From acdc469e39a193882f4559240b202eccb66cf0c8 Mon Sep 17 00:00:00 2001
From: meng-ustc <dongmeng@mail.ustc.edu.cn>
Date: Mon, 1 Feb 2021 21:05:34 +0900
Subject: [PATCH 01/11] Add A New Baseline: DoubleEnsemble

---
 .../DoubleEnsemble/requirements.txt           |   3 +
 ...rkflow_config_doubleensemble_Alpha158.yaml |  91 +++++++
 ...rkflow_config_doubleensemble_Alpha360.yaml |  98 +++++++
 qlib/contrib/model/double_ensemble.py         | 241 ++++++++++++++++++
 4 files changed, 433 insertions(+)
 create mode 100644 examples/benchmarks/DoubleEnsemble/requirements.txt
 create mode 100644 examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
 create mode 100644 examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml
 create mode 100644 qlib/contrib/model/double_ensemble.py

diff --git a/examples/benchmarks/DoubleEnsemble/requirements.txt b/examples/benchmarks/DoubleEnsemble/requirements.txt
new file mode 100644
index 0000000000..f9117361d5
--- /dev/null
+++ b/examples/benchmarks/DoubleEnsemble/requirements.txt
@@ -0,0 +1,3 @@
+pandas==1.1.2
+numpy==1.17.4
+lightgbm==3.1.0
\ No newline at end of file
diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
new file mode 100644
index 0000000000..b5c669c28f
--- /dev/null
+++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
@@ -0,0 +1,91 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy.strategy
+        kwargs:
+            topk: 50
+            n_drop: 5
+    backtest:
+        verbose: False
+        limit_threshold: 0.095
+        account: 100000000
+        benchmark: *benchmark
+        deal_price: close
+        open_cost: 0.0005
+        close_cost: 0.0015
+        min_cost: 5
+task:
+    model:
+        class: DEnsembleModel
+        module_path: qlib.contrib.model.double_ensemble
+        kwargs:
+            base: "gbm"
+            loss: mse
+            k: 6
+            enable_sr: True
+            enable_fs: True
+            alpha1: 1
+            alpha2: 1
+            bins_sr: 10
+            bins_fs: 5
+            decay: 0.5
+            sample_ratios:
+                - 0.8
+                - 0.7
+                - 0.6
+                - 0.5
+                - 0.4
+            sub_weights:
+                - 1
+                - 0.2
+                - 0.2
+                - 0.2
+                - 0.2
+                - 0.2
+            colsample_bytree: 0.8879
+            learning_rate: 0.2
+            subsample: 0.8789
+            lambda_l1: 205.6999
+            lambda_l2: 580.9768
+            max_depth: 8
+            num_leaves: 210
+            num_threads: 20
+            verbosity: -1
+            num_iterations: 28
+            early_stopping_round: None
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record:
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: {}
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs:
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs:
+            config: *port_analysis_config
\ No newline at end of file
diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml
new file mode 100644
index 0000000000..b31eac4cef
--- /dev/null
+++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml
@@ -0,0 +1,98 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+    infer_processors: []
+    learn_processors:
+        - class: DropnaLabel
+        - class: CSRankNorm
+          kwargs:
+              fields_group: label
+    label: ["Ref($close, -2) / Ref($close, -1) - 1"]
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy.strategy
+        kwargs:
+            topk: 50
+            n_drop: 5
+    backtest:
+        verbose: False
+        limit_threshold: 0.095
+        account: 100000000
+        benchmark: *benchmark
+        deal_price: close
+        open_cost: 0.0005
+        close_cost: 0.0015
+        min_cost: 5
+task:
+    model:
+        class: DEnsembleModel
+        module_path: qlib.contrib.model.double_ensemble
+        kwargs:
+            base: "gbm"
+                loss: mse
+                k: 6
+                enable_sr: True
+                enable_fs: True
+                alpha1: 1
+                alpha2: 1
+                bins_sr: 10
+                bins_fs: 5
+                decay: 0.5
+                sample_ratios:
+                    - 0.8
+                    - 0.7
+                    - 0.6
+                    - 0.5
+                    - 0.4
+                sub_weights:
+                    - 1
+                    - 0.2
+                    - 0.2
+                    - 0.2
+                    - 0.2
+                    - 0.2
+            colsample_bytree: 0.8879
+            learning_rate: 0.0421
+            subsample: 0.8789
+            lambda_l1: 205.6999
+            lambda_l2: 580.9768
+            max_depth: 8
+            num_leaves: 210
+            num_threads: 20
+            verbosity: -1
+            num_iterations: 28
+            early_stopping_round: None
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha360
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record:
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: {}
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs:
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs:
+            config: *port_analysis_config
\ No newline at end of file
diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py
new file mode 100644
index 0000000000..68680b8145
--- /dev/null
+++ b/qlib/contrib/model/double_ensemble.py
@@ -0,0 +1,241 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+
+from ...model.base import Model
+from ...data.dataset import DatasetH
+from ...data.dataset.handler import DataHandlerLP
+from ...log import get_module_logger
+
+
+class DEnsembleModel(Model):
+    """Double Ensemble Model"""
+
+    def __init__(
+            self,
+            base="gbm",
+            loss="mse",
+            k=6,
+            enable_sr=True,
+            enable_fs=True,
+            alpha1=1.,
+            alpha2=1.,
+            bins_sr=10,
+            bins_fs=5,
+            decay=None,
+            sample_ratios=None,
+            sub_weights=None,
+            **kwargs):
+        self.base = base  # "gbm" or "mlp", specifically, we use lgbm for "gbm"
+        self.k = k
+        self.enable_sr = enable_sr
+        self.enable_fs = enable_fs
+        self.alpha1 = alpha1
+        self.alpha2 = alpha2
+        self.bins_sr = bins_sr
+        self.bins_fs = bins_fs
+        self.decay = decay
+        if not len(sample_ratios) == bins_fs:
+            raise ValueError("The length of sample_ratios should be equal to bins_fs.")
+        self.sample_ratios = sample_ratios
+        if not len(sub_weights) == k:
+            raise ValueError("The length of sub_weights should be equal to k.")
+        self.sub_weights = sub_weights
+        self.logger = get_module_logger("DEnsembleModel")
+        self.logger.info("Double Ensemble Model...")
+        self.ensemble = []  # the current ensemble model, a list contains all the sub-models
+        self.sub_features = []  # the features for each sub model in the form of pandas.Index
+        self.params = {"objective": loss}
+        self.params.update(kwargs)
+        self.loss = loss
+
+    def fit(
+        self,
+        dataset: DatasetH
+    ):
+        df_train, df_valid = dataset.prepare(
+            ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
+        )
+        x_train, y_train = df_train["feature"], df_train["label"]
+        # initialize the sample weights
+        N, F = x_train.shape
+        weights = pd.Series(np.ones(N, dtype=float))
+        # initialize the features
+        features = x_train.columns
+        pred_sub = pd.DataFrame(np.zeros((N, self.k), dtype=float), index=x_train.index)
+        # train k sub-models
+        for i_k in range(self.k):
+            self.sub_features.append(features)
+            self.logger.info("Training sub-model: ({}/{})".format(i_k+1, self.k))
+            model_k = self.train_submodel(df_train, df_valid, weights, features)
+            self.ensemble.append(model_k)
+            # no further sample re-weight and feature selection needed for the last sub-model
+            if i_k + 1 == self.k:
+                break
+
+            self.logger.info("Retrieving loss curve and loss values...")
+            loss_curve = self.retrieve_loss_curve(model_k, df_train, features)
+            pred_k = self.predict_sub(model_k, df_train, features)
+            pred_sub.iloc[:, i_k] = pred_k
+            pred_ensemble = pred_sub.iloc[:, :i_k+1].mean(axis=1)
+            loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values))
+
+            if self.enable_sr:
+                self.logger.info("Sample re-weighting...")
+                weights = self.sample_reweight(loss_curve, loss_values, i_k+1)
+
+            if self.enable_fs:
+                self.logger.info("Feature selection...")
+                features = self.feature_selection(df_train, loss_values)
+
+    def train_submodel(self, df_train, df_valid, weights, features):
+        dtrain, dvalid = self._prepare_data_gbm(df_train, df_valid, weights, features)
+        evals_result = dict()
+        model = lgb.train(
+            self.params,
+            dtrain,
+            valid_sets=[dtrain, dvalid],
+            valid_names=["train", "valid"],
+            verbose_eval=20,
+            evals_result=evals_result,
+        )
+        evals_result["train"] = list(evals_result["train"].values())[0]
+        evals_result["valid"] = list(evals_result["valid"].values())[0]
+        return model
+
+    def _prepare_data_gbm(self, df_train, df_valid, weights, features):
+        x_train, y_train = df_train["feature"].loc[:, features], df_train["label"]
+        x_valid, y_valid = df_valid["feature"].loc[:, features], df_valid["label"]
+
+        # Lightgbm need 1D array as its label
+        if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
+            y_train, y_valid = np.squeeze(y_train.values), np.squeeze(y_valid.values)
+        else:
+            raise ValueError("LightGBM doesn't support multi-label training")
+
+        dtrain = lgb.Dataset(x_train.values, label=y_train, weight=weights)
+        dvalid = lgb.Dataset(x_valid.values, label=y_valid)
+        return dtrain, dvalid
+
+    def sample_reweight(self, loss_curve, loss_values, k_th):
+        """
+        the SR module of Double Ensemble
+        :param loss_curve: the shape is NxT
+        the loss curve for the previous sub-model, where the element (i, t) if the error on the i-th sample
+        after the t-th iteration in the training of the previous sub-model.
+        :param loss_values: the shape is N
+        the loss of the current ensemble on the i-th sample.
+        :param k_th: the index of the current sub-model, starting from 1
+        :return: weights
+        the weights for all the samples.
+        """
+        # normalize loss_curve and loss_values with ranking
+        loss_curve_norm = loss_curve.rank(axis=0, pct=True)
+        loss_values_norm = (-loss_values).rank(pct=True)
+
+        # calculate l_start and l_end from loss_curve
+        N, T = loss_curve.shape
+        part = np.maximum(int(T * 0.1), 1)
+        l_start = loss_curve_norm.iloc[:, :part].mean(axis=1)
+        l_end = loss_curve_norm.iloc[:, -part:].mean(axis=1)
+
+        # calculate h-value for each sample
+        h1 = loss_values_norm
+        h2 = (l_end / l_start).rank(pct=True)
+        h = pd.DataFrame({'h_value': self.alpha1 * h1 + self.alpha2 * h2})
+
+        # calculate weights
+        h['bins'] = pd.cut(h['h_value'], self.bins_sr)
+        h_avg = h.groupby('bins')['h_value'].mean()
+        weights = pd.Series(np.zeros(N, dtype=float))
+        for i_b, b in enumerate(h_avg.index):
+            weights[h['bins'] == b] = 1. / (self.decay ** k_th * h_avg[i_b] + 0.1)
+        return weights
+
+    def feature_selection(self, df_train, loss_values):
+        """
+        the FS module of Double Ensemble
+        :param df_train: the shape is NxF
+        :param loss_values: the shape is N
+        the loss of the current ensemble on the i-th sample.
+        :return: res_feat: in the form of pandas.Index
+
+        """
+        x_train, y_train = df_train["feature"], df_train["label"]
+        features = x_train.columns
+        N, F = x_train.shape
+        g = pd.DataFrame({'g_value': np.zeros(F, dtype=float)})
+        M = len(self.ensemble)
+
+        # shuffle specific columns and calculate g-value for each feature
+        x_train_tmp = x_train.copy()
+        for i_f, feat in enumerate(features):
+            x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values)
+            pred = pd.Series(np.zeros(N), index=x_train_tmp.index)
+            for i_s, submodel in enumerate(self.ensemble):
+                pred += pd.Series(submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values),
+                                  index=x_train_tmp.index) / M
+            loss_feat = self.get_loss(y_train.values.squeeze(), pred.values)
+            g.loc[i_f, 'g_value'] = np.mean(loss_feat - loss_values) / np.std(loss_feat - loss_values)
+            x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy()
+
+        # one column in train features is all-nan # if g['g_value'].isna().any()
+        g['g_value'].replace(np.nan, 0, inplace=True)
+
+        # divide features into bins_fs bins
+        g['bins'] = pd.cut(g['g_value'], self.bins_fs)
+
+        # randomly sample features from bins to construct the new features
+        res_feat = []
+        sorted_bins = sorted(g['bins'].unique(), reverse=True)
+        for i_b, b in enumerate(sorted_bins):
+            b_feat = features[g['bins'] == b]
+            num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat)))
+            res_feat = res_feat + np.random.choice(b_feat, size=num_feat).tolist()
+        return pd.Index(res_feat)
+
+    def get_loss(self, label, pred):
+        if self.loss == "mse":
+            return (label - pred) ** 2
+        else:
+            raise ValueError("not implemented yet")
+
+    def retrieve_loss_curve(self, model, df_train, features):
+        if self.base == "gbm":
+            num_trees = model.num_trees()
+            x_train, y_train = df_train["feature"].loc[:, features], df_train["label"]
+            # Lightgbm need 1D array as its label
+            if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
+                y_train = np.squeeze(y_train.values)
+            else:
+                raise ValueError("LightGBM doesn't support multi-label training")
+
+            N = x_train.shape[0]
+            loss_curve = pd.DataFrame(np.zeros((N, num_trees)))
+            pred_tree = np.zeros(N, dtype=float)
+            for i_tree in range(num_trees):
+                pred_tree += model.predict(x_train.values, start_iteration=i_tree, num_iteration=1)
+                loss_curve.iloc[:, i_tree] = self.get_loss(y_train, pred_tree)
+        else:
+            raise ValueError("not implemented yet")
+        return loss_curve
+
+    def predict(self, dataset):
+        if self.ensemble is None:
+            raise ValueError("model is not fitted yet!")
+        x_test = dataset.prepare("test", col_set="feature", data_key=DataHandlerLP.DK_I)
+        pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index)
+        for i_sub, submodel in enumerate(self.ensemble):
+            feat_sub = self.sub_features[i_sub]
+            pred += pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index) * self.sub_weights[i_sub]
+        return pred
+
+    def predict_sub(self, submodel, df_data, features):
+        x_data, y_data = df_data["feature"].loc[:, features], df_data["label"]
+        pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index)
+        return pred_sub
+
+

From 8c3ec164ff5bb73fea2cc0d63f91776148ed2760 Mon Sep 17 00:00:00 2001
From: meng-ustc <dongmeng@mail.ustc.edu.cn>
Date: Tue, 2 Feb 2021 11:46:37 +0900
Subject: [PATCH 02/11] Add A New Baseline: DoubleEnsemble

---
 ...rkflow_config_doubleensemble_Alpha158.yaml |  5 +-
 ...rkflow_config_doubleensemble_Alpha360.yaml | 47 +++++++++----------
 examples/run_all_model.py                     |  2 +-
 qlib/contrib/model/double_ensemble.py         |  3 ++
 4 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
index b5c669c28f..c468eb320a 100644
--- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
+++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
@@ -33,7 +33,7 @@ task:
             base: "gbm"
             loss: mse
             k: 6
-            enable_sr: True
+            enable_sr: False
             enable_fs: True
             alpha1: 1
             alpha2: 1
@@ -53,6 +53,7 @@ task:
                 - 0.2
                 - 0.2
                 - 0.2
+            epochs: 28
             colsample_bytree: 0.8879
             learning_rate: 0.2
             subsample: 0.8789
@@ -62,8 +63,6 @@ task:
             num_leaves: 210
             num_threads: 20
             verbosity: -1
-            num_iterations: 28
-            early_stopping_round: None
     dataset:
         class: DatasetH
         module_path: qlib.data.dataset
diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml
index b31eac4cef..3351cefc5a 100644
--- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml
+++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml
@@ -38,28 +38,29 @@ task:
         module_path: qlib.contrib.model.double_ensemble
         kwargs:
             base: "gbm"
-                loss: mse
-                k: 6
-                enable_sr: True
-                enable_fs: True
-                alpha1: 1
-                alpha2: 1
-                bins_sr: 10
-                bins_fs: 5
-                decay: 0.5
-                sample_ratios:
-                    - 0.8
-                    - 0.7
-                    - 0.6
-                    - 0.5
-                    - 0.4
-                sub_weights:
-                    - 1
-                    - 0.2
-                    - 0.2
-                    - 0.2
-                    - 0.2
-                    - 0.2
+            loss: mse
+            k: 6
+            enable_sr: True
+            enable_fs: True
+            alpha1: 1
+            alpha2: 1
+            bins_sr: 10
+            bins_fs: 5
+            decay: 0.5
+            sample_ratios:
+                - 0.8
+                - 0.7
+                - 0.6
+                - 0.5
+                - 0.4
+            sub_weights:
+                - 1
+                - 0.2
+                - 0.2
+                - 0.2
+                - 0.2
+                - 0.2
+            epochs: 136
             colsample_bytree: 0.8879
             learning_rate: 0.0421
             subsample: 0.8789
@@ -69,8 +70,6 @@ task:
             num_leaves: 210
             num_threads: 20
             verbosity: -1
-            num_iterations: 28
-            early_stopping_round: None
     dataset:
         class: DatasetH
         module_path: qlib.data.dataset
diff --git a/examples/run_all_model.py b/examples/run_all_model.py
index d587eff155..1809bac2e5 100644
--- a/examples/run_all_model.py
+++ b/examples/run_all_model.py
@@ -265,7 +265,7 @@ def run(times=1, models=None, dataset="Alpha360", exclude=False):
             )  # TODO: FIX ME!
         else:
             execute(
-                f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft/qlib#egg=pyqlib"
+                f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/meng-ustc/qlib#egg=pyqlib"
             )  # TODO: FIX ME!
         sys.stderr.write("\n")
         # run workflow_by_config for multiple times
diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py
index 68680b8145..786b3987cf 100644
--- a/qlib/contrib/model/double_ensemble.py
+++ b/qlib/contrib/model/double_ensemble.py
@@ -28,6 +28,7 @@ def __init__(
             decay=None,
             sample_ratios=None,
             sub_weights=None,
+            epochs=100,
             **kwargs):
         self.base = base  # "gbm" or "mlp", specifically, we use lgbm for "gbm"
         self.k = k
@@ -44,6 +45,7 @@ def __init__(
         if not len(sub_weights) == k:
             raise ValueError("The length of sub_weights should be equal to k.")
         self.sub_weights = sub_weights
+        self.epochs = epochs
         self.logger = get_module_logger("DEnsembleModel")
         self.logger.info("Double Ensemble Model...")
         self.ensemble = []  # the current ensemble model, a list contains all the sub-models
@@ -97,6 +99,7 @@ def train_submodel(self, df_train, df_valid, weights, features):
         model = lgb.train(
             self.params,
             dtrain,
+            num_boost_round=self.epochs,
             valid_sets=[dtrain, dvalid],
             valid_names=["train", "valid"],
             verbose_eval=20,

From fd5c68a7d1b9e9074464431ab384dfbdda7aa206 Mon Sep 17 00:00:00 2001
From: Meng Dong <dongmeng@mail.ustc.edu.cn>
Date: Tue, 2 Feb 2021 12:39:07 +0800
Subject: [PATCH 03/11] Update workflow_config_doubleensemble_Alpha158.yaml

---
 .../workflow_config_doubleensemble_Alpha158.yaml              | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
index c468eb320a..74923c0e5e 100644
--- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
+++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
@@ -33,7 +33,7 @@ task:
             base: "gbm"
             loss: mse
             k: 6
-            enable_sr: False
+            enable_sr: True
             enable_fs: True
             alpha1: 1
             alpha2: 1
@@ -87,4 +87,4 @@ task:
         - class: PortAnaRecord
           module_path: qlib.workflow.record_temp
           kwargs:
-            config: *port_analysis_config
\ No newline at end of file
+            config: *port_analysis_config

From d27dc8bab8955bc8900c1a0a0d9d564cd4c9400b Mon Sep 17 00:00:00 2001
From: meng-ustc <dongmeng@mail.ustc.edu.cn>
Date: Thu, 18 Feb 2021 19:02:33 +0900
Subject: [PATCH 04/11] Add A New Baseline: DoubleEnsemble

---
 examples/run_all_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_all_model.py b/examples/run_all_model.py
index 1809bac2e5..d587eff155 100644
--- a/examples/run_all_model.py
+++ b/examples/run_all_model.py
@@ -265,7 +265,7 @@ def run(times=1, models=None, dataset="Alpha360", exclude=False):
             )  # TODO: FIX ME!
         else:
             execute(
-                f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/meng-ustc/qlib#egg=pyqlib"
+                f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft/qlib#egg=pyqlib"
             )  # TODO: FIX ME!
         sys.stderr.write("\n")
         # run workflow_by_config for multiple times

From 42590972e49457d659749a65e8c8ad02172c8fe5 Mon Sep 17 00:00:00 2001
From: meng-ustc <dongmeng@mail.ustc.edu.cn>
Date: Thu, 18 Feb 2021 19:15:02 +0900
Subject: [PATCH 05/11] Modify run_all_model.py

---
 examples/run_all_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_all_model.py b/examples/run_all_model.py
index d587eff155..f9d4a62a04 100644
--- a/examples/run_all_model.py
+++ b/examples/run_all_model.py
@@ -265,7 +265,7 @@ def run(times=1, models=None, dataset="Alpha360", exclude=False):
             )  # TODO: FIX ME!
         else:
             execute(
-                f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft/qlib#egg=pyqlib"
+                f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft----/qlib#egg=pyqlib"
             )  # TODO: FIX ME!
         sys.stderr.write("\n")
         # run workflow_by_config for multiple times

From cd5b721bc6e91de0d0c83978ec14475a52f94a2e Mon Sep 17 00:00:00 2001
From: meng-ustc <dongmeng@mail.ustc.edu.cn>
Date: Fri, 19 Feb 2021 11:56:50 +0900
Subject: [PATCH 06/11] Update

---
 examples/run_all_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_all_model.py b/examples/run_all_model.py
index f9d4a62a04..d587eff155 100644
--- a/examples/run_all_model.py
+++ b/examples/run_all_model.py
@@ -265,7 +265,7 @@ def run(times=1, models=None, dataset="Alpha360", exclude=False):
             )  # TODO: FIX ME!
         else:
             execute(
-                f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft----/qlib#egg=pyqlib"
+                f"cd {env_path} && {python_path} -m pip install --upgrade --force-reinstall -e git+https://github.com/microsoft/qlib#egg=pyqlib"
             )  # TODO: FIX ME!
         sys.stderr.write("\n")
         # run workflow_by_config for multiple times

From 1a990fdd25e5fd25e6b868d56122f92b7caa30f7 Mon Sep 17 00:00:00 2001
From: meng-ustc <dongmeng@mail.ustc.edu.cn>
Date: Tue, 23 Feb 2021 19:08:11 +0900
Subject: [PATCH 07/11] Add Risk Prediction Demo

---
 examples/workflow_by_code_lgb_risk_demo.py | 179 +++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 examples/workflow_by_code_lgb_risk_demo.py

diff --git a/examples/workflow_by_code_lgb_risk_demo.py b/examples/workflow_by_code_lgb_risk_demo.py
new file mode 100644
index 0000000000..b250993d33
--- /dev/null
+++ b/examples/workflow_by_code_lgb_risk_demo.py
@@ -0,0 +1,179 @@
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT License.
+
+import sys
+from pathlib import Path
+
+import qlib
+from qlib.config import REG_CN
+from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict
+from qlib.workflow import R
+from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
+from qlib.data.dataset.handler import DataHandlerLP
+
+import seaborn as sns
+import matplotlib.pyplot as plt
+import math
+import pandas as pd
+from scipy.stats.stats import pearsonr
+import numpy as np
+
+if __name__ == "__main__":
+
+    # use default data
+    provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
+    if not exists_qlib_data(provider_uri):
+        print(f"Qlib data is not found in {provider_uri}")
+        sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
+        from get_data import GetData
+
+        GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
+
+    qlib.init(provider_uri=provider_uri, region=REG_CN)
+
+    market = "csi300"
+    benchmark = "SH000300"
+
+    ###################################
+    # train model
+    ###################################
+    data_handler_config = {
+        "start_time": "2008-01-01",
+        "end_time": "2020-08-01",
+        "fit_start_time": "2008-01-01",
+        "fit_end_time": "2014-12-31",
+        "instruments": market,
+        "infer_processors": [
+            {"class": "ProcessInf", "kwargs": {}},
+            {"class": "ZScoreNorm", "kwargs": {"fields_group": "feature"}},
+            {"class": "Fillna", "kwargs": {}},
+        ],
+        "learn_processors": [{
+            "class": "DropnaLabel", },
+        ],
+        "label": (["Ref(Min($low, 5), -4)/$close - 1"], ["LABEL0"])  # the period for risk prediction is 5 days
+    }
+
+    task = {
+        "model": {
+            "class": "LGBModel",
+            "module_path": "qlib.contrib.model.gbdt",
+            "kwargs": {
+                "loss": "mse",
+                "colsample_bytree": 0.8999,
+                "learning_rate": 0.02,
+                "subsample": 0.7,
+                "lambda_l1": 11.9668,
+                "lambda_l2": 339.1301,
+                "max_depth": 16,
+                "num_leaves": 31,
+                "num_threads": 20,
+            },
+        },
+        "dataset": {
+            "class": "DatasetH",
+            "module_path": "qlib.data.dataset",
+            "kwargs": {
+                "handler": {
+                    "class": "Alpha360",
+                    "module_path": "qlib.contrib.data.handler",
+                    "kwargs": data_handler_config,
+                },
+                "segments": {
+                    "train": ("2008-01-01", "2014-12-31"),
+                    "valid": ("2015-01-01", "2016-12-31"),
+                    "test": ("2017-01-01", "2020-08-01"),
+                },
+            },
+        },
+    }
+
+    port_analysis_config = {
+        "strategy": {
+            "class": "TopkDropoutStrategy",
+            "module_path": "qlib.contrib.strategy.strategy",
+            "kwargs": {
+                "topk": 50,
+                "n_drop": 5,
+            },
+        },
+        "backtest": {
+            "verbose": False,
+            "limit_threshold": 0.095,
+            "account": 100000000,
+            "benchmark": benchmark,
+            "deal_price": "close",
+            "open_cost": 0.0005,
+            "close_cost": 0.0015,
+            "min_cost": 5,
+            "return_order": True,
+        },
+    }
+
+    # model initiaiton
+    model = init_instance_by_config(task["model"])
+    dataset = init_instance_by_config(task["dataset"])
+
+    # NOTE: This line is optional
+    # It demonstrates that the dataset can be used standalone.
+    example_df = dataset.prepare("train")
+    print(example_df.head())
+
+    def heatmap(actual_risk, predicted_risk, step=0.02):
+        """
+        plot the precision heatmap as a visualized evaluation for risk predition
+        :param actual_risk: the LABEL0 of test samples
+        :param predicted_risk: the predicted results of test samples
+        :param step: the internal size of risk values on axis
+        :return:
+        """
+        num_step = math.ceil(-predicted_risk.min() / step)
+        matrix = np.zeros((num_step, num_step), dtype=np.float)
+        for pred_thresh in range(num_step):
+            for act_thresh in range(num_step):
+                actual_positive = actual_risk < -act_thresh*step
+                predicted_alarm = predicted_risk < -pred_thresh*step
+                num_alarm = predicted_alarm.sum()
+                num_tp = (actual_positive & predicted_alarm).sum()
+                matrix[pred_thresh, act_thresh] = num_tp / num_alarm
+        axis_labels = ['{:.3f}'.format(-x * step) for x in range(num_step)]
+        return matrix, axis_labels
+
+    # start exp
+    with R.start(experiment_name="workflow"):
+        R.log_params(**flatten_dict(task))
+        model.fit(dataset)
+
+        # prediction
+        actual_risk = dataset.prepare("test", col_set="label", data_key=DataHandlerLP.DK_I)['LABEL0']
+        pred = model.predict(dataset)
+
+        result_df = pd.concat((actual_risk, pred), axis=1)
+        result_df.columns = ['Actual Risk', 'Predicted Risk']
+        result_df.dropna(inplace=True)
+        actual_risk, predicted_risk = result_df.iloc[:, 0], result_df.iloc[:, 1]
+        corr = pearsonr(actual_risk, predicted_risk)[0]
+        print('The correlation between predicted risk and actual risk is: {:.6f}'.format(corr))
+
+        # visualized results
+        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
+        sns.histplot(actual_risk, ax=axes[0, 0])
+        axes[0, 0].set_title('Market: {}  Actual Risk'.format(market))
+        axes[0, 0].grid()
+
+        sns.histplot(predicted_risk, ax=axes[0, 1])
+        axes[0, 1].set_title('Feature: {}  Predicted Risk'.format(task['dataset']['kwargs']['handler']['class']))
+        axes[0, 1].grid()
+
+        sns.scatterplot(data=result_df, ax=axes[1, 0], x='Actual Risk', y='Predicted Risk', s=20)
+        axes[1, 0].set_title('Market: {}  Feature: {}  Corr: {:.5f}'.format(
+            market, task['dataset']['kwargs']['handler']['class'], corr))
+        axes[1, 0].grid()
+
+        matrix, ax_labels = heatmap(actual_risk, predicted_risk)
+        sns.heatmap(matrix, annot=True, fmt=".3f", xticklabels=ax_labels, yticklabels=ax_labels, ax=axes[1, 1],
+                    )
+        axes[1, 1].set_xlabel('Predicted Alarm Threshold')
+        axes[1, 1].set_ylabel('Actual Positive Threshold')
+        axes[1, 1].set_title('Risk Prediction Precision Heatmap')
+        plt.show()

From ce60097722d46bb431224d1bf9e67e59b2e03224 Mon Sep 17 00:00:00 2001
From: meng-ustc <dongmeng@mail.ustc.edu.cn>
Date: Wed, 24 Feb 2021 16:59:31 +0900
Subject: [PATCH 08/11] Add README and Formatted

---
 README.md                                    |  1 +
 examples/benchmarks/DoubleEnsemble/README.md |  4 +
 qlib/contrib/model/double_ensemble.py        | 77 ++++++++++----------
 3 files changed, 45 insertions(+), 37 deletions(-)
 create mode 100644 examples/benchmarks/DoubleEnsemble/README.md

diff --git a/README.md b/README.md
index 787075d6ad..e1608c37da 100644
--- a/README.md
+++ b/README.md
@@ -232,6 +232,7 @@ Here is a list of models built on `Qlib`.
 - [SFM based on pytorch (Liheng Zhang, et al. 2017)](qlib/contrib/model/pytorch_sfm.py)
 - [TFT based on tensorflow (Bryan Lim, et al. 2019)](examples/benchmarks/TFT/tft.py)
 - [TabNet based on pytorch (Sercan O. Arik, et al. 2019)](qlib/contrib/model/pytorch_tabnet.py)
+- [DoubleEnsemble based on LightGBM (Chuheng Zhang, et al. 2020)](qlib/contrib/model/double_ensemble.py)
 
 Your PR of new Quant models is highly welcomed.
 
diff --git a/examples/benchmarks/DoubleEnsemble/README.md b/examples/benchmarks/DoubleEnsemble/README.md
new file mode 100644
index 0000000000..67e741050f
--- /dev/null
+++ b/examples/benchmarks/DoubleEnsemble/README.md
@@ -0,0 +1,4 @@
+# DoubleEnsemble
+* DoubleEnsemble is an ensemble framework leveraging learning trajectory based sample reweighting and shuffling based feature selection, to solve both the low signal-to-noise ratio and increasing number of features problems. They identify the key samples based on the training dynamics on each sample and elicit key features based on the ablation impact of each feature via shuffling. The model is applicable to a wide range of base models, capable of extracting complex patterns, while mitigating the overfitting and instability issues for financial market prediction.
+* This code used in Qlib is implemented by ourselves.
+* Paper: DoubleEnsemble: A New Ensemble Method Based on Sample Reweighting and Feature Selection for Financial Data Analysis [https://arxiv.org/pdf/2010.01265.pdf](https://arxiv.org/pdf/2010.01265.pdf).
\ No newline at end of file
diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py
index 786b3987cf..c96b69e8b4 100644
--- a/qlib/contrib/model/double_ensemble.py
+++ b/qlib/contrib/model/double_ensemble.py
@@ -15,21 +15,22 @@ class DEnsembleModel(Model):
     """Double Ensemble Model"""
 
     def __init__(
-            self,
-            base="gbm",
-            loss="mse",
-            k=6,
-            enable_sr=True,
-            enable_fs=True,
-            alpha1=1.,
-            alpha2=1.,
-            bins_sr=10,
-            bins_fs=5,
-            decay=None,
-            sample_ratios=None,
-            sub_weights=None,
-            epochs=100,
-            **kwargs):
+        self,
+        base="gbm",
+        loss="mse",
+        k=6,
+        enable_sr=True,
+        enable_fs=True,
+        alpha1=1.0,
+        alpha2=1.0,
+        bins_sr=10,
+        bins_fs=5,
+        decay=None,
+        sample_ratios=None,
+        sub_weights=None,
+        epochs=100,
+        **kwargs
+    ):
         self.base = base  # "gbm" or "mlp", specifically, we use lgbm for "gbm"
         self.k = k
         self.enable_sr = enable_sr
@@ -54,10 +55,7 @@ def __init__(
         self.params.update(kwargs)
         self.loss = loss
 
-    def fit(
-        self,
-        dataset: DatasetH
-    ):
+    def fit(self, dataset: DatasetH):
         df_train, df_valid = dataset.prepare(
             ["train", "valid"], col_set=["feature", "label"], data_key=DataHandlerLP.DK_L
         )
@@ -71,7 +69,7 @@ def fit(
         # train k sub-models
         for i_k in range(self.k):
             self.sub_features.append(features)
-            self.logger.info("Training sub-model: ({}/{})".format(i_k+1, self.k))
+            self.logger.info("Training sub-model: ({}/{})".format(i_k + 1, self.k))
             model_k = self.train_submodel(df_train, df_valid, weights, features)
             self.ensemble.append(model_k)
             # no further sample re-weight and feature selection needed for the last sub-model
@@ -82,12 +80,12 @@ def fit(
             loss_curve = self.retrieve_loss_curve(model_k, df_train, features)
             pred_k = self.predict_sub(model_k, df_train, features)
             pred_sub.iloc[:, i_k] = pred_k
-            pred_ensemble = pred_sub.iloc[:, :i_k+1].mean(axis=1)
+            pred_ensemble = pred_sub.iloc[:, : i_k + 1].mean(axis=1)
             loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values))
 
             if self.enable_sr:
                 self.logger.info("Sample re-weighting...")
-                weights = self.sample_reweight(loss_curve, loss_values, i_k+1)
+                weights = self.sample_reweight(loss_curve, loss_values, i_k + 1)
 
             if self.enable_fs:
                 self.logger.info("Feature selection...")
@@ -148,14 +146,14 @@ def sample_reweight(self, loss_curve, loss_values, k_th):
         # calculate h-value for each sample
         h1 = loss_values_norm
         h2 = (l_end / l_start).rank(pct=True)
-        h = pd.DataFrame({'h_value': self.alpha1 * h1 + self.alpha2 * h2})
+        h = pd.DataFrame({"h_value": self.alpha1 * h1 + self.alpha2 * h2})
 
         # calculate weights
-        h['bins'] = pd.cut(h['h_value'], self.bins_sr)
-        h_avg = h.groupby('bins')['h_value'].mean()
+        h["bins"] = pd.cut(h["h_value"], self.bins_sr)
+        h_avg = h.groupby("bins")["h_value"].mean()
         weights = pd.Series(np.zeros(N, dtype=float))
         for i_b, b in enumerate(h_avg.index):
-            weights[h['bins'] == b] = 1. / (self.decay ** k_th * h_avg[i_b] + 0.1)
+            weights[h["bins"] == b] = 1.0 / (self.decay ** k_th * h_avg[i_b] + 0.1)
         return weights
 
     def feature_selection(self, df_train, loss_values):
@@ -170,7 +168,7 @@ def feature_selection(self, df_train, loss_values):
         x_train, y_train = df_train["feature"], df_train["label"]
         features = x_train.columns
         N, F = x_train.shape
-        g = pd.DataFrame({'g_value': np.zeros(F, dtype=float)})
+        g = pd.DataFrame({"g_value": np.zeros(F, dtype=float)})
         M = len(self.ensemble)
 
         # shuffle specific columns and calculate g-value for each feature
@@ -179,23 +177,27 @@ def feature_selection(self, df_train, loss_values):
             x_train_tmp.loc[:, feat] = np.random.permutation(x_train_tmp.loc[:, feat].values)
             pred = pd.Series(np.zeros(N), index=x_train_tmp.index)
             for i_s, submodel in enumerate(self.ensemble):
-                pred += pd.Series(submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values),
-                                  index=x_train_tmp.index) / M
+                pred += (
+                    pd.Series(
+                        submodel.predict(x_train_tmp.loc[:, self.sub_features[i_s]].values), index=x_train_tmp.index
+                    )
+                    / M
+                )
             loss_feat = self.get_loss(y_train.values.squeeze(), pred.values)
-            g.loc[i_f, 'g_value'] = np.mean(loss_feat - loss_values) / np.std(loss_feat - loss_values)
+            g.loc[i_f, "g_value"] = np.mean(loss_feat - loss_values) / np.std(loss_feat - loss_values)
             x_train_tmp.loc[:, feat] = x_train.loc[:, feat].copy()
 
         # one column in train features is all-nan # if g['g_value'].isna().any()
-        g['g_value'].replace(np.nan, 0, inplace=True)
+        g["g_value"].replace(np.nan, 0, inplace=True)
 
         # divide features into bins_fs bins
-        g['bins'] = pd.cut(g['g_value'], self.bins_fs)
+        g["bins"] = pd.cut(g["g_value"], self.bins_fs)
 
         # randomly sample features from bins to construct the new features
         res_feat = []
-        sorted_bins = sorted(g['bins'].unique(), reverse=True)
+        sorted_bins = sorted(g["bins"].unique(), reverse=True)
         for i_b, b in enumerate(sorted_bins):
-            b_feat = features[g['bins'] == b]
+            b_feat = features[g["bins"] == b]
             num_feat = int(np.ceil(self.sample_ratios[i_b] * len(b_feat)))
             res_feat = res_feat + np.random.choice(b_feat, size=num_feat).tolist()
         return pd.Index(res_feat)
@@ -233,12 +235,13 @@ def predict(self, dataset):
         pred = pd.Series(np.zeros(x_test.shape[0]), index=x_test.index)
         for i_sub, submodel in enumerate(self.ensemble):
             feat_sub = self.sub_features[i_sub]
-            pred += pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index) * self.sub_weights[i_sub]
+            pred += (
+                pd.Series(submodel.predict(x_test.loc[:, feat_sub].values), index=x_test.index)
+                * self.sub_weights[i_sub]
+            )
         return pred
 
     def predict_sub(self, submodel, df_data, features):
         x_data, y_data = df_data["feature"].loc[:, features], df_data["label"]
         pred_sub = pd.Series(submodel.predict(x_data.values), index=x_data.index)
         return pred_sub
-
-

From 70575e8a1ce4f63cced98a55de1914009012bdcd Mon Sep 17 00:00:00 2001
From: Meng Dong <dongmeng@mail.ustc.edu.cn>
Date: Wed, 24 Feb 2021 16:10:38 +0800
Subject: [PATCH 09/11] Delete workflow_by_code_lgb_risk_demo.py

---
 examples/workflow_by_code_lgb_risk_demo.py | 179 ---------------------
 1 file changed, 179 deletions(-)
 delete mode 100644 examples/workflow_by_code_lgb_risk_demo.py

diff --git a/examples/workflow_by_code_lgb_risk_demo.py b/examples/workflow_by_code_lgb_risk_demo.py
deleted file mode 100644
index b250993d33..0000000000
--- a/examples/workflow_by_code_lgb_risk_demo.py
+++ /dev/null
@@ -1,179 +0,0 @@
-#  Copyright (c) Microsoft Corporation.
-#  Licensed under the MIT License.
-
-import sys
-from pathlib import Path
-
-import qlib
-from qlib.config import REG_CN
-from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict
-from qlib.workflow import R
-from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
-from qlib.data.dataset.handler import DataHandlerLP
-
-import seaborn as sns
-import matplotlib.pyplot as plt
-import math
-import pandas as pd
-from scipy.stats.stats import pearsonr
-import numpy as np
-
-if __name__ == "__main__":
-
-    # use default data
-    provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
-    if not exists_qlib_data(provider_uri):
-        print(f"Qlib data is not found in {provider_uri}")
-        sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts")))
-        from get_data import GetData
-
-        GetData().qlib_data(target_dir=provider_uri, region=REG_CN)
-
-    qlib.init(provider_uri=provider_uri, region=REG_CN)
-
-    market = "csi300"
-    benchmark = "SH000300"
-
-    ###################################
-    # train model
-    ###################################
-    data_handler_config = {
-        "start_time": "2008-01-01",
-        "end_time": "2020-08-01",
-        "fit_start_time": "2008-01-01",
-        "fit_end_time": "2014-12-31",
-        "instruments": market,
-        "infer_processors": [
-            {"class": "ProcessInf", "kwargs": {}},
-            {"class": "ZScoreNorm", "kwargs": {"fields_group": "feature"}},
-            {"class": "Fillna", "kwargs": {}},
-        ],
-        "learn_processors": [{
-            "class": "DropnaLabel", },
-        ],
-        "label": (["Ref(Min($low, 5), -4)/$close - 1"], ["LABEL0"])  # the period for risk prediction is 5 days
-    }
-
-    task = {
-        "model": {
-            "class": "LGBModel",
-            "module_path": "qlib.contrib.model.gbdt",
-            "kwargs": {
-                "loss": "mse",
-                "colsample_bytree": 0.8999,
-                "learning_rate": 0.02,
-                "subsample": 0.7,
-                "lambda_l1": 11.9668,
-                "lambda_l2": 339.1301,
-                "max_depth": 16,
-                "num_leaves": 31,
-                "num_threads": 20,
-            },
-        },
-        "dataset": {
-            "class": "DatasetH",
-            "module_path": "qlib.data.dataset",
-            "kwargs": {
-                "handler": {
-                    "class": "Alpha360",
-                    "module_path": "qlib.contrib.data.handler",
-                    "kwargs": data_handler_config,
-                },
-                "segments": {
-                    "train": ("2008-01-01", "2014-12-31"),
-                    "valid": ("2015-01-01", "2016-12-31"),
-                    "test": ("2017-01-01", "2020-08-01"),
-                },
-            },
-        },
-    }
-
-    port_analysis_config = {
-        "strategy": {
-            "class": "TopkDropoutStrategy",
-            "module_path": "qlib.contrib.strategy.strategy",
-            "kwargs": {
-                "topk": 50,
-                "n_drop": 5,
-            },
-        },
-        "backtest": {
-            "verbose": False,
-            "limit_threshold": 0.095,
-            "account": 100000000,
-            "benchmark": benchmark,
-            "deal_price": "close",
-            "open_cost": 0.0005,
-            "close_cost": 0.0015,
-            "min_cost": 5,
-            "return_order": True,
-        },
-    }
-
-    # model initiaiton
-    model = init_instance_by_config(task["model"])
-    dataset = init_instance_by_config(task["dataset"])
-
-    # NOTE: This line is optional
-    # It demonstrates that the dataset can be used standalone.
-    example_df = dataset.prepare("train")
-    print(example_df.head())
-
-    def heatmap(actual_risk, predicted_risk, step=0.02):
-        """
-        plot the precision heatmap as a visualized evaluation for risk predition
-        :param actual_risk: the LABEL0 of test samples
-        :param predicted_risk: the predicted results of test samples
-        :param step: the internal size of risk values on axis
-        :return:
-        """
-        num_step = math.ceil(-predicted_risk.min() / step)
-        matrix = np.zeros((num_step, num_step), dtype=np.float)
-        for pred_thresh in range(num_step):
-            for act_thresh in range(num_step):
-                actual_positive = actual_risk < -act_thresh*step
-                predicted_alarm = predicted_risk < -pred_thresh*step
-                num_alarm = predicted_alarm.sum()
-                num_tp = (actual_positive & predicted_alarm).sum()
-                matrix[pred_thresh, act_thresh] = num_tp / num_alarm
-        axis_labels = ['{:.3f}'.format(-x * step) for x in range(num_step)]
-        return matrix, axis_labels
-
-    # start exp
-    with R.start(experiment_name="workflow"):
-        R.log_params(**flatten_dict(task))
-        model.fit(dataset)
-
-        # prediction
-        actual_risk = dataset.prepare("test", col_set="label", data_key=DataHandlerLP.DK_I)['LABEL0']
-        pred = model.predict(dataset)
-
-        result_df = pd.concat((actual_risk, pred), axis=1)
-        result_df.columns = ['Actual Risk', 'Predicted Risk']
-        result_df.dropna(inplace=True)
-        actual_risk, predicted_risk = result_df.iloc[:, 0], result_df.iloc[:, 1]
-        corr = pearsonr(actual_risk, predicted_risk)[0]
-        print('The correlation between predicted risk and actual risk is: {:.6f}'.format(corr))
-
-        # visualized results
-        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
-        sns.histplot(actual_risk, ax=axes[0, 0])
-        axes[0, 0].set_title('Market: {}  Actual Risk'.format(market))
-        axes[0, 0].grid()
-
-        sns.histplot(predicted_risk, ax=axes[0, 1])
-        axes[0, 1].set_title('Feature: {}  Predicted Risk'.format(task['dataset']['kwargs']['handler']['class']))
-        axes[0, 1].grid()
-
-        sns.scatterplot(data=result_df, ax=axes[1, 0], x='Actual Risk', y='Predicted Risk', s=20)
-        axes[1, 0].set_title('Market: {}  Feature: {}  Corr: {:.5f}'.format(
-            market, task['dataset']['kwargs']['handler']['class'], corr))
-        axes[1, 0].grid()
-
-        matrix, ax_labels = heatmap(actual_risk, predicted_risk)
-        sns.heatmap(matrix, annot=True, fmt=".3f", xticklabels=ax_labels, yticklabels=ax_labels, ax=axes[1, 1],
-                    )
-        axes[1, 1].set_xlabel('Predicted Alarm Threshold')
-        axes[1, 1].set_ylabel('Actual Positive Threshold')
-        axes[1, 1].set_title('Risk Prediction Precision Heatmap')
-        plt.show()

From 6e2ce6f1dce676f1c82bed2446e84ba43e4bc29b Mon Sep 17 00:00:00 2001
From: meng-ustc <dongmeng@mail.ustc.edu.cn>
Date: Tue, 2 Mar 2021 12:17:05 +0900
Subject: [PATCH 10/11] Add the results of DoubleEnsemble

---
 examples/benchmarks/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md
index bcd2279b33..c5bfced6dc 100644
--- a/examples/benchmarks/README.md
+++ b/examples/benchmarks/README.md
@@ -16,7 +16,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
 | LSTM (Sepp Hochreiter, et al.) | Alpha360 | 0.0443±0.01 | 0.3401±0.05| 0.0536±0.01 | 0.4248±0.05 | 0.0627±0.03 | 0.8441±0.48| -0.0882±0.03 |
 | ALSTM (Yao Qin, et al.) | Alpha360 | 0.0493±0.01 | 0.3778±0.06| 0.0585±0.00 | 0.4606±0.04 | 0.0513±0.03 | 0.6727±0.38| -0.1085±0.02 |
 | GATs (Petar Velickovic, et al.) | Alpha360 | 0.0475±0.00 | 0.3515±0.02| 0.0592±0.00 | 0.4585±0.01 | 0.0876±0.02 | 1.1513±0.27| -0.0795±0.02 |
-
+| DoubleEnsemble (Chuheng Zhang, et al.) | Alpha360 | 0.0407±0.00| 0.3053±0.00 | 0.0490±0.00 | 0.3840±0.00 | 0.0380±0.02 | 0.5000±0.21 | -0.0984±0.02 |
 ## Alpha158 dataset
 | Model Name | Dataset | IC | ICIR | Rank IC | Rank ICIR | Annualized Return | Information Ratio | Max Drawdown |
 |---|---|---|---|---|---|---|---|---|
@@ -31,5 +31,7 @@ The numbers shown below demonstrate the performance of the entire `workflow` of
 | LSTM (Sepp Hochreiter, et al.) | Alpha158 (with selected 20 features) | 0.0312±0.00 | 0.2394±0.04| 0.0418±0.00 | 0.3324±0.03 | 0.0298±0.02 | 0.4198±0.33| -0.1348±0.03 |
 | ALSTM (Yao Qin, et al.) | Alpha158 (with selected 20 features) | 0.0385±0.01 | 0.3022±0.06| 0.0478±0.00 | 0.3874±0.04 | 0.0486±0.03 | 0.7141±0.45| -0.1088±0.03 |
 | GATs (Petar Velickovic, et al.) | Alpha158 (with selected 20 features) | 0.0349±0.00 | 0.2511±0.01| 0.0457±0.00 | 0.3537±0.01 | 0.0578±0.02 | 0.8221±0.25| -0.0824±0.02 |
+| DoubleEnsemble (Chuheng Zhang, et al.) | Alpha158 | 0.0544±0.00 | 0.4338±0.01 | 0.0523±0.00 | 0.4257±0.01 | 0.1253±0.01 | 1.4105±0.14 | -0.0902±0.01 |
 
 - The selected 20 features are based on the feature importance of a lightgbm-based model.
+- The base model of DoubleEnsemble is LGBM.

From 1de4def444a44247aeb80db1adadb8fcad5beb39 Mon Sep 17 00:00:00 2001
From: meng-ustc <dongmeng@mail.ustc.edu.cn>
Date: Tue, 2 Mar 2021 16:14:56 +0900
Subject: [PATCH 11/11] Update parameter names: 'k' and 'base'

---
 ...rkflow_config_doubleensemble_Alpha158.yaml |  4 +--
 ...rkflow_config_doubleensemble_Alpha360.yaml |  4 +--
 qlib/contrib/model/double_ensemble.py         | 30 +++++++++----------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
index 74923c0e5e..a12df802da 100644
--- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
+++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml
@@ -30,9 +30,9 @@ task:
         class: DEnsembleModel
         module_path: qlib.contrib.model.double_ensemble
         kwargs:
-            base: "gbm"
+            base_model: "gbm"
             loss: mse
-            k: 6
+            num_models: 6
             enable_sr: True
             enable_fs: True
             alpha1: 1
diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml
index 3351cefc5a..415448f0be 100644
--- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml
+++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml
@@ -37,9 +37,9 @@ task:
         class: DEnsembleModel
         module_path: qlib.contrib.model.double_ensemble
         kwargs:
-            base: "gbm"
+            base_model: "gbm"
             loss: mse
-            k: 6
+            num_models: 6
             enable_sr: True
             enable_fs: True
             alpha1: 1
diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py
index c96b69e8b4..a340489c20 100644
--- a/qlib/contrib/model/double_ensemble.py
+++ b/qlib/contrib/model/double_ensemble.py
@@ -16,9 +16,9 @@ class DEnsembleModel(Model):
 
     def __init__(
         self,
-        base="gbm",
+        base_model="gbm",
         loss="mse",
-        k=6,
+        num_models=6,
         enable_sr=True,
         enable_fs=True,
         alpha1=1.0,
@@ -31,8 +31,8 @@ def __init__(
         epochs=100,
         **kwargs
     ):
-        self.base = base  # "gbm" or "mlp", specifically, we use lgbm for "gbm"
-        self.k = k
+        self.base_model = base_model  # "gbm" or "mlp", specifically, we use lgbm for "gbm"
+        self.num_models = num_models  # the number of sub-models
         self.enable_sr = enable_sr
         self.enable_fs = enable_fs
         self.alpha1 = alpha1
@@ -43,8 +43,8 @@ def __init__(
         if not len(sample_ratios) == bins_fs:
             raise ValueError("The length of sample_ratios should be equal to bins_fs.")
         self.sample_ratios = sample_ratios
-        if not len(sub_weights) == k:
-            raise ValueError("The length of sub_weights should be equal to k.")
+        if not len(sub_weights) == num_models:
+            raise ValueError("The length of sub_weights should be equal to num_models.")
         self.sub_weights = sub_weights
         self.epochs = epochs
         self.logger = get_module_logger("DEnsembleModel")
@@ -65,27 +65,27 @@ def fit(self, dataset: DatasetH):
         weights = pd.Series(np.ones(N, dtype=float))
         # initialize the features
         features = x_train.columns
-        pred_sub = pd.DataFrame(np.zeros((N, self.k), dtype=float), index=x_train.index)
-        # train k sub-models
-        for i_k in range(self.k):
+        pred_sub = pd.DataFrame(np.zeros((N, self.num_models), dtype=float), index=x_train.index)
+        # train sub-models
+        for k in range(self.num_models):
             self.sub_features.append(features)
-            self.logger.info("Training sub-model: ({}/{})".format(i_k + 1, self.k))
+            self.logger.info("Training sub-model: ({}/{})".format(k + 1, self.num_models))
             model_k = self.train_submodel(df_train, df_valid, weights, features)
             self.ensemble.append(model_k)
             # no further sample re-weight and feature selection needed for the last sub-model
-            if i_k + 1 == self.k:
+            if k + 1 == self.num_models:
                 break
 
             self.logger.info("Retrieving loss curve and loss values...")
             loss_curve = self.retrieve_loss_curve(model_k, df_train, features)
             pred_k = self.predict_sub(model_k, df_train, features)
-            pred_sub.iloc[:, i_k] = pred_k
-            pred_ensemble = pred_sub.iloc[:, : i_k + 1].mean(axis=1)
+            pred_sub.iloc[:, k] = pred_k
+            pred_ensemble = pred_sub.iloc[:, : k + 1].mean(axis=1)
             loss_values = pd.Series(self.get_loss(y_train.values.squeeze(), pred_ensemble.values))
 
             if self.enable_sr:
                 self.logger.info("Sample re-weighting...")
-                weights = self.sample_reweight(loss_curve, loss_values, i_k + 1)
+                weights = self.sample_reweight(loss_curve, loss_values, k + 1)
 
             if self.enable_fs:
                 self.logger.info("Feature selection...")
@@ -209,7 +209,7 @@ def get_loss(self, label, pred):
             raise ValueError("not implemented yet")
 
     def retrieve_loss_curve(self, model, df_train, features):
-        if self.base == "gbm":
+        if self.base_model == "gbm":
             num_trees = model.num_trees()
             x_train, y_train = df_train["feature"].loc[:, features], df_train["label"]
             # Lightgbm need 1D array as its label