From 39f88daaa7b78fdb6b2a10ce6518b6bf154f225c Mon Sep 17 00:00:00 2001
From: Linlang <30293408+SunsetWolf@users.noreply.github.com>
Date: Thu, 7 Mar 2024 14:41:21 +0800
Subject: [PATCH 01/37] download orderbook data (#1754)

* download orderbook data

* fix CI error

* fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* test fix CI error

* optimize get_data code

* optimize get_data code

* optimize get_data code

* optimize README

---------

Co-authored-by: Linlang <v-linlanglv@microsoft.com>
---
 examples/benchmarks/TRA/src/model.py     |  3 ---
 examples/orderbook_data/README.md        |  5 +----
 qlib/backtest/__init__.py                | 16 +++++++++-------
 qlib/backtest/report.py                  |  8 +++++---
 qlib/contrib/eva/alpha.py                |  1 +
 qlib/contrib/model/pytorch_tra.py        |  3 ---
 qlib/contrib/strategy/signal_strategy.py |  1 -
 qlib/model/ens/ensemble.py               |  2 --
 qlib/model/riskmodel/shrink.py           |  4 +---
 qlib/workflow/online/strategy.py         |  1 -
 scripts/dump_bin.py                      |  4 +---
 scripts/dump_pit.py                      |  8 +++++---
 setup.py                                 |  2 ++
 tests/test_workflow.py                   |  6 +++++-
 14 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/examples/benchmarks/TRA/src/model.py b/examples/benchmarks/TRA/src/model.py
index affb115a10..ebafd6a521 100644
--- a/examples/benchmarks/TRA/src/model.py
+++ b/examples/benchmarks/TRA/src/model.py
@@ -324,7 +324,6 @@ def predict(self, dataset, segment="test"):
 
 
 class LSTM(nn.Module):
-
     """LSTM Model
 
     Args:
@@ -414,7 +413,6 @@ def forward(self, x):
 
 
 class Transformer(nn.Module):
-
     """Transformer Model
 
     Args:
@@ -475,7 +473,6 @@ def forward(self, x):
 
 
 class TRA(nn.Module):
-
     """Temporal Routing Adaptor (TRA)
 
     TRA takes historical prediction errors & latent representation as inputs,
diff --git a/examples/orderbook_data/README.md b/examples/orderbook_data/README.md
index 059ee27056..890e11f41e 100644
--- a/examples/orderbook_data/README.md
+++ b/examples/orderbook_data/README.md
@@ -27,13 +27,11 @@ pip install arctic  # NOTE: pip may fail to resolve the right package dependency
 2. Please follow following steps to download example data
 ```bash
 cd examples/orderbook_data/
-wget http://fintech.msra.cn/stock_data/downloads/highfreq_orderboook_example_data.tar.bz2
-tar xf highfreq_orderboook_example_data.tar.bz2
+python ../../scripts/get_data.py download_data --target_dir . --file_name highfreq_orderbook_example_data.zip
 ```
 
 3. Please import the example data to your mongo db
 ```bash
-cd examples/orderbook_data/
 python create_dataset.py initialize_library  # Initialization Libraries
 python create_dataset.py import_data  # Initialization Libraries
 ```
@@ -42,7 +40,6 @@ python create_dataset.py import_data  # Initialization Libraries
 
 After importing these data, you run `example.py` to create some high-frequency features.
 ```bash
-cd examples/orderbook_data/
 pytest -s --disable-warnings example.py   # If you want run all examples
 pytest -s --disable-warnings example.py::TestClass::test_exp_10  # If you want to run specific example
 ```
diff --git a/qlib/backtest/__init__.py b/qlib/backtest/__init__.py
index d784aed57e..9daba91153 100644
--- a/qlib/backtest/__init__.py
+++ b/qlib/backtest/__init__.py
@@ -162,13 +162,15 @@ def create_account_instance(
         init_cash=init_cash,
         position_dict=position_dict,
         pos_type=pos_type,
-        benchmark_config={}
-        if benchmark is None
-        else {
-            "benchmark": benchmark,
-            "start_time": start_time,
-            "end_time": end_time,
-        },
+        benchmark_config=(
+            {}
+            if benchmark is None
+            else {
+                "benchmark": benchmark,
+                "start_time": start_time,
+                "end_time": end_time,
+            }
+        ),
     )
 
 
diff --git a/qlib/backtest/report.py b/qlib/backtest/report.py
index 8e7440ba9e..e7c6041efd 100644
--- a/qlib/backtest/report.py
+++ b/qlib/backtest/report.py
@@ -622,9 +622,11 @@ def cal_trade_indicators(
             print(
                 "[Indicator({}) {}]: FFR: {}, PA: {}, POS: {}".format(
                     freq,
-                    trade_start_time
-                    if isinstance(trade_start_time, str)
-                    else trade_start_time.strftime("%Y-%m-%d %H:%M:%S"),
+                    (
+                        trade_start_time
+                        if isinstance(trade_start_time, str)
+                        else trade_start_time.strftime("%Y-%m-%d %H:%M:%S")
+                    ),
                     fulfill_rate,
                     price_advantage,
                     positive_rate,
diff --git a/qlib/contrib/eva/alpha.py b/qlib/contrib/eva/alpha.py
index 95ec9b91e9..86d366d205 100644
--- a/qlib/contrib/eva/alpha.py
+++ b/qlib/contrib/eva/alpha.py
@@ -3,6 +3,7 @@
 
 The interface should be redesigned carefully in the future.
 """
+
 import pandas as pd
 from typing import Tuple
 from qlib import get_module_logger
diff --git a/qlib/contrib/model/pytorch_tra.py b/qlib/contrib/model/pytorch_tra.py
index 964febf11c..bc9a6aa977 100644
--- a/qlib/contrib/model/pytorch_tra.py
+++ b/qlib/contrib/model/pytorch_tra.py
@@ -511,7 +511,6 @@ def predict(self, dataset, segment="test"):
 
 
 class RNN(nn.Module):
-
     """RNN Model
 
     Args:
@@ -601,7 +600,6 @@ def forward(self, x):
 
 
 class Transformer(nn.Module):
-
     """Transformer Model
 
     Args:
@@ -649,7 +647,6 @@ def forward(self, x):
 
 
 class TRA(nn.Module):
-
     """Temporal Routing Adaptor (TRA)
 
     TRA takes historical prediction errors & latent representation as inputs,
diff --git a/qlib/contrib/strategy/signal_strategy.py b/qlib/contrib/strategy/signal_strategy.py
index 9ba960eebd..bad19ddfdc 100644
--- a/qlib/contrib/strategy/signal_strategy.py
+++ b/qlib/contrib/strategy/signal_strategy.py
@@ -373,7 +373,6 @@ def generate_trade_decision(self, execute_result=None):
 
 
 class EnhancedIndexingStrategy(WeightStrategyBase):
-
     """Enhanced Indexing Strategy
 
     Enhanced indexing combines the arts of active management and passive management,
diff --git a/qlib/model/ens/ensemble.py b/qlib/model/ens/ensemble.py
index ede1f8e3ad..1ebb16f18b 100644
--- a/qlib/model/ens/ensemble.py
+++ b/qlib/model/ens/ensemble.py
@@ -30,7 +30,6 @@ def __call__(self, ensemble_dict: dict, *args, **kwargs):
 
 
 class SingleKeyEnsemble(Ensemble):
-
     """
     Extract the object if there is only one key and value in the dict. Make the result more readable.
     {Only key: Only value} -> Only value
@@ -64,7 +63,6 @@ def __call__(self, ensemble_dict: Union[dict, object], recursion: bool = True) -
 
 
 class RollingEnsemble(Ensemble):
-
     """Merge a dict of rolling dataframe like `prediction` or `IC` into an ensemble.
 
     NOTE: The values of dict must be pd.DataFrame, and have the index "datetime".
diff --git a/qlib/model/riskmodel/shrink.py b/qlib/model/riskmodel/shrink.py
index b2594f707d..c3c0e48ef8 100644
--- a/qlib/model/riskmodel/shrink.py
+++ b/qlib/model/riskmodel/shrink.py
@@ -247,9 +247,7 @@ def _get_shrink_param_lw_single_factor(self, X: np.ndarray, S: np.ndarray, F: np
         v1 = y.T.dot(z) / t - cov_mkt[:, None] * S
         roff1 = np.sum(v1 * cov_mkt[:, None].T) / var_mkt - np.sum(np.diag(v1) * cov_mkt) / var_mkt
         v3 = z.T.dot(z) / t - var_mkt * S
-        roff3 = (
-            np.sum(v3 * np.outer(cov_mkt, cov_mkt)) / var_mkt**2 - np.sum(np.diag(v3) * cov_mkt**2) / var_mkt**2
-        )
+        roff3 = np.sum(v3 * np.outer(cov_mkt, cov_mkt)) / var_mkt**2 - np.sum(np.diag(v3) * cov_mkt**2) / var_mkt**2
         roff = 2 * roff1 - roff3
         rho = rdiag + roff
 
diff --git a/qlib/workflow/online/strategy.py b/qlib/workflow/online/strategy.py
index f2988d843f..d545e4bc9a 100644
--- a/qlib/workflow/online/strategy.py
+++ b/qlib/workflow/online/strategy.py
@@ -90,7 +90,6 @@ def get_collector(self) -> Collector:
 
 
 class RollingStrategy(OnlineStrategy):
-
     """
     This example strategy always uses the latest rolling model sas online models.
     """
diff --git a/scripts/dump_bin.py b/scripts/dump_bin.py
index 92abc8beec..a65b1f58ee 100644
--- a/scripts/dump_bin.py
+++ b/scripts/dump_bin.py
@@ -146,9 +146,7 @@ def get_dump_fields(self, df_columns: Iterable[str]) -> Iterable[str]:
         return (
             self._include_fields
             if self._include_fields
-            else set(df_columns) - set(self._exclude_fields)
-            if self._exclude_fields
-            else df_columns
+            else set(df_columns) - set(self._exclude_fields) if self._exclude_fields else df_columns
         )
 
     @staticmethod
diff --git a/scripts/dump_pit.py b/scripts/dump_pit.py
index 34d304ed78..1ca9cfc942 100644
--- a/scripts/dump_pit.py
+++ b/scripts/dump_pit.py
@@ -132,9 +132,11 @@ def get_dump_fields(self, df: Iterable[str]) -> Iterable[str]:
         return (
             set(self._include_fields)
             if self._include_fields
-            else set(df[self.field_column_name]) - set(self._exclude_fields)
-            if self._exclude_fields
-            else set(df[self.field_column_name])
+            else (
+                set(df[self.field_column_name]) - set(self._exclude_fields)
+                if self._exclude_fields
+                else set(df[self.field_column_name])
+            )
         )
 
     def get_filenames(self, symbol, field, interval):
diff --git a/setup.py b/setup.py
index 508fd8c3a4..adafefd614 100644
--- a/setup.py
+++ b/setup.py
@@ -65,6 +65,8 @@ def get_version(rel_path: str) -> str:
     # To ensure stable operation of the experiment manager, we have limited the version of mlflow,
     # and we need to verify whether version 2.0 of mlflow can serve qlib properly.
     "mlflow>=1.12.1, <=1.30.0",
+    # mlflow 1.30.0 requires packaging<22, so we limit the packaging version, otherwise the CI will fail.
+    "packaging<22",
     "tqdm",
     "loguru",
     "lightgbm>=3.3.0",
diff --git a/tests/test_workflow.py b/tests/test_workflow.py
index 129abc0fbb..cf17b3d18a 100644
--- a/tests/test_workflow.py
+++ b/tests/test_workflow.py
@@ -9,7 +9,9 @@
 
 
 class WorkflowTest(TestAutoData):
-    TMP_PATH = Path("./.mlruns_tmp/")
+    # Creating the directory manually doesn't work with mlflow,
+    # so we add a subfolder named .trash when we create the directory.
+    TMP_PATH = Path("./.mlruns_tmp/.trash")
 
     def tearDown(self) -> None:
         if self.TMP_PATH.exists():
@@ -17,6 +19,8 @@ def tearDown(self) -> None:
 
     def test_get_local_dir(self):
         """ """
+        self.TMP_PATH.mkdir(parents=True, exist_ok=True)
+
         with R.start(uri=str(self.TMP_PATH)):
             pass
 

From 1bb8f2fa23498b915bee50d71ff3e31d655fb022 Mon Sep 17 00:00:00 2001
From: Xisen Wang <118058822+Xisen-Wang@users.noreply.github.com>
Date: Wed, 20 Mar 2024 12:48:52 +0000
Subject: [PATCH 02/37] Enhance README with LightGBM Installation Guidance for
 Mac M1 Users (#1766)

* Update README.md

* Update README.md

* Update README.md
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index a25c29c117..a9d5e4cc23 100644
--- a/README.md
+++ b/README.md
@@ -172,6 +172,8 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
 
 **Tips**: If you fail to install `Qlib` or run the examples in your environment,  comparing your steps and the [CI workflow](.github/workflows/test_qlib_from_source.yml) may help you find the problem.
 
+**Tips for Mac**: If you are using Mac with M1, you might encounter issues in building the wheel for LightGBM, which is due to missing dependencies from OpenMP. To solve the problem, install openmp first with ``brew install libomp`` and then run ``pip install .`` to build it successfully. 
+
 ## Data Preparation
 Load and prepare data by running the following code:
 

From 194284b1ac88192770e1acc9eea89269bd490048 Mon Sep 17 00:00:00 2001
From: Young <afe.young@gmail.com>
Date: Sun, 14 Apr 2024 00:09:30 +0800
Subject: [PATCH 03/37] Update version

---
 .github/workflows/python-publish.yml             | 10 +++++-----
 .github/workflows/release-drafter.yml            |  6 ++++++
 .github/workflows/test_qlib_from_pip.yml         |  5 ++++-
 .github/workflows/test_qlib_from_source.yml      |  5 ++++-
 .github/workflows/test_qlib_from_source_slow.yml |  5 ++++-
 qlib/__init__.py                                 |  2 +-
 6 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 5d88b2959a..9310cd5e97 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -51,8 +51,8 @@ jobs:
         python setup.py bdist_wheel
     - name: Build and publish
       env:
-        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
       run: |
         twine upload dist/*
 
@@ -72,10 +72,10 @@ jobs:
         python-version: 3.7
     - name: Install dependencies
       run: |
-        pip install twine  
+        pip install twine
     - name: Build and publish
       env:
-        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
-        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
       run: |
         twine upload dist/pyqlib-*-manylinux*.whl
diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml
index 113a4f009f..7eefec5801 100644
--- a/.github/workflows/release-drafter.yml
+++ b/.github/workflows/release-drafter.yml
@@ -6,8 +6,14 @@ on:
     branches:
       - main
 
+permissions:
+  contents: read
+
 jobs:
   update_release_draft:
+    permissions:
+      contents: write
+      pull-requests: read
     runs-on: ubuntu-latest
     steps:
       # Drafts your next Release notes as Pull Requests are merged into "master"
diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml
index bde41d8026..4cc842b223 100644
--- a/.github/workflows/test_qlib_from_pip.yml
+++ b/.github/workflows/test_qlib_from_pip.yml
@@ -13,7 +13,10 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-latest]
+        # Since macos-latest changed from 12.7.4 to 14.4.1,
+        # the minimum python version that matches a 14.4.1 version of macos is 3.10,
+        # so we limit the macos version to macos-12.
+        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-12]
         # not supporting 3.6 due to annotations is not supported https://stackoverflow.com/a/52890129
         python-version: [3.7, 3.8]
 
diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml
index 9205a13641..38f32da8ed 100644
--- a/.github/workflows/test_qlib_from_source.yml
+++ b/.github/workflows/test_qlib_from_source.yml
@@ -14,7 +14,10 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-latest]
+        # Since macos-latest changed from 12.7.4 to 14.4.1,
+        # the minimum python version that matches a 14.4.1 version of macos is 3.10,
+        # so we limit the macos version to macos-12.
+        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-12]
         # not supporting 3.6 due to annotations is not supported https://stackoverflow.com/a/52890129
         python-version: [3.7, 3.8]
 
diff --git a/.github/workflows/test_qlib_from_source_slow.yml b/.github/workflows/test_qlib_from_source_slow.yml
index caab6f444e..8725d4fe03 100644
--- a/.github/workflows/test_qlib_from_source_slow.yml
+++ b/.github/workflows/test_qlib_from_source_slow.yml
@@ -14,7 +14,10 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-latest]
+        # Since macos-latest changed from 12.7.4 to 14.4.1,
+        # the minimum python version that matches a 14.4.1 version of macos is 3.10,
+        # so we limit the macos version to macos-12.
+        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-12]
         # not supporting 3.6 due to annotations is not supported https://stackoverflow.com/a/52890129
         python-version: [3.7, 3.8]
 
diff --git a/qlib/__init__.py b/qlib/__init__.py
index ed95f589e4..8714df8d83 100644
--- a/qlib/__init__.py
+++ b/qlib/__init__.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 from pathlib import Path
 
-__version__ = "0.9.3.99"
+__version__ = "0.9.4"
 __version__bak = __version__  # This version is backup for QlibConfig.reset_qlib_version
 import os
 from typing import Union

From 3779b5186addad545683a4acde02d9c636b86261 Mon Sep 17 00:00:00 2001
From: Linlang <30293408+SunsetWolf@users.noreply.github.com>
Date: Wed, 8 May 2024 13:50:55 +0800
Subject: [PATCH 04/37] bump version (#1784)

Co-authored-by: Linlang Lv (iSoftStone Information) <v-lvlinlang@microsoft.com>
---
 qlib/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qlib/__init__.py b/qlib/__init__.py
index 8714df8d83..98ba7f95ce 100644
--- a/qlib/__init__.py
+++ b/qlib/__init__.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 from pathlib import Path
 
-__version__ = "0.9.4"
+__version__ = "0.9.4.99"
 __version__bak = __version__  # This version is backup for QlibConfig.reset_qlib_version
 import os
 from typing import Union

From ea245f543516dc600d711d5ac8886431ba09b139 Mon Sep 17 00:00:00 2001
From: Linlang <30293408+SunsetWolf@users.noreply.github.com>
Date: Fri, 10 May 2024 11:04:59 +0800
Subject: [PATCH 05/37] Fix issue 1729 (#1776)

* fix issue 1729

* fix issue 1729

* fix issue 1729

---------

Co-authored-by: Linlang Lv (iSoftStone Information) <v-lvlinlang@microsoft.com>
---
 scripts/data_collector/cn_index/collector.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/scripts/data_collector/cn_index/collector.py b/scripts/data_collector/cn_index/collector.py
index 96f68ef9cd..237df6fe87 100644
--- a/scripts/data_collector/cn_index/collector.py
+++ b/scripts/data_collector/cn_index/collector.py
@@ -396,14 +396,7 @@ def get_history_companies(self) -> pd.DataFrame:
         today = pd.Timestamp.now()
         date_range = pd.DataFrame(pd.date_range(start="2007-01-15", end=today, freq="7D"))[0].dt.date
         ret_list = []
-        col = ["date", "symbol", "code_name"]
         for date in tqdm(date_range, desc="Download CSI500"):
-            rs = bs.query_zz500_stocks(date=str(date))
-            zz500_stocks = []
-            while (rs.error_code == "0") & rs.next():
-                zz500_stocks.append(rs.get_row_data())
-            result = pd.DataFrame(zz500_stocks, columns=col)
-            result["symbol"] = result["symbol"].apply(lambda x: x.replace(".", "").upper())
             result = self.get_data_from_baostock(date)
             ret_list.append(result[["date", "symbol"]])
         bs.logout()

From b1e0e77c97b55ae0305d3d8c3192c61db032fcc6 Mon Sep 17 00:00:00 2001
From: Chuan Xu <xuchuan0304@gmail.com>
Date: Fri, 10 May 2024 01:09:39 -0400
Subject: [PATCH 06/37] Fix the bug of reading string NA as NaN in the function
 exists_qlib_data. (#1736)

* Fix the bug of reading NA string as NaN in exists_qlib_data.

* Fix the .gitignore file.

* Update the fix and add some comments.

* format with black

---------

Co-authored-by: Chuan Xu <chuan.xu@sas.com>
Co-authored-by: Linlang Lv (iSoftStone Information) <v-lvlinlang@microsoft.com>
---
 .gitignore             |  2 +-
 qlib/utils/__init__.py | 70 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8854c25e99..29ea1cd5e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,4 +48,4 @@ tags
 *.swp
 
 ./pretrain
-.idea/
+.idea/
\ No newline at end of file
diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py
index 9e63c104a1..732638b236 100644
--- a/qlib/utils/__init__.py
+++ b/qlib/utils/__init__.py
@@ -25,7 +25,12 @@
 from pathlib import Path
 from typing import List, Union, Optional, Callable
 from packaging import version
-from .file import get_or_create_path, save_multiple_parts_file, unpack_archive_with_buffer, get_tmp_file_with_buffer
+from .file import (
+    get_or_create_path,
+    save_multiple_parts_file,
+    unpack_archive_with_buffer,
+    get_tmp_file_with_buffer,
+)
 from ..config import C
 from ..log import get_module_logger, set_log_with_config
 
@@ -37,7 +42,12 @@
 #################### Server ####################
 def get_redis_connection():
     """get redis connection instance."""
-    return redis.StrictRedis(host=C.redis_host, port=C.redis_port, db=C.redis_task_db, password=C.redis_password)
+    return redis.StrictRedis(
+        host=C.redis_host,
+        port=C.redis_port,
+        db=C.redis_task_db,
+        password=C.redis_password,
+    )
 
 
 #################### Data ####################
@@ -96,7 +106,14 @@ def get_period_offset(first_year, period, quarterly):
     return offset
 
 
-def read_period_data(index_path, data_path, period, cur_date_int: int, quarterly, last_period_index: int = None):
+def read_period_data(
+    index_path,
+    data_path,
+    period,
+    cur_date_int: int,
+    quarterly,
+    last_period_index: int = None,
+):
     """
     At `cur_date`(e.g. 20190102), read the information at `period`(e.g. 201803).
     Only the updating info before cur_date or at cur_date will be used.
@@ -273,7 +290,10 @@ def parse_field(field):
     # \uff09 -> )
     chinese_punctuation_regex = r"\u3001\uff1a\uff08\uff09"
     for pattern, new in [
-        (rf"\$\$([\w{chinese_punctuation_regex}]+)", r'PFeature("\1")'),  # $$ must be before $
+        (
+            rf"\$\$([\w{chinese_punctuation_regex}]+)",
+            r'PFeature("\1")',
+        ),  # $$ must be before $
         (rf"\$([\w{chinese_punctuation_regex}]+)", r'Feature("\1")'),
         (r"(\w+\s*)\(", r"Operators.\1("),
     ]:  # Features  # Operators
@@ -383,7 +403,14 @@ def get_date_range(trading_date, left_shift=0, right_shift=0, future=False):
     return calendar
 
 
-def get_date_by_shift(trading_date, shift, future=False, clip_shift=True, freq="day", align: Optional[str] = None):
+def get_date_by_shift(
+    trading_date,
+    shift,
+    future=False,
+    clip_shift=True,
+    freq="day",
+    align: Optional[str] = None,
+):
     """get trading date with shift bias will cur_date
         e.g. : shift == 1,  return next trading date
                shift == -1, return previous trading date
@@ -569,7 +596,38 @@ def exists_qlib_data(qlib_dir):
     # check instruments
     code_names = set(map(lambda x: fname_to_code(x.name.lower()), features_dir.iterdir()))
     _instrument = instruments_dir.joinpath("all.txt")
-    miss_code = set(pd.read_csv(_instrument, sep="\t", header=None).loc[:, 0].apply(str.lower)) - set(code_names)
+    # Removed two possible ticker names "NA" and "NULL" from the default na_values list for column 0
+    miss_code = set(
+        pd.read_csv(
+            _instrument,
+            sep="\t",
+            header=None,
+            keep_default_na=False,
+            na_values={
+                0: [
+                    " ",
+                    "#N/A",
+                    "#N/A N/A",
+                    "#NA",
+                    "-1.#IND",
+                    "-1.#QNAN",
+                    "-NaN",
+                    "-nan",
+                    "1.#IND",
+                    "1.#QNAN",
+                    "<NA>",
+                    "N/A",
+                    "NaN",
+                    "None",
+                    "n/a",
+                    "nan",
+                    "null ",
+                ]
+            },
+        )
+        .loc[:, 0]
+        .apply(str.lower)
+    ) - set(code_names)
     if miss_code and any(map(lambda x: "sht" not in x, miss_code)):
         return False
 

From 917e3a725e9464ed04a981571ac45fa268f97fdf Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Fri, 10 May 2024 15:42:41 +0900
Subject: [PATCH 07/37] Update dump_pit.py (#1759)

seperated -> separated

Co-authored-by: Linlang Lv (iSoftStone Information) <v-lvlinlang@microsoft.com>
---
 scripts/dump_pit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/dump_pit.py b/scripts/dump_pit.py
index 1ca9cfc942..806bbd0cc9 100644
--- a/scripts/dump_pit.py
+++ b/scripts/dump_pit.py
@@ -3,7 +3,7 @@
 """
 TODO:
 - A more well-designed PIT database is required.
-    - seperated insert, delete, update, query operations are required.
+    - separated insert, delete, update, query operations are required.
 """
 
 import shutil

From 6ed83f7c04175767165f7891cc0a80218d25d70f Mon Sep 17 00:00:00 2001
From: fei long <feilongphone@gmail.com>
Date: Fri, 17 May 2024 18:43:12 +0800
Subject: [PATCH 08/37] data_collector: cn_index: fix missing dependencies
 package in requirements.txt (#1770)

add yahooquery and openpyxl in requirements.txt

Signed-off-by: YuLong Yao <feilongphone@gmail.com>
Co-authored-by: Linlang Lv (iSoftStone Information) <v-lvlinlang@microsoft.com>
---
 scripts/data_collector/cn_index/requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/data_collector/cn_index/requirements.txt b/scripts/data_collector/cn_index/requirements.txt
index bff59525cd..87933e9d9d 100644
--- a/scripts/data_collector/cn_index/requirements.txt
+++ b/scripts/data_collector/cn_index/requirements.txt
@@ -5,3 +5,5 @@ pandas
 lxml
 loguru
 tqdm
+yahooquery
+openpyxl

From 2ae4be426a6645e58e50764deabd3ac0d770e15f Mon Sep 17 00:00:00 2001
From: playfund <windev@foxmail.com>
Date: Fri, 17 May 2024 18:45:07 +0800
Subject: [PATCH 09/37] Delete redundant copy() code to speed up (#1732)

Delete redundant copy() code to speed up

Co-authored-by: Linlang Lv (iSoftStone Information) <v-lvlinlang@microsoft.com>
---
 qlib/data/data.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/qlib/data/data.py b/qlib/data/data.py
index 116827f232..1b1353ee4e 100644
--- a/qlib/data/data.py
+++ b/qlib/data/data.py
@@ -536,7 +536,6 @@ def get_column_names(fields):
         """
         if len(fields) == 0:
             raise ValueError("fields cannot be empty")
-        fields = fields.copy()
         column_names = [str(f) for f in fields]
         return column_names
 

From 8a087d0db9d9aec2e4a5685536e79ee75a4b45d7 Mon Sep 17 00:00:00 2001
From: Linlang <30293408+SunsetWolf@users.noreply.github.com>
Date: Fri, 17 May 2024 19:19:45 +0800
Subject: [PATCH 10/37] fix docs (#1721)

* fix docs

* modify file extension

* modify file extension

---------

Co-authored-by: Linlang Lv (iSoftStone Information) <v-lvlinlang@microsoft.com>
---
 .readthedocs.yml => .readthedocs.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
 rename .readthedocs.yml => .readthedocs.yaml (80%)

diff --git a/.readthedocs.yml b/.readthedocs.yaml
similarity index 80%
rename from .readthedocs.yml
rename to .readthedocs.yaml
index 7d4cb854ae..71b29a2279 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yaml
@@ -5,6 +5,12 @@
 # Required
 version: 2
 
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.7"
+
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
   configuration: docs/conf.py
@@ -14,7 +20,6 @@ formats: all
 
 # Optionally set the version of Python and requirements required to build your docs
 python:
-  version: 3.7
   install:
     - requirements: docs/requirements.txt
     - method: pip

From f79a0eeaffe6da63a5967aaf2e9f85652f9eec68 Mon Sep 17 00:00:00 2001
From: Linlang <30293408+SunsetWolf@users.noreply.github.com>
Date: Tue, 21 May 2024 04:23:55 +0800
Subject: [PATCH 11/37] fix docs (#1788)

Co-authored-by: Linlang Lv (iSoftStone Information) <v-lvlinlang@microsoft.com>
---
 docs/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index c10a86d4ee..9444c55737 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,3 +5,4 @@ scipy
 scikit-learn
 pandas
 tianshou
+sphinx_rtd_theme

From 63021018d680b55d65f03c919c86332e8bad23d0 Mon Sep 17 00:00:00 2001
From: you-n-g <you-n-g@users.noreply.github.com>
Date: Tue, 21 May 2024 08:15:18 +0800
Subject: [PATCH 12/37] Update README.md's dataset

---
 README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/README.md b/README.md
index a9d5e4cc23..65c4420e6b 100644
--- a/README.md
+++ b/README.md
@@ -175,6 +175,20 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
 **Tips for Mac**: If you are using Mac with M1, you might encounter issues in building the wheel for LightGBM, which is due to missing dependencies from OpenMP. To solve the problem, install openmp first with ``brew install libomp`` and then run ``pip install .`` to build it successfully. 
 
 ## Data Preparation
+❗ Due to more restrict data security policy. The offical dataset is disabled temporarily. You can try [this data source](https://github.com/chenditc/investment_data/releases) contributed by the community.
+Here is an example to download the data updated on 20220720.
+```bash
+wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz
+mkdir -p ~/.qlib/qlib_data/cn_data
+tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
+rm -f qlib_bin.tar.gz
+```
+
+The official dataset below will resume in short future.
+
+
+----
+
 Load and prepare data by running the following code:
 
 ### Get with module

From 155f80323c6fc09e2f19a22767e20f569989d0cd Mon Sep 17 00:00:00 2001
From: Linlang <30293408+SunsetWolf@users.noreply.github.com>
Date: Fri, 24 May 2024 12:59:50 +0800
Subject: [PATCH 13/37] fix get data error (#1793)

* fix get data error

* fix get v0 data error

* optimize get_data code

* fix pylint error

* add comments
---
 examples/orderbook_data/README.md |  5 ++-
 qlib/tests/data.py                | 68 ++++++++++++++++++++-----------
 2 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/examples/orderbook_data/README.md b/examples/orderbook_data/README.md
index 890e11f41e..53fd523d7f 100644
--- a/examples/orderbook_data/README.md
+++ b/examples/orderbook_data/README.md
@@ -16,7 +16,7 @@ Current version of script with default value tries to connect localhost **via de
 
 Run following command to install necessary libraries
 ```
-pip install pytest coverage
+pip install pytest coverage gdown
 pip install arctic  # NOTE: pip may fail to resolve the right package dependency !!! Please make sure the dependency are satisfied.
 ```
 
@@ -27,7 +27,8 @@ pip install arctic  # NOTE: pip may fail to resolve the right package dependency
 2. Please follow following steps to download example data
 ```bash
 cd examples/orderbook_data/
-python ../../scripts/get_data.py download_data --target_dir . --file_name highfreq_orderbook_example_data.zip
+gdown https://drive.google.com/uc?id=15nZF7tFT_eKVZAcMFL1qPS4jGyJflH7e  # Proxies may be necessary here.
+python ../../scripts/get_data.py _unzip --file_path highfreq_orderbook_example_data.zip --target_dir .
 ```
 
 3. Please import the example data to your mongo db
diff --git a/qlib/tests/data.py b/qlib/tests/data.py
index f6bd780905..2fa76855b5 100644
--- a/qlib/tests/data.py
+++ b/qlib/tests/data.py
@@ -12,15 +12,11 @@
 from tqdm import tqdm
 from pathlib import Path
 from loguru import logger
-from cryptography.fernet import Fernet
 from qlib.utils import exists_qlib_data
 
 
 class GetData:
-    REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data"
-    # "?" is not included in the token.
-    TOKEN = b"gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy"
-    KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA="
+    REMOTE_URL = "https://github.com/SunsetWolf/qlib_dataset/releases/download"
 
     def __init__(self, delete_zip_file=False):
         """
@@ -33,9 +29,45 @@ def __init__(self, delete_zip_file=False):
         self.delete_zip_file = delete_zip_file
 
     def merge_remote_url(self, file_name: str):
-        fernet = Fernet(self.KEY)
-        token = fernet.decrypt(self.TOKEN).decode()
-        return f"{self.REMOTE_URL}/{file_name}?{token}"
+        """
+        Generate download links.
+
+        Parameters
+        ----------
+        file_name: str
+            The name of the file to be downloaded.
+            The file name can be accompanied by a version number, (e.g.: v2/qlib_data_simple_cn_1d_latest.zip),
+            if no version number is attached, it will be downloaded from v0 by default.
+        """
+        return f"{self.REMOTE_URL}/{file_name}" if "/" in file_name else f"{self.REMOTE_URL}/v0/{file_name}"
+
+    def download(self, url: str, target_path: [Path, str]):
+        """
+        Download a file from the specified url.
+
+        Parameters
+        ----------
+        url: str
+            The url of the data.
+        target_path: str
+            The location where the data is saved, including the file name.
+        """
+        file_name = str(target_path).rsplit("/", maxsplit=1)[-1]
+        resp = requests.get(url, stream=True, timeout=60)
+        resp.raise_for_status()
+        if resp.status_code != 200:
+            raise requests.exceptions.HTTPError()
+
+        chunk_size = 1024
+        logger.warning(
+            f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
+        )
+        logger.info(f"{os.path.basename(file_name)} downloading......")
+        with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
+            with target_path.open("wb") as fp:
+                for chunk in resp.iter_content(chunk_size=chunk_size):
+                    fp.write(chunk)
+                    p_bar.update(chunk_size)
 
     def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True):
         """
@@ -70,21 +102,7 @@ def download_data(self, file_name: str, target_dir: [Path, str], delete_old: boo
         target_path = target_dir.joinpath(_target_file_name)
 
         url = self.merge_remote_url(file_name)
-        resp = requests.get(url, stream=True, timeout=60)
-        resp.raise_for_status()
-        if resp.status_code != 200:
-            raise requests.exceptions.HTTPError()
-
-        chunk_size = 1024
-        logger.warning(
-            f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)"
-        )
-        logger.info(f"{os.path.basename(file_name)} downloading......")
-        with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar:
-            with target_path.open("wb") as fp:
-                for chunk in resp.iter_content(chunk_size=chunk_size):
-                    fp.write(chunk)
-                    p_bar.update(chunk_size)
+        self.download(url=url, target_path=target_path)
 
         self._unzip(target_path, target_dir, delete_old)
         if self.delete_zip_file:
@@ -99,7 +117,9 @@ def check_dataset(self, file_name: str):
         return status
 
     @staticmethod
-    def _unzip(file_path: Path, target_dir: Path, delete_old: bool = True):
+    def _unzip(file_path: [Path, str], target_dir: [Path, str], delete_old: bool = True):
+        file_path = Path(file_path)
+        target_dir = Path(target_dir)
         if delete_old:
             logger.warning(
                 f"will delete the old qlib data directory(features, instruments, calendars, features_cache, dataset_cache): {target_dir}"

From b892b21045df4cfc65fde9f7b9714e21bbbf41af Mon Sep 17 00:00:00 2001
From: Linlang <Lv.Linlang@hotmail.com>
Date: Fri, 24 May 2024 15:14:49 +0800
Subject: [PATCH 14/37] update version

---
 qlib/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qlib/__init__.py b/qlib/__init__.py
index 98ba7f95ce..39935fd3c4 100644
--- a/qlib/__init__.py
+++ b/qlib/__init__.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 from pathlib import Path
 
-__version__ = "0.9.4.99"
+__version__ = "0.9.5"
 __version__bak = __version__  # This version is backup for QlibConfig.reset_qlib_version
 import os
 from typing import Union

From 02fe6b6974573a730add42036d13301eb346e3e8 Mon Sep 17 00:00:00 2001
From: Linlang <Lv.Linlang@hotmail.com>
Date: Fri, 24 May 2024 16:38:48 +0800
Subject: [PATCH 15/37] bump verison

---
 qlib/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qlib/__init__.py b/qlib/__init__.py
index 39935fd3c4..fca74e4567 100644
--- a/qlib/__init__.py
+++ b/qlib/__init__.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 from pathlib import Path
 
-__version__ = "0.9.5"
+__version__ = "0.9.5.99"
 __version__bak = __version__  # This version is backup for QlibConfig.reset_qlib_version
 import os
 from typing import Union

From 907c888c23a5e40fa13ceccb6028b29d85aedfc1 Mon Sep 17 00:00:00 2001
From: igeni <kublin@it8.ru>
Date: Tue, 28 May 2024 07:13:12 +0300
Subject: [PATCH 16/37] changed concat of strings to f-strings and redundant
 type conversion was removed (#1767)

Co-authored-by: Linlang <Lv.Linlang@hotmail.com>
---
 qlib/data/client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qlib/data/client.py b/qlib/data/client.py
index b6733fd3ad..a9b4b2edf7 100644
--- a/qlib/data/client.py
+++ b/qlib/data/client.py
@@ -35,7 +35,7 @@ def __init__(self, host, port):
     def connect_server(self):
         """Connect to server."""
         try:
-            self.sio.connect("ws://" + self.server_host + ":" + str(self.server_port))
+            self.sio.connect(f"ws://{self.server_host}:{self.server_port}")
         except socketio.exceptions.ConnectionError:
             self.logger.error("Cannot connect to server - check your network or server status")
 

From 598017f634ad95e223d2130b76ea83d0da73838a Mon Sep 17 00:00:00 2001
From: you-n-g <you-n-g@users.noreply.github.com>
Date: Wed, 29 May 2024 17:44:18 +0800
Subject: [PATCH 17/37] Update Dev in README.md (#1800)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 65c4420e6b..2f833f04d4 100644
--- a/README.md
+++ b/README.md
@@ -166,7 +166,7 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
 * Clone the repository and install ``Qlib`` as follows.
     ```bash
     git clone https://github.com/microsoft/qlib.git && cd qlib
-    pip install .
+    pip install .  # `pip install -e .[dev]` is recommended for development. check details in docs/developer/code_standard_and_dev_guide.rst
     ```
   **Note**:  You can install Qlib with `python setup.py install` as well. But it is not the recommended approach. It will skip `pip` and cause obscure problems. For example, **only** the command ``pip install .`` **can** overwrite the stable version installed by ``pip install pyqlib``, while the command ``python setup.py install`` **can't**.
 

From 35e0fdd1c036ff03d029f3a49dfc0ccfa1bab593 Mon Sep 17 00:00:00 2001
From: Hao Zhao <zhstark@icloud.com>
Date: Sat, 1 Jun 2024 08:07:34 +0800
Subject: [PATCH 18/37] fix the bug that the HS_SYMBOLS_URL is 404 (#1758)

* fix the bug that the HS_SYMBOLS_URL is 404

* fix bug

* format with black

* fix pylint error

* change error code

* fix ci error

* fix ci error

* optimize code

* optimize code

* add comments

---------

Co-authored-by: Linlang <Lv.Linlang@hotmail.com>
---
 .github/workflows/test_qlib_from_pip.yml |  3 ++
 scripts/data_collector/utils.py          | 49 ++++++++++++++++++------
 setup.py                                 |  3 ++
 3 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml
index 4cc842b223..4b9fa7c34d 100644
--- a/.github/workflows/test_qlib_from_pip.yml
+++ b/.github/workflows/test_qlib_from_pip.yml
@@ -45,6 +45,9 @@ jobs:
 
     - name: Qlib installation test
       run: |
+        # 2024-05-30 scs has released a new version: 3.2.4.post2,
+        # This will cause the CI to fail, so we have limited the version of scs for now.
+        python -m pip install "scs<=3.2.4"
         python -m pip install pyqlib
 
     - name: Install Lightgbm for MacOS
diff --git a/scripts/data_collector/utils.py b/scripts/data_collector/utils.py
index 596eae60ef..feec170bb1 100644
--- a/scripts/data_collector/utils.py
+++ b/scripts/data_collector/utils.py
@@ -15,7 +15,6 @@
 
 import numpy as np
 import pandas as pd
-from lxml import etree
 from loguru import logger
 from yahooquery import Ticker
 from tqdm import tqdm
@@ -190,17 +189,43 @@ def get_hs_stock_symbols() -> list:
     global _HS_SYMBOLS  # pylint: disable=W0603
 
     def _get_symbol():
-        _res = set()
-        for _k, _v in (("ha", "ss"), ("sa", "sz"), ("gem", "sz")):
-            resp = requests.get(HS_SYMBOLS_URL.format(s_type=_k), timeout=None)
-            _res |= set(
-                map(
-                    lambda x: "{}.{}".format(re.findall(r"\d+", x)[0], _v),  # pylint: disable=W0640
-                    etree.HTML(resp.text).xpath("//div[@class='result']/ul//li/a/text()"),  # pylint: disable=I1101
-                )
-            )
-            time.sleep(3)
-        return _res
+        """
+        Get the stock pool from a web page and process it into the format required by yahooquery.
+        Format of data retrieved from the web page: 600519, 000001
+        The data format required by yahooquery: 600519.ss, 000001.sz
+
+        Returns
+        -------
+            set: Returns the set of symbol codes.
+
+        Examples:
+        -------
+            {600000.ss, 600001.ss, 600002.ss, 600003.ss, ...}
+        """
+        url = "http://99.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&po=1&np=1&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12"
+        try:
+            resp = requests.get(url, timeout=None)
+            resp.raise_for_status()
+        except requests.exceptions.HTTPError as e:
+            raise requests.exceptions.HTTPError(f"Request to {url} failed with status code {resp.status_code}") from e
+
+        try:
+            _symbols = [_v["f12"] for _v in resp.json()["data"]["diff"]]
+        except Exception as e:
+            logger.warning("An error occurred while extracting data from the response.")
+            raise
+
+        if len(_symbols) < 3900:
+            raise ValueError("The complete list of stocks is not available.")
+
+        # Add suffix after the stock code to conform to yahooquery standard, otherwise the data will not be fetched.
+        _symbols = [
+            _symbol + ".ss" if _symbol.startswith("6") else _symbol + ".sz" if _symbol.startswith(("0", "3")) else None
+            for _symbol in _symbols
+        ]
+        _symbols = [_symbol for _symbol in _symbols if _symbol is not None]
+
+        return set(_symbols)
 
     if _HS_SYMBOLS is None:
         symbols = set()
diff --git a/setup.py b/setup.py
index adafefd614..1feabd30c1 100644
--- a/setup.py
+++ b/setup.py
@@ -166,6 +166,9 @@ def get_version(rel_path: str) -> str:
             "lxml",
             "baostock",
             "yahooquery",
+            # 2024-05-30 scs has released a new version: 3.2.4.post2,
+            # this version, causes qlib installation to fail, so we've limited the scs version a bit for now.
+            "scs<=3.2.4",
             "beautifulsoup4",
             # In version 0.4.11 of tianshou, the code:
             # logits, hidden = self.actor(batch.obs, state=state, info=batch.info)

From 7db83d84b7a39d78718e06d863a3cbc6449b2d0d Mon Sep 17 00:00:00 2001
From: block-gpt <5486391+block-gpt@users.noreply.github.com>
Date: Sat, 1 Jun 2024 19:33:23 +0800
Subject: [PATCH 19/37] Update utils.py for typo (#1751)

Fix typo

Co-authored-by: Linlang <Lv.Linlang@hotmail.com>
---
 qlib/data/dataset/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qlib/data/dataset/utils.py b/qlib/data/dataset/utils.py
index f19dfe08fa..688cde99af 100644
--- a/qlib/data/dataset/utils.py
+++ b/qlib/data/dataset/utils.py
@@ -9,7 +9,7 @@
     from qlib.data.dataset import DataHandler
 
 
-def get_level_index(df: pd.DataFrame, level=Union[str, int]) -> int:
+def get_level_index(df: pd.DataFrame, level: Union[str, int]) -> int:
     """
 
     get the level index of `df` given `level`

From 41b94059aab19c8616efb8d54728a5ee58d9cf97 Mon Sep 17 00:00:00 2001
From: Yang <3349368+m3ngyang@users.noreply.github.com>
Date: Sun, 2 Jun 2024 06:54:39 +0800
Subject: [PATCH 20/37] fix panic during normalizing the invalid data (#1698)

* fix panic during normalizing the invalid data

* fix yaml load

* change error to warning

* change error code

* optimize code

---------

Co-authored-by: Linlang <Lv.Linlang@hotmail.com>
---
 scripts/data_collector/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py
index 2517e9bce8..2efc2feadc 100644
--- a/scripts/data_collector/base.py
+++ b/scripts/data_collector/base.py
@@ -301,6 +301,7 @@ def _executor(self, file_path: Path):
             na_values={col: symbol_na if col == self._symbol_field_name else default_na for col in columns},
         )
 
+        # NOTE: It has been reported that there may be some problems here, and the specific issues will be dealt with when they are identified.
         df = self._normalize_obj.normalize(df)
         if df is not None and not df.empty:
             if self._end_date is not None:

From 155c17f8ff83b503ad9731f1d6ce53858b078731 Mon Sep 17 00:00:00 2001
From: Linlang <30293408+SunsetWolf@users.noreply.github.com>
Date: Thu, 6 Jun 2024 13:39:49 +0800
Subject: [PATCH 21/37] fix logo display error (#1804)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2f833f04d4..773eeaf39b 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ Recent released features
 Features released before 2021 are not listed here.
 
 <p align="center">
-  <img src="http://fintech.msra.cn/images_v070/logo/1.png" />
+  <img src="docs/_static/img/logo/1.png" />
 </p>
 
 Qlib is an open-source, AI-oriented quantitative investment platform that aims to realize the potential, empower research, and create value using AI technologies in quantitative investment, from exploring ideas to implementing productions. Qlib supports diverse machine learning modeling paradigms, including supervised learning, market dynamics modeling, and reinforcement learning.

From 73ec0f40036aa12a9d26e46984ae5bb9ad8443f5 Mon Sep 17 00:00:00 2001
From: raikiriww <raikiriww@gmail.com>
Date: Wed, 19 Jun 2024 17:31:47 +0800
Subject: [PATCH 22/37] Add "mse" metric option to ALSTM.metric_fn (#1810)

---
 qlib/contrib/model/pytorch_alstm_ts.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/qlib/contrib/model/pytorch_alstm_ts.py b/qlib/contrib/model/pytorch_alstm_ts.py
index 008d789402..3fb7cb9e19 100644
--- a/qlib/contrib/model/pytorch_alstm_ts.py
+++ b/qlib/contrib/model/pytorch_alstm_ts.py
@@ -160,6 +160,10 @@ def metric_fn(self, pred, label):
 
         if self.metric in ("", "loss"):
             return -self.loss_fn(pred[mask], label[mask])
+        elif self.metric == "mse":
+            mask = ~torch.isnan(label)
+            weight = torch.ones_like(label)
+            return -self.mse(pred[mask], label[mask], weight[mask])
 
         raise ValueError("unknown metric `%s`" % self.metric)
 

From 37b908792bbfb77882df7a872186548317f5d1fa Mon Sep 17 00:00:00 2001
From: Lee Yuntong <89683513+akazeakari@users.noreply.github.com>
Date: Wed, 19 Jun 2024 17:31:57 +0800
Subject: [PATCH 23/37] Fix typo (#1809)

Co-authored-by: LeeYuntong <nukuihayu@outlook.com>
---
 qlib/data/data.py           | 2 +-
 qlib/workflow/task/utils.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/qlib/data/data.py b/qlib/data/data.py
index 1b1353ee4e..aba75c0b1a 100644
--- a/qlib/data/data.py
+++ b/qlib/data/data.py
@@ -616,7 +616,7 @@ def inst_calculator(inst, start_time, end_time, freq, column_names, spans=None,
 
         data = pd.DataFrame(obj)
         if not data.empty and not np.issubdtype(data.index.dtype, np.dtype("M")):
-            # If the underlaying provides the data not in datatime formmat, we'll convert it into datetime format
+            # If the underlaying provides the data not in datetime format, we'll convert it into datetime format
             _calendar = Cal.calendar(freq=freq)
             data.index = _calendar[data.index.values.astype(int)]
         data.index.names = ["datetime"]
diff --git a/qlib/workflow/task/utils.py b/qlib/workflow/task/utils.py
index 19837b3c79..4b4a7c06b8 100644
--- a/qlib/workflow/task/utils.py
+++ b/qlib/workflow/task/utils.py
@@ -242,7 +242,7 @@ def _add_step(self, index, step):
 
     def shift(self, seg: tuple, step: int, rtype=SHIFT_SD) -> tuple:
         """
-        Shift the datatime of segment
+        Shift the datetime of segment
 
         If there are None (which indicates unbounded index) in the segment, this method will return None.
 

From 3a348aec9f6e7ffc46f3b5aeba6ac971f6f73d36 Mon Sep 17 00:00:00 2001
From: Lee Yuntong <89683513+akazeakari@users.noreply.github.com>
Date: Thu, 20 Jun 2024 18:12:07 +0800
Subject: [PATCH 24/37] Fix typo (#1811)

Co-authored-by: LeeYuntong <nukuihayu@outlook.com>
---
 docs/component/model.rst                | 2 +-
 examples/workflow_by_code.ipynb         | 2 +-
 qlib/model/trainer.py                   | 2 +-
 scripts/data_collector/crypto/README.md | 2 +-
 tests/test_all_pipeline.py              | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/component/model.rst b/docs/component/model.rst
index e0c630ccaa..60cfa58fab 100644
--- a/docs/component/model.rst
+++ b/docs/component/model.rst
@@ -86,7 +86,7 @@ Example
             },
         }
 
-        # model initiaiton
+        # model initialization
         model = init_instance_by_config(task["model"])
         dataset = init_instance_by_config(task["dataset"])
 
diff --git a/examples/workflow_by_code.ipynb b/examples/workflow_by_code.ipynb
index ebdf2d33bb..2cf3f1fa2a 100644
--- a/examples/workflow_by_code.ipynb
+++ b/examples/workflow_by_code.ipynb
@@ -161,7 +161,7 @@
     "    },\n",
     "}\n",
     "\n",
-    "# model initiaiton\n",
+    "# model initialization\n",
     "model = init_instance_by_config(task[\"model\"])\n",
     "dataset = init_instance_by_config(task[\"dataset\"])\n",
     "\n",
diff --git a/qlib/model/trainer.py b/qlib/model/trainer.py
index 65842d81fa..606c2154e2 100644
--- a/qlib/model/trainer.py
+++ b/qlib/model/trainer.py
@@ -41,7 +41,7 @@ def _log_task_info(task_config: dict):
 
 def _exe_task(task_config: dict):
     rec = R.get_recorder()
-    # model & dataset initiation
+    # model & dataset initialization
     model: Model = init_instance_by_config(task_config["model"], accept_types=Model)
     dataset: Dataset = init_instance_by_config(task_config["dataset"], accept_types=Dataset)
     reweighter: Reweighter = task_config.get("reweighter", None)
diff --git a/scripts/data_collector/crypto/README.md b/scripts/data_collector/crypto/README.md
index b88d80c063..13943f07ee 100644
--- a/scripts/data_collector/crypto/README.md
+++ b/scripts/data_collector/crypto/README.md
@@ -9,7 +9,7 @@ pip install -r requirements.txt
 ```
 
 ## Usage of the dataset
-> *Crypto dateset only support Data retrieval function but not support backtest function due to the lack of OHLC data.*
+> *Crypto dataset only support Data retrieval function but not support backtest function due to the lack of OHLC data.*
 
 ## Collector Data
 
diff --git a/tests/test_all_pipeline.py b/tests/test_all_pipeline.py
index d0f48564db..7bbdaefe3c 100644
--- a/tests/test_all_pipeline.py
+++ b/tests/test_all_pipeline.py
@@ -27,7 +27,7 @@ def train(uri_path: str = None):
             model performance
     """
 
-    # model initiaiton
+    # model initialization
     model = init_instance_by_config(CSI300_GBDT_TASK["model"])
     dataset = init_instance_by_config(CSI300_GBDT_TASK["dataset"])
     # To test __repr__

From ebc0ca893eaedff46d17b4b73731d16d5979bf07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=B1=B9=E5=8D=8E?=
 <37462254+YeewahChan@users.noreply.github.com>
Date: Fri, 21 Jun 2024 09:25:23 +0800
Subject: [PATCH 25/37] Fix TSDataSampler Slicing Bug #1716 (#1803)

* Fix TSDataSampler Slicing Bug #1716

* Fix TSDataSampler Slicing Bug #1716

* Fix TSDataSampler Slicing Bug #1716

* Fix TSDataSampler Slicing Bug with simplyer implmentation#1716
 with Simplified Implementation

* Refactor: Fix CI errors by addressing pylint formatting issues

* Refactor: Remove extraneous whitespace for improved code formatting with Black
---
 qlib/data/dataset/__init__.py              |  2 +-
 tests/data_mid_layer_tests/test_dataset.py | 51 +++++++++++++++++++++-
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/qlib/data/dataset/__init__.py b/qlib/data/dataset/__init__.py
index aacd58389a..0b6c552a37 100644
--- a/qlib/data/dataset/__init__.py
+++ b/qlib/data/dataset/__init__.py
@@ -403,7 +403,7 @@ def __init__(
             np.full((1, self.data_arr.shape[1]), np.nan, dtype=self.data_arr.dtype),
             axis=0,
         )
-        self.nan_idx = -1  # The last line is all NaN
+        self.nan_idx = len(self.data_arr) - 1  # The last line is all NaN; setting it to -1 can cause bug #1716
 
         # the data type will be changed
         # The index of usable data is between start_idx and end_idx
diff --git a/tests/data_mid_layer_tests/test_dataset.py b/tests/data_mid_layer_tests/test_dataset.py
index dc2ec812f1..9eb2083aa7 100755
--- a/tests/data_mid_layer_tests/test_dataset.py
+++ b/tests/data_mid_layer_tests/test_dataset.py
@@ -5,8 +5,9 @@
 import pytest
 import sys
 from qlib.tests import TestAutoData
-from qlib.data.dataset import TSDatasetH
+from qlib.data.dataset import TSDatasetH, TSDataSampler
 import numpy as np
+import pandas as pd
 import time
 from qlib.data.dataset.handler import DataHandlerLP
 
@@ -98,6 +99,54 @@ def testTSDataset(self):
             print(idx[i])
 
 
+class TestTSDataSampler(unittest.TestCase):
+    def test_TSDataSampler(self):
+        """
+        Test TSDataSampler for issue #1716
+        """
+        datetime_list = ["2000-01-31", "2000-02-29", "2000-03-31", "2000-04-30", "2000-05-31"]
+        instruments = ["000001", "000002", "000003", "000004", "000005"]
+        index = pd.MultiIndex.from_product(
+            [pd.to_datetime(datetime_list), instruments], names=["datetime", "instrument"]
+        )
+        data = np.random.randn(len(datetime_list) * len(instruments))
+        test_df = pd.DataFrame(data=data, index=index, columns=["factor"])
+        dataset = TSDataSampler(test_df, datetime_list[0], datetime_list[-1], step_len=2)
+        print()
+        print("--------------dataset[0]--------------")
+        print(dataset[0])
+        print("--------------dataset[1]--------------")
+        print(dataset[1])
+        assert len(dataset[0]) == 2
+        self.assertTrue(np.isnan(dataset[0][0]))
+        self.assertEqual(dataset[0][1], dataset[1][0])
+        self.assertEqual(dataset[1][1], dataset[2][0])
+        self.assertEqual(dataset[2][1], dataset[3][0])
+
+    def test_TSDataSampler2(self):
+        """
+        Extra test TSDataSampler to prevent incorrect filling of nan for the values at the front
+        """
+        datetime_list = ["2000-01-31", "2000-02-29", "2000-03-31", "2000-04-30", "2000-05-31"]
+        instruments = ["000001", "000002", "000003", "000004", "000005"]
+        index = pd.MultiIndex.from_product(
+            [pd.to_datetime(datetime_list), instruments], names=["datetime", "instrument"]
+        )
+        data = np.random.randn(len(datetime_list) * len(instruments))
+        test_df = pd.DataFrame(data=data, index=index, columns=["factor"])
+        dataset = TSDataSampler(test_df, datetime_list[2], datetime_list[-1], step_len=3)
+        print()
+        print("--------------dataset[0]--------------")
+        print(dataset[0])
+        print("--------------dataset[1]--------------")
+        print(dataset[1])
+        for i in range(3):
+            self.assertFalse(np.isnan(dataset[0][i]))
+            self.assertFalse(np.isnan(dataset[1][i]))
+        self.assertEqual(dataset[0][1], dataset[1][0])
+        self.assertEqual(dataset[0][2], dataset[1][1])
+
+
 if __name__ == "__main__":
     unittest.main(verbosity=10)
 

From 47bd13295b48c591ae5d4b864690f7afd6e30d42 Mon Sep 17 00:00:00 2001
From: Fivele-Li <128388363+Fivele-Li@users.noreply.github.com>
Date: Fri, 21 Jun 2024 11:22:23 +0800
Subject: [PATCH 26/37] Fix Yahoo daily data format inconsistent (#1517)

* Fix FutureWarning: Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead

* align index format while end date contains current day data

* fix black

* fix black

* optimize code

* optimize code

* optimize code

* fix ci error

* check ci error

* fix ci error

* check ci error

* check ci error

* check ci error

* check ci error

* check ci error

* check ci error

* fix ci error

* fix ci error

* fix ci error

* fix ci error

* fix ci error

---------

Co-authored-by: Cadenza-Li <362237642@qq.com>
Co-authored-by: Linlang <Lv.Linlang@hotmail.com>
---
 .github/workflows/test_qlib_from_pip.yml    | 3 +++
 .github/workflows/test_qlib_from_source.yml | 9 ++++++++-
 scripts/data_collector/yahoo/collector.py   | 3 +++
 setup.py                                    | 4 ++--
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml
index 4b9fa7c34d..fd1e8c4cf4 100644
--- a/.github/workflows/test_qlib_from_pip.yml
+++ b/.github/workflows/test_qlib_from_pip.yml
@@ -68,5 +68,8 @@ jobs:
         cd qlib
 
     - name: Test workflow by config
+      # On macos-11 system, it will lead to "Segmentation fault: 11" error,
+      # which may be caused by the excessive memory overhead of macos-11 system, so we disable macos-11 temporarily here.
+      if: ${{ matrix.os != 'macos-11' }}
       run: |
         qrun examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml
index 38f32da8ed..885d8fa439 100644
--- a/.github/workflows/test_qlib_from_source.yml
+++ b/.github/workflows/test_qlib_from_source.yml
@@ -72,8 +72,10 @@ jobs:
         black . -l 120 --check --diff
 
     - name: Make html with sphinx
+      # Since read the docs builds on ubuntu 22.04, we only need to test that the build passes on ubuntu 22.04.
+      if: ${{ matrix.os == 'ubuntu-22.04' }}
       run: |
-        cd docs 
+        cd docs
         sphinx-build -W --keep-going -b html . _build
         cd ..
 
@@ -159,11 +161,16 @@ jobs:
 
     # Run after data downloads
     - name: Check Qlib ipynb with nbconvert
+      # Running the nbconvert check on a macos-11 system results in a "Kernel died" error, so we've temporarily disabled macos-11 here.
+      if: ${{ matrix.os != 'macos-11' }}
       run: |
         # add more ipynb files in future
         jupyter nbconvert --to notebook --execute examples/workflow_by_code.ipynb
 
     - name: Test workflow by config (install from source)
+      # On macos-11 system, it will lead to "Segmentation fault: 11" error,
+      # which may be caused by the excessive memory overhead of macos-11 system, so we disable macos-11 temporarily here.
+      if: ${{ matrix.os != 'macos-11' }}
       run: |
         python -m pip install numba
         python qlib/workflow/cli.py examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml
diff --git a/scripts/data_collector/yahoo/collector.py b/scripts/data_collector/yahoo/collector.py
index 25e2963883..d2fa0b06f7 100644
--- a/scripts/data_collector/yahoo/collector.py
+++ b/scripts/data_collector/yahoo/collector.py
@@ -796,6 +796,9 @@ def download_data(
             # get 1m data
             $ python collector.py download_data --source_dir ~/.qlib/stock_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1m
         """
+        if self.interval == "1d" and pd.Timestamp(end) > pd.Timestamp(datetime.datetime.now().strftime("%Y-%m-%d")):
+            raise ValueError(f"end_date: {end} is greater than the current date.")
+
         super(Run, self).download_data(max_collector_count, delay, start, end, check_data_length, limit_nums)
 
     def normalize_data(
diff --git a/setup.py b/setup.py
index 1feabd30c1..a0dc9962c6 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,7 @@ def get_version(rel_path: str) -> str:
 REQUIRED = [
     "numpy>=1.12.0, <1.24",
     "pandas>=0.25.1",
-    "scipy>=1.0.0",
+    "scipy>=1.7.3",
     "requests>=2.18.0",
     "sacred>=0.7.4",
     "python-socketio",
@@ -82,7 +82,7 @@ def get_version(rel_path: str) -> str:
     "dill",
     "dataclasses;python_version<'3.7'",
     "filelock",
-    "jinja2<3.1.0",  # for passing the readthedocs workflow.
+    "jinja2",
     "gym",
     # Installing the latest version of protobuf for python versions below 3.8 will cause unit tests to fail.
     "protobuf<=3.20.1;python_version<='3.8'",

From 33482047dcc24a3cd27c695e916cc9fc5168c0c6 Mon Sep 17 00:00:00 2001
From: Linlang <30293408+SunsetWolf@users.noreply.github.com>
Date: Fri, 21 Jun 2024 13:05:53 +0800
Subject: [PATCH 27/37] change weight data download url (#1812)

---
 examples/portfolio/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/portfolio/README.md b/examples/portfolio/README.md
index 829e19ea80..57e86e15aa 100644
--- a/examples/portfolio/README.md
+++ b/examples/portfolio/README.md
@@ -20,7 +20,7 @@ We use China stock market data for our example.
 1. Prepare CSI300 weight:
 
    ```bash
-   wget http://fintech.msra.cn/stock_data/downloads/csi300_weight.zip
+   wget https://github.com/SunsetWolf/qlib_dataset/releases/download/v0/csi300_weight.zip
    unzip -d ~/.qlib/qlib_data/cn_data csi300_weight.zip
    rm -f csi300_weight.zip
    ```

From a339fc11d1984e2a66f682195c030570277bd001 Mon Sep 17 00:00:00 2001
From: cyncyw <47289405+taozhiwang@users.noreply.github.com>
Date: Mon, 24 Jun 2024 03:33:45 -0400
Subject: [PATCH 28/37] add a note for code standard (#1814)

* add a note for code standard

* handle both cases

---------

Co-authored-by: taozhiwang <taozhiwa@gmail.com>
---
 docs/developer/code_standard_and_dev_guide.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/developer/code_standard_and_dev_guide.rst b/docs/developer/code_standard_and_dev_guide.rst
index 87f193b8e5..26aeb4f0aa 100644
--- a/docs/developer/code_standard_and_dev_guide.rst
+++ b/docs/developer/code_standard_and_dev_guide.rst
@@ -60,4 +60,4 @@ The `[dev]` option will help you to install some related packages when developin
 
 .. code-block:: bash
 
-    pip install -e .[dev]
+    pip install -e ".[dev]"
\ No newline at end of file

From cde80206e416be919c6652b4fd07f0cba0520789 Mon Sep 17 00:00:00 2001
From: cyncyw <47289405+taozhiwang@users.noreply.github.com>
Date: Mon, 24 Jun 2024 03:34:48 -0400
Subject: [PATCH 29/37] Update index_data.py for datatype conversion and
 alignment (#1813)

* Update index_data.py for data convertion and alignment

* Update qlib/utils/index_data.py

* Update qlib/utils/index_data.py

* fix linting

---------

Co-authored-by: taozhiwang <taozhiwa@gmail.com>
Co-authored-by: you-n-g <you-n-g@users.noreply.github.com>
---
 qlib/utils/index_data.py      | 13 ++++++++++++-
 tests/misc/test_index_data.py | 18 ++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/qlib/utils/index_data.py b/qlib/utils/index_data.py
index 113f9802d7..6c4525ce36 100644
--- a/qlib/utils/index_data.py
+++ b/qlib/utils/index_data.py
@@ -108,6 +108,12 @@ def __init__(self, idx_list: Union[List, pd.Index, "Index", int]):
             self.index_map = self.idx_list = np.arange(idx_list)
             self._is_sorted = True
         else:
+            # Check if all elements in idx_list are of the same type
+            if not all(isinstance(x, type(idx_list[0])) for x in idx_list):
+                raise TypeError("All elements in idx_list must be of the same type")
+            # Check if all elements in idx_list are of the same datetime64 precision
+            if isinstance(idx_list[0], np.datetime64) and not all(x.dtype == idx_list[0].dtype for x in idx_list):
+                raise TypeError("All elements in idx_list must be of the same datetime64 precision")
             self.idx_list = np.array(idx_list)
             # NOTE: only the first appearance is indexed
             self.index_map = dict(zip(self.idx_list, range(len(self))))
@@ -131,7 +137,12 @@ def _convert_type(self, item):
         if self.idx_list.dtype.type is np.datetime64:
             if isinstance(item, pd.Timestamp):
                 # This happens often when creating index based on pandas.DatetimeIndex and query with pd.Timestamp
-                return item.to_numpy()
+                return item.to_numpy().astype(self.idx_list.dtype)
+            elif isinstance(item, np.datetime64):
+                # This happens often when creating index based on np.datetime64 and query with another precision
+                return item.astype(self.idx_list.dtype)
+            # NOTE: It is hard to consider every case at first.
+            # We just try to cover part of cases to make it more user-friendly
         return item
 
     def index(self, item) -> int:
diff --git a/tests/misc/test_index_data.py b/tests/misc/test_index_data.py
index 2db644f8a6..b3045a5c7f 100644
--- a/tests/misc/test_index_data.py
+++ b/tests/misc/test_index_data.py
@@ -94,6 +94,24 @@ def test_corner_cases(self):
         print(sd)
         self.assertTrue(sd.iloc[0] == 2)
 
+        # test different precisions of time data
+        timeindex = [
+            np.datetime64("2024-06-22T00:00:00.000000000"),
+            np.datetime64("2024-06-21T00:00:00.000000000"),
+            np.datetime64("2024-06-20T00:00:00.000000000"),
+        ]
+        sd = idd.SingleData([1, 2, 3], index=timeindex)
+        self.assertTrue(
+            sd.index.index(np.datetime64("2024-06-21T00:00:00.000000000"))
+            == sd.index.index(np.datetime64("2024-06-21T00:00:00"))
+        )
+        self.assertTrue(sd.index.index(pd.Timestamp("2024-06-21 00:00")) == 1)
+
+        # Bad case: the input is not aligned
+        timeindex[1] = (np.datetime64("2024-06-21T00:00:00.00"),)
+        with self.assertRaises(TypeError):
+            sd = idd.SingleData([1, 2, 3], index=timeindex)
+
     def test_ops(self):
         sd1 = idd.SingleData([1, 2, 3, 4], index=["foo", "bar", "f", "g"])
         sd2 = idd.SingleData([1, 2, 3, 4], index=["foo", "bar", "f", "g"])

From 5190332c7ecdc274e8d4af5e5ef8a4c4f0ee6963 Mon Sep 17 00:00:00 2001
From: you-n-g <you-n-g@users.noreply.github.com>
Date: Wed, 26 Jun 2024 18:34:00 +0800
Subject: [PATCH 30/37] Add  some misc features. (#1816)

* Normal mod

* Black linting

* Linting
---
 .../benchmarks_dynamic/DDG-DA/workflow.py     |  9 +-
 .../baseline/rolling_benchmark.py             |  9 +-
 qlib/contrib/meta/data_selection/dataset.py   | 34 ++++++--
 qlib/contrib/meta/data_selection/model.py     | 12 ++-
 qlib/contrib/meta/data_selection/utils.py     |  9 +-
 qlib/contrib/model/linear.py                  |  1 +
 qlib/contrib/model/pytorch_gru.py             | 84 ++++++++++++-------
 qlib/contrib/report/data/ana.py               | 16 ++++
 qlib/contrib/report/data/base.py              | 18 ++++
 qlib/contrib/report/utils.py                  |  6 +-
 qlib/contrib/rolling/base.py                  | 35 ++++++--
 qlib/contrib/rolling/ddgda.py                 | 72 ++++++++++++----
 qlib/model/meta/task.py                       |  3 +
 qlib/utils/mod.py                             |  8 +-
 qlib/workflow/cli.py                          | 48 +++++++++--
 15 files changed, 289 insertions(+), 75 deletions(-)

diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py
index 7593fe374f..8209e0e906 100644
--- a/examples/benchmarks_dynamic/DDG-DA/workflow.py
+++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+import os
 from pathlib import Path
 from typing import Union
 
@@ -35,6 +36,10 @@ def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwa
 
 
 if __name__ == "__main__":
-    GetData().qlib_data(exists_skip=True)
-    auto_init()
+    kwargs = {}
+    if os.environ.get("PROVIDER_URI", "") == "":
+        GetData().qlib_data(exists_skip=True)
+    else:
+        kwargs["provider_uri"] = os.environ["PROVIDER_URI"]
+    auto_init(**kwargs)
     fire.Fire(DDGDABench)
diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py
index 1ce30ef8a7..02b7ed4650 100644
--- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py
+++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+import os
 from pathlib import Path
 from typing import Union
 
@@ -31,6 +32,10 @@ def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwa
 
 
 if __name__ == "__main__":
-    GetData().qlib_data(exists_skip=True)
-    auto_init()
+    kwargs = {}
+    if os.environ.get("PROVIDER_URI", "") == "":
+        GetData().qlib_data(exists_skip=True)
+    else:
+        kwargs["provider_uri"] = os.environ["PROVIDER_URI"]
+    auto_init(**kwargs)
     fire.Fire(RollingBenchmark)
diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py
index 9349a12fe5..58e160f110 100644
--- a/qlib/contrib/meta/data_selection/dataset.py
+++ b/qlib/contrib/meta/data_selection/dataset.py
@@ -243,7 +243,7 @@ def __init__(
         trunc_days: int = None,
         rolling_ext_days: int = 0,
         exp_name: Union[str, InternalData],
-        segments: Union[Dict[Text, Tuple], float],
+        segments: Union[Dict[Text, Tuple], float, str],
         hist_step_n: int = 10,
         task_mode: str = MetaTask.PROC_MODE_FULL,
         fill_method: str = "max",
@@ -271,12 +271,16 @@ def __init__(
             - str: the name of the experiment to store the performance of data
             - InternalData: a prepared internal data
         segments: Union[Dict[Text, Tuple], float]
-            the segments to divide data
-            both left and right
+            if the segment is a Dict
+                the segments to divide data
+                both left and right are included
             if segments is a float:
                 the float represents the percentage of data for training
+            if segments is a string:
+                it will try its best to put its data in training and ensure that the date `segments` is in the test set
         hist_step_n: int
             length of historical steps for the meta infomation
+            Number of steps of the data similarity information
         task_mode : str
             Please refer to the docs of MetaTask
         """
@@ -383,10 +387,30 @@ def _prepare_seg(self, segment: Text) -> List[MetaTask]:
         if isinstance(self.segments, float):
             train_task_n = int(len(self.meta_task_l) * self.segments)
             if segment == "train":
-                return self.meta_task_l[:train_task_n]
+                train_tasks = self.meta_task_l[:train_task_n]
+                get_module_logger("MetaDatasetDS").info(f"The first train meta task: {train_tasks[0]}")
+                return train_tasks
             elif segment == "test":
-                return self.meta_task_l[train_task_n:]
+                test_tasks = self.meta_task_l[train_task_n:]
+                get_module_logger("MetaDatasetDS").info(f"The first test meta task: {test_tasks[0]}")
+                return test_tasks
             else:
                 raise NotImplementedError(f"This type of input is not supported")
+        elif isinstance(self.segments, str):
+            train_tasks = []
+            test_tasks = []
+            for t in self.meta_task_l:
+                test_end = t.task["dataset"]["kwargs"]["segments"]["test"][1]
+                if test_end is None or pd.Timestamp(test_end) < pd.Timestamp(self.segments):
+                    train_tasks.append(t)
+                else:
+                    test_tasks.append(t)
+            get_module_logger("MetaDatasetDS").info(f"The first train meta task: {train_tasks[0]}")
+            get_module_logger("MetaDatasetDS").info(f"The first test meta task: {test_tasks[0]}")
+            if segment == "train":
+                return train_tasks
+            elif segment == "test":
+                return test_tasks
+            raise NotImplementedError(f"This type of input is not supported")
         else:
             raise NotImplementedError(f"This type of input is not supported")
diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py
index 068f15f9d6..7aaa0cad79 100644
--- a/qlib/contrib/meta/data_selection/model.py
+++ b/qlib/contrib/meta/data_selection/model.py
@@ -53,7 +53,12 @@ def __init__(
         max_epoch=100,
         seed=43,
         alpha=0.0,
+        loss_skip_thresh=50,
     ):
+        """
+        loss_skip_size: int
+            The number of threshold to skip the loss calculation for each day.
+        """
         self.step = step
         self.hist_step_n = hist_step_n
         self.clip_method = clip_method
@@ -63,6 +68,7 @@ def __init__(
         self.max_epoch = max_epoch
         self.fitted = False
         self.alpha = alpha
+        self.loss_skip_thresh = loss_skip_thresh
         torch.manual_seed(seed)
 
     def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False):
@@ -88,12 +94,14 @@ def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False):
                 criterion = nn.MSELoss()
                 loss = criterion(pred, meta_input["y_test"])
             elif self.criterion == "ic_loss":
-                criterion = ICLoss()
+                criterion = ICLoss(self.loss_skip_thresh)
                 try:
-                    loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"], skip_size=50)
+                    loss = criterion(pred, meta_input["y_test"], meta_input["test_idx"])
                 except ValueError as e:
                     get_module_logger("MetaModelDS").warning(f"Exception `{e}` when calculating IC loss")
                     continue
+            else:
+                raise ValueError(f"Unknown criterion: {self.criterion}")
 
             assert not np.isnan(loss.detach().item()), "NaN loss!"
 
diff --git a/qlib/contrib/meta/data_selection/utils.py b/qlib/contrib/meta/data_selection/utils.py
index 7da5028085..2fddb00963 100644
--- a/qlib/contrib/meta/data_selection/utils.py
+++ b/qlib/contrib/meta/data_selection/utils.py
@@ -10,7 +10,11 @@
 
 
 class ICLoss(nn.Module):
-    def forward(self, pred, y, idx, skip_size=50):
+    def __init__(self, skip_size=50):
+        super().__init__()
+        self.skip_size = skip_size
+
+    def forward(self, pred, y, idx):
         """forward.
         FIXME:
         - Some times it will be a slightly different from the result from `pandas.corr()`
@@ -33,7 +37,7 @@ def forward(self, pred, y, idx, skip_size=50):
         skip_n = 0
         for start_i, end_i in zip(diff_point, diff_point[1:]):
             pred_focus = pred[start_i:end_i]  # TODO: just for fake
-            if pred_focus.shape[0] < skip_size:
+            if pred_focus.shape[0] < self.skip_size:
                 # skip some days which have very small amount of stock.
                 skip_n += 1
                 continue
@@ -50,6 +54,7 @@ def forward(self, pred, y, idx, skip_size=50):
             )
             ic_all += ic_day
         if len(diff_point) - 1 - skip_n <= 0:
+            __import__("ipdb").set_trace()
             raise ValueError("No enough data for calculating IC")
         if skip_n > 0:
             get_module_logger("ICLoss").info(
diff --git a/qlib/contrib/model/linear.py b/qlib/contrib/model/linear.py
index 7fd3d156b5..15cdb739e9 100644
--- a/qlib/contrib/model/linear.py
+++ b/qlib/contrib/model/linear.py
@@ -63,6 +63,7 @@ def fit(self, dataset: DatasetH, reweighter: Reweighter = None):
                 df_train = pd.concat([df_train, df_valid])
             except KeyError:
                 get_module_logger("LinearModel").info("include_valid=True, but valid does not exist")
+        df_train = df_train.dropna()
         if df_train.empty:
             raise ValueError("Empty data from dataset, please check your dataset config.")
         if reweighter is not None:
diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py
index 2a476a657d..e0f883f094 100755
--- a/qlib/contrib/model/pytorch_gru.py
+++ b/qlib/contrib/model/pytorch_gru.py
@@ -1,25 +1,25 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-
 from __future__ import division
 from __future__ import print_function
+import copy
+from typing import Text, Union
 
 import numpy as np
 import pandas as pd
-from typing import Text, Union
-import copy
-from ...utils import get_or_create_path
-from ...log import get_module_logger
-
 import torch
 import torch.nn as nn
 import torch.optim as optim
 
-from .pytorch_utils import count_parameters
-from ...model.base import Model
+from qlib.workflow import R
+
 from ...data.dataset import DatasetH
 from ...data.dataset.handler import DataHandlerLP
+from ...log import get_module_logger
+from ...model.base import Model
+from ...utils import get_or_create_path
+from .pytorch_utils import count_parameters
 
 
 class GRU(Model):
@@ -212,16 +212,31 @@ def fit(
         evals_result=dict(),
         save_path=None,
     ):
-        df_train, df_valid, df_test = dataset.prepare(
-            ["train", "valid", "test"],
-            col_set=["feature", "label"],
-            data_key=DataHandlerLP.DK_L,
-        )
-        if df_train.empty or df_valid.empty:
-            raise ValueError("Empty data from dataset, please check your dataset config.")
+        # prepare training and validation data
+        dfs = {
+            k: dataset.prepare(
+                k,
+                col_set=["feature", "label"],
+                data_key=DataHandlerLP.DK_L,
+            )
+            for k in ["train", "valid"]
+            if k in dataset.segments
+        }
+        df_train, df_valid = dfs.get("train", pd.DataFrame()), dfs.get("valid", pd.DataFrame())
+
+        # check if training data is empty
+        if df_train.empty:
+            raise ValueError("Empty training data from dataset, please check your dataset config.")
 
+        df_train = df_train.dropna()
         x_train, y_train = df_train["feature"], df_train["label"]
-        x_valid, y_valid = df_valid["feature"], df_valid["label"]
+
+        # check if validation data is provided
+        if not df_valid.empty:
+            df_valid = df_valid.dropna()
+            x_valid, y_valid = df_valid["feature"], df_valid["label"]
+        else:
+            x_valid, y_valid = None, None
 
         save_path = get_or_create_path(save_path)
         stop_steps = 0
@@ -235,32 +250,42 @@ def fit(
         self.logger.info("training...")
         self.fitted = True
 
+        best_param = copy.deepcopy(self.gru_model.state_dict())
         for step in range(self.n_epochs):
             self.logger.info("Epoch%d:", step)
             self.logger.info("training...")
             self.train_epoch(x_train, y_train)
             self.logger.info("evaluating...")
             train_loss, train_score = self.test_epoch(x_train, y_train)
-            val_loss, val_score = self.test_epoch(x_valid, y_valid)
-            self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
             evals_result["train"].append(train_score)
-            evals_result["valid"].append(val_score)
 
-            if val_score > best_score:
-                best_score = val_score
-                stop_steps = 0
-                best_epoch = step
-                best_param = copy.deepcopy(self.gru_model.state_dict())
-            else:
-                stop_steps += 1
-                if stop_steps >= self.early_stop:
-                    self.logger.info("early stop")
-                    break
+            # evaluate on validation data if provided
+            if x_valid is not None and y_valid is not None:
+                val_loss, val_score = self.test_epoch(x_valid, y_valid)
+                self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
+                evals_result["valid"].append(val_score)
+
+                if val_score > best_score:
+                    best_score = val_score
+                    stop_steps = 0
+                    best_epoch = step
+                    best_param = copy.deepcopy(self.gru_model.state_dict())
+                else:
+                    stop_steps += 1
+                    if stop_steps >= self.early_stop:
+                        self.logger.info("early stop")
+                        break
 
         self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
         self.gru_model.load_state_dict(best_param)
         torch.save(best_param, save_path)
 
+        # Logging
+        rec = R.get_recorder()
+        for k, v_l in evals_result.items():
+            for i, v in enumerate(v_l):
+                rec.log_metrics(step=i, **{k: v})
+
         if self.use_gpu:
             torch.cuda.empty_cache()
 
@@ -292,6 +317,7 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
 
 
 class GRUModel(nn.Module):
+
     def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
         super().__init__()
 
diff --git a/qlib/contrib/report/data/ana.py b/qlib/contrib/report/data/ana.py
index 567ef311d5..d01e852cee 100644
--- a/qlib/contrib/report/data/ana.py
+++ b/qlib/contrib/report/data/ana.py
@@ -1,5 +1,17 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+"""
+Here we have a comprehensive set of analysis classes.
+
+Here is an example.
+
+.. code-block:: python
+
+    from qlib.contrib.report.data.ana import FeaMeanStd
+    fa = FeaMeanStd(ret_df)
+    fa.plot_all(wspace=0.3, sub_figsize=(12, 3), col_n=5)
+
+"""
 import pandas as pd
 import numpy as np
 from qlib.contrib.report.data.base import FeaAnalyser
@@ -152,6 +164,7 @@ def plot_single(self, col, ax):
         self._kurt[col].plot(ax=right_ax, label="kurt", color="green")
         right_ax.set_xlabel("")
         right_ax.set_ylabel("kurt")
+        right_ax.grid(None)  # set the grid to None to avoid two layer of grid
 
         h1, l1 = ax.get_legend_handles_labels()
         h2, l2 = right_ax.get_legend_handles_labels()
@@ -171,12 +184,15 @@ def plot_single(self, col, ax):
         ax.set_xlabel("")
         ax.set_ylabel("mean")
         ax.legend()
+        ax.tick_params(axis="x", rotation=90)
 
         right_ax = ax.twinx()
 
         self._std[col].plot(ax=right_ax, label="std", color="green")
         right_ax.set_xlabel("")
         right_ax.set_ylabel("std")
+        right_ax.tick_params(axis="x", rotation=90)
+        right_ax.grid(None)  # set the grid to None to avoid two layer of grid
 
         h1, l1 = ax.get_legend_handles_labels()
         h2, l2 = right_ax.get_legend_handles_labels()
diff --git a/qlib/contrib/report/data/base.py b/qlib/contrib/report/data/base.py
index a91eda48e6..0861233b6d 100644
--- a/qlib/contrib/report/data/base.py
+++ b/qlib/contrib/report/data/base.py
@@ -14,6 +14,24 @@
 
 class FeaAnalyser:
     def __init__(self, dataset: pd.DataFrame):
+        """
+
+        Parameters
+        ----------
+        dataset : pd.DataFrame
+
+            We often have multiple columns for dataset. Each column corresponds to one sub figure.
+            There will be a datatime column in the index levels.
+            Aggretation will be used for more summarized metrics overtime.
+            Here is an example of data:
+
+            .. code-block::
+
+                                            return
+                datetime   instrument
+                2007-02-06 equity_tpx     0.010087
+                           equity_spx     0.000786
+        """
         self._dataset = dataset
         with TimeInspector.logt("calc_stat_values"):
             self.calc_stat_values()
diff --git a/qlib/contrib/report/utils.py b/qlib/contrib/report/utils.py
index 70de85198a..8d3d3fac9a 100644
--- a/qlib/contrib/report/utils.py
+++ b/qlib/contrib/report/utils.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 
-def sub_fig_generator(sub_fs=(3, 3), col_n=10, row_n=1, wspace=None, hspace=None, sharex=False, sharey=False):
+def sub_fig_generator(sub_figsize=(3, 3), col_n=10, row_n=1, wspace=None, hspace=None, sharex=False, sharey=False):
     """sub_fig_generator.
     it will return a generator, each row contains <col_n> sub graph
 
@@ -13,7 +13,7 @@ def sub_fig_generator(sub_fs=(3, 3), col_n=10, row_n=1, wspace=None, hspace=None
 
     Parameters
     ----------
-    sub_fs :
+    sub_figsize :
         the figure size of each subgraph in <col_n> * <row_n> subgraphs
     col_n :
         the number of subgraph in each row;  It will generating a new graph after generating <col_n> of subgraphs.
@@ -33,7 +33,7 @@ def sub_fig_generator(sub_fs=(3, 3), col_n=10, row_n=1, wspace=None, hspace=None
 
     while True:
         fig, axes = plt.subplots(
-            row_n, col_n, figsize=(sub_fs[0] * col_n, sub_fs[1] * row_n), sharex=sharex, sharey=sharey
+            row_n, col_n, figsize=(sub_figsize[0] * col_n, sub_figsize[1] * row_n), sharex=sharex, sharey=sharey
         )
         plt.subplots_adjust(wspace=wspace, hspace=hspace)
         axes = axes.reshape(row_n, col_n)
diff --git a/qlib/contrib/rolling/base.py b/qlib/contrib/rolling/base.py
index d179efb38b..05467a6be2 100644
--- a/qlib/contrib/rolling/base.py
+++ b/qlib/contrib/rolling/base.py
@@ -73,8 +73,8 @@ def __init__(
             The horizon of the prediction target.
             This is used to override the prediction horizon of the file.
         h_path : Optional[str]
-            the dumped data handler;
-            It may come from other data source. It will override the data handler in the config.
+            It is other data source that is dumped as a handler. It will override the data handler section in the config.
+            If it is not given, it will create a customized cache for the handler when `enable_handler_cache=True`
         test_end : Optional[str]
             the test end for the data. It is typically used together with the handler
             You can do the same thing with task_ext_conf in a more complicated way
@@ -119,7 +119,7 @@ def _raw_conf(self) -> dict:
         with self.conf_path.open("r") as f:
             return yaml.safe_load(f)
 
-    def _replace_hanler_with_cache(self, task: dict):
+    def _replace_handler_with_cache(self, task: dict):
         """
         Due to the data processing part in original rolling is slow. So we have to
         This class tries to add more feature
@@ -159,13 +159,20 @@ def basic_task(self, enable_handler_cache: Optional[bool] = True):
             # - get horizon automatically from the expression!!!!
             raise NotImplementedError(f"This type of input is not supported")
         else:
-            self.logger.info("The prediction horizon is overrided")
-            task["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [
-                "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1)
-            ]
+            if enable_handler_cache and self.h_path is not None:
+                self.logger.info("Fail to override the horizon due to data handler cache")
+            else:
+                self.logger.info("The prediction horizon is overrided")
+                if isinstance(task["dataset"]["kwargs"]["handler"], dict):
+                    task["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [
+                        "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1)
+                    ]
+                else:
+                    self.logger.warning("Try to automatically configure the lablel but failed.")
 
-        if enable_handler_cache:
-            task = self._replace_hanler_with_cache(task)
+        if self.h_path is not None or enable_handler_cache:
+            # if we already have provided data source or we want to create one
+            task = self._replace_handler_with_cache(task)
         task = self._update_start_end_time(task)
 
         if self.task_ext_conf is not None:
@@ -173,6 +180,16 @@ def basic_task(self, enable_handler_cache: Optional[bool] = True):
         self.logger.info(task)
         return task
 
+    def run_basic_task(self):
+        """
+        Run the basic task without rolling.
+        This is for fast testing for model tunning.
+        """
+        task = self.basic_task()
+        print(task)
+        trainer = TrainerR(experiment_name=self.exp_name)
+        trainer([task])
+
     def get_task_list(self) -> List[dict]:
         """return a batch of tasks for rolling."""
         task = self.basic_task()
diff --git a/qlib/contrib/rolling/ddgda.py b/qlib/contrib/rolling/ddgda.py
index 25fb4c36e2..b62820ccea 100644
--- a/qlib/contrib/rolling/ddgda.py
+++ b/qlib/contrib/rolling/ddgda.py
@@ -80,6 +80,11 @@ def __init__(
         sim_task_model: UTIL_MODEL_TYPE = "gbdt",
         meta_1st_train_end: Optional[str] = None,
         alpha: float = 0.01,
+        loss_skip_thresh: int = 50,
+        fea_imp_n: Optional[int] = 30,
+        meta_data_proc: Optional[str] = "V01",
+        segments: Union[float, str] = 0.62,
+        hist_step_n: int = 30,
         working_dir: Optional[Union[str, Path]] = None,
         **kwargs,
     ):
@@ -94,6 +99,15 @@ def __init__(
         alpha: float
             Setting the L2 regularization for ridge
             The `alpha` is only passed to MetaModelDS (it is not passed to sim_task_model currently..)
+        loss_skip_thresh: int
+            The thresh to skip the loss calculation for each day. If the number of item is less than it, it will skip the loss on that day.
+        meta_data_proc : Optional[str]
+            How we process the meta dataset for learning meta model.
+        segments : Union[float, str]
+            if segments is a float:
+                The ratio of training data in the meta task dataset
+            if segments is a string:
+                it will try its best to put its data in training and ensure that the date `segments` is in the test set
         """
         # NOTE:
         # the horizon must match the meaning in the base task template
@@ -104,14 +118,22 @@ def __init__(
         super().__init__(**kwargs)
         self.working_dir = self.conf_path.parent if working_dir is None else Path(working_dir)
         self.proxy_hd = self.working_dir / "handler_proxy.pkl"
+        self.fea_imp_n = fea_imp_n
+        self.meta_data_proc = meta_data_proc
+        self.loss_skip_thresh = loss_skip_thresh
+        self.segments = segments
+        self.hist_step_n = hist_step_n
 
     def _adjust_task(self, task: dict, astype: UTIL_MODEL_TYPE):
         """
-        some task are use for special purpose.
+        Base on the original task, we need to do some extra things.
+
         For example:
         - GBDT for calculating feature importance
         - Linear or GBDT for calculating similarity
         - Datset (well processed) that aligned to Linear that for meta learning
+
+        So we may need to change the dataset and model for the special purpose and other settings remains the same.
         """
         # NOTE: here is just for aligning with previous implementation
         # It is not necessary for the current implementation
@@ -119,12 +141,16 @@ def _adjust_task(self, task: dict, astype: UTIL_MODEL_TYPE):
         if astype == "gbdt":
             task["model"] = LGBM_MODEL
             if isinstance(handler, dict):
+                # We don't need preprocessing when using GBDT model
                 for k in ["infer_processors", "learn_processors"]:
                     if k in handler.setdefault("kwargs", {}):
                         handler["kwargs"].pop(k)
         elif astype == "linear":
             task["model"] = LINEAR_MODEL
-            handler["kwargs"].update(PROC_ARGS)
+            if isinstance(handler, dict):
+                handler["kwargs"].update(PROC_ARGS)
+            else:
+                self.logger.warning("The handler can't be adjusted.")
         else:
             raise ValueError(f"astype not supported: {astype}")
         return task
@@ -155,12 +181,15 @@ def _dump_data_for_proxy_model(self):
         The meta model will be trained upon the proxy forecasting model.
         This dataset is for the proxy forecasting model.
         """
-        topk = 30
-        fi = self._get_feature_importance()
-        col_selected = fi.nlargest(topk)
+
         # NOTE: adjusting to `self.sim_task_model` just for aligning with previous implementation.
+        # In previous version. The data for proxy model is using sim_task_model's way for processing
         task = self._adjust_task(self.basic_task(enable_handler_cache=False), self.sim_task_model)
         task = replace_task_handler_with_cache(task, self.working_dir)
+        # if self.meta_data_proc is not None:
+        # else:
+        #     # Otherwise, we don't need futher processing
+        #     task = self.basic_task()
 
         dataset = init_instance_by_config(task["dataset"])
         prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
@@ -168,12 +197,18 @@ def _dump_data_for_proxy_model(self):
         feature_df = prep_ds["feature"]
         label_df = prep_ds["label"]
 
-        feature_selected = feature_df.loc[:, col_selected.index]
+        if self.fea_imp_n is not None:
+            fi = self._get_feature_importance()
+            col_selected = fi.nlargest(self.fea_imp_n)
+            feature_selected = feature_df.loc[:, col_selected.index]
+        else:
+            feature_selected = feature_df
 
-        feature_selected = feature_selected.groupby("datetime", group_keys=False).apply(
-            lambda df: (df - df.mean()).div(df.std())
-        )
-        feature_selected = feature_selected.fillna(0.0)
+        if self.meta_data_proc == "V01":
+            feature_selected = feature_selected.groupby("datetime", group_keys=False).apply(
+                lambda df: (df - df.mean()).div(df.std())
+            )
+            feature_selected = feature_selected.fillna(0.0)
 
         df_all = {
             "label": label_df.reindex(feature_selected.index),
@@ -223,7 +258,10 @@ def _train_meta_model(self, fill_method="max"):
         # 1) leverage the simplified proxy forecasting model to train meta model.
         # - Only the dataset part is important, in current version of meta model will integrate the
 
-        # the train_start for training meta model does not necessarily align with final rolling
+        # NOTE:
+        # - The train_start for training meta model does not necessarily align with final rolling
+        #   But please select a right time to make sure the finnal rolling tasks are not leaked in the training data.
+        # - The test_start is automatically aligned to the next day of test_end.  Validation is ignored.
         train_start = "2008-01-01" if self.train_start is None else self.train_start
         train_end = "2010-12-31" if self.meta_1st_train_end is None else self.meta_1st_train_end
         test_start = (pd.Timestamp(train_end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d")
@@ -249,9 +287,9 @@ def _train_meta_model(self, fill_method="max"):
         kwargs = dict(
             task_tpl=proxy_forecast_model_task,
             step=self.step,
-            segments=0.62,  # keep test period consistent with the dataset yaml
+            segments=self.segments,  # keep test period consistent with the dataset yaml
             trunc_days=1 + self.horizon,
-            hist_step_n=30,
+            hist_step_n=self.hist_step_n,
             fill_method=fill_method,
             rolling_ext_days=0,
         )
@@ -268,7 +306,13 @@ def _train_meta_model(self, fill_method="max"):
         with R.start(experiment_name=self.meta_exp_name):
             R.log_params(**kwargs)
             mm = MetaModelDS(
-                step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=30, seed=43, alpha=self.alpha
+                step=self.step,
+                hist_step_n=kwargs["hist_step_n"],
+                lr=0.001,
+                max_epoch=30,
+                seed=43,
+                alpha=self.alpha,
+                loss_skip_thresh=self.loss_skip_thresh,
             )
             mm.fit(md)
             R.save_objects(model=mm)
diff --git a/qlib/model/meta/task.py b/qlib/model/meta/task.py
index 3204910010..a051acf146 100644
--- a/qlib/model/meta/task.py
+++ b/qlib/model/meta/task.py
@@ -51,3 +51,6 @@ def get_meta_input(self) -> object:
         Return the **processed** meta_info
         """
         return self.meta_info
+
+    def __repr__(self):
+        return f"MetaTask(task={self.task}, meta_info={self.meta_info})"
diff --git a/qlib/utils/mod.py b/qlib/utils/mod.py
index e539572606..4e0cb707f3 100644
--- a/qlib/utils/mod.py
+++ b/qlib/utils/mod.py
@@ -161,7 +161,13 @@ def init_instance_by_config(
             # path like 'file:///<path to pickle file>/obj.pkl'
             pr = urlparse(config)
             if pr.scheme == "file":
-                pr_path = os.path.join(pr.netloc, pr.path) if bool(pr.path) else pr.netloc
+
+                # To enable relative path like file://data/a/b/c.pkl.  pr.netloc will be data
+                path = pr.path
+                if pr.netloc != "":
+                    path = path.lstrip("/")
+
+                pr_path = os.path.join(pr.netloc, path) if bool(pr.path) else pr.netloc
                 with open(os.path.normpath(pr_path), "rb") as f:
                     return pickle.load(f)
         else:
diff --git a/qlib/workflow/cli.py b/qlib/workflow/cli.py
index c2265ea5db..cda3fdbe16 100644
--- a/qlib/workflow/cli.py
+++ b/qlib/workflow/cli.py
@@ -1,18 +1,20 @@
 #  Copyright (c) Microsoft Corporation.
 #  Licensed under the MIT License.
 import logging
-import sys
 import os
 from pathlib import Path
+import sys
 
-import qlib
 import fire
+from jinja2 import Template, meta
 import ruamel.yaml as yaml
+
+import qlib
 from qlib.config import C
-from qlib.model.trainer import task_train
-from qlib.utils.data import update_config
 from qlib.log import get_module_logger
+from qlib.model.trainer import task_train
 from qlib.utils import set_log_with_config
+from qlib.utils.data import update_config
 
 set_log_with_config(C.logging_config)
 logger = get_module_logger("qrun", logging.INFO)
@@ -47,6 +49,39 @@ def sys_config(config, config_path):
         sys.path.append(str(Path(config_path).parent.resolve().absolute() / p))
 
 
+def render_template(config_path: str) -> str:
+    """
+    render the template based on the environment
+
+    Parameters
+    ----------
+    config_path : str
+        configuration path
+
+    Returns
+    -------
+    str
+        the rendered content
+    """
+    with open(config_path, "r") as f:
+        config = f.read()
+    # Set up the Jinja2 environment
+    template = Template(config)
+
+    # Parse the template to find undeclared variables
+    env = template.environment
+    parsed_content = env.parse(config)
+    variables = meta.find_undeclared_variables(parsed_content)
+
+    # Get context from os.environ according to the variables
+    context = {var: os.getenv(var, "") for var in variables if var in os.environ}
+    logger.info(f"Render the template with the context: {context}")
+
+    # Render the template with the context
+    rendered_content = template.render(context)
+    return rendered_content
+
+
 # workflow handler function
 def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"):
     """
@@ -67,8 +102,9 @@ def workflow(config_path, experiment_name="workflow", uri_folder="mlruns"):
         market: csi300
 
     """
-    with open(config_path) as fp:
-        config = yaml.safe_load(fp)
+    # Render the template
+    rendered_yaml = render_template(config_path)
+    config = yaml.safe_load(rendered_yaml)
 
     base_config_path = config.get("BASE_CONFIG_PATH", None)
     if base_config_path:

From a7d5a9b500de5df053e32abf00f6a679546636eb Mon Sep 17 00:00:00 2001
From: you-n-g <you-n-g@users.noreply.github.com>
Date: Fri, 5 Jul 2024 15:44:16 +0800
Subject: [PATCH 31/37] Nested data loader (#1822)

* nested data loader

* Amend

* add data loder test

* fix pylint error

* fix pytest error

* fix pytest error

* delete comments

* Update qlib/contrib/data/handler.py

---------

Co-authored-by: Linlang <Lv.Linlang@hotmail.com>
---
 qlib/contrib/data/handler.py                  | 281 +---------------
 qlib/contrib/data/loader.py                   | 310 ++++++++++++++++++
 qlib/data/dataset/loader.py                   |  55 +++-
 tests/data_mid_layer_tests/test_dataloader.py |  50 +++
 4 files changed, 417 insertions(+), 279 deletions(-)
 create mode 100644 qlib/contrib/data/loader.py
 create mode 100644 tests/data_mid_layer_tests/test_dataloader.py

diff --git a/qlib/contrib/data/handler.py b/qlib/contrib/data/handler.py
index ce052f5506..7c63e5a639 100644
--- a/qlib/contrib/data/handler.py
+++ b/qlib/contrib/data/handler.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
+from qlib.contrib.data.loader import Alpha158DL, Alpha360DL
 from ...data.dataset.handler import DataHandlerLP
 from ...data.dataset.processor import Processor
 from ...utils import get_callable_kwargs
@@ -66,7 +67,7 @@ def __init__(
             "class": "QlibDataLoader",
             "kwargs": {
                 "config": {
-                    "feature": self.get_feature_config(),
+                    "feature": Alpha360DL.get_feature_config(),
                     "label": kwargs.pop("label", self.get_label_config()),
                 },
                 "filter_pipe": filter_pipe,
@@ -88,51 +89,6 @@ def __init__(
     def get_label_config(self):
         return ["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]
 
-    @staticmethod
-    def get_feature_config():
-        # NOTE:
-        # Alpha360 tries to provide a dataset with original price data
-        # the original price data includes the prices and volume in the last 60 days.
-        # To make it easier to learn models from this dataset, all the prices and volume
-        # are normalized by the latest price and volume data ( dividing by $close, $volume)
-        # So the latest normalized $close will be 1 (with name CLOSE0), the latest normalized $volume will be 1 (with name VOLUME0)
-        # If further normalization are executed (e.g. centralization),  CLOSE0 and VOLUME0 will be 0.
-        fields = []
-        names = []
-
-        for i in range(59, 0, -1):
-            fields += ["Ref($close, %d)/$close" % i]
-            names += ["CLOSE%d" % i]
-        fields += ["$close/$close"]
-        names += ["CLOSE0"]
-        for i in range(59, 0, -1):
-            fields += ["Ref($open, %d)/$close" % i]
-            names += ["OPEN%d" % i]
-        fields += ["$open/$close"]
-        names += ["OPEN0"]
-        for i in range(59, 0, -1):
-            fields += ["Ref($high, %d)/$close" % i]
-            names += ["HIGH%d" % i]
-        fields += ["$high/$close"]
-        names += ["HIGH0"]
-        for i in range(59, 0, -1):
-            fields += ["Ref($low, %d)/$close" % i]
-            names += ["LOW%d" % i]
-        fields += ["$low/$close"]
-        names += ["LOW0"]
-        for i in range(59, 0, -1):
-            fields += ["Ref($vwap, %d)/$close" % i]
-            names += ["VWAP%d" % i]
-        fields += ["$vwap/$close"]
-        names += ["VWAP0"]
-        for i in range(59, 0, -1):
-            fields += ["Ref($volume, %d)/($volume+1e-12)" % i]
-            names += ["VOLUME%d" % i]
-        fields += ["$volume/($volume+1e-12)"]
-        names += ["VOLUME0"]
-
-        return fields, names
-
 
 class Alpha360vwap(Alpha360):
     def get_label_config(self):
@@ -190,242 +146,11 @@ def get_feature_config(self):
             },
             "rolling": {},
         }
-        return self.parse_config_to_fields(conf)
+        return Alpha158DL.get_feature_config(conf)
 
     def get_label_config(self):
         return ["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]
 
-    @staticmethod
-    def parse_config_to_fields(config):
-        """create factors from config
-
-        config = {
-            'kbar': {}, # whether to use some hard-code kbar features
-            'price': { # whether to use raw price features
-                'windows': [0, 1, 2, 3, 4], # use price at n days ago
-                'feature': ['OPEN', 'HIGH', 'LOW'] # which price field to use
-            },
-            'volume': { # whether to use raw volume features
-                'windows': [0, 1, 2, 3, 4], # use volume at n days ago
-            },
-            'rolling': { # whether to use rolling operator based features
-                'windows': [5, 10, 20, 30, 60], # rolling windows size
-                'include': ['ROC', 'MA', 'STD'], # rolling operator to use
-                #if include is None we will use default operators
-                'exclude': ['RANK'], # rolling operator not to use
-            }
-        }
-        """
-        fields = []
-        names = []
-        if "kbar" in config:
-            fields += [
-                "($close-$open)/$open",
-                "($high-$low)/$open",
-                "($close-$open)/($high-$low+1e-12)",
-                "($high-Greater($open, $close))/$open",
-                "($high-Greater($open, $close))/($high-$low+1e-12)",
-                "(Less($open, $close)-$low)/$open",
-                "(Less($open, $close)-$low)/($high-$low+1e-12)",
-                "(2*$close-$high-$low)/$open",
-                "(2*$close-$high-$low)/($high-$low+1e-12)",
-            ]
-            names += [
-                "KMID",
-                "KLEN",
-                "KMID2",
-                "KUP",
-                "KUP2",
-                "KLOW",
-                "KLOW2",
-                "KSFT",
-                "KSFT2",
-            ]
-        if "price" in config:
-            windows = config["price"].get("windows", range(5))
-            feature = config["price"].get("feature", ["OPEN", "HIGH", "LOW", "CLOSE", "VWAP"])
-            for field in feature:
-                field = field.lower()
-                fields += ["Ref($%s, %d)/$close" % (field, d) if d != 0 else "$%s/$close" % field for d in windows]
-                names += [field.upper() + str(d) for d in windows]
-        if "volume" in config:
-            windows = config["volume"].get("windows", range(5))
-            fields += ["Ref($volume, %d)/($volume+1e-12)" % d if d != 0 else "$volume/($volume+1e-12)" for d in windows]
-            names += ["VOLUME" + str(d) for d in windows]
-        if "rolling" in config:
-            windows = config["rolling"].get("windows", [5, 10, 20, 30, 60])
-            include = config["rolling"].get("include", None)
-            exclude = config["rolling"].get("exclude", [])
-            # `exclude` in dataset config unnecessary filed
-            # `include` in dataset config necessary field
-
-            def use(x):
-                return x not in exclude and (include is None or x in include)
-
-            # Some factor ref: https://guorn.com/static/upload/file/3/134065454575605.pdf
-            if use("ROC"):
-                # https://www.investopedia.com/terms/r/rateofchange.asp
-                # Rate of change, the price change in the past d days, divided by latest close price to remove unit
-                fields += ["Ref($close, %d)/$close" % d for d in windows]
-                names += ["ROC%d" % d for d in windows]
-            if use("MA"):
-                # https://www.investopedia.com/ask/answers/071414/whats-difference-between-moving-average-and-weighted-moving-average.asp
-                # Simple Moving Average, the simple moving average in the past d days, divided by latest close price to remove unit
-                fields += ["Mean($close, %d)/$close" % d for d in windows]
-                names += ["MA%d" % d for d in windows]
-            if use("STD"):
-                # The standard diviation of close price for the past d days, divided by latest close price to remove unit
-                fields += ["Std($close, %d)/$close" % d for d in windows]
-                names += ["STD%d" % d for d in windows]
-            if use("BETA"):
-                # The rate of close price change in the past d days, divided by latest close price to remove unit
-                # For example, price increase 10 dollar per day in the past d days, then Slope will be 10.
-                fields += ["Slope($close, %d)/$close" % d for d in windows]
-                names += ["BETA%d" % d for d in windows]
-            if use("RSQR"):
-                # The R-sqaure value of linear regression for the past d days, represent the trend linear
-                fields += ["Rsquare($close, %d)" % d for d in windows]
-                names += ["RSQR%d" % d for d in windows]
-            if use("RESI"):
-                # The redisdual for linear regression for the past d days, represent the trend linearity for past d days.
-                fields += ["Resi($close, %d)/$close" % d for d in windows]
-                names += ["RESI%d" % d for d in windows]
-            if use("MAX"):
-                # The max price for past d days, divided by latest close price to remove unit
-                fields += ["Max($high, %d)/$close" % d for d in windows]
-                names += ["MAX%d" % d for d in windows]
-            if use("LOW"):
-                # The low price for past d days, divided by latest close price to remove unit
-                fields += ["Min($low, %d)/$close" % d for d in windows]
-                names += ["MIN%d" % d for d in windows]
-            if use("QTLU"):
-                # The 80% quantile of past d day's close price, divided by latest close price to remove unit
-                # Used with MIN and MAX
-                fields += ["Quantile($close, %d, 0.8)/$close" % d for d in windows]
-                names += ["QTLU%d" % d for d in windows]
-            if use("QTLD"):
-                # The 20% quantile of past d day's close price, divided by latest close price to remove unit
-                fields += ["Quantile($close, %d, 0.2)/$close" % d for d in windows]
-                names += ["QTLD%d" % d for d in windows]
-            if use("RANK"):
-                # Get the percentile of current close price in past d day's close price.
-                # Represent the current price level comparing to past N days, add additional information to moving average.
-                fields += ["Rank($close, %d)" % d for d in windows]
-                names += ["RANK%d" % d for d in windows]
-            if use("RSV"):
-                # Represent the price position between upper and lower resistent price for past d days.
-                fields += ["($close-Min($low, %d))/(Max($high, %d)-Min($low, %d)+1e-12)" % (d, d, d) for d in windows]
-                names += ["RSV%d" % d for d in windows]
-            if use("IMAX"):
-                # The number of days between current date and previous highest price date.
-                # Part of Aroon Indicator https://www.investopedia.com/terms/a/aroon.asp
-                # The indicator measures the time between highs and the time between lows over a time period.
-                # The idea is that strong uptrends will regularly see new highs, and strong downtrends will regularly see new lows.
-                fields += ["IdxMax($high, %d)/%d" % (d, d) for d in windows]
-                names += ["IMAX%d" % d for d in windows]
-            if use("IMIN"):
-                # The number of days between current date and previous lowest price date.
-                # Part of Aroon Indicator https://www.investopedia.com/terms/a/aroon.asp
-                # The indicator measures the time between highs and the time between lows over a time period.
-                # The idea is that strong uptrends will regularly see new highs, and strong downtrends will regularly see new lows.
-                fields += ["IdxMin($low, %d)/%d" % (d, d) for d in windows]
-                names += ["IMIN%d" % d for d in windows]
-            if use("IMXD"):
-                # The time period between previous lowest-price date occur after highest price date.
-                # Large value suggest downward momemtum.
-                fields += ["(IdxMax($high, %d)-IdxMin($low, %d))/%d" % (d, d, d) for d in windows]
-                names += ["IMXD%d" % d for d in windows]
-            if use("CORR"):
-                # The correlation between absolute close price and log scaled trading volume
-                fields += ["Corr($close, Log($volume+1), %d)" % d for d in windows]
-                names += ["CORR%d" % d for d in windows]
-            if use("CORD"):
-                # The correlation between price change ratio and volume change ratio
-                fields += ["Corr($close/Ref($close,1), Log($volume/Ref($volume, 1)+1), %d)" % d for d in windows]
-                names += ["CORD%d" % d for d in windows]
-            if use("CNTP"):
-                # The percentage of days in past d days that price go up.
-                fields += ["Mean($close>Ref($close, 1), %d)" % d for d in windows]
-                names += ["CNTP%d" % d for d in windows]
-            if use("CNTN"):
-                # The percentage of days in past d days that price go down.
-                fields += ["Mean($close<Ref($close, 1), %d)" % d for d in windows]
-                names += ["CNTN%d" % d for d in windows]
-            if use("CNTD"):
-                # The diff between past up day and past down day
-                fields += ["Mean($close>Ref($close, 1), %d)-Mean($close<Ref($close, 1), %d)" % (d, d) for d in windows]
-                names += ["CNTD%d" % d for d in windows]
-            if use("SUMP"):
-                # The total gain / the absolute total price changed
-                # Similar to RSI indicator. https://www.investopedia.com/terms/r/rsi.asp
-                fields += [
-                    "Sum(Greater($close-Ref($close, 1), 0), %d)/(Sum(Abs($close-Ref($close, 1)), %d)+1e-12)" % (d, d)
-                    for d in windows
-                ]
-                names += ["SUMP%d" % d for d in windows]
-            if use("SUMN"):
-                # The total lose / the absolute total price changed
-                # Can be derived from SUMP by SUMN = 1 - SUMP
-                # Similar to RSI indicator. https://www.investopedia.com/terms/r/rsi.asp
-                fields += [
-                    "Sum(Greater(Ref($close, 1)-$close, 0), %d)/(Sum(Abs($close-Ref($close, 1)), %d)+1e-12)" % (d, d)
-                    for d in windows
-                ]
-                names += ["SUMN%d" % d for d in windows]
-            if use("SUMD"):
-                # The diff ratio between total gain and total lose
-                # Similar to RSI indicator. https://www.investopedia.com/terms/r/rsi.asp
-                fields += [
-                    "(Sum(Greater($close-Ref($close, 1), 0), %d)-Sum(Greater(Ref($close, 1)-$close, 0), %d))"
-                    "/(Sum(Abs($close-Ref($close, 1)), %d)+1e-12)" % (d, d, d)
-                    for d in windows
-                ]
-                names += ["SUMD%d" % d for d in windows]
-            if use("VMA"):
-                # Simple Volume Moving average: https://www.barchart.com/education/technical-indicators/volume_moving_average
-                fields += ["Mean($volume, %d)/($volume+1e-12)" % d for d in windows]
-                names += ["VMA%d" % d for d in windows]
-            if use("VSTD"):
-                # The standard deviation for volume in past d days.
-                fields += ["Std($volume, %d)/($volume+1e-12)" % d for d in windows]
-                names += ["VSTD%d" % d for d in windows]
-            if use("WVMA"):
-                # The volume weighted price change volatility
-                fields += [
-                    "Std(Abs($close/Ref($close, 1)-1)*$volume, %d)/(Mean(Abs($close/Ref($close, 1)-1)*$volume, %d)+1e-12)"
-                    % (d, d)
-                    for d in windows
-                ]
-                names += ["WVMA%d" % d for d in windows]
-            if use("VSUMP"):
-                # The total volume increase / the absolute total volume changed
-                fields += [
-                    "Sum(Greater($volume-Ref($volume, 1), 0), %d)/(Sum(Abs($volume-Ref($volume, 1)), %d)+1e-12)"
-                    % (d, d)
-                    for d in windows
-                ]
-                names += ["VSUMP%d" % d for d in windows]
-            if use("VSUMN"):
-                # The total volume increase / the absolute total volume changed
-                # Can be derived from VSUMP by VSUMN = 1 - VSUMP
-                fields += [
-                    "Sum(Greater(Ref($volume, 1)-$volume, 0), %d)/(Sum(Abs($volume-Ref($volume, 1)), %d)+1e-12)"
-                    % (d, d)
-                    for d in windows
-                ]
-                names += ["VSUMN%d" % d for d in windows]
-            if use("VSUMD"):
-                # The diff ratio between total volume increase and total volume decrease
-                # RSI indicator for volume
-                fields += [
-                    "(Sum(Greater($volume-Ref($volume, 1), 0), %d)-Sum(Greater(Ref($volume, 1)-$volume, 0), %d))"
-                    "/(Sum(Abs($volume-Ref($volume, 1)), %d)+1e-12)" % (d, d, d)
-                    for d in windows
-                ]
-                names += ["VSUMD%d" % d for d in windows]
-
-        return fields, names
-
 
 class Alpha158vwap(Alpha158):
     def get_label_config(self):
diff --git a/qlib/contrib/data/loader.py b/qlib/contrib/data/loader.py
new file mode 100644
index 0000000000..4d11f3a34c
--- /dev/null
+++ b/qlib/contrib/data/loader.py
@@ -0,0 +1,310 @@
+from qlib.data.dataset.loader import QlibDataLoader
+
+
+class Alpha360DL(QlibDataLoader):
+    """Dataloader to get Alpha360"""
+
+    def __init__(self, config=None, **kwargs):
+        _config = {
+            "feature": self.get_feature_config(),
+        }
+        if config is not None:
+            _config.update(config)
+        super().__init__(config=_config, **kwargs)
+
+    @staticmethod
+    def get_feature_config():
+        # NOTE:
+        # Alpha360 tries to provide a dataset with original price data
+        # the original price data includes the prices and volume in the last 60 days.
+        # To make it easier to learn models from this dataset, all the prices and volume
+        # are normalized by the latest price and volume data ( dividing by $close, $volume)
+        # So the latest normalized $close will be 1 (with name CLOSE0), the latest normalized $volume will be 1 (with name VOLUME0)
+        # If further normalization are executed (e.g. centralization),  CLOSE0 and VOLUME0 will be 0.
+        fields = []
+        names = []
+
+        for i in range(59, 0, -1):
+            fields += ["Ref($close, %d)/$close" % i]
+            names += ["CLOSE%d" % i]
+        fields += ["$close/$close"]
+        names += ["CLOSE0"]
+        for i in range(59, 0, -1):
+            fields += ["Ref($open, %d)/$close" % i]
+            names += ["OPEN%d" % i]
+        fields += ["$open/$close"]
+        names += ["OPEN0"]
+        for i in range(59, 0, -1):
+            fields += ["Ref($high, %d)/$close" % i]
+            names += ["HIGH%d" % i]
+        fields += ["$high/$close"]
+        names += ["HIGH0"]
+        for i in range(59, 0, -1):
+            fields += ["Ref($low, %d)/$close" % i]
+            names += ["LOW%d" % i]
+        fields += ["$low/$close"]
+        names += ["LOW0"]
+        for i in range(59, 0, -1):
+            fields += ["Ref($vwap, %d)/$close" % i]
+            names += ["VWAP%d" % i]
+        fields += ["$vwap/$close"]
+        names += ["VWAP0"]
+        for i in range(59, 0, -1):
+            fields += ["Ref($volume, %d)/($volume+1e-12)" % i]
+            names += ["VOLUME%d" % i]
+        fields += ["$volume/($volume+1e-12)"]
+        names += ["VOLUME0"]
+
+        return fields, names
+
+
+class Alpha158DL(QlibDataLoader):
+    """Dataloader to get Alpha158"""
+
+    def __init__(self, config=None, **kwargs):
+        _config = {
+            "feature": self.get_feature_config(),
+        }
+        if config is not None:
+            _config.update(config)
+        super().__init__(config=_config, **kwargs)
+
+    @staticmethod
+    def get_feature_config(
+        config={
+            "kbar": {},
+            "price": {
+                "windows": [0],
+                "feature": ["OPEN", "HIGH", "LOW", "VWAP"],
+            },
+            "rolling": {},
+        }
+    ):
+        """create factors from config
+
+        config = {
+            'kbar': {}, # whether to use some hard-code kbar features
+            'price': { # whether to use raw price features
+                'windows': [0, 1, 2, 3, 4], # use price at n days ago
+                'feature': ['OPEN', 'HIGH', 'LOW'] # which price field to use
+            },
+            'volume': { # whether to use raw volume features
+                'windows': [0, 1, 2, 3, 4], # use volume at n days ago
+            },
+            'rolling': { # whether to use rolling operator based features
+                'windows': [5, 10, 20, 30, 60], # rolling windows size
+                'include': ['ROC', 'MA', 'STD'], # rolling operator to use
+                #if include is None we will use default operators
+                'exclude': ['RANK'], # rolling operator not to use
+            }
+        }
+        """
+        fields = []
+        names = []
+        if "kbar" in config:
+            fields += [
+                "($close-$open)/$open",
+                "($high-$low)/$open",
+                "($close-$open)/($high-$low+1e-12)",
+                "($high-Greater($open, $close))/$open",
+                "($high-Greater($open, $close))/($high-$low+1e-12)",
+                "(Less($open, $close)-$low)/$open",
+                "(Less($open, $close)-$low)/($high-$low+1e-12)",
+                "(2*$close-$high-$low)/$open",
+                "(2*$close-$high-$low)/($high-$low+1e-12)",
+            ]
+            names += [
+                "KMID",
+                "KLEN",
+                "KMID2",
+                "KUP",
+                "KUP2",
+                "KLOW",
+                "KLOW2",
+                "KSFT",
+                "KSFT2",
+            ]
+        if "price" in config:
+            windows = config["price"].get("windows", range(5))
+            feature = config["price"].get("feature", ["OPEN", "HIGH", "LOW", "CLOSE", "VWAP"])
+            for field in feature:
+                field = field.lower()
+                fields += ["Ref($%s, %d)/$close" % (field, d) if d != 0 else "$%s/$close" % field for d in windows]
+                names += [field.upper() + str(d) for d in windows]
+        if "volume" in config:
+            windows = config["volume"].get("windows", range(5))
+            fields += ["Ref($volume, %d)/($volume+1e-12)" % d if d != 0 else "$volume/($volume+1e-12)" for d in windows]
+            names += ["VOLUME" + str(d) for d in windows]
+        if "rolling" in config:
+            windows = config["rolling"].get("windows", [5, 10, 20, 30, 60])
+            include = config["rolling"].get("include", None)
+            exclude = config["rolling"].get("exclude", [])
+            # `exclude` in dataset config unnecessary filed
+            # `include` in dataset config necessary field
+
+            def use(x):
+                return x not in exclude and (include is None or x in include)
+
+            # Some factor ref: https://guorn.com/static/upload/file/3/134065454575605.pdf
+            if use("ROC"):
+                # https://www.investopedia.com/terms/r/rateofchange.asp
+                # Rate of change, the price change in the past d days, divided by latest close price to remove unit
+                fields += ["Ref($close, %d)/$close" % d for d in windows]
+                names += ["ROC%d" % d for d in windows]
+            if use("MA"):
+                # https://www.investopedia.com/ask/answers/071414/whats-difference-between-moving-average-and-weighted-moving-average.asp
+                # Simple Moving Average, the simple moving average in the past d days, divided by latest close price to remove unit
+                fields += ["Mean($close, %d)/$close" % d for d in windows]
+                names += ["MA%d" % d for d in windows]
+            if use("STD"):
+                # The standard diviation of close price for the past d days, divided by latest close price to remove unit
+                fields += ["Std($close, %d)/$close" % d for d in windows]
+                names += ["STD%d" % d for d in windows]
+            if use("BETA"):
+                # The rate of close price change in the past d days, divided by latest close price to remove unit
+                # For example, price increase 10 dollar per day in the past d days, then Slope will be 10.
+                fields += ["Slope($close, %d)/$close" % d for d in windows]
+                names += ["BETA%d" % d for d in windows]
+            if use("RSQR"):
+                # The R-sqaure value of linear regression for the past d days, represent the trend linear
+                fields += ["Rsquare($close, %d)" % d for d in windows]
+                names += ["RSQR%d" % d for d in windows]
+            if use("RESI"):
+                # The redisdual for linear regression for the past d days, represent the trend linearity for past d days.
+                fields += ["Resi($close, %d)/$close" % d for d in windows]
+                names += ["RESI%d" % d for d in windows]
+            if use("MAX"):
+                # The max price for past d days, divided by latest close price to remove unit
+                fields += ["Max($high, %d)/$close" % d for d in windows]
+                names += ["MAX%d" % d for d in windows]
+            if use("LOW"):
+                # The low price for past d days, divided by latest close price to remove unit
+                fields += ["Min($low, %d)/$close" % d for d in windows]
+                names += ["MIN%d" % d for d in windows]
+            if use("QTLU"):
+                # The 80% quantile of past d day's close price, divided by latest close price to remove unit
+                # Used with MIN and MAX
+                fields += ["Quantile($close, %d, 0.8)/$close" % d for d in windows]
+                names += ["QTLU%d" % d for d in windows]
+            if use("QTLD"):
+                # The 20% quantile of past d day's close price, divided by latest close price to remove unit
+                fields += ["Quantile($close, %d, 0.2)/$close" % d for d in windows]
+                names += ["QTLD%d" % d for d in windows]
+            if use("RANK"):
+                # Get the percentile of current close price in past d day's close price.
+                # Represent the current price level comparing to past N days, add additional information to moving average.
+                fields += ["Rank($close, %d)" % d for d in windows]
+                names += ["RANK%d" % d for d in windows]
+            if use("RSV"):
+                # Represent the price position between upper and lower resistent price for past d days.
+                fields += ["($close-Min($low, %d))/(Max($high, %d)-Min($low, %d)+1e-12)" % (d, d, d) for d in windows]
+                names += ["RSV%d" % d for d in windows]
+            if use("IMAX"):
+                # The number of days between current date and previous highest price date.
+                # Part of Aroon Indicator https://www.investopedia.com/terms/a/aroon.asp
+                # The indicator measures the time between highs and the time between lows over a time period.
+                # The idea is that strong uptrends will regularly see new highs, and strong downtrends will regularly see new lows.
+                fields += ["IdxMax($high, %d)/%d" % (d, d) for d in windows]
+                names += ["IMAX%d" % d for d in windows]
+            if use("IMIN"):
+                # The number of days between current date and previous lowest price date.
+                # Part of Aroon Indicator https://www.investopedia.com/terms/a/aroon.asp
+                # The indicator measures the time between highs and the time between lows over a time period.
+                # The idea is that strong uptrends will regularly see new highs, and strong downtrends will regularly see new lows.
+                fields += ["IdxMin($low, %d)/%d" % (d, d) for d in windows]
+                names += ["IMIN%d" % d for d in windows]
+            if use("IMXD"):
+                # The time period between previous lowest-price date occur after highest price date.
+                # Large value suggest downward momemtum.
+                fields += ["(IdxMax($high, %d)-IdxMin($low, %d))/%d" % (d, d, d) for d in windows]
+                names += ["IMXD%d" % d for d in windows]
+            if use("CORR"):
+                # The correlation between absolute close price and log scaled trading volume
+                fields += ["Corr($close, Log($volume+1), %d)" % d for d in windows]
+                names += ["CORR%d" % d for d in windows]
+            if use("CORD"):
+                # The correlation between price change ratio and volume change ratio
+                fields += ["Corr($close/Ref($close,1), Log($volume/Ref($volume, 1)+1), %d)" % d for d in windows]
+                names += ["CORD%d" % d for d in windows]
+            if use("CNTP"):
+                # The percentage of days in past d days that price go up.
+                fields += ["Mean($close>Ref($close, 1), %d)" % d for d in windows]
+                names += ["CNTP%d" % d for d in windows]
+            if use("CNTN"):
+                # The percentage of days in past d days that price go down.
+                fields += ["Mean($close<Ref($close, 1), %d)" % d for d in windows]
+                names += ["CNTN%d" % d for d in windows]
+            if use("CNTD"):
+                # The diff between past up day and past down day
+                fields += ["Mean($close>Ref($close, 1), %d)-Mean($close<Ref($close, 1), %d)" % (d, d) for d in windows]
+                names += ["CNTD%d" % d for d in windows]
+            if use("SUMP"):
+                # The total gain / the absolute total price changed
+                # Similar to RSI indicator. https://www.investopedia.com/terms/r/rsi.asp
+                fields += [
+                    "Sum(Greater($close-Ref($close, 1), 0), %d)/(Sum(Abs($close-Ref($close, 1)), %d)+1e-12)" % (d, d)
+                    for d in windows
+                ]
+                names += ["SUMP%d" % d for d in windows]
+            if use("SUMN"):
+                # The total lose / the absolute total price changed
+                # Can be derived from SUMP by SUMN = 1 - SUMP
+                # Similar to RSI indicator. https://www.investopedia.com/terms/r/rsi.asp
+                fields += [
+                    "Sum(Greater(Ref($close, 1)-$close, 0), %d)/(Sum(Abs($close-Ref($close, 1)), %d)+1e-12)" % (d, d)
+                    for d in windows
+                ]
+                names += ["SUMN%d" % d for d in windows]
+            if use("SUMD"):
+                # The diff ratio between total gain and total lose
+                # Similar to RSI indicator. https://www.investopedia.com/terms/r/rsi.asp
+                fields += [
+                    "(Sum(Greater($close-Ref($close, 1), 0), %d)-Sum(Greater(Ref($close, 1)-$close, 0), %d))"
+                    "/(Sum(Abs($close-Ref($close, 1)), %d)+1e-12)" % (d, d, d)
+                    for d in windows
+                ]
+                names += ["SUMD%d" % d for d in windows]
+            if use("VMA"):
+                # Simple Volume Moving average: https://www.barchart.com/education/technical-indicators/volume_moving_average
+                fields += ["Mean($volume, %d)/($volume+1e-12)" % d for d in windows]
+                names += ["VMA%d" % d for d in windows]
+            if use("VSTD"):
+                # The standard deviation for volume in past d days.
+                fields += ["Std($volume, %d)/($volume+1e-12)" % d for d in windows]
+                names += ["VSTD%d" % d for d in windows]
+            if use("WVMA"):
+                # The volume weighted price change volatility
+                fields += [
+                    "Std(Abs($close/Ref($close, 1)-1)*$volume, %d)/(Mean(Abs($close/Ref($close, 1)-1)*$volume, %d)+1e-12)"
+                    % (d, d)
+                    for d in windows
+                ]
+                names += ["WVMA%d" % d for d in windows]
+            if use("VSUMP"):
+                # The total volume increase / the absolute total volume changed
+                fields += [
+                    "Sum(Greater($volume-Ref($volume, 1), 0), %d)/(Sum(Abs($volume-Ref($volume, 1)), %d)+1e-12)"
+                    % (d, d)
+                    for d in windows
+                ]
+                names += ["VSUMP%d" % d for d in windows]
+            if use("VSUMN"):
+                # The total volume increase / the absolute total volume changed
+                # Can be derived from VSUMP by VSUMN = 1 - VSUMP
+                fields += [
+                    "Sum(Greater(Ref($volume, 1)-$volume, 0), %d)/(Sum(Abs($volume-Ref($volume, 1)), %d)+1e-12)"
+                    % (d, d)
+                    for d in windows
+                ]
+                names += ["VSUMN%d" % d for d in windows]
+            if use("VSUMD"):
+                # The diff ratio between total volume increase and total volume decrease
+                # RSI indicator for volume
+                fields += [
+                    "(Sum(Greater($volume-Ref($volume, 1), 0), %d)-Sum(Greater(Ref($volume, 1)-$volume, 0), %d))"
+                    "/(Sum(Abs($volume-Ref($volume, 1)), %d)+1e-12)" % (d, d, d)
+                    for d in windows
+                ]
+                names += ["VSUMD%d" % d for d in windows]
+
+        return fields, names
diff --git a/qlib/data/dataset/loader.py b/qlib/data/dataset/loader.py
index e9d6f98866..f2921124b7 100644
--- a/qlib/data/dataset/loader.py
+++ b/qlib/data/dataset/loader.py
@@ -7,7 +7,7 @@
 import warnings
 import pandas as pd
 
-from typing import Tuple, Union, List
+from typing import Tuple, Union, List, Dict
 
 from qlib.data import D
 from qlib.utils import load_dataset, init_instance_by_config, time_to_slc_point
@@ -247,10 +247,14 @@ def __getstate__(self) -> dict:
 
     def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame:
         self._maybe_load_raw_data()
+
+        # 1) Filter by instruments
         if instruments is None:
             df = self._data
         else:
             df = self._data.loc(axis=0)[:, instruments]
+
+        # 2) Filter by Datetime
         if start_time is None and end_time is None:
             return df  # NOTE: avoid copy by loc
         # pd.Timestamp(None) == NaT, use NaT as index can not fetch correct thing, so do not change None.
@@ -275,6 +279,55 @@ def _maybe_load_raw_data(self):
             self._data = self._config
 
 
+class NestedDataLoader(DataLoader):
+    """
+    We have multiple DataLoader, we can use this class to combine them.
+    """
+
+    def __init__(self, dataloader_l: List[Dict], join="left") -> None:
+        """
+
+        Parameters
+        ----------
+        dataloader_l : list[dict]
+            A list of dataloader, for exmaple
+
+            .. code-block:: python
+
+                nd = NestedDataLoader(
+                    dataloader_l=[
+                        {
+                            "class": "qlib.contrib.data.loader.Alpha158DL",
+                        }, {
+                            "class": "qlib.contrib.data.loader.Alpha360DL",
+                            "kwargs": {
+                                "config": {
+                                    "label": ( ["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"])
+                                }
+                            }
+                        }
+                    ]
+                )
+        join :
+            it will pass to pd.concat when merging it.
+        """
+        super().__init__()
+        self.data_loader_l = [
+            (dl if isinstance(dl, DataLoader) else init_instance_by_config(dl)) for dl in dataloader_l
+        ]
+        self.join = join
+
+    def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame:
+        df_full = None
+        for dl in self.data_loader_l:
+            df_current = dl.load(instruments, start_time, end_time)
+            if df_full is None:
+                df_full = df_current
+            else:
+                df_full = pd.merge(df_full, df_current, left_index=True, right_index=True, how=self.join)
+        return df_full.sort_index(axis=1)
+
+
 class DataLoaderDH(DataLoader):
     """DataLoaderDH
     DataLoader based on (D)ata (H)andler
diff --git a/tests/data_mid_layer_tests/test_dataloader.py b/tests/data_mid_layer_tests/test_dataloader.py
new file mode 100644
index 0000000000..e3cb741bb7
--- /dev/null
+++ b/tests/data_mid_layer_tests/test_dataloader.py
@@ -0,0 +1,50 @@
+# TODO:
+# dump alpha 360 to dataframe and merge it with Alpha158
+
+import sys
+import unittest
+import qlib
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent))
+from qlib.data.dataset.loader import NestedDataLoader
+from qlib.contrib.data.loader import Alpha158DL, Alpha360DL
+
+
+class TestDataLoader(unittest.TestCase):
+
+    def test_nested_data_loader(self):
+        qlib.init()
+        nd = NestedDataLoader(
+            dataloader_l=[
+                {
+                    "class": "qlib.contrib.data.loader.Alpha158DL",
+                },
+                {
+                    "class": "qlib.contrib.data.loader.Alpha360DL",
+                    "kwargs": {"config": {"label": (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"])}},
+                },
+            ]
+        )
+        # Of course you can use StaticDataLoader
+
+        dataset = nd.load()
+
+        assert dataset is not None
+
+        columns = dataset.columns.tolist()
+        columns_list = [tup[1] for tup in columns]
+
+        for col in Alpha158DL.get_feature_config()[1]:
+            assert col in columns_list
+
+        for col in Alpha360DL.get_feature_config()[1]:
+            assert col in columns_list
+
+        assert "LABEL0" in columns_list
+
+        # Then you can use it wth DataHandler;
+
+
+if __name__ == "__main__":
+    unittest.main()

From 2c33332dd6d25fb430fa4366abc1eaa8dc80f1ee Mon Sep 17 00:00:00 2001
From: Linlang <30293408+SunsetWolf@users.noreply.github.com>
Date: Wed, 10 Jul 2024 14:48:44 +0800
Subject: [PATCH 32/37] More dataloader example (#1823)

* More dataloader example

* optimize code

* optimeze code

* optimeze code

* optimeze code

* optimeze code

* optimeze code

* fix pylint error

* fix CI error

* fix CI error

* Comments

* fix error type

---------

Co-authored-by: Young <afe.young@gmail.com>
---
 .github/workflows/test_qlib_from_pip.yml      |  2 +-
 .github/workflows/test_qlib_from_source.yml   |  2 +-
 .../workflows/test_qlib_from_source_slow.yml  |  2 +-
 qlib/data/dataset/loader.py                   | 14 +++++++-
 tests/data_mid_layer_tests/test_dataloader.py | 33 ++++++++++++++++++-
 5 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml
index fd1e8c4cf4..029e292d37 100644
--- a/.github/workflows/test_qlib_from_pip.yml
+++ b/.github/workflows/test_qlib_from_pip.yml
@@ -16,7 +16,7 @@ jobs:
         # Since macos-latest changed from 12.7.4 to 14.4.1,
         # the minimum python version that matches a 14.4.1 version of macos is 3.10,
         # so we limit the macos version to macos-12.
-        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-12]
+        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-12]
         # not supporting 3.6 due to annotations is not supported https://stackoverflow.com/a/52890129
         python-version: [3.7, 3.8]
 
diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml
index 885d8fa439..8238db9bb2 100644
--- a/.github/workflows/test_qlib_from_source.yml
+++ b/.github/workflows/test_qlib_from_source.yml
@@ -17,7 +17,7 @@ jobs:
         # Since macos-latest changed from 12.7.4 to 14.4.1,
         # the minimum python version that matches a 14.4.1 version of macos is 3.10,
         # so we limit the macos version to macos-12.
-        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-12]
+        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-12]
         # not supporting 3.6 due to annotations is not supported https://stackoverflow.com/a/52890129
         python-version: [3.7, 3.8]
 
diff --git a/.github/workflows/test_qlib_from_source_slow.yml b/.github/workflows/test_qlib_from_source_slow.yml
index 8725d4fe03..3401ea3fd0 100644
--- a/.github/workflows/test_qlib_from_source_slow.yml
+++ b/.github/workflows/test_qlib_from_source_slow.yml
@@ -17,7 +17,7 @@ jobs:
         # Since macos-latest changed from 12.7.4 to 14.4.1,
         # the minimum python version that matches a 14.4.1 version of macos is 3.10,
         # so we limit the macos version to macos-12.
-        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-11, macos-12]
+        os: [windows-latest, ubuntu-20.04, ubuntu-22.04, macos-12]
         # not supporting 3.6 due to annotations is not supported https://stackoverflow.com/a/52890129
         python-version: [3.7, 3.8]
 
diff --git a/qlib/data/dataset/loader.py b/qlib/data/dataset/loader.py
index f2921124b7..06e199bca5 100644
--- a/qlib/data/dataset/loader.py
+++ b/qlib/data/dataset/loader.py
@@ -41,6 +41,7 @@ def load(self, instruments, start_time=None, end_time=None) -> pd.DataFrame:
         ----------
         instruments : str or dict
             it can either be the market name or the config file of instruments generated by InstrumentProvider.
+            If the value of instruments is None, it means that no filtering is done.
         start_time : str
             start of the time range.
         end_time : str
@@ -50,6 +51,11 @@ def load(self, instruments, start_time=None, end_time=None) -> pd.DataFrame:
         -------
         pd.DataFrame:
             data load from the under layer source
+
+        Raise
+        -----
+        KeyError:
+            if the instruments filter is not supported, raise KeyError
         """
 
 
@@ -320,7 +326,13 @@ def __init__(self, dataloader_l: List[Dict], join="left") -> None:
     def load(self, instruments=None, start_time=None, end_time=None) -> pd.DataFrame:
         df_full = None
         for dl in self.data_loader_l:
-            df_current = dl.load(instruments, start_time, end_time)
+            try:
+                df_current = dl.load(instruments, start_time, end_time)
+            except KeyError:
+                warnings.warn(
+                    "If the value of `instruments` cannot be processed, it will set instruments to None to get all the data."
+                )
+                df_current = dl.load(instruments=None, start_time=start_time, end_time=end_time)
             if df_full is None:
                 df_full = df_current
             else:
diff --git a/tests/data_mid_layer_tests/test_dataloader.py b/tests/data_mid_layer_tests/test_dataloader.py
index e3cb741bb7..4d057be4fc 100644
--- a/tests/data_mid_layer_tests/test_dataloader.py
+++ b/tests/data_mid_layer_tests/test_dataloader.py
@@ -7,8 +7,10 @@
 from pathlib import Path
 
 sys.path.append(str(Path(__file__).resolve().parent))
-from qlib.data.dataset.loader import NestedDataLoader
+from qlib.data.dataset.loader import NestedDataLoader, QlibDataLoader
+from qlib.data.dataset.handler import DataHandlerLP
 from qlib.contrib.data.loader import Alpha158DL, Alpha360DL
+from qlib.data import D
 
 
 class TestDataLoader(unittest.TestCase):
@@ -44,6 +46,35 @@ def test_nested_data_loader(self):
         assert "LABEL0" in columns_list
 
         # Then you can use it wth DataHandler;
+        # NOTE: please note that the data processors are missing!!!  You should add based on your requirements
+
+        """
+        dataset.to_pickle("test_df.pkl")
+        nested_data_loader = NestedDataLoader(
+            dataloader_l=[
+                {
+                    "class": "qlib.contrib.data.loader.Alpha158DL",
+                    "kwargs": {"config": {"label": (["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"])}},
+                },
+                {
+                    "class": "qlib.contrib.data.loader.Alpha360DL",
+                },
+                {
+                    "class": "qlib.data.dataset.loader.StaticDataLoader",
+                    "kwargs": {"config": "test_df.pkl"},
+                },
+            ]
+        )
+        data_handler_config = {
+            "start_time": "2008-01-01",
+            "end_time": "2020-08-01",
+            "instruments": "csi300",
+            "data_loader": nested_data_loader,
+        }
+        data_handler = DataHandlerLP(**data_handler_config)
+        data = data_handler.fetch()
+        print(data)
+        """
 
 
 if __name__ == "__main__":

From c9ed050ef034fe6519c14b59f3d207abcb693282 Mon Sep 17 00:00:00 2001
From: cyncyw <47289405+taozhiwang@users.noreply.github.com>
Date: Thu, 11 Jul 2024 17:59:18 +0800
Subject: [PATCH 33/37] Ptnn4both datatypes and alignment tests  (#1827)

* Init model for both dataset

* Remove some deprecated code

* Add model template;

* We must align with previous results

* We choose another mode as the initial version

* Almost success to run GRU

* Successfully run training

* Passed general_nn test

* gru test

* Alignment test passed

* comment

* fix readme & minor errors

* general nn updates & benchmarks

* Update examples/benchmarks/GeneralPtNN/workflow_config_gru2mlp.yaml

---------

Co-authored-by: Young <afe.young@gmail.com>
Co-authored-by: you-n-g <you-n-g@users.noreply.github.com>
---
 examples/benchmarks/GeneralPtNN/README.md     |  19 +
 .../GeneralPtNN/workflow_config_gru.yaml      | 100 +++++
 .../GeneralPtNN/workflow_config_gru2mlp.yaml  |  93 +++++
 .../GeneralPtNN/workflow_config_mlp.yaml      |  98 +++++
 qlib/contrib/model/pytorch_general_nn.py      | 353 ++++++++++++++++++
 qlib/contrib/model/pytorch_gru.py             |   1 -
 tests/model/test_general_nn.py                |  76 ++++
 7 files changed, 739 insertions(+), 1 deletion(-)
 create mode 100644 examples/benchmarks/GeneralPtNN/README.md
 create mode 100755 examples/benchmarks/GeneralPtNN/workflow_config_gru.yaml
 create mode 100644 examples/benchmarks/GeneralPtNN/workflow_config_gru2mlp.yaml
 create mode 100644 examples/benchmarks/GeneralPtNN/workflow_config_mlp.yaml
 create mode 100644 qlib/contrib/model/pytorch_general_nn.py
 create mode 100644 tests/model/test_general_nn.py

diff --git a/examples/benchmarks/GeneralPtNN/README.md b/examples/benchmarks/GeneralPtNN/README.md
new file mode 100644
index 0000000000..9778322204
--- /dev/null
+++ b/examples/benchmarks/GeneralPtNN/README.md
@@ -0,0 +1,19 @@
+
+
+# Introduction
+
+What is GeneralPtNN
+- Fix previous design that fail to support both Time-series and tabular data
+- Now you can just replace the Pytorch model structure to run a NN model.
+
+We provide an example to demonstrate the effectiveness of the current design.
+- `workflow_config_gru.yaml` align with previous results [GRU(Kyunghyun Cho, et al.)](../README.md#Alpha158-dataset)
+  - `workflow_config_gru2mlp.yaml` to demonstrate we can convert config from time-series to tabular data with minimal changes
+    - You only have to change the net & dataset class to make the conversion.
+- `workflow_config_mlp.yaml` achieved similar functionality with [MLP](../README.md#Alpha158-dataset)
+
+# TODO
+
+- We will align existing models to current design.
+
+- The result of `workflow_config_mlp.yaml` is different with the result of [MLP](../README.md#Alpha158-dataset) since GeneralPtNN has a different stopping method compared to previous implementations. Specificly, GeneralPtNN controls training according to epoches, whereas previous methods controlled by max_steps. 
diff --git a/examples/benchmarks/GeneralPtNN/workflow_config_gru.yaml b/examples/benchmarks/GeneralPtNN/workflow_config_gru.yaml
new file mode 100755
index 0000000000..74900fc3fd
--- /dev/null
+++ b/examples/benchmarks/GeneralPtNN/workflow_config_gru.yaml
@@ -0,0 +1,100 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+    infer_processors:
+        - class: FilterCol
+          kwargs:
+              fields_group: feature
+              col_list: ["RESI5", "WVMA5", "RSQR5", "KLEN", "RSQR10", "CORR5", "CORD5", "CORR10", 
+                            "ROC60", "RESI10", "VSTD5", "RSQR60", "CORR60", "WVMA60", "STD5", 
+                            "RSQR20", "CORD60", "CORD10", "CORR20", "KLOW"
+                        ]
+        - class: RobustZScoreNorm
+          kwargs:
+              fields_group: feature
+              clip_outlier: true
+        - class: Fillna
+          kwargs:
+              fields_group: feature
+    learn_processors:
+        - class: DropnaLabel
+        - class: CSRankNorm
+          kwargs:
+              fields_group: label
+    label: ["Ref($close, -2) / Ref($close, -1) - 1"] 
+
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy
+        kwargs:
+            signal: <PRED>
+            topk: 50
+            n_drop: 5
+    backtest:
+        start_time: 2017-01-01
+        end_time: 2020-08-01
+        account: 100000000
+        benchmark: *benchmark
+        exchange_kwargs:
+            limit_threshold: 0.095
+            deal_price: close
+            open_cost: 0.0005
+            close_cost: 0.0015
+            min_cost: 5
+task:
+    model:
+        class: GeneralPTNN
+        module_path: qlib.contrib.model.pytorch_general_nn
+        kwargs:
+            n_epochs: 200
+            lr: 2e-4
+            early_stop: 10
+            batch_size: 800
+            metric: loss
+            loss: mse
+            n_jobs: 20
+            GPU: 0
+            pt_model_uri: "qlib.contrib.model.pytorch_gru_ts.GRUModel"
+            pt_model_kwargs: {
+                "d_feat": 20,
+                "hidden_size": 64,
+                "num_layers": 2,
+                "dropout": 0.,
+            }
+    dataset:
+        class: TSDatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+            step_len: 20
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            model: <MODEL>
+            dataset: <DATASET>
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
diff --git a/examples/benchmarks/GeneralPtNN/workflow_config_gru2mlp.yaml b/examples/benchmarks/GeneralPtNN/workflow_config_gru2mlp.yaml
new file mode 100644
index 0000000000..3c2e4fabb1
--- /dev/null
+++ b/examples/benchmarks/GeneralPtNN/workflow_config_gru2mlp.yaml
@@ -0,0 +1,93 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+    infer_processors:
+        - class: FilterCol
+          kwargs:
+              fields_group: feature
+              col_list: ["RESI5", "WVMA5", "RSQR5", "KLEN", "RSQR10", "CORR5", "CORD5", "CORR10", 
+                            "ROC60", "RESI10", "VSTD5", "RSQR60", "CORR60", "WVMA60", "STD5", 
+                            "RSQR20", "CORD60", "CORD10", "CORR20", "KLOW"
+                        ]
+        - class: RobustZScoreNorm
+          kwargs:
+              fields_group: feature
+              clip_outlier: true
+        - class: Fillna
+          kwargs:
+              fields_group: feature
+    learn_processors:
+        - class: DropnaLabel
+        - class: CSRankNorm
+          kwargs:
+              fields_group: label
+    label: ["Ref($close, -2) / Ref($close, -1) - 1"] 
+
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy
+        kwargs:
+            signal: <PRED>
+            topk: 50
+            n_drop: 5
+    backtest:
+        start_time: 2017-01-01
+        end_time: 2020-08-01
+        account: 100000000
+        benchmark: *benchmark
+        exchange_kwargs:
+            limit_threshold: 0.095
+            deal_price: close
+            open_cost: 0.0005
+            close_cost: 0.0015
+            min_cost: 5
+task:
+    model:
+        class: GeneralPTNN
+        module_path: qlib.contrib.model.pytorch_general_nn
+        kwargs:
+            lr: 1e-3
+            n_epochs: 1
+            batch_size: 800
+            loss: mse
+            optimizer: adam
+            pt_model_uri: "qlib.contrib.model.pytorch_nn.Net"
+            pt_model_kwargs: 
+                input_dim: 20
+                layers: [20,]
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            model: <MODEL>
+            dataset: <DATASET>
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
diff --git a/examples/benchmarks/GeneralPtNN/workflow_config_mlp.yaml b/examples/benchmarks/GeneralPtNN/workflow_config_mlp.yaml
new file mode 100644
index 0000000000..d8567679c7
--- /dev/null
+++ b/examples/benchmarks/GeneralPtNN/workflow_config_mlp.yaml
@@ -0,0 +1,98 @@
+qlib_init:
+    provider_uri: "~/.qlib/qlib_data/cn_data"
+    region: cn
+market: &market csi300
+benchmark: &benchmark SH000300
+data_handler_config: &data_handler_config
+    start_time: 2008-01-01
+    end_time: 2020-08-01
+    fit_start_time: 2008-01-01
+    fit_end_time: 2014-12-31
+    instruments: *market
+    infer_processors: [
+        {
+            "class" : "DropCol", 
+            "kwargs":{"col_list": ["VWAP0"]}
+        },
+        {
+             "class" : "CSZFillna", 
+             "kwargs":{"fields_group": "feature"}
+        }
+    ]
+    learn_processors: [
+        {
+            "class" : "DropCol", 
+            "kwargs":{"col_list": ["VWAP0"]}
+        },
+        {
+            "class" : "DropnaProcessor", 
+            "kwargs":{"fields_group": "feature"}
+        },
+        "DropnaLabel",
+        {
+            "class": "CSZScoreNorm", 
+            "kwargs": {"fields_group": "label"}
+        }
+    ]
+    process_type: "independent"
+
+port_analysis_config: &port_analysis_config
+    strategy:
+        class: TopkDropoutStrategy
+        module_path: qlib.contrib.strategy
+        kwargs:
+            signal: <PRED>
+            topk: 50
+            n_drop: 5
+    backtest:
+        start_time: 2017-01-01
+        end_time: 2020-08-01
+        account: 100000000
+        benchmark: *benchmark
+        exchange_kwargs:
+            limit_threshold: 0.095
+            deal_price: close
+            open_cost: 0.0005
+            close_cost: 0.0015
+            min_cost: 5
+task:
+    model:
+        class: GeneralPTNN
+        module_path: qlib.contrib.model.pytorch_general_nn
+        kwargs:
+            # FIXME: wrong parameters.
+            lr: 2e-3
+            batch_size: 8192
+            loss: mse
+            weight_decay: 0.0002
+            optimizer: adam
+            pt_model_uri: "qlib.contrib.model.pytorch_nn.Net"
+            pt_model_kwargs: 
+                input_dim: 157
+    dataset:
+        class: DatasetH
+        module_path: qlib.data.dataset
+        kwargs:
+            handler:
+                class: Alpha158
+                module_path: qlib.contrib.data.handler
+                kwargs: *data_handler_config
+            segments:
+                train: [2008-01-01, 2014-12-31]
+                valid: [2015-01-01, 2016-12-31]
+                test: [2017-01-01, 2020-08-01]
+    record: 
+        - class: SignalRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            model: <MODEL>
+            dataset: <DATASET>
+        - class: SigAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            ana_long_short: False
+            ann_scaler: 252
+        - class: PortAnaRecord
+          module_path: qlib.workflow.record_temp
+          kwargs: 
+            config: *port_analysis_config
diff --git a/qlib/contrib/model/pytorch_general_nn.py b/qlib/contrib/model/pytorch_general_nn.py
new file mode 100644
index 0000000000..696a20254f
--- /dev/null
+++ b/qlib/contrib/model/pytorch_general_nn.py
@@ -0,0 +1,353 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from __future__ import division
+from __future__ import print_function
+
+from torch.utils.data import DataLoader
+
+
+import numpy as np
+import pandas as pd
+from typing import Union
+import copy
+
+import torch
+import torch.optim as optim
+
+from qlib.data.dataset.weight import Reweighter
+
+from .pytorch_utils import count_parameters
+from ...model.base import Model
+from ...data.dataset import DatasetH, TSDatasetH
+from ...data.dataset.handler import DataHandlerLP
+from ...utils import (
+    init_instance_by_config,
+    get_or_create_path,
+)
+from ...log import get_module_logger
+
+from ...model.utils import ConcatDataset
+
+
+class GeneralPTNN(Model):
+    """
+    Motivation:
+        We want to provide a Qlib General Pytorch Model Adaptor
+        You can reuse it for all kinds of Pytorch models.
+        It should include the training and predict process
+
+    Parameters
+    ----------
+    d_feat : int
+        input dimension for each time step
+    metric: str
+        the evaluation metric used in early stop
+    optimizer : str
+        optimizer name
+    GPU : str
+        the GPU ID(s) used for training
+    """
+
+    def __init__(
+        self,
+        n_epochs=200,
+        lr=0.001,
+        metric="",
+        batch_size=2000,
+        early_stop=20,
+        loss="mse",
+        weight_decay=0.0,
+        optimizer="adam",
+        n_jobs=10,
+        GPU=0,
+        seed=None,
+        pt_model_uri="qlib.contrib.model.pytorch_gru_ts.GRUModel",
+        pt_model_kwargs={
+            "d_feat": 6,
+            "hidden_size": 64,
+            "num_layers": 2,
+            "dropout": 0.0,
+        },
+    ):
+        # Set logger.
+        self.logger = get_module_logger("GeneralPTNN")
+        self.logger.info("GeneralPTNN pytorch version...")
+
+        # set hyper-parameters.
+        self.n_epochs = n_epochs
+        self.lr = lr
+        self.metric = metric
+        self.batch_size = batch_size
+        self.early_stop = early_stop
+        self.optimizer = optimizer.lower()
+        self.loss = loss
+        self.weight_decay = weight_decay
+        self.device = torch.device("cuda:%d" % (GPU) if torch.cuda.is_available() and GPU >= 0 else "cpu")
+        self.n_jobs = n_jobs
+        self.seed = seed
+
+        self.pt_model_uri, self.pt_model_kwargs = pt_model_uri, pt_model_kwargs
+        self.dnn_model = init_instance_by_config({"class": pt_model_uri, "kwargs": pt_model_kwargs})
+
+        self.logger.info(
+            "GeneralPTNN parameters setting:"
+            "\nn_epochs : {}"
+            "\nlr : {}"
+            "\nmetric : {}"
+            "\nbatch_size : {}"
+            "\nearly_stop : {}"
+            "\noptimizer : {}"
+            "\nloss_type : {}"
+            "\ndevice : {}"
+            "\nn_jobs : {}"
+            "\nuse_GPU : {}"
+            "\nweight_decay : {}"
+            "\nseed : {}"
+            "\npt_model_uri: {}"
+            "\npt_model_kwargs: {}".format(
+                n_epochs,
+                lr,
+                metric,
+                batch_size,
+                early_stop,
+                optimizer.lower(),
+                loss,
+                self.device,
+                n_jobs,
+                self.use_gpu,
+                weight_decay,
+                seed,
+                pt_model_uri,
+                pt_model_kwargs,
+            )
+        )
+
+        if self.seed is not None:
+            np.random.seed(self.seed)
+            torch.manual_seed(self.seed)
+
+        self.logger.info("model:\n{:}".format(self.dnn_model))
+        self.logger.info("model size: {:.4f} MB".format(count_parameters(self.dnn_model)))
+
+        if optimizer.lower() == "adam":
+            self.train_optimizer = optim.Adam(self.dnn_model.parameters(), lr=self.lr, weight_decay=weight_decay)
+        elif optimizer.lower() == "gd":
+            self.train_optimizer = optim.SGD(self.dnn_model.parameters(), lr=self.lr, weight_decay=weight_decay)
+        else:
+            raise NotImplementedError("optimizer {} is not supported!".format(optimizer))
+
+        self.fitted = False
+        self.dnn_model.to(self.device)
+
+    @property
+    def use_gpu(self):
+        return self.device != torch.device("cpu")
+
+    def mse(self, pred, label, weight):
+        loss = weight * (pred - label) ** 2
+        return torch.mean(loss)
+
+    def loss_fn(self, pred, label, weight=None):
+        mask = ~torch.isnan(label)
+
+        if weight is None:
+            weight = torch.ones_like(label)
+
+        if self.loss == "mse":
+            return self.mse(pred[mask], label[mask], weight[mask])
+
+        raise ValueError("unknown loss `%s`" % self.loss)
+
+    def metric_fn(self, pred, label):
+        mask = torch.isfinite(label)
+
+        if self.metric in ("", "loss"):
+            return -self.loss_fn(pred[mask], label[mask])
+
+        raise ValueError("unknown metric `%s`" % self.metric)
+
+    def _get_fl(self, data: torch.Tensor):
+        """
+        get feature and label from data
+        - Handle the different data shape of time series and tabular data
+
+        Parameters
+        ----------
+        data : torch.Tensor
+            input data which maybe 3 dimension or 2 dimension
+            - 3dim: [batch_size, time_step, feature_dim]
+            - 2dim: [batch_size, feature_dim]
+
+        Returns
+        -------
+        Tuple[torch.Tensor, torch.Tensor]
+        """
+        if data.dim() == 3:
+            # it is a time series dataset
+            feature = data[:, :, 0:-1].to(self.device)
+            label = data[:, -1, -1].to(self.device)
+        elif data.dim() == 2:
+            # it is a tabular dataset
+            feature = data[:, 0:-1].to(self.device)
+            label = data[:, -1].to(self.device)
+        else:
+            raise ValueError("Unsupported data shape.")
+        return feature, label
+
+    def train_epoch(self, data_loader):
+        self.dnn_model.train()
+
+        for data, weight in data_loader:
+            feature, label = self._get_fl(data)
+
+            pred = self.dnn_model(feature.float())
+            loss = self.loss_fn(pred, label, weight.to(self.device))
+
+            self.train_optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_value_(self.dnn_model.parameters(), 3.0)
+            self.train_optimizer.step()
+
+    def test_epoch(self, data_loader):
+        self.dnn_model.eval()
+
+        scores = []
+        losses = []
+
+        for data, weight in data_loader:
+            feature, label = self._get_fl(data)
+
+            with torch.no_grad():
+                pred = self.dnn_model(feature.float())
+                loss = self.loss_fn(pred, label, weight.to(self.device))
+                losses.append(loss.item())
+
+                score = self.metric_fn(pred, label)
+                scores.append(score.item())
+
+        return np.mean(losses), np.mean(scores)
+
+    def fit(
+        self,
+        dataset: Union[DatasetH, TSDatasetH],
+        evals_result=dict(),
+        save_path=None,
+        reweighter=None,
+    ):
+        ists = isinstance(dataset, TSDatasetH)  # is this time series dataset
+
+        dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
+        dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
+        if dl_train.empty or dl_valid.empty:
+            raise ValueError("Empty data from dataset, please check your dataset config.")
+
+        if reweighter is None:
+            wl_train = np.ones(len(dl_train))
+            wl_valid = np.ones(len(dl_valid))
+        elif isinstance(reweighter, Reweighter):
+            wl_train = reweighter.reweight(dl_train)
+            wl_valid = reweighter.reweight(dl_valid)
+        else:
+            raise ValueError("Unsupported reweighter type.")
+
+        # Preprocess for data.  To align to Dataset Interface for DataLoader
+        if ists:
+            dl_train.config(fillna_type="ffill+bfill")  # process nan brought by dataloader
+            dl_valid.config(fillna_type="ffill+bfill")  # process nan brought by dataloader
+        else:
+            # If it is a tabular, we convert the dataframe to numpy to be indexable by DataLoader
+            dl_train = dl_train.values
+            dl_valid = dl_valid.values
+
+        train_loader = DataLoader(
+            ConcatDataset(dl_train, wl_train),
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.n_jobs,
+            drop_last=True,
+        )
+        valid_loader = DataLoader(
+            ConcatDataset(dl_valid, wl_valid),
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.n_jobs,
+            drop_last=True,
+        )
+        del dl_train, dl_valid, wl_train, wl_valid
+
+        save_path = get_or_create_path(save_path)
+
+        stop_steps = 0
+        train_loss = 0
+        best_score = -np.inf
+        best_epoch = 0
+        evals_result["train"] = []
+        evals_result["valid"] = []
+
+        # train
+        self.logger.info("training...")
+        self.fitted = True
+
+        for step in range(self.n_epochs):
+            self.logger.info("Epoch%d:", step)
+            self.logger.info("training...")
+            self.train_epoch(train_loader)
+            self.logger.info("evaluating...")
+            train_loss, train_score = self.test_epoch(train_loader)
+            val_loss, val_score = self.test_epoch(valid_loader)
+            self.logger.info("train %.6f, valid %.6f" % (train_score, val_score))
+            evals_result["train"].append(train_score)
+            evals_result["valid"].append(val_score)
+
+            if step == 0:
+                best_param = copy.deepcopy(self.dnn_model.state_dict())
+            if val_score > best_score:
+                best_score = val_score
+                stop_steps = 0
+                best_epoch = step
+                best_param = copy.deepcopy(self.dnn_model.state_dict())
+            else:
+                stop_steps += 1
+                if stop_steps >= self.early_stop:
+                    self.logger.info("early stop")
+                    break
+
+        self.logger.info("best score: %.6lf @ %d" % (best_score, best_epoch))
+        self.dnn_model.load_state_dict(best_param)
+        torch.save(best_param, save_path)
+
+        if self.use_gpu:
+            torch.cuda.empty_cache()
+
+    def predict(self, dataset: Union[DatasetH, TSDatasetH]):
+        if not self.fitted:
+            raise ValueError("model is not fitted yet!")
+
+        dl_test = dataset.prepare("test", col_set=["feature", "label"], data_key=DataHandlerLP.DK_I)
+
+        if isinstance(dataset, TSDatasetH):
+            dl_test.config(fillna_type="ffill+bfill")  # process nan brought by dataloader
+            index = dl_test.get_index()
+        else:
+            # If it is a tabular, we convert the dataframe to numpy to be indexable by DataLoader
+            index = dl_test.index
+            dl_test = dl_test.values
+
+        test_loader = DataLoader(dl_test, batch_size=self.batch_size, num_workers=self.n_jobs)
+        self.dnn_model.eval()
+        preds = []
+
+        for data in test_loader:
+            feature, _ = self._get_fl(data)
+            feature = feature.to(self.device)
+
+            with torch.no_grad():
+                pred = self.dnn_model(feature.float()).detach().cpu().numpy()
+
+            preds.append(pred)
+
+        preds_concat = np.concatenate(preds)
+        if preds_concat.ndim != 1:
+            preds_concat = preds_concat.ravel()
+
+        return pd.Series(preds_concat, index=index)
diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py
index e0f883f094..3306115507 100755
--- a/qlib/contrib/model/pytorch_gru.py
+++ b/qlib/contrib/model/pytorch_gru.py
@@ -317,7 +317,6 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"):
 
 
 class GRUModel(nn.Module):
-
     def __init__(self, d_feat=6, hidden_size=64, num_layers=2, dropout=0.0):
         super().__init__()
 
diff --git a/tests/model/test_general_nn.py b/tests/model/test_general_nn.py
new file mode 100644
index 0000000000..dd695efcc5
--- /dev/null
+++ b/tests/model/test_general_nn.py
@@ -0,0 +1,76 @@
+import unittest
+from qlib.tests import TestAutoData
+
+
+class TestNN(TestAutoData):
+    def test_both_dataset(self):
+        try:
+            from qlib.contrib.model.pytorch_general_nn import GeneralPTNN
+            from qlib.data.dataset import DatasetH, TSDatasetH
+            from qlib.data.dataset.handler import DataHandlerLP
+        except ImportError:
+            print("Import error.")
+            return
+
+        data_handler_config = {
+            "start_time": "2008-01-01",
+            "end_time": "2020-08-01",
+            "instruments": "csi300",
+            "data_loader": {
+                "class": "QlibDataLoader",  # Assuming QlibDataLoader is a string reference to the class
+                "kwargs": {
+                    "config": {
+                        "feature": [["$high", "$close", "$low"], ["H", "C", "L"]],
+                        "label": [["Ref($close, -2)/Ref($close, -1) - 1"], ["LABEL0"]],
+                    },
+                    "freq": "day",
+                },
+            },
+            # TODO: processors
+            "learn_processors": [
+                {
+                    "class": "DropnaLabel",
+                },
+                {"class": "CSZScoreNorm", "kwargs": {"fields_group": "label"}},
+            ],
+        }
+        segments = {
+            "train": ["2008-01-01", "2014-12-31"],
+            "valid": ["2015-01-01", "2016-12-31"],
+            "test": ["2017-01-01", "2020-08-01"],
+        }
+        data_handler = DataHandlerLP(**data_handler_config)
+
+        # time-series dataset
+        tsds = TSDatasetH(handler=data_handler, segments=segments)
+
+        # tabular dataset
+        tbds = DatasetH(handler=data_handler, segments=segments)
+
+        model_l = [
+            GeneralPTNN(
+                n_epochs=2,
+                pt_model_uri="qlib.contrib.model.pytorch_gru_ts.GRUModel",
+                pt_model_kwargs={
+                    "d_feat": 3,
+                    "hidden_size": 8,
+                    "num_layers": 1,
+                    "dropout": 0.0,
+                },
+            ),
+            GeneralPTNN(
+                n_epochs=2,
+                pt_model_uri="qlib.contrib.model.pytorch_nn.Net",  # it is a MLP
+                pt_model_kwargs={
+                    "input_dim": 3,
+                },
+            ),
+        ]
+
+        for ds, model in list(zip((tsds, tbds), model_l)):
+            model.fit(ds)  # It works
+            model.predict(ds)  # It works
+
+
+if __name__ == "__main__":
+    unittest.main()

From b7ace1a6226136f63d3867f913ff7cec48a56d8c Mon Sep 17 00:00:00 2001
From: you-n-g <you-n-g@users.noreply.github.com>
Date: Fri, 9 Aug 2024 20:14:58 +0800
Subject: [PATCH 34/37] =?UTF-8?q?=F0=9F=94=A5LLM-driven=20Auto=20Quant=20F?=
 =?UTF-8?q?actory=F0=9F=94=A5=20(#1840)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update README.md

* Update README.md
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 773eeaf39b..d3c060036e 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@
 Recent released features
 | Feature | Status |
 | --                      | ------    |
+| 🔥LLM-driven Auto Quant Factory🔥 | 🚀 Released in [RD-Agent](https://github.com/microsoft/RD-Agent) on Aug 8, 2024 |
 | KRNN and Sandwich models | :chart_with_upwards_trend: [Released](https://github.com/microsoft/qlib/pull/1414/) on May 26, 2023 |
 | Release Qlib v0.9.0 | :octocat: [Released](https://github.com/microsoft/qlib/releases/tag/v0.9.0) on Dec 9, 2022 |
 | RL Learning Framework | :hammer: :chart_with_upwards_trend: Released on Nov 10, 2022. [#1332](https://github.com/microsoft/qlib/pull/1332), [#1322](https://github.com/microsoft/qlib/pull/1322), [#1316](https://github.com/microsoft/qlib/pull/1316),[#1299](https://github.com/microsoft/qlib/pull/1299),[#1263](https://github.com/microsoft/qlib/pull/1263), [#1244](https://github.com/microsoft/qlib/pull/1244), [#1169](https://github.com/microsoft/qlib/pull/1169), [#1125](https://github.com/microsoft/qlib/pull/1125), [#1076](https://github.com/microsoft/qlib/pull/1076)|

From 9e635168c09ec290dec2de1d4e84dbde1d4ace34 Mon Sep 17 00:00:00 2001
From: you-n-g <you-n-g@users.noreply.github.com>
Date: Fri, 9 Aug 2024 20:23:13 +0800
Subject: [PATCH 35/37] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d3c060036e..547e286c95 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@
 Recent released features
 | Feature | Status |
 | --                      | ------    |
-| 🔥LLM-driven Auto Quant Factory🔥 | 🚀 Released in [RD-Agent](https://github.com/microsoft/RD-Agent) on Aug 8, 2024 |
+| 🔥LLM-driven Auto Quant Factory🔥 | 🚀 Released in [♾️RD-Agent](https://github.com/microsoft/RD-Agent) on Aug 8, 2024 |
 | KRNN and Sandwich models | :chart_with_upwards_trend: [Released](https://github.com/microsoft/qlib/pull/1414/) on May 26, 2023 |
 | Release Qlib v0.9.0 | :octocat: [Released](https://github.com/microsoft/qlib/releases/tag/v0.9.0) on Dec 9, 2022 |
 | RL Learning Framework | :hammer: :chart_with_upwards_trend: Released on Nov 10, 2022. [#1332](https://github.com/microsoft/qlib/pull/1332), [#1322](https://github.com/microsoft/qlib/pull/1322), [#1316](https://github.com/microsoft/qlib/pull/1316),[#1299](https://github.com/microsoft/qlib/pull/1299),[#1263](https://github.com/microsoft/qlib/pull/1263), [#1244](https://github.com/microsoft/qlib/pull/1244), [#1169](https://github.com/microsoft/qlib/pull/1169), [#1125](https://github.com/microsoft/qlib/pull/1125), [#1076](https://github.com/microsoft/qlib/pull/1076)|

From 82cf438401bc37524fbfa2a9ecb836e25b50346b Mon Sep 17 00:00:00 2001
From: Linlang <30293408+SunsetWolf@users.noreply.github.com>
Date: Wed, 14 Aug 2024 14:59:28 +0800
Subject: [PATCH 36/37] fix break img (#1842)

---
 README.md                          | 16 ++++++++--------
 qlib/contrib/model/pytorch_hist.py |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 547e286c95..9edcc10cce 100644
--- a/README.md
+++ b/README.md
@@ -309,19 +309,19 @@ Qlib provides a tool named `qrun` to run the whole workflow automatically (inclu
 2. Graphical Reports Analysis: Run `examples/workflow_by_code.ipynb` with `jupyter notebook` to get graphical reports
     - Forecasting signal (model prediction) analysis
       - Cumulative Return of groups
-      ![Cumulative Return](http://fintech.msra.cn/images_v070/analysis/analysis_model_cumulative_return.png?v=0.1)
+      ![Cumulative Return](https://github.com/microsoft/qlib/blob/main/docs/_static/img/analysis/analysis_model_cumulative_return.png)
       - Return distribution
-      ![long_short](http://fintech.msra.cn/images_v070/analysis/analysis_model_long_short.png?v=0.1)
+      ![long_short](https://github.com/microsoft/qlib/blob/main/docs/_static/img/analysis/analysis_model_long_short.png)
       - Information Coefficient (IC)
-      ![Information Coefficient](http://fintech.msra.cn/images_v070/analysis/analysis_model_IC.png?v=0.1)
-      ![Monthly IC](http://fintech.msra.cn/images_v070/analysis/analysis_model_monthly_IC.png?v=0.1)
-      ![IC](http://fintech.msra.cn/images_v070/analysis/analysis_model_NDQ.png?v=0.1)
+      ![Information Coefficient](https://github.com/microsoft/qlib/blob/main/docs/_static/img/analysis/analysis_model_IC.png)
+      ![Monthly IC](https://github.com/microsoft/qlib/blob/main/docs/_static/img/analysis/analysis_model_monthly_IC.png)
+      ![IC](https://github.com/microsoft/qlib/blob/main/docs/_static/img/analysis/analysis_model_NDQ.png)
       - Auto Correlation of forecasting signal (model prediction)
-      ![Auto Correlation](http://fintech.msra.cn/images_v070/analysis/analysis_model_auto_correlation.png?v=0.1)
+      ![Auto Correlation](https://github.com/microsoft/qlib/blob/main/docs/_static/img/analysis/analysis_model_auto_correlation.png)
 
     - Portfolio analysis
       - Backtest return
-      ![Report](http://fintech.msra.cn/images_v070/analysis/report.png?v=0.1)
+      ![Report](https://github.com/microsoft/qlib/blob/main/docs/_static/img/analysis/report.png)
       <!-- 
       - Score IC
       ![Score IC](docs/_static/img/score_ic.png)
@@ -500,7 +500,7 @@ Qlib data are stored in a compact format, which is efficient to be combined into
 Join IM discussion groups:
 |[Gitter](https://gitter.im/Microsoft/qlib)|
 |----|
-|![image](http://fintech.msra.cn/images_v070/qrcode/gitter_qr.png)|
+|![image](https://github.com/microsoft/qlib/blob/main/docs/_static/img/qrcode/gitter_qr.png)|
 
 # Contributing
 We appreciate all contributions and thank all the contributors!
diff --git a/qlib/contrib/model/pytorch_hist.py b/qlib/contrib/model/pytorch_hist.py
index 5c3cd66a31..33df8e4875 100644
--- a/qlib/contrib/model/pytorch_hist.py
+++ b/qlib/contrib/model/pytorch_hist.py
@@ -256,7 +256,7 @@ def fit(
             raise ValueError("Empty data from dataset, please check your dataset config.")
 
         if not os.path.exists(self.stock2concept):
-            url = "http://fintech.msra.cn/stock_data/downloads/qlib_csi300_stock2concept.npy"
+            url = "https://github.com/SunsetWolf/qlib_dataset/releases/download/v0/qlib_csi300_stock2concept.npy"
             urllib.request.urlretrieve(url, self.stock2concept)
 
         stock_index = np.load(self.stock_index, allow_pickle=True).item()

From b45b006ef27c3f8dc9f30e92746261c327333134 Mon Sep 17 00:00:00 2001
From: Another <835166018@qq.com>
Date: Fri, 30 Aug 2024 17:01:55 +0800
Subject: [PATCH 37/37] Update README.md (#1839)

Update data example to 20240809
---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9edcc10cce..1835b79ddf 100644
--- a/README.md
+++ b/README.md
@@ -177,11 +177,11 @@ Also, users can install the latest dev version ``Qlib`` by the source code accor
 
 ## Data Preparation
 ❗ Due to more restrict data security policy. The offical dataset is disabled temporarily. You can try [this data source](https://github.com/chenditc/investment_data/releases) contributed by the community.
-Here is an example to download the data updated on 20220720.
+Here is an example to download the data updated on 20240809.
 ```bash
-wget https://github.com/chenditc/investment_data/releases/download/20220720/qlib_bin.tar.gz
+wget https://github.com/chenditc/investment_data/releases/download/2024-08-09/qlib_bin.tar.gz
 mkdir -p ~/.qlib/qlib_data/cn_data
-tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=2
+tar -zxvf qlib_bin.tar.gz -C ~/.qlib/qlib_data/cn_data --strip-components=1
 rm -f qlib_bin.tar.gz
 ```