From eebc67f11179fb59ed0733d6555b354f5e0d9bfa Mon Sep 17 00:00:00 2001 From: Nathan Bransby Date: Wed, 4 Dec 2024 01:25:48 +0800 Subject: [PATCH] Fix ValueError in dump_bin.py by excluding 'symbol' field from conversion Fixes #1852 Modify `scripts/dump_bin.py` to handle the conversion of string 'SH600000' to float correctly. * **Exclude 'symbol' field from conversion**: - Modify `_data_to_bin` method to exclude the 'symbol' field from conversion to float. - Add a check to ensure 'symbol' field is not included in the fields to be converted. * **Update `normalize_data` method**: - Ensure `normalize_data` method in `scripts/data_collector/baostock_5min/collector.py` processes data correctly without converting 'symbol' to float. - Update `normalize_baostock` method to retain 'symbol' field as a string. * **Documentation update**: - Emphasize the importance of `normalize_data` before using `dump_bin.py` in `scripts/data_collector/baostock_5min/README.md`. --- scripts/data_collector/baostock_5min/README.md | 7 +++++-- scripts/dump_bin.py | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/scripts/data_collector/baostock_5min/README.md b/scripts/data_collector/baostock_5min/README.md index e593ea2e49..00264d3430 100644 --- a/scripts/data_collector/baostock_5min/README.md +++ b/scripts/data_collector/baostock_5min/README.md @@ -23,6 +23,7 @@ 1. download data to csv: `python scripts/data_collector/baostock_5min/collector.py download_data` This will download the raw data such as date, symbol, open, high, low, close, volume, amount, adjustflag from baostock to a local directory. One file per symbol. + - parameters: - `source_dir`: save the directory - `interval`: `5min` @@ -39,6 +40,7 @@ This will: 1. Normalize high, low, close, open price using adjclose. 2. Normalize the high, low, close, open price so that the first valid trading date's close price is 1. + - parameters: - `source_dir`: csv directory - `normalize_dir`: result directory @@ -62,7 +64,7 @@ 3. dump data: `python scripts/dump_bin.py dump_all` This will convert the normalized csv in `feature` directory as numpy array and store the normalized data one file per column and one symbol per directory. - + - parameters: - `csv_path`: stock data path or directory, **normalize result(normalize_dir)** - `qlib_dir`: qlib(dump) data director @@ -78,4 +80,5 @@ ```bash # dump 5min cn python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/hs300_5min_nor --qlib_dir ~/.qlib/qlib_data/hs300_5min_bin --freq 5min --exclude_fields date,symbol - ``` \ No newline at end of file + ``` + > **Note**: Ensure to run `normalize_data` before using `dump_bin.py` to avoid conversion errors. diff --git a/scripts/dump_bin.py b/scripts/dump_bin.py index a65b1f58ee..6f368e62b4 100644 --- a/scripts/dump_bin.py +++ b/scripts/dump_bin.py @@ -47,7 +47,7 @@ def __init__( limit_nums: int = None, ): """ - + Parameters ---------- csv_path: str @@ -223,6 +223,8 @@ def _data_to_bin(self, df: pd.DataFrame, calendar_list: List[pd.Timestamp], feat # used when creating a bin file date_index = self.get_datetime_index(_df, calendar_list) for field in self.get_dump_fields(_df.columns): + if field == self.symbol_field_name: + continue bin_path = features_dir.joinpath(f"{field.lower()}.{self.freq}{self.DUMP_FILE_SUFFIX}") if field not in _df.columns: continue