diff --git a/scripts/data_collector/baostock_5min/README.md b/scripts/data_collector/baostock_5min/README.md index e593ea2e49..00264d3430 100644 --- a/scripts/data_collector/baostock_5min/README.md +++ b/scripts/data_collector/baostock_5min/README.md @@ -23,6 +23,7 @@ 1. download data to csv: `python scripts/data_collector/baostock_5min/collector.py download_data` This will download the raw data such as date, symbol, open, high, low, close, volume, amount, adjustflag from baostock to a local directory. One file per symbol. + - parameters: - `source_dir`: save the directory - `interval`: `5min` @@ -39,6 +40,7 @@ This will: 1. Normalize high, low, close, open price using adjclose. 2. Normalize the high, low, close, open price so that the first valid trading date's close price is 1. + - parameters: - `source_dir`: csv directory - `normalize_dir`: result directory @@ -62,7 +64,7 @@ 3. dump data: `python scripts/dump_bin.py dump_all` This will convert the normalized csv in `feature` directory as numpy array and store the normalized data one file per column and one symbol per directory. - + - parameters: - `csv_path`: stock data path or directory, **normalize result(normalize_dir)** - `qlib_dir`: qlib(dump) data director @@ -78,4 +80,5 @@ ```bash # dump 5min cn python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/hs300_5min_nor --qlib_dir ~/.qlib/qlib_data/hs300_5min_bin --freq 5min --exclude_fields date,symbol - ``` \ No newline at end of file + ``` + > **Note**: Ensure to run `normalize_data` before using `dump_bin.py` to avoid conversion errors. diff --git a/scripts/dump_bin.py b/scripts/dump_bin.py index a65b1f58ee..6f368e62b4 100644 --- a/scripts/dump_bin.py +++ b/scripts/dump_bin.py @@ -47,7 +47,7 @@ def __init__( limit_nums: int = None, ): """ - + Parameters ---------- csv_path: str @@ -223,6 +223,8 @@ def _data_to_bin(self, df: pd.DataFrame, calendar_list: List[pd.Timestamp], feat # used when creating a bin file date_index = self.get_datetime_index(_df, calendar_list) for field in self.get_dump_fields(_df.columns): + if field == self.symbol_field_name: + continue bin_path = features_dir.joinpath(f"{field.lower()}.{self.freq}{self.DUMP_FILE_SUFFIX}") if field not in _df.columns: continue