From eebc67f11179fb59ed0733d6555b354f5e0d9bfa Mon Sep 17 00:00:00 2001
From: Nathan Bransby <brnsb.dev@gmail.com>
Date: Wed, 4 Dec 2024 01:25:48 +0800
Subject: [PATCH] Fix ValueError in dump_bin.py by excluding 'symbol' field
 from conversion

Fixes #1852

Modify `scripts/dump_bin.py` to handle the conversion of string 'SH600000' to float correctly.

* **Exclude 'symbol' field from conversion**:
  - Modify `_data_to_bin` method to exclude the 'symbol' field from conversion to float.
  - Add a check to ensure 'symbol' field is not included in the fields to be converted.

* **Update `normalize_data` method**:
  - Ensure `normalize_data` method in `scripts/data_collector/baostock_5min/collector.py` processes data correctly without converting 'symbol' to float.
  - Update `normalize_baostock` method to retain 'symbol' field as a string.

* **Documentation update**:
  - Emphasize the importance of `normalize_data` before using `dump_bin.py` in `scripts/data_collector/baostock_5min/README.md`.
---
 scripts/data_collector/baostock_5min/README.md | 7 +++++--
 scripts/dump_bin.py                            | 4 +++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/scripts/data_collector/baostock_5min/README.md b/scripts/data_collector/baostock_5min/README.md
index e593ea2e49..00264d3430 100644
--- a/scripts/data_collector/baostock_5min/README.md
+++ b/scripts/data_collector/baostock_5min/README.md
@@ -23,6 +23,7 @@
   1. download data to csv: `python scripts/data_collector/baostock_5min/collector.py download_data`
      
      This will download the raw data such as date, symbol, open, high, low, close, volume, amount, adjustflag from baostock to a local directory. One file per symbol.
+     
      - parameters:
           - `source_dir`: save the directory
           - `interval`: `5min`
@@ -39,6 +40,7 @@
      This will:
      1. Normalize high, low, close, open price using adjclose.
      2. Normalize the high, low, close, open price so that the first valid trading date's close price is 1. 
+
      - parameters:
           - `source_dir`: csv directory
           - `normalize_dir`: result directory
@@ -62,7 +64,7 @@
   3. dump data: `python scripts/dump_bin.py dump_all`
     
      This will convert the normalized csv in `feature` directory as numpy array and store the normalized data one file per column and one symbol per directory. 
-    
+     
      - parameters:
        - `csv_path`: stock data path or directory, **normalize result(normalize_dir)**
        - `qlib_dir`: qlib(dump) data director
@@ -78,4 +80,5 @@
        ```bash
        # dump 5min cn
        python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/hs300_5min_nor --qlib_dir ~/.qlib/qlib_data/hs300_5min_bin --freq 5min --exclude_fields date,symbol
-       ```
\ No newline at end of file
+       ```
+       > **Note**: Ensure to run `normalize_data` before using `dump_bin.py` to avoid conversion errors.
diff --git a/scripts/dump_bin.py b/scripts/dump_bin.py
index a65b1f58ee..6f368e62b4 100644
--- a/scripts/dump_bin.py
+++ b/scripts/dump_bin.py
@@ -47,7 +47,7 @@ def __init__(
         limit_nums: int = None,
     ):
         """
-
+        
         Parameters
         ----------
         csv_path: str
@@ -223,6 +223,8 @@ def _data_to_bin(self, df: pd.DataFrame, calendar_list: List[pd.Timestamp], feat
         # used when creating a bin file
         date_index = self.get_datetime_index(_df, calendar_list)
         for field in self.get_dump_fields(_df.columns):
+            if field == self.symbol_field_name:
+                continue
             bin_path = features_dir.joinpath(f"{field.lower()}.{self.freq}{self.DUMP_FILE_SUFFIX}")
             if field not in _df.columns:
                 continue