Skip to content

Commit

Permalink
update code
Browse files Browse the repository at this point in the history
  • Loading branch information
SunsetWolf committed Jan 9, 2025
1 parent 22d388a commit 5515a64
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 27 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ We recommend users to prepare their own data if they have a high-quality dataset
```
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data --missing_data_num 30055 --large_step_threshold_volume 94485 --large_step_threshold_price 20
```
* If you want more information about `check_data_health`, please refer to the [documentation](https://qlib.readthedocs.io/en/latest/component/data.html#checking-the-health-of-the-data).
<!--
- Run the initialization code and get stock data:
Expand Down
2 changes: 1 addition & 1 deletion docs/component/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ Checking the health of the data
for 1min data:
.. code-block:: bash
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data_1min --freq 1min --missing_data_num 30055 --large_step_threshold_volume 94485 --large_step_threshold_price 20
python scripts/check_data_health.py check_data --qlib_dir ~/.qlib/qlib_data/cn_data --freq 1min --missing_data_num 35806 --large_step_threshold_volume 3205452000000 --large_step_threshold_price 0.91
Stock Pool (Market)
-------------------
Expand Down
54 changes: 28 additions & 26 deletions scripts/check_data_health.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import enum
from loguru import logger
import os
from typing import List, Optional, Tuple
from typing import Optional

import fire
import pandas as pd
Expand All @@ -19,12 +19,6 @@ class DataHealthChecker:
- any factor is missing
"""

class DataProblem(enum.Enum):
MISSING_REQUIRED_COLUMN = 1
MISSING_DATA = 2
LARGE_STEP_CHANGE = 3
MISSING_FACTOR = 4

def __init__(
self,
csv_path=None,
Expand Down Expand Up @@ -57,7 +51,7 @@ def __init__(

def load_qlib_data(self):
instruments = D.instruments(market="all")
instrument_list = D.list_instruments(instruments=instruments, as_list=True)
instrument_list = D.list_instruments(instruments=instruments, as_list=True, freq=self.freq)
required_fields = ["$open", "$close", "$low", "$high", "$volume", "$factor"]
for instrument in instrument_list:
df = D.features([instrument], required_fields, freq=self.freq)
Expand All @@ -73,8 +67,9 @@ def load_qlib_data(self):
inplace=True,
)
self.data[instrument] = df
print(df)

def check_missing_data(self) -> Optional[Tuple[DataProblem, List[str]]]:
def check_missing_data(self) -> Optional[pd.DataFrame]:
"""Check if any data is missing in the DataFrame."""
result_dict = {
"instruments": [],
Expand All @@ -99,8 +94,9 @@ def check_missing_data(self) -> Optional[Tuple[DataProblem, List[str]]]:
return result_df
else:
logger.info(f"✅ There are no missing data.")
return None

def check_large_step_changes(self) -> Optional[Tuple[DataProblem, List[str]]]:
def check_large_step_changes(self) -> Optional[pd.DataFrame]:
"""Check if there are any large step changes above the threshold in the OHLCV columns."""
result_dict = {
"instruments": [],
Expand All @@ -127,8 +123,9 @@ def check_large_step_changes(self) -> Optional[Tuple[DataProblem, List[str]]]:
return result_df
else:
logger.info(f"✅ There are no large step changes in the OHLCV column above the threshold.")
return None

def check_required_columns(self) -> Optional[Tuple[DataProblem, List[str]]]:
def check_required_columns(self) -> Optional[pd.DataFrame]:
"""Check if any of the required columns (OLHCV) are missing in the DataFrame."""
required_columns = ["open", "high", "low", "close", "volume"]
result_dict = {
Expand All @@ -146,15 +143,18 @@ def check_required_columns(self) -> Optional[Tuple[DataProblem, List[str]]]:
return result_df
else:
logger.info(f"✅ The columns (OLHCV) are complete and not missing.")
return None

def check_missing_factor(self) -> Optional[Tuple[DataProblem, List[str]]]:
def check_missing_factor(self) -> Optional[pd.DataFrame]:
"""Check if the 'factor' column is missing in the DataFrame."""
result_dict = {
"instruments": [],
"missing_factor_col": [],
"missing_factor_data": [],
}
for filename, df in self.data.items():
if "000300" in filename or "000903" in filename or "000905" in filename:
continue
if "factor" not in df.columns:
result_dict["instruments"].append(filename)
result_dict["missing_factor_col"].append(True)
Expand All @@ -171,26 +171,28 @@ def check_missing_factor(self) -> Optional[Tuple[DataProblem, List[str]]]:
return result_df
else:
logger.info(f"✅ The `factor` column already exists and is not empty.")
return None

def check_data(self):
check_missing_data_result = self.check_missing_data()
check_large_step_changes_result = self.check_large_step_changes()
check_required_columns_result = self.check_required_columns()
check_missing_factor_result = self.check_missing_factor()
print(f"\nSummary of data health check ({len(self.data)} files checked):")
print("-------------------------------------------------")
if isinstance(check_missing_data_result, pd.DataFrame):
logger.warning(f"There is missing data.")
print(check_missing_data_result)
if isinstance(check_large_step_changes_result, pd.DataFrame):
logger.warning(f"The OHLCV column has large step changes.")
print(check_large_step_changes_result)
if isinstance(check_required_columns_result, pd.DataFrame):
logger.warning(f"Columns (OLHCV) are missing.")
print(check_required_columns_result)
if isinstance(check_missing_factor_result, pd.DataFrame):
logger.warning(f"The factor column does not exist or is empty")
print(check_missing_factor_result)
if check_large_step_changes_result is not None or check_large_step_changes_result is not None or check_required_columns_result is not None or check_missing_factor_result is not None:
print(f"\nSummary of data health check ({len(self.data)} files checked):")
print("-------------------------------------------------")
if isinstance(check_missing_data_result, pd.DataFrame):
logger.warning(f"There is missing data.")
print(check_missing_data_result)
if isinstance(check_large_step_changes_result, pd.DataFrame):
logger.warning(f"The OHLCV column has large step changes.")
print(check_large_step_changes_result)
if isinstance(check_required_columns_result, pd.DataFrame):
logger.warning(f"Columns (OLHCV) are missing.")
print(check_required_columns_result)
if isinstance(check_missing_factor_result, pd.DataFrame):
logger.warning(f"The factor column does not exist or is empty")
print(check_missing_factor_result)


if __name__ == "__main__":
Expand Down

0 comments on commit 5515a64

Please sign in to comment.