代码修复 (16 个模块): - GARCH 模型统一改用 t 分布 + 收敛检查 (returns/volatility/anomaly) - KS 检验替换为 Lilliefors 检验 (returns) - 修复数据泄漏: StratifiedKFold→TimeSeriesSplit, scaler 逐折 fit (anomaly) - 前兆标签 shift(-1) 预测次日异常 (anomaly) - PSD 归一化加入采样频率和单边谱×2 (fft) - AR(1) 红噪声基线经验缩放 (fft) - 盒计数法独立 x/y 归一化, MF-DFA q=0 (fractal) - ADF 平稳性检验 + 移除双重 Bonferroni (causality) - R/S Hurst 添加 R² 拟合优度 (hurst) - Prophet 递推预测避免信息泄露 (time_series) - IC 计算过滤零信号, 中性形态 hit_rate=NaN (indicators/patterns) - 聚类阈值自适应化 (clustering) - 日历效应前后半段稳健性检查 (calendar) - 证据评分标准文本与代码对齐 (visualization) - 核心管道 NaN/空值防护 (data_loader/preprocessing/main) 报告修复 (docs/REPORT.md, 15 处): - 标度指数 H_scaling 与 Hurst 指数消歧 - GBM 6 个月概率锥数值重算 - CLT 限定、减半措辞弱化、情景概率逻辑修正 - GPD 形状参数解读修正、异常 AUC 证据降级 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
93 lines
3.1 KiB
Python
93 lines
3.1 KiB
Python
"""数据预处理模块 - 收益率、去趋势、标准化、衍生指标"""
|
||
|
||
import pandas as pd
|
||
import numpy as np
|
||
from typing import Optional
|
||
|
||
|
||
def log_returns(prices: pd.Series) -> pd.Series:
|
||
"""对数收益率"""
|
||
return np.log(prices / prices.shift(1)).dropna()
|
||
|
||
|
||
def simple_returns(prices: pd.Series) -> pd.Series:
|
||
"""简单收益率"""
|
||
return prices.pct_change().dropna()
|
||
|
||
|
||
def detrend_log_diff(prices: pd.Series) -> pd.Series:
|
||
"""对数差分去趋势"""
|
||
return np.log(prices).diff().dropna()
|
||
|
||
|
||
def detrend_linear(series: pd.Series) -> pd.Series:
|
||
"""线性去趋势(自动忽略NaN)"""
|
||
clean = series.dropna()
|
||
if len(clean) < 2:
|
||
return series - series.mean()
|
||
x = np.arange(len(clean))
|
||
coeffs = np.polyfit(x, clean.values, 1)
|
||
# 对完整索引计算趋势
|
||
x_full = np.arange(len(series))
|
||
trend = np.polyval(coeffs, x_full)
|
||
return pd.Series(series.values - trend, index=series.index)
|
||
|
||
|
||
def hp_filter(series: pd.Series, lamb: float = 1600) -> tuple:
|
||
"""Hodrick-Prescott 滤波器"""
|
||
from statsmodels.tsa.filters.hp_filter import hpfilter
|
||
cycle, trend = hpfilter(series.dropna(), lamb=lamb)
|
||
return cycle, trend
|
||
|
||
|
||
def rolling_volatility(returns: pd.Series, window: int = 30, periods_per_year: int = 365) -> pd.Series:
|
||
"""滚动波动率(年化)"""
|
||
return returns.rolling(window=window).std() * np.sqrt(periods_per_year)
|
||
|
||
|
||
def realized_volatility(returns: pd.Series, window: int = 30) -> pd.Series:
|
||
"""已实现波动率"""
|
||
return np.sqrt((returns ** 2).rolling(window=window).sum())
|
||
|
||
|
||
def taker_buy_ratio(df: pd.DataFrame) -> pd.Series:
|
||
"""Taker买入比例"""
|
||
return df["taker_buy_volume"] / df["volume"].replace(0, np.nan)
|
||
|
||
|
||
def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
|
||
"""添加常用衍生特征列
|
||
|
||
注意: 返回的 DataFrame 前30行部分列包含 NaN(由滚动窗口计算导致),
|
||
下游模块应根据需要自行处理。
|
||
"""
|
||
out = df.copy()
|
||
out["log_return"] = log_returns(df["close"])
|
||
out["simple_return"] = simple_returns(df["close"])
|
||
out["log_price"] = np.log(df["close"])
|
||
out["range_pct"] = (df["high"] - df["low"]) / df["close"]
|
||
out["body_pct"] = (df["close"] - df["open"]) / df["open"]
|
||
out["taker_buy_ratio"] = taker_buy_ratio(df)
|
||
out["vol_30d"] = rolling_volatility(out["log_return"], 30)
|
||
out["vol_7d"] = rolling_volatility(out["log_return"], 7)
|
||
out["volume_ma20"] = df["volume"].rolling(20).mean()
|
||
out["volume_ratio"] = df["volume"] / out["volume_ma20"]
|
||
out["abs_return"] = out["log_return"].abs()
|
||
out["squared_return"] = out["log_return"] ** 2
|
||
return out
|
||
|
||
|
||
def standardize(series: pd.Series) -> pd.Series:
|
||
"""Z-score标准化(零方差时返回全零序列)"""
|
||
std = series.std()
|
||
if std == 0 or np.isnan(std):
|
||
return pd.Series(0.0, index=series.index)
|
||
return (series - series.mean()) / std
|
||
|
||
|
||
def winsorize(series: pd.Series, lower: float = 0.01, upper: float = 0.99) -> pd.Series:
|
||
"""Winsorize处理极端值"""
|
||
lo = series.quantile(lower)
|
||
hi = series.quantile(upper)
|
||
return series.clip(lo, hi)
|