Files
btc_price_anany/src/preprocessing.py
riba2534 d480712b40 fix: 全面修复代码质量和报告准确性问题
代码修复 (16 个模块):
- GARCH 模型统一改用 t 分布 + 收敛检查 (returns/volatility/anomaly)
- KS 检验替换为 Lilliefors 检验 (returns)
- 修复数据泄漏: StratifiedKFold→TimeSeriesSplit, scaler 逐折 fit (anomaly)
- 前兆标签 shift(-1) 预测次日异常 (anomaly)
- PSD 归一化加入采样频率和单边谱×2 (fft)
- AR(1) 红噪声基线经验缩放 (fft)
- 盒计数法独立 x/y 归一化, MF-DFA q=0 (fractal)
- ADF 平稳性检验 + 移除双重 Bonferroni (causality)
- R/S Hurst 添加 R² 拟合优度 (hurst)
- Prophet 递推预测避免信息泄露 (time_series)
- IC 计算过滤零信号, 中性形态 hit_rate=NaN (indicators/patterns)
- 聚类阈值自适应化 (clustering)
- 日历效应前后半段稳健性检查 (calendar)
- 证据评分标准文本与代码对齐 (visualization)
- 核心管道 NaN/空值防护 (data_loader/preprocessing/main)

报告修复 (docs/REPORT.md, 15 处):
- 标度指数 H_scaling 与 Hurst 指数消歧
- GBM 6 个月概率锥数值重算
- CLT 限定、减半措辞弱化、情景概率逻辑修正
- GPD 形状参数解读修正、异常 AUC 证据降级

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 01:07:50 +08:00

93 lines
3.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""数据预处理模块 - 收益率、去趋势、标准化、衍生指标"""
import pandas as pd
import numpy as np
from typing import Optional
def log_returns(prices: pd.Series) -> pd.Series:
"""对数收益率"""
return np.log(prices / prices.shift(1)).dropna()
def simple_returns(prices: pd.Series) -> pd.Series:
"""简单收益率"""
return prices.pct_change().dropna()
def detrend_log_diff(prices: pd.Series) -> pd.Series:
"""对数差分去趋势"""
return np.log(prices).diff().dropna()
def detrend_linear(series: pd.Series) -> pd.Series:
"""线性去趋势自动忽略NaN"""
clean = series.dropna()
if len(clean) < 2:
return series - series.mean()
x = np.arange(len(clean))
coeffs = np.polyfit(x, clean.values, 1)
# 对完整索引计算趋势
x_full = np.arange(len(series))
trend = np.polyval(coeffs, x_full)
return pd.Series(series.values - trend, index=series.index)
def hp_filter(series: pd.Series, lamb: float = 1600) -> tuple:
"""Hodrick-Prescott 滤波器"""
from statsmodels.tsa.filters.hp_filter import hpfilter
cycle, trend = hpfilter(series.dropna(), lamb=lamb)
return cycle, trend
def rolling_volatility(returns: pd.Series, window: int = 30, periods_per_year: int = 365) -> pd.Series:
"""滚动波动率(年化)"""
return returns.rolling(window=window).std() * np.sqrt(periods_per_year)
def realized_volatility(returns: pd.Series, window: int = 30) -> pd.Series:
"""已实现波动率"""
return np.sqrt((returns ** 2).rolling(window=window).sum())
def taker_buy_ratio(df: pd.DataFrame) -> pd.Series:
"""Taker买入比例"""
return df["taker_buy_volume"] / df["volume"].replace(0, np.nan)
def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
"""添加常用衍生特征列
注意: 返回的 DataFrame 前30行部分列包含 NaN由滚动窗口计算导致
下游模块应根据需要自行处理。
"""
out = df.copy()
out["log_return"] = log_returns(df["close"])
out["simple_return"] = simple_returns(df["close"])
out["log_price"] = np.log(df["close"])
out["range_pct"] = (df["high"] - df["low"]) / df["close"]
out["body_pct"] = (df["close"] - df["open"]) / df["open"]
out["taker_buy_ratio"] = taker_buy_ratio(df)
out["vol_30d"] = rolling_volatility(out["log_return"], 30)
out["vol_7d"] = rolling_volatility(out["log_return"], 7)
out["volume_ma20"] = df["volume"].rolling(20).mean()
out["volume_ratio"] = df["volume"] / out["volume_ma20"]
out["abs_return"] = out["log_return"].abs()
out["squared_return"] = out["log_return"] ** 2
return out
def standardize(series: pd.Series) -> pd.Series:
"""Z-score标准化零方差时返回全零序列"""
std = series.std()
if std == 0 or np.isnan(std):
return pd.Series(0.0, index=series.index)
return (series - series.mean()) / std
def winsorize(series: pd.Series, lower: float = 0.01, upper: float = 0.99) -> pd.Series:
"""Winsorize处理极端值"""
lo = series.quantile(lower)
hi = series.quantile(upper)
return series.clip(lo, hi)