Add comprehensive BTC/USDT price analysis framework with 17 modules
Complete statistical analysis pipeline covering: - FFT spectral analysis, wavelet CWT, ACF/PACF autocorrelation - Returns distribution (fat tails, kurtosis=15.65), GARCH volatility modeling - Hurst exponent (H=0.593), fractal dimension, power law corridor - Volume-price causality (Granger), calendar effects, halving cycle analysis - Technical indicator validation (0/21 pass FDR), candlestick pattern testing - Market state clustering (K-Means/GMM), Markov chain transitions - Time series forecasting (ARIMA/Prophet/LSTM benchmarks) - Anomaly detection ensemble (IF+LOF+COPOD, AUC=0.9935) Key finding: volatility is predictable (GARCH persistence=0.973), but price direction is statistically indistinguishable from random walk. Includes REPORT.md with 16-section analysis report and future projections, 70+ charts in output/, and all source modules in src/. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
80
src/preprocessing.py
Normal file
80
src/preprocessing.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""数据预处理模块 - 收益率、去趋势、标准化、衍生指标"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def log_returns(prices: pd.Series) -> pd.Series:
|
||||
"""对数收益率"""
|
||||
return np.log(prices / prices.shift(1)).dropna()
|
||||
|
||||
|
||||
def simple_returns(prices: pd.Series) -> pd.Series:
|
||||
"""简单收益率"""
|
||||
return prices.pct_change().dropna()
|
||||
|
||||
|
||||
def detrend_log_diff(prices: pd.Series) -> pd.Series:
|
||||
"""对数差分去趋势"""
|
||||
return np.log(prices).diff().dropna()
|
||||
|
||||
|
||||
def detrend_linear(series: pd.Series) -> pd.Series:
|
||||
"""线性去趋势"""
|
||||
x = np.arange(len(series))
|
||||
coeffs = np.polyfit(x, series.values, 1)
|
||||
trend = np.polyval(coeffs, x)
|
||||
return pd.Series(series.values - trend, index=series.index)
|
||||
|
||||
|
||||
def hp_filter(series: pd.Series, lamb: float = 1600) -> tuple:
|
||||
"""Hodrick-Prescott 滤波器"""
|
||||
from statsmodels.tsa.filters.hp_filter import hpfilter
|
||||
cycle, trend = hpfilter(series.dropna(), lamb=lamb)
|
||||
return cycle, trend
|
||||
|
||||
|
||||
def rolling_volatility(returns: pd.Series, window: int = 30) -> pd.Series:
|
||||
"""滚动波动率(年化)"""
|
||||
return returns.rolling(window=window).std() * np.sqrt(365)
|
||||
|
||||
|
||||
def realized_volatility(returns: pd.Series, window: int = 30) -> pd.Series:
|
||||
"""已实现波动率"""
|
||||
return np.sqrt((returns ** 2).rolling(window=window).sum())
|
||||
|
||||
|
||||
def taker_buy_ratio(df: pd.DataFrame) -> pd.Series:
|
||||
"""Taker买入比例"""
|
||||
return df["taker_buy_volume"] / df["volume"].replace(0, np.nan)
|
||||
|
||||
|
||||
def add_derived_features(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""添加常用衍生特征列"""
|
||||
out = df.copy()
|
||||
out["log_return"] = log_returns(df["close"])
|
||||
out["simple_return"] = simple_returns(df["close"])
|
||||
out["log_price"] = np.log(df["close"])
|
||||
out["range_pct"] = (df["high"] - df["low"]) / df["close"]
|
||||
out["body_pct"] = (df["close"] - df["open"]) / df["open"]
|
||||
out["taker_buy_ratio"] = taker_buy_ratio(df)
|
||||
out["vol_30d"] = rolling_volatility(out["log_return"], 30)
|
||||
out["vol_7d"] = rolling_volatility(out["log_return"], 7)
|
||||
out["volume_ma20"] = df["volume"].rolling(20).mean()
|
||||
out["volume_ratio"] = df["volume"] / out["volume_ma20"]
|
||||
out["abs_return"] = out["log_return"].abs()
|
||||
out["squared_return"] = out["log_return"] ** 2
|
||||
return out
|
||||
|
||||
|
||||
def standardize(series: pd.Series) -> pd.Series:
|
||||
"""Z-score标准化"""
|
||||
return (series - series.mean()) / series.std()
|
||||
|
||||
|
||||
def winsorize(series: pd.Series, lower: float = 0.01, upper: float = 0.99) -> pd.Series:
|
||||
"""Winsorize处理极端值"""
|
||||
lo = series.quantile(lower)
|
||||
hi = series.quantile(upper)
|
||||
return series.clip(lo, hi)
|
||||
Reference in New Issue
Block a user