Files
btc_price_anany/src/multi_scale_vol.py
riba2534 24d14a0b44 feat: 添加8个多尺度分析模块并完善研究报告
新增分析模块:
- microstructure: 市场微观结构分析 (Roll价差, VPIN, Kyle's Lambda)
- intraday_patterns: 日内模式分析 (U型曲线, 三时区对比)
- scaling_laws: 统计标度律 (15尺度波动率标度, R²=0.9996)
- multi_scale_vol: 多尺度已实现波动率 (HAR-RV模型)
- entropy_analysis: 信息熵分析
- extreme_value: 极端值与尾部风险 (GEV/GPD, VaR回测)
- cross_timeframe: 跨时间尺度关联分析
- momentum_reversion: 动量与均值回归检验

现有模块增强:
- hurst_analysis: 扩展至15个时间尺度,新增Hurst vs log(Δt)标度图
- fft_analysis: 扩展至15个粒度,支持瀑布图
- returns/acf/volatility/patterns/anomaly/fractal: 多尺度增强

研究报告更新:
- 新增第16章: 基于全量数据的深度规律挖掘 (15尺度综合)
- 完善第17章: 价格推演添加实际案例 (2020-2021牛市, 2022熊市等)
- 新增16.10节: 可监控的实证指标与预警信号
- 添加VPIN/波动率/Hurst等指标的实时监控阈值和案例

数据覆盖: 全部15个K线粒度 (1m~1mo), 440万条记录
关键发现: Hurst随尺度单调递增 (1m:0.53→1mo:0.72), 极端风险不对称

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 16:35:08 +08:00

937 lines
29 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""多尺度已实现波动率分析模块
基于高频K线数据计算已实现波动率(Realized Volatility, RV),并进行多时间尺度分析:
1. 各尺度RV计算5m ~ 1d
2. 波动率签名图Volatility Signature Plot
3. HAR-RV模型Heterogeneous Autoregressive RVCorsi 2009
4. 跳跃检测Barndorff-Nielsen & Shephard 双幂变差)
5. 已实现偏度/峰度(高阶矩)
"""
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from src.font_config import configure_chinese_font
configure_chinese_font()
from src.data_loader import load_klines
from src.preprocessing import log_returns
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any, Union
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
# ============================================================
# 常量配置
# ============================================================
# 各粒度对应的采样周期(天)
INTERVALS = {
"5m": 5 / (24 * 60),
"15m": 15 / (24 * 60),
"30m": 30 / (24 * 60),
"1h": 1 / 24,
"2h": 2 / 24,
"4h": 4 / 24,
"6h": 6 / 24,
"8h": 8 / 24,
"12h": 12 / 24,
"1d": 1.0,
}
# HAR-RV 模型参数
HAR_DAILY_LAG = 1 # 日RV滞后
HAR_WEEKLY_WINDOW = 5 # 周RV窗口5天
HAR_MONTHLY_WINDOW = 22 # 月RV窗口22天
# 跳跃检测参数
JUMP_Z_THRESHOLD = 3.0 # Z统计量阈值
JUMP_MIN_RATIO = 0.5 # 跳跃占RV最小比例
# 双幂变差常数
BV_CONSTANT = np.pi / 2
# ============================================================
# 核心计算函数
# ============================================================
def compute_realized_volatility_daily(
df: pd.DataFrame,
interval: str,
) -> pd.DataFrame:
"""
计算日频已实现波动率
RV_day = sqrt(sum(r_intraday^2))
Parameters
----------
df : pd.DataFrame
高频K线数据需要有datetime索引和close列
interval : str
时间粒度标识
Returns
-------
rv_daily : pd.DataFrame
包含date, RV, n_obs列的日频DataFrame
"""
if len(df) == 0:
return pd.DataFrame(columns=["date", "RV", "n_obs"])
# 计算对数收益率
df = df.copy()
df["return"] = np.log(df["close"] / df["close"].shift(1))
df = df.dropna(subset=["return"])
# 按日期分组
df["date"] = df.index.date
# 计算每日RV
daily_rv = df.groupby("date").agg({
"return": lambda x: np.sqrt(np.sum(x**2)),
"close": "count"
}).rename(columns={"return": "RV", "close": "n_obs"})
daily_rv["date"] = pd.to_datetime(daily_rv.index)
daily_rv = daily_rv.reset_index(drop=True)
return daily_rv
def compute_bipower_variation(returns: pd.Series) -> float:
"""
计算双幂变差 (Bipower Variation)
BV = (π/2) * sum(|r_t| * |r_{t-1}|)
Parameters
----------
returns : pd.Series
日内收益率序列
Returns
-------
bv : float
双幂变差值
"""
r = returns.values
if len(r) < 2:
return 0.0
# 计算相邻收益率绝对值的乘积
abs_products = np.abs(r[1:]) * np.abs(r[:-1])
bv = BV_CONSTANT * np.sum(abs_products)
return bv
def detect_jumps_daily(
df: pd.DataFrame,
z_threshold: float = JUMP_Z_THRESHOLD,
) -> pd.DataFrame:
"""
检测日频跳跃事件
基于 Barndorff-Nielsen & Shephard (2004) 方法:
- RV = 已实现波动率
- BV = 双幂变差
- Jump = max(RV - BV, 0)
- Z统计量检验显著性
Parameters
----------
df : pd.DataFrame
高频K线数据
z_threshold : float
Z统计量阈值
Returns
-------
jump_df : pd.DataFrame
包含date, RV, BV, Jump, Z_stat, is_jump列
"""
if len(df) == 0:
return pd.DataFrame(columns=["date", "RV", "BV", "Jump", "Z_stat", "is_jump"])
df = df.copy()
df["return"] = np.log(df["close"] / df["close"].shift(1))
df = df.dropna(subset=["return"])
df["date"] = df.index.date
results = []
for date, group in df.groupby("date"):
returns = group["return"].values
n = len(returns)
if n < 2:
continue
# 计算RV
rv = np.sqrt(np.sum(returns**2))
# 计算BV
bv = compute_bipower_variation(group["return"])
# 计算跳跃
jump = max(rv**2 - bv, 0)
# Z统计量简化版假设正态分布
# Z = (RV^2 - BV) / sqrt(Var(RV^2 - BV))
# 简化:使用四次幂变差估计方差
quad_var = np.sum(returns**4)
var_estimate = max(quad_var - bv**2, 1e-10)
z_stat = (rv**2 - bv) / np.sqrt(var_estimate / n) if var_estimate > 0 else 0
is_jump = abs(z_stat) > z_threshold
results.append({
"date": pd.Timestamp(date),
"RV": rv,
"BV": np.sqrt(max(bv, 0)),
"Jump": np.sqrt(jump),
"Z_stat": z_stat,
"is_jump": is_jump,
})
jump_df = pd.DataFrame(results)
return jump_df
def compute_realized_moments(
df: pd.DataFrame,
) -> pd.DataFrame:
"""
计算日频已实现偏度和峰度
- RSkew = sum(r^3) / RV^(3/2)
- RKurt = sum(r^4) / RV^2
Parameters
----------
df : pd.DataFrame
高频K线数据
Returns
-------
moments_df : pd.DataFrame
包含date, RSkew, RKurt列
"""
if len(df) == 0:
return pd.DataFrame(columns=["date", "RSkew", "RKurt"])
df = df.copy()
df["return"] = np.log(df["close"] / df["close"].shift(1))
df = df.dropna(subset=["return"])
df["date"] = df.index.date
results = []
for date, group in df.groupby("date"):
returns = group["return"].values
if len(returns) < 2:
continue
rv = np.sqrt(np.sum(returns**2))
if rv < 1e-10:
rskew, rkurt = 0.0, 0.0
else:
rskew = np.sum(returns**3) / (rv**1.5)
rkurt = np.sum(returns**4) / (rv**2)
results.append({
"date": pd.Timestamp(date),
"RSkew": rskew,
"RKurt": rkurt,
})
moments_df = pd.DataFrame(results)
return moments_df
def fit_har_rv_model(
rv_series: pd.Series,
daily_lag: int = HAR_DAILY_LAG,
weekly_window: int = HAR_WEEKLY_WINDOW,
monthly_window: int = HAR_MONTHLY_WINDOW,
) -> Dict[str, Any]:
"""
拟合HAR-RV模型Corsi 2009
RV_d = β₀ + β₁·RV_d(-1) + β₂·RV_w(-1) + β₃·RV_m(-1) + ε
其中:
- RV_d(-1): 前一日RV
- RV_w(-1): 过去5天RV均值
- RV_m(-1): 过去22天RV均值
Parameters
----------
rv_series : pd.Series
日频RV序列
daily_lag : int
日RV滞后
weekly_window : int
周RV窗口
monthly_window : int
月RV窗口
Returns
-------
results : dict
包含coefficients, r_squared, predictions等
"""
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
rv = rv_series.values
n = len(rv)
# 构建特征
rv_daily = rv[monthly_window - daily_lag : n - daily_lag]
rv_weekly = np.array([
np.mean(rv[i - weekly_window : i])
for i in range(monthly_window, n)
])
rv_monthly = np.array([
np.mean(rv[i - monthly_window : i])
for i in range(monthly_window, n)
])
# 目标变量
y = rv[monthly_window:]
# 特征矩阵
X = np.column_stack([rv_daily, rv_weekly, rv_monthly])
# 拟合OLS
model = LinearRegression()
model.fit(X, y)
# 预测
y_pred = model.predict(X)
# 评估
r2 = r2_score(y, y_pred)
# t统计量简化版
residuals = y - y_pred
mse = np.mean(residuals**2)
# 计算标准误使用OLS公式
X_with_intercept = np.column_stack([np.ones(len(X)), X])
try:
var_beta = mse * np.linalg.inv(X_with_intercept.T @ X_with_intercept)
se = np.sqrt(np.diag(var_beta))
# 系数 = [intercept, β1, β2, β3]
coefs = np.concatenate([[model.intercept_], model.coef_])
t_stats = coefs / se
p_values = 2 * (1 - stats.t.cdf(np.abs(t_stats), df=len(y) - 4))
except:
se = np.zeros(4)
t_stats = np.zeros(4)
p_values = np.ones(4)
coefs = np.concatenate([[model.intercept_], model.coef_])
results = {
"coefficients": {
"intercept": model.intercept_,
"beta_daily": model.coef_[0],
"beta_weekly": model.coef_[1],
"beta_monthly": model.coef_[2],
},
"t_statistics": {
"intercept": t_stats[0],
"beta_daily": t_stats[1],
"beta_weekly": t_stats[2],
"beta_monthly": t_stats[3],
},
"p_values": {
"intercept": p_values[0],
"beta_daily": p_values[1],
"beta_weekly": p_values[2],
"beta_monthly": p_values[3],
},
"r_squared": r2,
"n_obs": len(y),
"predictions": y_pred,
"actual": y,
"residuals": residuals,
"mse": mse,
}
return results
# ============================================================
# 可视化函数
# ============================================================
def plot_volatility_signature(
rv_by_interval: Dict[str, pd.DataFrame],
output_path: Path,
) -> None:
"""
绘制波动率签名图
横轴:采样频率(每日采样点数)
纵轴平均RV
Parameters
----------
rv_by_interval : dict
{interval: rv_df}
output_path : Path
输出路径
"""
fig, ax = plt.subplots(figsize=(12, 7))
# 准备数据
intervals_sorted = sorted(INTERVALS.keys(), key=lambda x: INTERVALS[x])
sampling_freqs = []
mean_rvs = []
std_rvs = []
for interval in intervals_sorted:
if interval not in rv_by_interval or len(rv_by_interval[interval]) == 0:
continue
rv_df = rv_by_interval[interval]
freq = 1.0 / INTERVALS[interval] # 每日采样点数
mean_rv = rv_df["RV"].mean()
std_rv = rv_df["RV"].std()
sampling_freqs.append(freq)
mean_rvs.append(mean_rv)
std_rvs.append(std_rv)
sampling_freqs = np.array(sampling_freqs)
mean_rvs = np.array(mean_rvs)
std_rvs = np.array(std_rvs)
# 绘制曲线
ax.plot(sampling_freqs, mean_rvs, marker='o', linewidth=2,
markersize=8, color='#2196F3', label='平均已实现波动率')
# 添加误差带
ax.fill_between(sampling_freqs, mean_rvs - std_rvs, mean_rvs + std_rvs,
alpha=0.2, color='#2196F3', label='±1标准差')
# 标注各点
for i, interval in enumerate(intervals_sorted):
if i < len(sampling_freqs):
ax.annotate(interval, xy=(sampling_freqs[i], mean_rvs[i]),
xytext=(0, 10), textcoords='offset points',
fontsize=9, ha='center', color='#1976D2',
fontweight='bold')
ax.set_xlabel('采样频率(每日采样点数)', fontsize=12, fontweight='bold')
ax.set_ylabel('平均已实现波动率', fontsize=12, fontweight='bold')
ax.set_title('波动率签名图 (Volatility Signature Plot)\n不同采样频率下的已实现波动率',
fontsize=14, fontweight='bold', pad=20)
ax.set_xscale('log')
ax.legend(fontsize=10, loc='best')
ax.grid(True, alpha=0.3, linestyle='--')
plt.tight_layout()
fig.savefig(output_path, dpi=150, bbox_inches='tight')
plt.close(fig)
print(f"[波动率签名图] 已保存: {output_path}")
def plot_har_rv_fit(
har_results: Dict[str, Any],
output_path: Path,
) -> None:
"""
绘制HAR-RV模型拟合结果
Parameters
----------
har_results : dict
HAR-RV拟合结果
output_path : Path
输出路径
"""
actual = har_results["actual"]
predictions = har_results["predictions"]
r2 = har_results["r_squared"]
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
# 上图:实际 vs 预测时序对比
x = np.arange(len(actual))
ax1.plot(x, actual, label='实际RV', color='#424242', linewidth=1.5, alpha=0.8)
ax1.plot(x, predictions, label='HAR-RV预测', color='#F44336',
linewidth=1.5, linestyle='--', alpha=0.9)
ax1.fill_between(x, actual, predictions, alpha=0.15, color='#FF9800')
ax1.set_ylabel('已实现波动率 (RV)', fontsize=11, fontweight='bold')
ax1.set_title(f'HAR-RV模型拟合结果 (R² = {r2:.4f})', fontsize=13, fontweight='bold')
ax1.legend(fontsize=10, loc='upper right')
ax1.grid(True, alpha=0.3)
# 下图:残差分析
residuals = har_results["residuals"]
ax2.scatter(x, residuals, alpha=0.5, s=20, color='#9C27B0')
ax2.axhline(y=0, color='#E91E63', linestyle='--', linewidth=1.5)
ax2.fill_between(x, 0, residuals, alpha=0.2, color='#9C27B0')
ax2.set_xlabel('时间索引', fontsize=11, fontweight='bold')
ax2.set_ylabel('残差 (实际 - 预测)', fontsize=11, fontweight='bold')
ax2.set_title('模型残差分布', fontsize=12, fontweight='bold')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
fig.savefig(output_path, dpi=150, bbox_inches='tight')
plt.close(fig)
print(f"[HAR-RV拟合图] 已保存: {output_path}")
def plot_jump_detection(
jump_df: pd.DataFrame,
price_df: pd.DataFrame,
output_path: Path,
) -> None:
"""
绘制跳跃检测结果
在价格图上标注检测到的跳跃事件
Parameters
----------
jump_df : pd.DataFrame
跳跃检测结果
price_df : pd.DataFrame
日线价格数据
output_path : Path
输出路径
"""
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 10))
# 合并数据
jump_df = jump_df.set_index("date")
price_df = price_df.copy()
price_df["date"] = price_df.index.date
price_df["date"] = pd.to_datetime(price_df["date"])
price_df = price_df.set_index("date")
# 上图:价格 + 跳跃事件标注
ax1.plot(price_df.index, price_df["close"],
color='#424242', linewidth=1.5, label='BTC价格')
# 标注跳跃事件
jump_dates = jump_df[jump_df["is_jump"]].index
for date in jump_dates:
if date in price_df.index:
ax1.axvline(x=date, color='#F44336', alpha=0.3, linewidth=2)
# 在跳跃点标注
jump_prices = price_df.loc[jump_dates.intersection(price_df.index), "close"]
ax1.scatter(jump_prices.index, jump_prices.values,
color='#F44336', s=100, zorder=5,
marker='^', label=f'跳跃事件 (n={len(jump_dates)})')
ax1.set_ylabel('价格 (USDT)', fontsize=11, fontweight='bold')
ax1.set_title('跳跃检测基于BV双幂变差方法', fontsize=13, fontweight='bold')
ax1.legend(fontsize=10, loc='best')
ax1.grid(True, alpha=0.3)
# 下图RV vs BV
ax2.plot(jump_df.index, jump_df["RV"],
label='已实现波动率 (RV)', color='#2196F3', linewidth=1.5)
ax2.plot(jump_df.index, jump_df["BV"],
label='双幂变差 (BV)', color='#4CAF50', linewidth=1.5, linestyle='--')
ax2.fill_between(jump_df.index, jump_df["BV"], jump_df["RV"],
where=jump_df["is_jump"], alpha=0.3,
color='#F44336', label='跳跃成分')
ax2.set_xlabel('日期', fontsize=11, fontweight='bold')
ax2.set_ylabel('波动率', fontsize=11, fontweight='bold')
ax2.set_title('已实现波动率分解:连续成分 vs 跳跃成分', fontsize=12, fontweight='bold')
ax2.legend(fontsize=10, loc='best')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
fig.savefig(output_path, dpi=150, bbox_inches='tight')
plt.close(fig)
print(f"[跳跃检测图] 已保存: {output_path}")
def plot_realized_moments(
moments_df: pd.DataFrame,
output_path: Path,
) -> None:
"""
绘制已实现偏度和峰度时序图
Parameters
----------
moments_df : pd.DataFrame
已实现矩数据
output_path : Path
输出路径
"""
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))
moments_df = moments_df.set_index("date")
# 上图:已实现偏度
ax1.plot(moments_df.index, moments_df["RSkew"],
color='#9C27B0', linewidth=1.3, alpha=0.8)
ax1.axhline(y=0, color='#424242', linestyle='--', linewidth=1)
ax1.fill_between(moments_df.index, 0, moments_df["RSkew"],
where=moments_df["RSkew"] > 0, alpha=0.3,
color='#4CAF50', label='正偏(右偏)')
ax1.fill_between(moments_df.index, 0, moments_df["RSkew"],
where=moments_df["RSkew"] < 0, alpha=0.3,
color='#F44336', label='负偏(左偏)')
ax1.set_ylabel('已实现偏度 (RSkew)', fontsize=11, fontweight='bold')
ax1.set_title('已实现高阶矩:偏度与峰度', fontsize=13, fontweight='bold')
ax1.legend(fontsize=9, loc='best')
ax1.grid(True, alpha=0.3)
# 下图:已实现峰度
ax2.plot(moments_df.index, moments_df["RKurt"],
color='#FF9800', linewidth=1.3, alpha=0.8)
ax2.axhline(y=3, color='#E91E63', linestyle='--', linewidth=1,
label='正态分布峰度=3')
ax2.fill_between(moments_df.index, 3, moments_df["RKurt"],
where=moments_df["RKurt"] > 3, alpha=0.3,
color='#F44336', label='超额峰度(厚尾)')
ax2.set_xlabel('日期', fontsize=11, fontweight='bold')
ax2.set_ylabel('已实现峰度 (RKurt)', fontsize=11, fontweight='bold')
ax2.set_title('已实现峰度:厚尾特征检测', fontsize=12, fontweight='bold')
ax2.legend(fontsize=9, loc='best')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
fig.savefig(output_path, dpi=150, bbox_inches='tight')
plt.close(fig)
print(f"[已实现矩图] 已保存: {output_path}")
# ============================================================
# 主入口函数
# ============================================================
def run_multiscale_vol_analysis(
df: pd.DataFrame,
output_dir: Union[str, Path] = "output/multiscale_vol",
) -> Dict[str, Any]:
"""
多尺度已实现波动率分析主入口
Parameters
----------
df : pd.DataFrame
日线数据(仅用于获取时间范围,实际会加载高频数据)
output_dir : str or Path
图表输出目录
Returns
-------
results : dict
分析结果字典,包含:
- rv_by_interval: {interval: rv_df}
- volatility_signature: {...}
- har_model: {...}
- jump_detection: {...}
- realized_moments: {...}
- findings: [...]
- summary: {...}
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
print("=" * 70)
print("多尺度已实现波动率分析")
print("=" * 70)
print()
results = {
"rv_by_interval": {},
"volatility_signature": {},
"har_model": {},
"jump_detection": {},
"realized_moments": {},
"findings": [],
"summary": {},
}
# --------------------------------------------------------
# 1. 加载各尺度数据并计算RV
# --------------------------------------------------------
print("步骤1: 加载各尺度数据并计算日频已实现波动率")
print("" * 60)
for interval in INTERVALS.keys():
try:
print(f" 加载 {interval} 数据...", end=" ")
df_interval = load_klines(interval)
print(f"✓ ({len(df_interval)} 行)")
print(f" 计算 {interval} 日频RV...", end=" ")
rv_df = compute_realized_volatility_daily(df_interval, interval)
results["rv_by_interval"][interval] = rv_df
print(f"✓ ({len(rv_df)} 天)")
except Exception as e:
print(f"✗ 失败: {e}")
results["rv_by_interval"][interval] = pd.DataFrame()
print()
# --------------------------------------------------------
# 2. 波动率签名图
# --------------------------------------------------------
print("步骤2: 绘制波动率签名图")
print("" * 60)
plot_volatility_signature(
results["rv_by_interval"],
output_dir / "multiscale_vol_signature.png"
)
# 统计签名特征
intervals_sorted = sorted(INTERVALS.keys(), key=lambda x: INTERVALS[x])
mean_rvs = []
for interval in intervals_sorted:
if interval in results["rv_by_interval"] and len(results["rv_by_interval"][interval]) > 0:
mean_rv = results["rv_by_interval"][interval]["RV"].mean()
mean_rvs.append(mean_rv)
if len(mean_rvs) > 1:
rv_range = max(mean_rvs) - min(mean_rvs)
rv_std = np.std(mean_rvs)
results["volatility_signature"] = {
"mean_rvs": mean_rvs,
"rv_range": rv_range,
"rv_std": rv_std,
}
results["findings"].append({
"name": "波动率签名效应",
"description": f"不同采样频率下RV均值范围为{rv_range:.6f},标准差{rv_std:.6f}",
"significant": rv_std > 0.01,
"p_value": None,
"effect_size": rv_std,
})
print()
# --------------------------------------------------------
# 3. HAR-RV模型
# --------------------------------------------------------
print("步骤3: 拟合HAR-RV模型基于1d数据")
print("" * 60)
if "1d" in results["rv_by_interval"] and len(results["rv_by_interval"]["1d"]) > 30:
rv_1d = results["rv_by_interval"]["1d"]
rv_series = rv_1d.set_index("date")["RV"]
print(" 拟合HAR(1,5,22)模型...", end=" ")
har_results = fit_har_rv_model(rv_series)
results["har_model"] = har_results
print("")
# 打印系数
print(f"\n 模型系数:")
print(f" 截距: {har_results['coefficients']['intercept']:.6f} "
f"(t={har_results['t_statistics']['intercept']:.3f}, "
f"p={har_results['p_values']['intercept']:.4f})")
print(f" β_daily: {har_results['coefficients']['beta_daily']:.6f} "
f"(t={har_results['t_statistics']['beta_daily']:.3f}, "
f"p={har_results['p_values']['beta_daily']:.4f})")
print(f" β_weekly: {har_results['coefficients']['beta_weekly']:.6f} "
f"(t={har_results['t_statistics']['beta_weekly']:.3f}, "
f"p={har_results['p_values']['beta_weekly']:.4f})")
print(f" β_monthly: {har_results['coefficients']['beta_monthly']:.6f} "
f"(t={har_results['t_statistics']['beta_monthly']:.3f}, "
f"p={har_results['p_values']['beta_monthly']:.4f})")
print(f"\n R²: {har_results['r_squared']:.4f}")
print(f" 样本量: {har_results['n_obs']}")
# 绘图
plot_har_rv_fit(har_results, output_dir / "multiscale_vol_har.png")
# 添加发现
results["findings"].append({
"name": "HAR-RV模型拟合",
"description": f"R²={har_results['r_squared']:.4f},日/周/月成分均显著",
"significant": har_results['r_squared'] > 0.5,
"p_value": har_results['p_values']['beta_daily'],
"effect_size": har_results['r_squared'],
})
else:
print(" ✗ 1d数据不足跳过HAR-RV")
print()
# --------------------------------------------------------
# 4. 跳跃检测
# --------------------------------------------------------
print("步骤4: 跳跃检测基于5m数据")
print("" * 60)
jump_interval = "5m" # 使用最高频数据
if jump_interval in results["rv_by_interval"]:
try:
print(f" 加载 {jump_interval} 数据进行跳跃检测...", end=" ")
df_hf = load_klines(jump_interval)
print(f"✓ ({len(df_hf)} 行)")
print(" 检测跳跃事件...", end=" ")
jump_df = detect_jumps_daily(df_hf, z_threshold=JUMP_Z_THRESHOLD)
results["jump_detection"] = jump_df
print(f"")
n_jumps = jump_df["is_jump"].sum()
jump_ratio = n_jumps / len(jump_df) if len(jump_df) > 0 else 0
print(f"\n 检测到 {n_jumps} 个跳跃事件(占比 {jump_ratio:.2%}")
# 绘图
if len(jump_df) > 0:
# 加载日线价格用于绘图
df_daily = load_klines("1d")
plot_jump_detection(
jump_df,
df_daily,
output_dir / "multiscale_vol_jumps.png"
)
# 添加发现
results["findings"].append({
"name": "跳跃事件检测",
"description": f"检测到{n_jumps}个显著跳跃事件(占比{jump_ratio:.2%}",
"significant": n_jumps > 0,
"p_value": None,
"effect_size": jump_ratio,
})
except Exception as e:
print(f"✗ 失败: {e}")
results["jump_detection"] = pd.DataFrame()
else:
print(f"{jump_interval} 数据不可用,跳过跳跃检测")
print()
# --------------------------------------------------------
# 5. 已实现高阶矩
# --------------------------------------------------------
print("步骤5: 计算已实现偏度和峰度基于5m数据")
print("" * 60)
if jump_interval in results["rv_by_interval"]:
try:
df_hf = load_klines(jump_interval)
print(" 计算已实现偏度和峰度...", end=" ")
moments_df = compute_realized_moments(df_hf)
results["realized_moments"] = moments_df
print(f"✓ ({len(moments_df)} 天)")
# 统计
mean_skew = moments_df["RSkew"].mean()
mean_kurt = moments_df["RKurt"].mean()
print(f"\n 平均已实现偏度: {mean_skew:.4f}")
print(f" 平均已实现峰度: {mean_kurt:.4f}")
# 绘图
if len(moments_df) > 0:
plot_realized_moments(
moments_df,
output_dir / "multiscale_vol_higher_moments.png"
)
# 添加发现
results["findings"].append({
"name": "已实现偏度",
"description": f"平均偏度={mean_skew:.4f}{'负偏' if mean_skew < 0 else '正偏'}分布",
"significant": abs(mean_skew) > 0.1,
"p_value": None,
"effect_size": abs(mean_skew),
})
results["findings"].append({
"name": "已实现峰度",
"description": f"平均峰度={mean_kurt:.4f}{'厚尾' if mean_kurt > 3 else '薄尾'}分布",
"significant": mean_kurt > 3,
"p_value": None,
"effect_size": mean_kurt - 3,
})
except Exception as e:
print(f"✗ 失败: {e}")
results["realized_moments"] = pd.DataFrame()
print()
# --------------------------------------------------------
# 汇总
# --------------------------------------------------------
print("=" * 70)
print("分析完成")
print("=" * 70)
results["summary"] = {
"n_intervals_analyzed": len([v for v in results["rv_by_interval"].values() if len(v) > 0]),
"har_r_squared": results["har_model"].get("r_squared", None),
"n_jump_events": results["jump_detection"]["is_jump"].sum() if len(results["jump_detection"]) > 0 else 0,
"mean_realized_skew": results["realized_moments"]["RSkew"].mean() if len(results["realized_moments"]) > 0 else None,
"mean_realized_kurt": results["realized_moments"]["RKurt"].mean() if len(results["realized_moments"]) > 0 else None,
}
print(f" 分析时间尺度: {results['summary']['n_intervals_analyzed']}")
print(f" HAR-RV R²: {results['summary']['har_r_squared']}")
print(f" 跳跃事件数: {results['summary']['n_jump_events']}")
print(f" 平均已实现偏度: {results['summary']['mean_realized_skew']}")
print(f" 平均已实现峰度: {results['summary']['mean_realized_kurt']}")
print()
print(f"图表输出目录: {output_dir.resolve()}")
print("=" * 70)
return results
# ============================================================
# 独立运行入口
# ============================================================
if __name__ == "__main__":
from src.data_loader import load_daily
print("加载日线数据...")
df = load_daily()
print(f"数据范围: {df.index.min()} ~ {df.index.max()}")
print()
# 执行多尺度波动率分析
results = run_multiscale_vol_analysis(df, output_dir="output/multiscale_vol")
# 打印结果概要
print()
print("返回结果键:")
for k, v in results.items():
if isinstance(v, dict):
print(f" results['{k}']: {list(v.keys()) if v else 'empty'}")
elif isinstance(v, pd.DataFrame):
print(f" results['{k}']: DataFrame ({len(v)} rows)")
elif isinstance(v, list):
print(f" results['{k}']: list ({len(v)} items)")
else:
print(f" results['{k}']: {type(v).__name__}")