btc_price_anany/src/volatility_analysis.py

"""波动率聚集与非对称GARCH建模模块

分析内容：
- 多窗口已实现波动率（7d, 30d, 90d）
- 波动率自相关幂律衰减检验（长记忆性）
- GARCH/EGARCH/GJR-GARCH 模型对比
- 杠杆效应分析：收益率与未来波动率的相关性
"""

import matplotlib
matplotlib.use('Agg')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from scipy.optimize import curve_fit
from statsmodels.tsa.stattools import acf
from pathlib import Path
from typing import Optional

from src.data_loader import load_daily
from src.preprocessing import log_returns


# ============================================================
# 1. 多窗口已实现波动率
# ============================================================

def multi_window_realized_vol(returns: pd.Series,
                               windows: list = [7, 30, 90]) -> pd.DataFrame:
    """
    计算多窗口已实现波动率（年化）

    Parameters
    ----------
    returns : pd.Series
        日对数收益率
    windows : list
        滚动窗口列表（天数）

    Returns
    -------
    pd.DataFrame
        各窗口已实现波动率，列名为 'rv_7d', 'rv_30d', 'rv_90d' 等
    """
    vol_df = pd.DataFrame(index=returns.index)
    for w in windows:
        # 已实现波动率 = sqrt(sum(r^2)) * sqrt(365/window) 进行年化
        rv = np.sqrt((returns ** 2).rolling(window=w).sum()) * np.sqrt(365 / w)
        vol_df[f'rv_{w}d'] = rv
    return vol_df.dropna(how='all')


# ============================================================
# 2. 波动率自相关幂律衰减检验（长记忆性）
# ============================================================

def volatility_acf_power_law(returns: pd.Series,
                              max_lags: int = 200) -> dict:
    """
    检验|收益率|的自相关函数是否服从幂律衰减：ACF(k) ~ k^(-d)

    长记忆性判断：若 0 < d < 1，则存在长记忆

    Parameters
    ----------
    returns : pd.Series
        日对数收益率
    max_lags : int
        最大滞后阶数

    Returns
    -------
    dict
        包含幂律拟合参数d、拟合优度R²、ACF值等
    """
    abs_returns = returns.dropna().abs()

    # 计算ACF
    acf_values = acf(abs_returns, nlags=max_lags, fft=True)
    # 从lag=1开始（lag=0始终为1）
    lags = np.arange(1, max_lags + 1)
    acf_vals = acf_values[1:]

    # 只取正的ACF值来做对数拟合
    positive_mask = acf_vals > 0
    lags_pos = lags[positive_mask]
    acf_pos = acf_vals[positive_mask]

    if len(lags_pos) < 10:
        print("[警告] 正的ACF值过少，无法可靠拟合幂律")
        return {
            'd': np.nan, 'r_squared': np.nan,
            'lags': lags, 'acf_values': acf_vals,
            'is_long_memory': False,
        }

    # 对数-对数线性回归: log(ACF) = -d * log(k) + c
    log_lags = np.log(lags_pos)
    log_acf = np.log(acf_pos)
    slope, intercept, r_value, p_value, std_err = stats.linregress(log_lags, log_acf)

    d = -slope  # 幂律衰减指数
    r_squared = r_value ** 2

    # 非线性拟合作为对照（幂律函数直接拟合）
    def power_law(k, a, d_param):
        return a * k ** (-d_param)

    try:
        popt, pcov = curve_fit(power_law, lags_pos, acf_pos,
                               p0=[acf_pos[0], d], maxfev=5000)
        d_nonlinear = popt[1]
    except (RuntimeError, ValueError):
        d_nonlinear = np.nan

    results = {
        'd': d,
        'd_nonlinear': d_nonlinear,
        'r_squared': r_squared,
        'slope': slope,
        'intercept': intercept,
        'p_value': p_value,
        'std_err': std_err,
        'lags': lags,
        'acf_values': acf_vals,
        'lags_positive': lags_pos,
        'acf_positive': acf_pos,
        'is_long_memory': 0 < d < 1,
    }
    return results


# ============================================================
# 3. GARCH / EGARCH / GJR-GARCH 模型对比
# ============================================================

def compare_garch_models(returns: pd.Series) -> dict:
    """
    拟合GARCH(1,1)、EGARCH(1,1)、GJR-GARCH(1,1)并比较AIC/BIC

    Parameters
    ----------
    returns : pd.Series
        日对数收益率

    Returns
    -------
    dict
        各模型参数、AIC/BIC、杠杆效应参数
    """
    from arch import arch_model

    r_pct = returns.dropna() * 100  # 百分比收益率
    results = {}

    # --- GARCH(1,1) ---
    model_garch = arch_model(r_pct, vol='Garch', p=1, q=1,
                              mean='Constant', dist='Normal')
    res_garch = model_garch.fit(disp='off')
    results['GARCH'] = {
        'params': dict(res_garch.params),
        'aic': res_garch.aic,
        'bic': res_garch.bic,
        'log_likelihood': res_garch.loglikelihood,
        'conditional_volatility': res_garch.conditional_volatility / 100,
        'result_obj': res_garch,
    }

    # --- EGARCH(1,1) ---
    model_egarch = arch_model(r_pct, vol='EGARCH', p=1, q=1,
                               mean='Constant', dist='Normal')
    res_egarch = model_egarch.fit(disp='off')
    # EGARCH的gamma参数反映杠杆效应（负值表示负收益增大波动率）
    egarch_params = dict(res_egarch.params)
    results['EGARCH'] = {
        'params': egarch_params,
        'aic': res_egarch.aic,
        'bic': res_egarch.bic,
        'log_likelihood': res_egarch.loglikelihood,
        'conditional_volatility': res_egarch.conditional_volatility / 100,
        'leverage_param': egarch_params.get('gamma[1]', np.nan),
        'result_obj': res_egarch,
    }

    # --- GJR-GARCH(1,1) ---
    # GJR-GARCH 在 arch 库中通过 vol='Garch', o=1 实现
    model_gjr = arch_model(r_pct, vol='Garch', p=1, o=1, q=1,
                            mean='Constant', dist='Normal')
    res_gjr = model_gjr.fit(disp='off')
    gjr_params = dict(res_gjr.params)
    results['GJR-GARCH'] = {
        'params': gjr_params,
        'aic': res_gjr.aic,
        'bic': res_gjr.bic,
        'log_likelihood': res_gjr.loglikelihood,
        'conditional_volatility': res_gjr.conditional_volatility / 100,
        # gamma[1] > 0 表示负冲击产生更大波动
        'leverage_param': gjr_params.get('gamma[1]', np.nan),
        'result_obj': res_gjr,
    }

    return results


# ============================================================
# 4. 杠杆效应分析
# ============================================================

def leverage_effect_analysis(returns: pd.Series,
                              forward_windows: list = [5, 10, 20]) -> dict:
    """
    分析收益率与未来波动率的相关性（杠杆效应）

    杠杆效应：负收益倾向于增加未来波动率，正收益倾向于减少未来波动率
    表现为 corr(r_t, vol_{t+k}) < 0

    Parameters
    ----------
    returns : pd.Series
        日对数收益率
    forward_windows : list
        前瞻波动率窗口列表

    Returns
    -------
    dict
        各窗口下的相关系数及显著性
    """
    r = returns.dropna()
    results = {}

    for w in forward_windows:
        # 前瞻已实现波动率
        future_vol = r.abs().rolling(window=w).mean().shift(-w)
        # 对齐有效数据
        valid = pd.DataFrame({'return': r, 'future_vol': future_vol}).dropna()

        if len(valid) < 30:
            results[f'{w}d'] = {
                'correlation': np.nan,
                'p_value': np.nan,
                'n_samples': len(valid),
            }
            continue

        corr, p_val = stats.pearsonr(valid['return'], valid['future_vol'])
        # Spearman秩相关作为稳健性检查
        spearman_corr, spearman_p = stats.spearmanr(valid['return'], valid['future_vol'])

        results[f'{w}d'] = {
            'pearson_correlation': corr,
            'pearson_pvalue': p_val,
            'spearman_correlation': spearman_corr,
            'spearman_pvalue': spearman_p,
            'n_samples': len(valid),
            'return_series': valid['return'],
            'future_vol_series': valid['future_vol'],
        }

    return results


# ============================================================
# 5. 可视化
# ============================================================

def plot_realized_volatility(vol_df: pd.DataFrame, output_dir: Path):
    """绘制多窗口已实现波动率时序图"""
    fig, ax = plt.subplots(figsize=(14, 6))

    colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
    labels = {'rv_7d': '7天', 'rv_30d': '30天', 'rv_90d': '90天'}

    for idx, col in enumerate(vol_df.columns):
        label = labels.get(col, col)
        ax.plot(vol_df.index, vol_df[col], linewidth=0.8,
                color=colors[idx % len(colors)],
                label=f'{label}已实现波动率（年化）', alpha=0.85)

    ax.set_xlabel('日期', fontsize=12)
    ax.set_ylabel('年化波动率', fontsize=12)
    ax.set_title('BTC 多窗口已实现波动率', fontsize=14)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.3)

    fig.savefig(output_dir / 'realized_volatility_multiwindow.png',
                dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f"[保存] {output_dir / 'realized_volatility_multiwindow.png'}")


def plot_acf_power_law(acf_results: dict, output_dir: Path):
    """绘制ACF幂律衰减拟合图"""
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    lags = acf_results['lags']
    acf_vals = acf_results['acf_values']

    # 左图：ACF原始值
    ax1 = axes[0]
    ax1.bar(lags, acf_vals, width=1, alpha=0.6, color='steelblue')
    ax1.set_xlabel('滞后阶数', fontsize=11)
    ax1.set_ylabel('ACF', fontsize=11)
    ax1.set_title('|收益率| 自相关函数', fontsize=12)
    ax1.grid(True, alpha=0.3)
    ax1.axhline(y=0, color='black', linewidth=0.5)

    # 右图：对数-对数图 + 幂律拟合
    ax2 = axes[1]
    lags_pos = acf_results['lags_positive']
    acf_pos = acf_results['acf_positive']

    ax2.scatter(np.log(lags_pos), np.log(acf_pos), s=10, alpha=0.5,
                color='steelblue', label='实际ACF')

    # 拟合线
    d = acf_results['d']
    intercept = acf_results['intercept']
    x_fit = np.linspace(np.log(lags_pos.min()), np.log(lags_pos.max()), 100)
    y_fit = -d * x_fit + intercept
    ax2.plot(x_fit, y_fit, 'r-', linewidth=2,
             label=f'幂律拟合: d={d:.3f}, R²={acf_results["r_squared"]:.3f}')

    ax2.set_xlabel('log(滞后阶数)', fontsize=11)
    ax2.set_ylabel('log(ACF)', fontsize=11)
    ax2.set_title('幂律衰减拟合（双对数坐标）', fontsize=12)
    ax2.legend(fontsize=10)
    ax2.grid(True, alpha=0.3)

    fig.tight_layout()
    fig.savefig(output_dir / 'acf_power_law_fit.png',
                dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f"[保存] {output_dir / 'acf_power_law_fit.png'}")


def plot_model_comparison(model_results: dict, output_dir: Path):
    """绘制GARCH模型对比图（AIC/BIC + 条件波动率对比）"""
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))

    model_names = list(model_results.keys())
    aic_values = [model_results[m]['aic'] for m in model_names]
    bic_values = [model_results[m]['bic'] for m in model_names]

    # 上图：AIC/BIC 对比柱状图
    ax1 = axes[0]
    x = np.arange(len(model_names))
    width = 0.35
    bars1 = ax1.bar(x - width / 2, aic_values, width, label='AIC',
                     color='steelblue', alpha=0.8)
    bars2 = ax1.bar(x + width / 2, bic_values, width, label='BIC',
                     color='coral', alpha=0.8)

    ax1.set_xlabel('模型', fontsize=12)
    ax1.set_ylabel('信息准则值', fontsize=12)
    ax1.set_title('GARCH 模型信息准则对比（越小越好）', fontsize=13)
    ax1.set_xticks(x)
    ax1.set_xticklabels(model_names, fontsize=11)
    ax1.legend(fontsize=11)
    ax1.grid(True, alpha=0.3, axis='y')

    # 在柱状图上标注数值
    for bar in bars1:
        height = bar.get_height()
        ax1.annotate(f'{height:.1f}',
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3), textcoords="offset points",
                     ha='center', va='bottom', fontsize=9)
    for bar in bars2:
        height = bar.get_height()
        ax1.annotate(f'{height:.1f}',
                     xy=(bar.get_x() + bar.get_width() / 2, height),
                     xytext=(0, 3), textcoords="offset points",
                     ha='center', va='bottom', fontsize=9)

    # 下图：各模型条件波动率时序对比
    ax2 = axes[1]
    colors = {'GARCH': '#1f77b4', 'EGARCH': '#ff7f0e', 'GJR-GARCH': '#2ca02c'}
    for name in model_names:
        cv = model_results[name]['conditional_volatility']
        ax2.plot(cv.index, cv.values, linewidth=0.7,
                 color=colors.get(name, 'gray'),
                 label=name, alpha=0.8)

    ax2.set_xlabel('日期', fontsize=12)
    ax2.set_ylabel('条件波动率', fontsize=12)
    ax2.set_title('各GARCH模型条件波动率对比', fontsize=13)
    ax2.legend(fontsize=11)
    ax2.grid(True, alpha=0.3)

    fig.tight_layout()
    fig.savefig(output_dir / 'garch_model_comparison.png',
                dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f"[保存] {output_dir / 'garch_model_comparison.png'}")


def plot_leverage_effect(leverage_results: dict, output_dir: Path):
    """绘制杠杆效应散点图"""
    # 找到有数据的窗口
    valid_windows = [w for w, r in leverage_results.items()
                     if 'return_series' in r]
    n_plots = len(valid_windows)
    if n_plots == 0:
        print("[警告] 无有效杠杆效应数据可绘制")
        return

    fig, axes = plt.subplots(1, n_plots, figsize=(6 * n_plots, 5))
    if n_plots == 1:
        axes = [axes]

    for idx, window_key in enumerate(valid_windows):
        ax = axes[idx]
        data = leverage_results[window_key]
        ret = data['return_series']
        fvol = data['future_vol_series']

        # 散点图（采样避免过多点）
        n_sample = min(len(ret), 2000)
        sample_idx = np.random.choice(len(ret), n_sample, replace=False)
        ax.scatter(ret.values[sample_idx], fvol.values[sample_idx],
                   s=5, alpha=0.3, color='steelblue')

        # 回归线
        z = np.polyfit(ret.values, fvol.values, 1)
        p = np.poly1d(z)
        x_line = np.linspace(ret.min(), ret.max(), 100)
        ax.plot(x_line, p(x_line), 'r-', linewidth=2)

        corr = data['pearson_correlation']
        p_val = data['pearson_pvalue']
        ax.set_xlabel('当日对数收益率', fontsize=11)
        ax.set_ylabel(f'未来{window_key}平均|收益率|', fontsize=11)
        ax.set_title(f'杠杆效应 ({window_key})\n'
                     f'Pearson r={corr:.4f}, p={p_val:.2e}', fontsize=11)
        ax.grid(True, alpha=0.3)

    fig.tight_layout()
    fig.savefig(output_dir / 'leverage_effect_scatter.png',
                dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f"[保存] {output_dir / 'leverage_effect_scatter.png'}")


# ============================================================
# 6. 结果打印
# ============================================================

def print_realized_vol_summary(vol_df: pd.DataFrame):
    """打印已实现波动率统计摘要"""
    print("\n" + "=" * 60)
    print("多窗口已实现波动率统计（年化）")
    print("=" * 60)
    summary = vol_df.describe().T
    for col in vol_df.columns:
        s = vol_df[col].dropna()
        print(f"\n  {col}:")
        print(f"    均值:   {s.mean():.4f} ({s.mean() * 100:.2f}%)")
        print(f"    中位数: {s.median():.4f} ({s.median() * 100:.2f}%)")
        print(f"    最大值: {s.max():.4f} ({s.max() * 100:.2f}%)")
        print(f"    最小值: {s.min():.4f} ({s.min() * 100:.2f}%)")
        print(f"    标准差: {s.std():.4f}")


def print_acf_power_law_results(results: dict):
    """打印ACF幂律衰减检验结果"""
    print("\n" + "=" * 60)
    print("波动率自相关幂律衰减检验（长记忆性）")
    print("=" * 60)
    print(f"  幂律衰减指数 d (线性拟合):   {results['d']:.4f}")
    print(f"  幂律衰减指数 d (非线性拟合): {results['d_nonlinear']:.4f}")
    print(f"  拟合优度 R²:                  {results['r_squared']:.4f}")
    print(f"  回归斜率:                     {results['slope']:.4f}")
    print(f"  回归截距:                     {results['intercept']:.4f}")
    print(f"  p值:                          {results['p_value']:.2e}")
    print(f"  标准误:                       {results['std_err']:.4f}")
    print(f"\n  长记忆性判断 (0 < d < 1):     "
          f"{'是 - 存在长记忆性' if results['is_long_memory'] else '否'}")
    if results['is_long_memory']:
        print(f"    → |收益率|的自相关以幂律速度缓慢衰减")
        print(f"    → 波动率聚集具有长记忆特征，GARCH模型的持续性可能不足以刻画")


def print_model_comparison(model_results: dict):
    """打印GARCH模型对比结果"""
    print("\n" + "=" * 60)
    print("GARCH / EGARCH / GJR-GARCH 模型对比")
    print("=" * 60)

    print(f"\n  {'模型':<14} {'AIC':>12} {'BIC':>12} {'对数似然':>12}")
    print("  " + "-" * 52)
    for name, res in model_results.items():
        print(f"  {name:<14} {res['aic']:>12.2f} {res['bic']:>12.2f} "
              f"{res['log_likelihood']:>12.2f}")

    # 找到最优模型
    best_aic = min(model_results.items(), key=lambda x: x[1]['aic'])
    best_bic = min(model_results.items(), key=lambda x: x[1]['bic'])
    print(f"\n  AIC最优模型: {best_aic[0]} (AIC={best_aic[1]['aic']:.2f})")
    print(f"  BIC最优模型: {best_bic[0]} (BIC={best_bic[1]['bic']:.2f})")

    # 杠杆效应参数
    print("\n  杠杆效应参数:")
    for name in ['EGARCH', 'GJR-GARCH']:
        if name in model_results and 'leverage_param' in model_results[name]:
            gamma = model_results[name]['leverage_param']
            print(f"    {name} gamma[1] = {gamma:.6f}")
            if name == 'EGARCH':
                # EGARCH中gamma<0表示负冲击增大波动
                if gamma < 0:
                    print(f"      → gamma < 0: 负收益（下跌）产生更大波动，存在杠杆效应")
                else:
                    print(f"      → gamma >= 0: 未观察到明显杠杆效应")
            elif name == 'GJR-GARCH':
                # GJR-GARCH中gamma>0表示负冲击的额外影响
                if gamma > 0:
                    print(f"      → gamma > 0: 负冲击产生额外波动增量，存在杠杆效应")
                else:
                    print(f"      → gamma <= 0: 未观察到明显杠杆效应")

    # 打印各模型详细参数
    print("\n  各模型详细参数:")
    for name, res in model_results.items():
        print(f"\n  [{name}]")
        for param_name, param_val in res['params'].items():
            print(f"    {param_name}: {param_val:.6f}")


def print_leverage_results(leverage_results: dict):
    """打印杠杆效应分析结果"""
    print("\n" + "=" * 60)
    print("杠杆效应分析：收益率与未来波动率的相关性")
    print("=" * 60)
    print(f"\n  {'窗口':<8} {'Pearson r':>12} {'p值':>12} "
          f"{'Spearman r':>12} {'p值':>12} {'样本数':>8}")
    print("  " + "-" * 66)
    for window, data in leverage_results.items():
        if 'pearson_correlation' in data:
            print(f"  {window:<8} "
                  f"{data['pearson_correlation']:>12.4f} "
                  f"{data['pearson_pvalue']:>12.2e} "
                  f"{data['spearman_correlation']:>12.4f} "
                  f"{data['spearman_pvalue']:>12.2e} "
                  f"{data['n_samples']:>8d}")
        else:
            print(f"  {window:<8} {'N/A':>12} {'N/A':>12} "
                  f"{'N/A':>12} {'N/A':>12} {data.get('n_samples', 0):>8d}")

    # 总结
    print("\n  解读:")
    print("    - 相关系数 < 0: 负收益（下跌）后波动率上升 → 存在杠杆效应")
    print("    - 相关系数 ≈ 0: 收益率方向与未来波动率无关")
    print("    - 相关系数 > 0: 正收益（上涨）后波动率上升（反向杠杆/波动率反馈效应）")
    print("    - 注意: BTC作为加密货币，杠杆效应可能与传统股票不同")


# ============================================================
# 7. 主入口
# ============================================================

def run_volatility_analysis(df: pd.DataFrame, output_dir: str = "output/volatility"):
    """
    波动率聚集与非对称GARCH分析主函数

    Parameters
    ----------
    df : pd.DataFrame
        日线K线数据（含'close'列，DatetimeIndex索引）
    output_dir : str
        图表输出目录
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    print("=" * 60)
    print("BTC 波动率聚集与非对称 GARCH 分析")
    print("=" * 60)
    print(f"数据范围: {df.index.min()} ~ {df.index.max()}")
    print(f"样本数量: {len(df)}")

    # 计算日对数收益率
    daily_returns = log_returns(df['close'])
    print(f"日对数收益率样本数: {len(daily_returns)}")

    from src.font_config import configure_chinese_font
    configure_chinese_font()

    # 固定随机种子以保证杠杆效应散点图采样可复现
    np.random.seed(42)

    # --- 多窗口已实现波动率 ---
    print("\n>>> 计算多窗口已实现波动率 (7d, 30d, 90d)...")
    vol_df = multi_window_realized_vol(daily_returns, windows=[7, 30, 90])
    print_realized_vol_summary(vol_df)
    plot_realized_volatility(vol_df, output_dir)

    # --- ACF幂律衰减检验 ---
    print("\n>>> 执行波动率自相关幂律衰减检验...")
    acf_results = volatility_acf_power_law(daily_returns, max_lags=200)
    print_acf_power_law_results(acf_results)
    plot_acf_power_law(acf_results, output_dir)

    # --- GARCH模型对比 ---
    print("\n>>> 拟合 GARCH / EGARCH / GJR-GARCH 模型...")
    model_results = compare_garch_models(daily_returns)
    print_model_comparison(model_results)
    plot_model_comparison(model_results, output_dir)

    # --- 杠杆效应分析 ---
    print("\n>>> 执行杠杆效应分析...")
    leverage_results = leverage_effect_analysis(daily_returns,
                                                forward_windows=[5, 10, 20])
    print_leverage_results(leverage_results)
    plot_leverage_effect(leverage_results, output_dir)

    print("\n" + "=" * 60)
    print("波动率分析完成！")
    print(f"图表已保存至: {output_dir.resolve()}")
    print("=" * 60)

    # 返回所有结果供后续使用
    return {
        'realized_vol': vol_df,
        'acf_power_law': acf_results,
        'model_comparison': model_results,
        'leverage_effect': leverage_results,
    }


# ============================================================
# 独立运行入口
# ============================================================

if __name__ == '__main__':
    df = load_daily()
    run_volatility_analysis(df)