btc_price_anany/src/anomaly.py

"""异常检测与前兆模式提取模块

分析内容：
- 集成异常检测（Isolation Forest + LOF + COPOD，≥2/3 一致判定）
- GARCH 条件波动率异常检测（标准化残差 > 3）
- 异常前兆模式提取（Random Forest 分类器）
- 事件对齐分析（比特币减半等重大事件）
- 可视化：异常标记价格图、特征分布对比、ROC 曲线、特征重要性
"""

import matplotlib
matplotlib.use('Agg')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from pathlib import Path
from typing import Optional, Dict, List, Tuple

from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score, roc_curve

from src.data_loader import load_klines
from src.preprocessing import add_derived_features

try:
    from pyod.models.copod import COPOD
    HAS_COPOD = True
except ImportError:
    HAS_COPOD = False
    print("[警告] pyod 未安装，COPOD 检测将跳过，使用 2/2 一致判定")


# ============================================================
# 1. 检测特征定义
# ============================================================

# 用于异常检测的特征列
DETECTION_FEATURES = [
    'log_return',
    'abs_return',
    'volume_ratio',
    'range_pct',
    'taker_buy_ratio',
    'vol_7d',
]

# 比特币减半及其他重大事件日期
KNOWN_EVENTS = {
    '2012-11-28': '第一次减半',
    '2016-07-09': '第二次减半',
    '2020-05-11': '第三次减半',
    '2024-04-20': '第四次减半',
    '2017-12-17': '2017年牛市顶点',
    '2018-12-15': '2018年熊市底部',
    '2020-03-12': '新冠黑色星期四',
    '2021-04-14': '2021年牛市中期高点',
    '2021-11-10': '2021年牛市顶点',
    '2022-06-18': 'Luna/3AC 暴跌',
    '2022-11-09': 'FTX 崩盘',
    '2024-01-11': 'BTC ETF 获批',
}


# ============================================================
# 2. 集成异常检测
# ============================================================

def prepare_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray]:
    """
    准备异常检测特征矩阵

    Parameters
    ----------
    df : pd.DataFrame
        含衍生特征的日线数据

    Returns
    -------
    features_df : pd.DataFrame
        特征子集（已去除 NaN）
    X_scaled : np.ndarray
        标准化后的特征矩阵
    """
    # 选取可用特征
    available = [f for f in DETECTION_FEATURES if f in df.columns]
    if len(available) < 3:
        raise ValueError(f"可用特征不足: {available}，至少需要 3 个")

    features_df = df[available].dropna()

    # 标准化
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(features_df.values)

    return features_df, X_scaled


def detect_isolation_forest(X: np.ndarray, contamination: float = 0.05) -> np.ndarray:
    """Isolation Forest 异常检测"""
    model = IsolationForest(
        n_estimators=200,
        contamination=contamination,
        random_state=42,
        n_jobs=-1,
    )
    # -1 = 异常, 1 = 正常
    labels = model.fit_predict(X)
    return (labels == -1).astype(int)


def detect_lof(X: np.ndarray, contamination: float = 0.05) -> np.ndarray:
    """Local Outlier Factor 异常检测"""
    model = LocalOutlierFactor(
        n_neighbors=20,
        contamination=contamination,
        novelty=False,
        n_jobs=-1,
    )
    labels = model.fit_predict(X)
    return (labels == -1).astype(int)


def detect_copod(X: np.ndarray, contamination: float = 0.05) -> np.ndarray:
    """COPOD 异常检测（基于 Copula）"""
    if not HAS_COPOD:
        return None

    model = COPOD(contamination=contamination)
    labels = model.fit_predict(X)
    return labels.astype(int)


def ensemble_anomaly_detection(
    df: pd.DataFrame,
    contamination: float = 0.05,
    min_agreement: int = 2,
) -> pd.DataFrame:
    """
    集成异常检测：要求 ≥ min_agreement / n_methods 一致判定

    Parameters
    ----------
    df : pd.DataFrame
        含衍生特征的日线数据
    contamination : float
        预期异常比例
    min_agreement : int
        最少多少个方法一致才标记为异常

    Returns
    -------
    pd.DataFrame
        添加了各方法检测结果及集成结果的数据
    """
    features_df, X_scaled = prepare_features(df)

    print(f"  特征矩阵: {X_scaled.shape[0]} 样本 x {X_scaled.shape[1]} 特征")

    # 执行各方法检测
    print("  [1/3] Isolation Forest...")
    if_labels = detect_isolation_forest(X_scaled, contamination)

    print("  [2/3] Local Outlier Factor...")
    lof_labels = detect_lof(X_scaled, contamination)

    n_methods = 2
    vote_matrix = np.column_stack([if_labels, lof_labels])
    method_names = ['iforest', 'lof']

    print("  [3/3] COPOD...")
    copod_labels = detect_copod(X_scaled, contamination)
    if copod_labels is not None:
        vote_matrix = np.column_stack([vote_matrix, copod_labels])
        method_names.append('copod')
        n_methods = 3
    else:
        print("    COPOD 不可用，使用 2 方法集成")

    # 投票
    vote_sum = vote_matrix.sum(axis=1)
    ensemble_label = (vote_sum >= min_agreement).astype(int)

    # 构建结果 DataFrame
    result = features_df.copy()
    for i, name in enumerate(method_names):
        result[f'anomaly_{name}'] = vote_matrix[:, i]
    result['anomaly_votes'] = vote_sum
    result['anomaly_ensemble'] = ensemble_label

    # 打印各方法统计
    print(f"\n  异常检测统计:")
    for name in method_names:
        n_anom = result[f'anomaly_{name}'].sum()
        print(f"    {name:>12}: {n_anom} 个异常 ({n_anom / len(result) * 100:.2f}%)")
    n_ensemble = ensemble_label.sum()
    print(f"    {'集成(≥' + str(min_agreement) + ')':>12}: {n_ensemble} 个异常 ({n_ensemble / len(result) * 100:.2f}%)")

    # 方法间重叠度
    print(f"\n  方法间重叠:")
    for i in range(len(method_names)):
        for j in range(i + 1, len(method_names)):
            overlap = ((vote_matrix[:, i] == 1) & (vote_matrix[:, j] == 1)).sum()
            n_i = vote_matrix[:, i].sum()
            n_j = vote_matrix[:, j].sum()
            if min(n_i, n_j) > 0:
                jaccard = overlap / ((vote_matrix[:, i] == 1) | (vote_matrix[:, j] == 1)).sum()
            else:
                jaccard = 0.0
            print(f"    {method_names[i]} ∩ {method_names[j]}: "
                  f"{overlap} 个 (Jaccard={jaccard:.3f})")

    return result


# ============================================================
# 3. GARCH 条件波动率异常
# ============================================================

def garch_anomaly_detection(
    df: pd.DataFrame,
    threshold: float = 3.0,
) -> pd.Series:
    """
    基于 GARCH(1,1) 的条件波动率异常检测

    标准化残差 |ε_t / σ_t| > threshold 的日期标记为异常

    Parameters
    ----------
    df : pd.DataFrame
        含 log_return 列的数据
    threshold : float
        标准化残差阈值

    Returns
    -------
    pd.Series
        异常标记（1 = 异常，0 = 正常），索引与输入对齐
    """
    from arch import arch_model

    returns = df['log_return'].dropna()
    r_pct = returns * 100  # arch 库使用百分比收益率

    # 拟合 GARCH(1,1)
    model = arch_model(r_pct, vol='Garch', p=1, q=1, mean='Constant', dist='Normal')
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        result = model.fit(disp='off')

    # 计算标准化残差
    std_resid = result.resid / result.conditional_volatility
    anomaly = (std_resid.abs() > threshold).astype(int)

    n_anom = anomaly.sum()
    print(f"  GARCH 异常: {n_anom} 个 (|标准化残差| > {threshold})")
    print(f"  GARCH 模型: α={result.params.get('alpha[1]', np.nan):.4f}, "
          f"β={result.params.get('beta[1]', np.nan):.4f}, "
          f"持续性={result.params.get('alpha[1]', 0) + result.params.get('beta[1]', 0):.4f}")

    return anomaly


# ============================================================
# 4. 前兆模式提取
# ============================================================

def extract_precursor_features(
    df: pd.DataFrame,
    anomaly_labels: pd.Series,
    lookback_windows: List[int] = None,
) -> Tuple[pd.DataFrame, pd.Series]:
    """
    提取异常日前若干天的特征作为前兆信号

    Parameters
    ----------
    df : pd.DataFrame
        含衍生特征的数据
    anomaly_labels : pd.Series
        异常标记（1 = 异常）
    lookback_windows : list of int
        向前回溯的天数窗口

    Returns
    -------
    X : pd.DataFrame
        前兆特征矩阵
    y : pd.Series
        标签（1 = 后续发生异常, 0 = 正常）
    """
    if lookback_windows is None:
        lookback_windows = [5, 10, 20]

    # 确保对齐
    common_idx = df.index.intersection(anomaly_labels.index)
    df_aligned = df.loc[common_idx]
    labels_aligned = anomaly_labels.loc[common_idx]

    base_features = [f for f in DETECTION_FEATURES if f in df.columns]
    precursor_features = {}

    for window in lookback_windows:
        for feat in base_features:
            if feat not in df_aligned.columns:
                continue
            series = df_aligned[feat]

            # 滚动统计作为前兆特征
            precursor_features[f'{feat}_mean_{window}d'] = series.rolling(window).mean()
            precursor_features[f'{feat}_std_{window}d'] = series.rolling(window).std()
            precursor_features[f'{feat}_max_{window}d'] = series.rolling(window).max()
            precursor_features[f'{feat}_min_{window}d'] = series.rolling(window).min()

            # 趋势特征（最近值 vs 窗口均值的偏离）
            rolling_mean = series.rolling(window).mean()
            precursor_features[f'{feat}_deviation_{window}d'] = series - rolling_mean

    X = pd.DataFrame(precursor_features, index=df_aligned.index)

    # 标签: 预测次日是否出现异常（前瞻1天）
    y = labels_aligned.shift(-1).dropna()
    X = X.loc[y.index]  # 对齐特征和标签

    # 去除 NaN
    valid_mask = X.notna().all(axis=1) & y.notna()
    X = X[valid_mask]
    y = y[valid_mask]

    return X, y


def train_precursor_classifier(
    X: pd.DataFrame,
    y: pd.Series,
) -> Dict:
    """
    训练前兆模式分类器（Random Forest）

    使用分层 K 折交叉验证评估

    Parameters
    ----------
    X : pd.DataFrame
        前兆特征矩阵
    y : pd.Series
        标签

    Returns
    -------
    dict
        AUC、特征重要性等结果
    """
    if len(X) < 50 or y.sum() < 10:
        print(f"  [警告] 样本不足 (n={len(X)}, 正例={y.sum()})，跳过分类器训练")
        return {}

    # 时间序列交叉验证
    n_splits = min(5, int(y.sum()))
    if n_splits < 2:
        print("  [警告] 正例数过少，无法进行交叉验证")
        return {}

    cv = TimeSeriesSplit(n_splits=n_splits)

    clf = RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1,
    )

    # 手动交叉验证（每折单独 fit scaler，防止数据泄漏）
    try:
        y_prob = np.full(len(y), np.nan)
        for train_idx, val_idx in cv.split(X):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_val_scaled = scaler.transform(X_val)
            clf.fit(X_train_scaled, y_train)
            y_prob[val_idx] = clf.predict_proba(X_val_scaled)[:, 1]
        # 去除未被验证的样本（如有）
        valid_prob_mask = ~np.isnan(y_prob)
        y_eval = y[valid_prob_mask]
        y_prob_eval = y_prob[valid_prob_mask]
        auc = roc_auc_score(y_eval, y_prob_eval)
    except Exception as e:
        print(f"  [错误] 交叉验证失败: {e}")
        return {}

    # 在全量数据上训练获取特征重要性
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    clf.fit(X_scaled, y)
    importances = pd.Series(clf.feature_importances_, index=X.columns)
    importances = importances.sort_values(ascending=False)

    # ROC 曲线数据
    fpr, tpr, thresholds = roc_curve(y_eval, y_prob_eval)

    results = {
        'auc': auc,
        'feature_importances': importances,
        'y_true': y_eval,
        'y_prob': y_prob_eval,
        'fpr': fpr,
        'tpr': tpr,
    }

    print(f"\n  前兆分类器结果:")
    print(f"    AUC: {auc:.4f}")
    print(f"    样本: {len(y)} (异常: {y.sum()}, 正常: {(y == 0).sum()})")
    print(f"    Top-10 重要特征:")
    for feat, imp in importances.head(10).items():
        print(f"      {feat:<40} {imp:.4f}")

    return results


# ============================================================
# 5. 事件对齐分析
# ============================================================

def align_with_events(
    anomaly_dates: pd.DatetimeIndex,
    tolerance_days: int = 5,
) -> pd.DataFrame:
    """
    将异常日期与已知事件对齐

    Parameters
    ----------
    anomaly_dates : pd.DatetimeIndex
        异常日期列表
    tolerance_days : int
        容差天数（异常日期与事件日期相差 ≤ tolerance_days 天即视为匹配）

    Returns
    -------
    pd.DataFrame
        匹配结果
    """
    matches = []

    for event_date_str, event_name in KNOWN_EVENTS.items():
        event_date = pd.Timestamp(event_date_str)

        for anom_date in anomaly_dates:
            diff_days = abs((anom_date - event_date).days)
            if diff_days <= tolerance_days:
                matches.append({
                    'anomaly_date': anom_date,
                    'event_date': event_date,
                    'event_name': event_name,
                    'diff_days': diff_days,
                })

    if matches:
        result = pd.DataFrame(matches)
        print(f"\n  事件对齐 (容差 {tolerance_days} 天):")
        for _, row in result.iterrows():
            print(f"    异常 {row['anomaly_date'].strftime('%Y-%m-%d')} ↔ "
                  f"{row['event_name']} ({row['event_date'].strftime('%Y-%m-%d')}, "
                  f"差 {row['diff_days']} 天)")
        return result
    else:
        print(f"  [信息] 无异常日期与已知事件匹配 (容差 {tolerance_days} 天)")
        return pd.DataFrame()


# ============================================================
# 6. 可视化
# ============================================================

def plot_price_with_anomalies(
    df: pd.DataFrame,
    anomaly_result: pd.DataFrame,
    garch_anomaly: Optional[pd.Series],
    output_dir: Path,
):
    """绘制价格图，标注异常点"""
    fig, axes = plt.subplots(2, 1, figsize=(16, 10), gridspec_kw={'height_ratios': [3, 1]})

    # 上图：价格 + 异常标记
    ax1 = axes[0]
    ax1.plot(df.index, df['close'], linewidth=0.6, color='steelblue', alpha=0.8, label='BTC 收盘价')

    # 集成异常
    ensemble_anom = anomaly_result[anomaly_result['anomaly_ensemble'] == 1]
    if not ensemble_anom.empty:
        # 获取异常日期对应的收盘价
        anom_prices = df.loc[df.index.isin(ensemble_anom.index), 'close']
        ax1.scatter(anom_prices.index, anom_prices.values,
                    color='red', s=30, zorder=5, label=f'集成异常 (n={len(anom_prices)})',
                    alpha=0.7, edgecolors='darkred', linewidths=0.5)

    # GARCH 异常
    if garch_anomaly is not None:
        garch_anom_dates = garch_anomaly[garch_anomaly == 1].index
        garch_prices = df.loc[df.index.isin(garch_anom_dates), 'close']
        if not garch_prices.empty:
            ax1.scatter(garch_prices.index, garch_prices.values,
                        color='orange', s=20, zorder=4, marker='^',
                        label=f'GARCH 异常 (n={len(garch_prices)})',
                        alpha=0.7, edgecolors='darkorange', linewidths=0.5)

    ax1.set_ylabel('价格 (USDT)', fontsize=12)
    ax1.set_title('BTC 价格与异常检测结果', fontsize=14)
    ax1.legend(fontsize=10, loc='upper left')
    ax1.grid(True, alpha=0.3)
    ax1.set_yscale('log')

    # 下图：成交量 + 异常标记
    ax2 = axes[1]
    if 'volume' in df.columns:
        ax2.bar(df.index, df['volume'], width=1, color='steelblue', alpha=0.4, label='成交量')
        if not ensemble_anom.empty:
            anom_vol = df.loc[df.index.isin(ensemble_anom.index), 'volume']
            ax2.bar(anom_vol.index, anom_vol.values, width=1, color='red', alpha=0.7, label='异常日成交量')
    ax2.set_ylabel('成交量', fontsize=12)
    ax2.set_xlabel('日期', fontsize=12)
    ax2.legend(fontsize=10)
    ax2.grid(True, alpha=0.3)

    fig.tight_layout()
    fig.savefig(output_dir / 'anomaly_price_chart.png', dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f"  [保存] {output_dir / 'anomaly_price_chart.png'}")


def plot_anomaly_feature_distributions(
    anomaly_result: pd.DataFrame,
    output_dir: Path,
):
    """绘制异常日 vs 正常日的特征分布对比"""
    features_to_plot = [f for f in DETECTION_FEATURES if f in anomaly_result.columns]
    n_feats = len(features_to_plot)
    if n_feats == 0:
        print("  [警告] 无可绘制特征")
        return

    n_cols = 3
    n_rows = (n_feats + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
    axes = np.array(axes).flatten()

    normal = anomaly_result[anomaly_result['anomaly_ensemble'] == 0]
    anomaly = anomaly_result[anomaly_result['anomaly_ensemble'] == 1]

    for idx, feat in enumerate(features_to_plot):
        ax = axes[idx]

        # 正常分布
        vals_normal = normal[feat].dropna()
        vals_anomaly = anomaly[feat].dropna()

        ax.hist(vals_normal, bins=50, density=True, alpha=0.6,
                color='steelblue', label=f'正常 (n={len(vals_normal)})', edgecolor='white', linewidth=0.3)
        if len(vals_anomaly) > 0:
            ax.hist(vals_anomaly, bins=30, density=True, alpha=0.6,
                    color='red', label=f'异常 (n={len(vals_anomaly)})', edgecolor='white', linewidth=0.3)

        ax.set_title(feat, fontsize=11)
        ax.legend(fontsize=8)
        ax.grid(True, alpha=0.3)

    # 隐藏多余子图
    for idx in range(n_feats, len(axes)):
        axes[idx].set_visible(False)

    fig.suptitle('异常日 vs 正常日 特征分布对比', fontsize=14, y=1.02)
    fig.tight_layout()
    fig.savefig(output_dir / 'anomaly_feature_distributions.png', dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f"  [保存] {output_dir / 'anomaly_feature_distributions.png'}")


def plot_precursor_roc(precursor_results: Dict, output_dir: Path):
    """绘制前兆分类器 ROC 曲线"""
    if not precursor_results or 'fpr' not in precursor_results:
        print("  [警告] 无前兆分类器结果，跳过 ROC 曲线")
        return

    fig, ax = plt.subplots(figsize=(8, 8))

    fpr = precursor_results['fpr']
    tpr = precursor_results['tpr']
    auc = precursor_results['auc']

    ax.plot(fpr, tpr, color='steelblue', linewidth=2,
            label=f'Random Forest (AUC = {auc:.4f})')
    ax.plot([0, 1], [0, 1], 'k--', linewidth=1, label='随机基线')

    ax.set_xlabel('假阳性率 (FPR)', fontsize=12)
    ax.set_ylabel('真阳性率 (TPR)', fontsize=12)
    ax.set_title('异常前兆分类器 ROC 曲线', fontsize=14)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.3)
    ax.set_xlim([-0.02, 1.02])
    ax.set_ylim([-0.02, 1.02])

    fig.savefig(output_dir / 'precursor_roc_curve.png', dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f"  [保存] {output_dir / 'precursor_roc_curve.png'}")


def plot_feature_importance(precursor_results: Dict, output_dir: Path, top_n: int = 20):
    """绘制前兆特征重要性条形图"""
    if not precursor_results or 'feature_importances' not in precursor_results:
        print("  [警告] 无特征重要性数据，跳过")
        return

    importances = precursor_results['feature_importances'].head(top_n)

    fig, ax = plt.subplots(figsize=(10, max(6, top_n * 0.35)))

    colors = plt.cm.RdYlBu_r(np.linspace(0.2, 0.8, len(importances)))
    ax.barh(range(len(importances)), importances.values[::-1],
            color=colors[::-1], edgecolor='white', linewidth=0.5)
    ax.set_yticks(range(len(importances)))
    ax.set_yticklabels(importances.index[::-1], fontsize=9)
    ax.set_xlabel('特征重要性', fontsize=12)
    ax.set_title(f'异常前兆 Top-{top_n} 特征重要性 (Random Forest)', fontsize=13)
    ax.grid(True, alpha=0.3, axis='x')

    fig.savefig(output_dir / 'precursor_feature_importance.png', dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f"  [保存] {output_dir / 'precursor_feature_importance.png'}")


# ============================================================
# 9. 多尺度异常检测
# ============================================================

def multi_scale_anomaly_detection(intervals=None, contamination=0.05) -> Dict:
    """多尺度异常检测"""
    if intervals is None:
        intervals = ['1h', '4h', '1d']

    results = {}
    for interval in intervals:
        try:
            print(f"\n  加载 {interval} 数据进行异常检测...")
            df_tf = load_klines(interval)
            df_tf = add_derived_features(df_tf)

            # 截断大数据
            if len(df_tf) > 50000:
                df_tf = df_tf.iloc[-50000:]

            if len(df_tf) < 200:
                print(f"    {interval} 数据不足，跳过")
                continue

            # 集成异常检测
            anomaly_result = ensemble_anomaly_detection(df_tf, contamination=contamination, min_agreement=2)

            # 提取异常日期
            anomaly_dates = anomaly_result[anomaly_result['anomaly_ensemble'] == 1].index

            results[interval] = {
                'anomaly_dates': anomaly_dates,
                'n_anomalies': len(anomaly_dates),
                'n_total': len(anomaly_result),
                'anomaly_pct': len(anomaly_dates) / len(anomaly_result) * 100,
            }

            print(f"    {interval}: {len(anomaly_dates)} 个异常 ({len(anomaly_dates)/len(anomaly_result)*100:.2f}%)")

        except FileNotFoundError:
            print(f"    {interval} 数据文件不存在，跳过")
        except Exception as e:
            print(f"    {interval} 异常检测失败: {e}")

    return results


def cross_scale_anomaly_consensus(ms_results: Dict, tolerance_hours: int = 24) -> pd.DataFrame:
    """
    跨尺度异常共识：多个尺度在同一时间窗口内同时报异常 → 高置信度

    Parameters
    ----------
    ms_results : Dict
        多尺度异常检测结果字典
    tolerance_hours : int
        时间容差（小时）

    Returns
    -------
    pd.DataFrame
        共识异常数据
    """
    # 将所有尺度的异常日期映射到日频
    all_dates = []
    for interval, result in ms_results.items():
        dates = result['anomaly_dates']
        # 转换为日期（去除时间部分）
        daily_dates = pd.to_datetime(dates.date).unique()
        for date in daily_dates:
            all_dates.append({'date': date, 'interval': interval})

    if not all_dates:
        return pd.DataFrame()

    df_dates = pd.DataFrame(all_dates)

    # 统计每个日期被多少个尺度报为异常
    consensus_counts = df_dates.groupby('date').size().reset_index(name='n_scales')
    consensus_counts = consensus_counts.sort_values('date')

    # >=2 个尺度报异常 = "共识异常"
    consensus_counts['is_consensus'] = (consensus_counts['n_scales'] >= 2).astype(int)

    # 添加参与的尺度列表
    scale_groups = df_dates.groupby('date')['interval'].apply(list).reset_index()
    consensus_counts = consensus_counts.merge(scale_groups, on='date')

    n_consensus = consensus_counts['is_consensus'].sum()
    print(f"\n  跨尺度共识异常: {n_consensus} 天 (≥2 个尺度同时报异常)")

    return consensus_counts


def plot_multi_scale_anomaly_timeline(df: pd.DataFrame, ms_results: Dict, consensus: pd.DataFrame, output_dir: Path):
    """多尺度异常共识时间线"""
    fig, axes = plt.subplots(2, 1, figsize=(16, 10), gridspec_kw={'height_ratios': [2, 1]})

    # 上图: 价格图（对数尺度）+ 共识异常点标注
    ax1 = axes[0]
    ax1.plot(df.index, df['close'], linewidth=0.6, color='steelblue', alpha=0.8, label='BTC 收盘价')

    if not consensus.empty:
        # 标注共识异常点
        consensus_dates = consensus[consensus['is_consensus'] == 1]['date']
        if len(consensus_dates) > 0:
            # 获取对应的价格
            consensus_prices = df.loc[df.index.isin(consensus_dates), 'close']
            if not consensus_prices.empty:
                ax1.scatter(consensus_prices.index, consensus_prices.values,
                           color='red', s=50, zorder=5, label=f'共识异常 (n={len(consensus_prices)})',
                           alpha=0.8, edgecolors='darkred', linewidths=1, marker='*')

    ax1.set_ylabel('价格 (USDT)', fontsize=12)
    ax1.set_title('多尺度异常检测：价格与共识异常', fontsize=14)
    ax1.legend(fontsize=10, loc='upper left')
    ax1.grid(True, alpha=0.3)
    ax1.set_yscale('log')

    # 下图: 各尺度异常时间线（类似甘特图）
    ax2 = axes[1]

    interval_labels = list(ms_results.keys())
    y_positions = range(len(interval_labels))

    colors = {'1h': 'lightcoral', '4h': 'orange', '1d': 'steelblue'}

    for idx, interval in enumerate(interval_labels):
        anomaly_dates = ms_results[interval]['anomaly_dates']
        # 转换为日期
        daily_dates = pd.to_datetime(anomaly_dates.date).unique()

        # 绘制时间线（每个异常日期用竖线表示）
        for date in daily_dates:
            ax2.axvline(x=date, ymin=idx/len(interval_labels), ymax=(idx+0.8)/len(interval_labels),
                       color=colors.get(interval, 'gray'), alpha=0.6, linewidth=2)

    # 标注共识异常区域
    if not consensus.empty:
        consensus_dates = consensus[consensus['is_consensus'] == 1]['date']
        for date in consensus_dates:
            ax2.axvspan(date, date + pd.Timedelta(days=1),
                       color='red', alpha=0.15, zorder=0)

    ax2.set_yticks(y_positions)
    ax2.set_yticklabels(interval_labels)
    ax2.set_ylabel('时间尺度', fontsize=12)
    ax2.set_xlabel('日期', fontsize=12)
    ax2.set_title('各尺度异常时间线（红色背景 = 共识异常）', fontsize=12)
    ax2.grid(True, alpha=0.3, axis='x')
    ax2.set_xlim(df.index.min(), df.index.max())

    fig.tight_layout()
    fig.savefig(output_dir / 'anomaly_multi_scale_timeline.png', dpi=150, bbox_inches='tight')
    plt.close(fig)
    print(f"  [保存] {output_dir / 'anomaly_multi_scale_timeline.png'}")


# ============================================================
# 7. 结果打印
# ============================================================

def print_anomaly_summary(
    anomaly_result: pd.DataFrame,
    garch_anomaly: Optional[pd.Series],
    precursor_results: Dict,
):
    """打印异常检测汇总"""
    print("\n" + "=" * 70)
    print("异常检测结果汇总")
    print("=" * 70)

    # 集成异常统计
    n_total = len(anomaly_result)
    n_ensemble = anomaly_result['anomaly_ensemble'].sum()
    print(f"\n  总样本数: {n_total}")
    print(f"  集成异常数: {n_ensemble} ({n_ensemble / n_total * 100:.2f}%)")

    # 各方法统计
    method_cols = [c for c in anomaly_result.columns if c.startswith('anomaly_') and c != 'anomaly_ensemble' and c != 'anomaly_votes']
    for col in method_cols:
        method_name = col.replace('anomaly_', '')
        n_anom = anomaly_result[col].sum()
        print(f"    {method_name:>12}: {n_anom} ({n_anom / n_total * 100:.2f}%)")

    # GARCH 异常
    if garch_anomaly is not None:
        n_garch = garch_anomaly.sum()
        print(f"    {'GARCH':>12}: {n_garch} ({n_garch / len(garch_anomaly) * 100:.2f}%)")

        # 集成异常与 GARCH 异常的重叠
        common_idx = anomaly_result.index.intersection(garch_anomaly.index)
        if len(common_idx) > 0:
            ensemble_set = set(anomaly_result.loc[common_idx][anomaly_result.loc[common_idx, 'anomaly_ensemble'] == 1].index)
            garch_set = set(garch_anomaly[garch_anomaly == 1].index)
            overlap = len(ensemble_set & garch_set)
            print(f"\n  集成 ∩ GARCH 重叠: {overlap} 个")

    # 前兆分类器
    if precursor_results and 'auc' in precursor_results:
        print(f"\n  前兆分类器 AUC: {precursor_results['auc']:.4f}")
        print(f"  Top-5 前兆特征:")
        for feat, imp in precursor_results['feature_importances'].head(5).items():
            print(f"    {feat:<40} {imp:.4f}")


# ============================================================
# 8. 主入口
# ============================================================

def run_anomaly_analysis(
    df: pd.DataFrame,
    output_dir: str = "output/anomaly",
) -> Dict:
    """
    异常检测与前兆模式分析主函数

    Parameters
    ----------
    df : pd.DataFrame
        日线数据（已通过 add_derived_features 添加衍生特征）
    output_dir : str
        图表输出目录

    Returns
    -------
    dict
        包含所有分析结果的字典
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    print("=" * 70)
    print("BTC 异常检测与前兆模式分析")
    print("=" * 70)
    print(f"数据范围: {df.index.min()} ~ {df.index.max()}")
    print(f"样本数量: {len(df)}")

    from src.font_config import configure_chinese_font
    configure_chinese_font()

    # --- 集成异常检测 ---
    print("\n>>> [1/5] 执行集成异常检测...")
    anomaly_result = ensemble_anomaly_detection(df, contamination=0.05, min_agreement=2)

    # --- GARCH 条件波动率异常 ---
    print("\n>>> [2/5] 执行 GARCH 条件波动率异常检测...")
    garch_anomaly = None
    try:
        garch_anomaly = garch_anomaly_detection(df, threshold=3.0)
    except Exception as e:
        print(f"  [错误] GARCH 异常检测失败: {e}")

    # --- 事件对齐 ---
    print("\n>>> [3/5] 执行事件对齐分析...")
    ensemble_anom_dates = anomaly_result[anomaly_result['anomaly_ensemble'] == 1].index
    event_alignment = align_with_events(ensemble_anom_dates, tolerance_days=5)

    # --- 前兆模式提取 ---
    print("\n>>> [4/5] 提取前兆模式并训练分类器...")
    precursor_results = {}
    try:
        X_precursor, y_precursor = extract_precursor_features(
            df, anomaly_result['anomaly_ensemble'], lookback_windows=[5, 10, 20]
        )
        print(f"  前兆特征矩阵: {X_precursor.shape[0]} 样本 x {X_precursor.shape[1]} 特征")
        precursor_results = train_precursor_classifier(X_precursor, y_precursor)
    except Exception as e:
        print(f"  [错误] 前兆模式提取失败: {e}")

    # --- 可视化 ---
    print("\n>>> [5/5] 生成可视化图表...")
    plot_price_with_anomalies(df, anomaly_result, garch_anomaly, output_dir)
    plot_anomaly_feature_distributions(anomaly_result, output_dir)
    plot_precursor_roc(precursor_results, output_dir)
    plot_feature_importance(precursor_results, output_dir)

    # --- 汇总打印 ---
    print_anomaly_summary(anomaly_result, garch_anomaly, precursor_results)

    # --- 多尺度异常检测 ---
    print("\n>>> [额外] 多尺度异常检测与共识分析...")
    ms_anomaly = multi_scale_anomaly_detection(['1h', '4h', '1d'])
    consensus = None
    if len(ms_anomaly) >= 2:
        consensus = cross_scale_anomaly_consensus(ms_anomaly)
        plot_multi_scale_anomaly_timeline(df, ms_anomaly, consensus, output_dir)

    print("\n" + "=" * 70)
    print("异常检测与前兆模式分析完成！")
    print(f"图表已保存至: {output_dir.resolve()}")
    print("=" * 70)

    return {
        'anomaly_result': anomaly_result,
        'garch_anomaly': garch_anomaly,
        'event_alignment': event_alignment,
        'precursor_results': precursor_results,
        'multi_scale_anomaly': ms_anomaly,
        'cross_scale_consensus': consensus,
    }


# ============================================================
# 独立运行入口
# ============================================================

if __name__ == '__main__':
    from src.data_loader import load_daily
    from src.preprocessing import add_derived_features

    df = load_daily()
    df = add_derived_features(df)
    run_anomaly_analysis(df)