代码修复 (16 个模块): - GARCH 模型统一改用 t 分布 + 收敛检查 (returns/volatility/anomaly) - KS 检验替换为 Lilliefors 检验 (returns) - 修复数据泄漏: StratifiedKFold→TimeSeriesSplit, scaler 逐折 fit (anomaly) - 前兆标签 shift(-1) 预测次日异常 (anomaly) - PSD 归一化加入采样频率和单边谱×2 (fft) - AR(1) 红噪声基线经验缩放 (fft) - 盒计数法独立 x/y 归一化, MF-DFA q=0 (fractal) - ADF 平稳性检验 + 移除双重 Bonferroni (causality) - R/S Hurst 添加 R² 拟合优度 (hurst) - Prophet 递推预测避免信息泄露 (time_series) - IC 计算过滤零信号, 中性形态 hit_rate=NaN (indicators/patterns) - 聚类阈值自适应化 (clustering) - 日历效应前后半段稳健性检查 (calendar) - 证据评分标准文本与代码对齐 (visualization) - 核心管道 NaN/空值防护 (data_loader/preprocessing/main) 报告修复 (docs/REPORT.md, 15 处): - 标度指数 H_scaling 与 Hurst 指数消歧 - GBM 6 个月概率锥数值重算 - CLT 限定、减半措辞弱化、情景概率逻辑修正 - GPD 形状参数解读修正、异常 AUC 证据降级 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
955 lines
32 KiB
Python
955 lines
32 KiB
Python
"""异常检测与前兆模式提取模块
|
||
|
||
分析内容:
|
||
- 集成异常检测(Isolation Forest + LOF + COPOD,≥2/3 一致判定)
|
||
- GARCH 条件波动率异常检测(标准化残差 > 3)
|
||
- 异常前兆模式提取(Random Forest 分类器)
|
||
- 事件对齐分析(比特币减半等重大事件)
|
||
- 可视化:异常标记价格图、特征分布对比、ROC 曲线、特征重要性
|
||
"""
|
||
|
||
import matplotlib
|
||
matplotlib.use('Agg')
|
||
|
||
import numpy as np
|
||
import pandas as pd
|
||
import matplotlib.pyplot as plt
|
||
import warnings
|
||
from pathlib import Path
|
||
from typing import Optional, Dict, List, Tuple
|
||
|
||
from sklearn.ensemble import IsolationForest, RandomForestClassifier
|
||
from sklearn.neighbors import LocalOutlierFactor
|
||
from sklearn.preprocessing import StandardScaler
|
||
from sklearn.model_selection import TimeSeriesSplit
|
||
from sklearn.metrics import roc_auc_score, roc_curve
|
||
|
||
from src.data_loader import load_klines
|
||
from src.preprocessing import add_derived_features
|
||
|
||
try:
|
||
from pyod.models.copod import COPOD
|
||
HAS_COPOD = True
|
||
except ImportError:
|
||
HAS_COPOD = False
|
||
print("[警告] pyod 未安装,COPOD 检测将跳过,使用 2/2 一致判定")
|
||
|
||
|
||
# ============================================================
|
||
# 1. 检测特征定义
|
||
# ============================================================
|
||
|
||
# 用于异常检测的特征列
|
||
DETECTION_FEATURES = [
|
||
'log_return',
|
||
'abs_return',
|
||
'volume_ratio',
|
||
'range_pct',
|
||
'taker_buy_ratio',
|
||
'vol_7d',
|
||
]
|
||
|
||
# 比特币减半及其他重大事件日期
|
||
KNOWN_EVENTS = {
|
||
'2012-11-28': '第一次减半',
|
||
'2016-07-09': '第二次减半',
|
||
'2020-05-11': '第三次减半',
|
||
'2024-04-20': '第四次减半',
|
||
'2017-12-17': '2017年牛市顶点',
|
||
'2018-12-15': '2018年熊市底部',
|
||
'2020-03-12': '新冠黑色星期四',
|
||
'2021-04-14': '2021年牛市中期高点',
|
||
'2021-11-10': '2021年牛市顶点',
|
||
'2022-06-18': 'Luna/3AC 暴跌',
|
||
'2022-11-09': 'FTX 崩盘',
|
||
'2024-01-11': 'BTC ETF 获批',
|
||
}
|
||
|
||
|
||
# ============================================================
|
||
# 2. 集成异常检测
|
||
# ============================================================
|
||
|
||
def prepare_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray]:
|
||
"""
|
||
准备异常检测特征矩阵
|
||
|
||
Parameters
|
||
----------
|
||
df : pd.DataFrame
|
||
含衍生特征的日线数据
|
||
|
||
Returns
|
||
-------
|
||
features_df : pd.DataFrame
|
||
特征子集(已去除 NaN)
|
||
X_scaled : np.ndarray
|
||
标准化后的特征矩阵
|
||
"""
|
||
# 选取可用特征
|
||
available = [f for f in DETECTION_FEATURES if f in df.columns]
|
||
if len(available) < 3:
|
||
raise ValueError(f"可用特征不足: {available},至少需要 3 个")
|
||
|
||
features_df = df[available].dropna()
|
||
|
||
# 标准化
|
||
scaler = StandardScaler()
|
||
X_scaled = scaler.fit_transform(features_df.values)
|
||
|
||
return features_df, X_scaled
|
||
|
||
|
||
def detect_isolation_forest(X: np.ndarray, contamination: float = 0.05) -> np.ndarray:
|
||
"""Isolation Forest 异常检测"""
|
||
model = IsolationForest(
|
||
n_estimators=200,
|
||
contamination=contamination,
|
||
random_state=42,
|
||
n_jobs=-1,
|
||
)
|
||
# -1 = 异常, 1 = 正常
|
||
labels = model.fit_predict(X)
|
||
return (labels == -1).astype(int)
|
||
|
||
|
||
def detect_lof(X: np.ndarray, contamination: float = 0.05) -> np.ndarray:
|
||
"""Local Outlier Factor 异常检测"""
|
||
model = LocalOutlierFactor(
|
||
n_neighbors=20,
|
||
contamination=contamination,
|
||
novelty=False,
|
||
n_jobs=-1,
|
||
)
|
||
labels = model.fit_predict(X)
|
||
return (labels == -1).astype(int)
|
||
|
||
|
||
def detect_copod(X: np.ndarray, contamination: float = 0.05) -> np.ndarray:
|
||
"""COPOD 异常检测(基于 Copula)"""
|
||
if not HAS_COPOD:
|
||
return None
|
||
|
||
model = COPOD(contamination=contamination)
|
||
labels = model.fit_predict(X)
|
||
return labels.astype(int)
|
||
|
||
|
||
def ensemble_anomaly_detection(
|
||
df: pd.DataFrame,
|
||
contamination: float = 0.05,
|
||
min_agreement: int = 2,
|
||
) -> pd.DataFrame:
|
||
"""
|
||
集成异常检测:要求 ≥ min_agreement / n_methods 一致判定
|
||
|
||
Parameters
|
||
----------
|
||
df : pd.DataFrame
|
||
含衍生特征的日线数据
|
||
contamination : float
|
||
预期异常比例
|
||
min_agreement : int
|
||
最少多少个方法一致才标记为异常
|
||
|
||
Returns
|
||
-------
|
||
pd.DataFrame
|
||
添加了各方法检测结果及集成结果的数据
|
||
"""
|
||
features_df, X_scaled = prepare_features(df)
|
||
|
||
print(f" 特征矩阵: {X_scaled.shape[0]} 样本 x {X_scaled.shape[1]} 特征")
|
||
|
||
# 执行各方法检测
|
||
print(" [1/3] Isolation Forest...")
|
||
if_labels = detect_isolation_forest(X_scaled, contamination)
|
||
|
||
print(" [2/3] Local Outlier Factor...")
|
||
lof_labels = detect_lof(X_scaled, contamination)
|
||
|
||
n_methods = 2
|
||
vote_matrix = np.column_stack([if_labels, lof_labels])
|
||
method_names = ['iforest', 'lof']
|
||
|
||
print(" [3/3] COPOD...")
|
||
copod_labels = detect_copod(X_scaled, contamination)
|
||
if copod_labels is not None:
|
||
vote_matrix = np.column_stack([vote_matrix, copod_labels])
|
||
method_names.append('copod')
|
||
n_methods = 3
|
||
else:
|
||
print(" COPOD 不可用,使用 2 方法集成")
|
||
|
||
# 投票
|
||
vote_sum = vote_matrix.sum(axis=1)
|
||
ensemble_label = (vote_sum >= min_agreement).astype(int)
|
||
|
||
# 构建结果 DataFrame
|
||
result = features_df.copy()
|
||
for i, name in enumerate(method_names):
|
||
result[f'anomaly_{name}'] = vote_matrix[:, i]
|
||
result['anomaly_votes'] = vote_sum
|
||
result['anomaly_ensemble'] = ensemble_label
|
||
|
||
# 打印各方法统计
|
||
print(f"\n 异常检测统计:")
|
||
for name in method_names:
|
||
n_anom = result[f'anomaly_{name}'].sum()
|
||
print(f" {name:>12}: {n_anom} 个异常 ({n_anom / len(result) * 100:.2f}%)")
|
||
n_ensemble = ensemble_label.sum()
|
||
print(f" {'集成(≥' + str(min_agreement) + ')':>12}: {n_ensemble} 个异常 ({n_ensemble / len(result) * 100:.2f}%)")
|
||
|
||
# 方法间重叠度
|
||
print(f"\n 方法间重叠:")
|
||
for i in range(len(method_names)):
|
||
for j in range(i + 1, len(method_names)):
|
||
overlap = ((vote_matrix[:, i] == 1) & (vote_matrix[:, j] == 1)).sum()
|
||
n_i = vote_matrix[:, i].sum()
|
||
n_j = vote_matrix[:, j].sum()
|
||
if min(n_i, n_j) > 0:
|
||
jaccard = overlap / ((vote_matrix[:, i] == 1) | (vote_matrix[:, j] == 1)).sum()
|
||
else:
|
||
jaccard = 0.0
|
||
print(f" {method_names[i]} ∩ {method_names[j]}: "
|
||
f"{overlap} 个 (Jaccard={jaccard:.3f})")
|
||
|
||
return result
|
||
|
||
|
||
# ============================================================
|
||
# 3. GARCH 条件波动率异常
|
||
# ============================================================
|
||
|
||
def garch_anomaly_detection(
|
||
df: pd.DataFrame,
|
||
threshold: float = 3.0,
|
||
) -> pd.Series:
|
||
"""
|
||
基于 GARCH(1,1) 的条件波动率异常检测
|
||
|
||
标准化残差 |ε_t / σ_t| > threshold 的日期标记为异常
|
||
|
||
Parameters
|
||
----------
|
||
df : pd.DataFrame
|
||
含 log_return 列的数据
|
||
threshold : float
|
||
标准化残差阈值
|
||
|
||
Returns
|
||
-------
|
||
pd.Series
|
||
异常标记(1 = 异常,0 = 正常),索引与输入对齐
|
||
"""
|
||
from arch import arch_model
|
||
|
||
returns = df['log_return'].dropna()
|
||
r_pct = returns * 100 # arch 库使用百分比收益率
|
||
|
||
# 拟合 GARCH(1,1)
|
||
model = arch_model(r_pct, vol='Garch', p=1, q=1, mean='Constant', dist='Normal')
|
||
with warnings.catch_warnings():
|
||
warnings.simplefilter("ignore")
|
||
result = model.fit(disp='off')
|
||
|
||
# 计算标准化残差
|
||
std_resid = result.resid / result.conditional_volatility
|
||
anomaly = (std_resid.abs() > threshold).astype(int)
|
||
|
||
n_anom = anomaly.sum()
|
||
print(f" GARCH 异常: {n_anom} 个 (|标准化残差| > {threshold})")
|
||
print(f" GARCH 模型: α={result.params.get('alpha[1]', np.nan):.4f}, "
|
||
f"β={result.params.get('beta[1]', np.nan):.4f}, "
|
||
f"持续性={result.params.get('alpha[1]', 0) + result.params.get('beta[1]', 0):.4f}")
|
||
|
||
return anomaly
|
||
|
||
|
||
# ============================================================
|
||
# 4. 前兆模式提取
|
||
# ============================================================
|
||
|
||
def extract_precursor_features(
|
||
df: pd.DataFrame,
|
||
anomaly_labels: pd.Series,
|
||
lookback_windows: List[int] = None,
|
||
) -> Tuple[pd.DataFrame, pd.Series]:
|
||
"""
|
||
提取异常日前若干天的特征作为前兆信号
|
||
|
||
Parameters
|
||
----------
|
||
df : pd.DataFrame
|
||
含衍生特征的数据
|
||
anomaly_labels : pd.Series
|
||
异常标记(1 = 异常)
|
||
lookback_windows : list of int
|
||
向前回溯的天数窗口
|
||
|
||
Returns
|
||
-------
|
||
X : pd.DataFrame
|
||
前兆特征矩阵
|
||
y : pd.Series
|
||
标签(1 = 后续发生异常, 0 = 正常)
|
||
"""
|
||
if lookback_windows is None:
|
||
lookback_windows = [5, 10, 20]
|
||
|
||
# 确保对齐
|
||
common_idx = df.index.intersection(anomaly_labels.index)
|
||
df_aligned = df.loc[common_idx]
|
||
labels_aligned = anomaly_labels.loc[common_idx]
|
||
|
||
base_features = [f for f in DETECTION_FEATURES if f in df.columns]
|
||
precursor_features = {}
|
||
|
||
for window in lookback_windows:
|
||
for feat in base_features:
|
||
if feat not in df_aligned.columns:
|
||
continue
|
||
series = df_aligned[feat]
|
||
|
||
# 滚动统计作为前兆特征
|
||
precursor_features[f'{feat}_mean_{window}d'] = series.rolling(window).mean()
|
||
precursor_features[f'{feat}_std_{window}d'] = series.rolling(window).std()
|
||
precursor_features[f'{feat}_max_{window}d'] = series.rolling(window).max()
|
||
precursor_features[f'{feat}_min_{window}d'] = series.rolling(window).min()
|
||
|
||
# 趋势特征(最近值 vs 窗口均值的偏离)
|
||
rolling_mean = series.rolling(window).mean()
|
||
precursor_features[f'{feat}_deviation_{window}d'] = series - rolling_mean
|
||
|
||
X = pd.DataFrame(precursor_features, index=df_aligned.index)
|
||
|
||
# 标签: 预测次日是否出现异常(前瞻1天)
|
||
y = labels_aligned.shift(-1).dropna()
|
||
X = X.loc[y.index] # 对齐特征和标签
|
||
|
||
# 去除 NaN
|
||
valid_mask = X.notna().all(axis=1) & y.notna()
|
||
X = X[valid_mask]
|
||
y = y[valid_mask]
|
||
|
||
return X, y
|
||
|
||
|
||
def train_precursor_classifier(
|
||
X: pd.DataFrame,
|
||
y: pd.Series,
|
||
) -> Dict:
|
||
"""
|
||
训练前兆模式分类器(Random Forest)
|
||
|
||
使用分层 K 折交叉验证评估
|
||
|
||
Parameters
|
||
----------
|
||
X : pd.DataFrame
|
||
前兆特征矩阵
|
||
y : pd.Series
|
||
标签
|
||
|
||
Returns
|
||
-------
|
||
dict
|
||
AUC、特征重要性等结果
|
||
"""
|
||
if len(X) < 50 or y.sum() < 10:
|
||
print(f" [警告] 样本不足 (n={len(X)}, 正例={y.sum()}),跳过分类器训练")
|
||
return {}
|
||
|
||
# 时间序列交叉验证
|
||
n_splits = min(5, int(y.sum()))
|
||
if n_splits < 2:
|
||
print(" [警告] 正例数过少,无法进行交叉验证")
|
||
return {}
|
||
|
||
cv = TimeSeriesSplit(n_splits=n_splits)
|
||
|
||
clf = RandomForestClassifier(
|
||
n_estimators=200,
|
||
max_depth=10,
|
||
min_samples_split=5,
|
||
class_weight='balanced',
|
||
random_state=42,
|
||
n_jobs=-1,
|
||
)
|
||
|
||
# 手动交叉验证(每折单独 fit scaler,防止数据泄漏)
|
||
try:
|
||
y_prob = np.full(len(y), np.nan)
|
||
for train_idx, val_idx in cv.split(X):
|
||
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
|
||
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
|
||
scaler = StandardScaler()
|
||
X_train_scaled = scaler.fit_transform(X_train)
|
||
X_val_scaled = scaler.transform(X_val)
|
||
clf.fit(X_train_scaled, y_train)
|
||
y_prob[val_idx] = clf.predict_proba(X_val_scaled)[:, 1]
|
||
# 去除未被验证的样本(如有)
|
||
valid_prob_mask = ~np.isnan(y_prob)
|
||
y_eval = y[valid_prob_mask]
|
||
y_prob_eval = y_prob[valid_prob_mask]
|
||
auc = roc_auc_score(y_eval, y_prob_eval)
|
||
except Exception as e:
|
||
print(f" [错误] 交叉验证失败: {e}")
|
||
return {}
|
||
|
||
# 在全量数据上训练获取特征重要性
|
||
scaler = StandardScaler()
|
||
X_scaled = scaler.fit_transform(X)
|
||
clf.fit(X_scaled, y)
|
||
importances = pd.Series(clf.feature_importances_, index=X.columns)
|
||
importances = importances.sort_values(ascending=False)
|
||
|
||
# ROC 曲线数据
|
||
fpr, tpr, thresholds = roc_curve(y_eval, y_prob_eval)
|
||
|
||
results = {
|
||
'auc': auc,
|
||
'feature_importances': importances,
|
||
'y_true': y_eval,
|
||
'y_prob': y_prob_eval,
|
||
'fpr': fpr,
|
||
'tpr': tpr,
|
||
}
|
||
|
||
print(f"\n 前兆分类器结果:")
|
||
print(f" AUC: {auc:.4f}")
|
||
print(f" 样本: {len(y)} (异常: {y.sum()}, 正常: {(y == 0).sum()})")
|
||
print(f" Top-10 重要特征:")
|
||
for feat, imp in importances.head(10).items():
|
||
print(f" {feat:<40} {imp:.4f}")
|
||
|
||
return results
|
||
|
||
|
||
# ============================================================
|
||
# 5. 事件对齐分析
|
||
# ============================================================
|
||
|
||
def align_with_events(
|
||
anomaly_dates: pd.DatetimeIndex,
|
||
tolerance_days: int = 5,
|
||
) -> pd.DataFrame:
|
||
"""
|
||
将异常日期与已知事件对齐
|
||
|
||
Parameters
|
||
----------
|
||
anomaly_dates : pd.DatetimeIndex
|
||
异常日期列表
|
||
tolerance_days : int
|
||
容差天数(异常日期与事件日期相差 ≤ tolerance_days 天即视为匹配)
|
||
|
||
Returns
|
||
-------
|
||
pd.DataFrame
|
||
匹配结果
|
||
"""
|
||
matches = []
|
||
|
||
for event_date_str, event_name in KNOWN_EVENTS.items():
|
||
event_date = pd.Timestamp(event_date_str)
|
||
|
||
for anom_date in anomaly_dates:
|
||
diff_days = abs((anom_date - event_date).days)
|
||
if diff_days <= tolerance_days:
|
||
matches.append({
|
||
'anomaly_date': anom_date,
|
||
'event_date': event_date,
|
||
'event_name': event_name,
|
||
'diff_days': diff_days,
|
||
})
|
||
|
||
if matches:
|
||
result = pd.DataFrame(matches)
|
||
print(f"\n 事件对齐 (容差 {tolerance_days} 天):")
|
||
for _, row in result.iterrows():
|
||
print(f" 异常 {row['anomaly_date'].strftime('%Y-%m-%d')} ↔ "
|
||
f"{row['event_name']} ({row['event_date'].strftime('%Y-%m-%d')}, "
|
||
f"差 {row['diff_days']} 天)")
|
||
return result
|
||
else:
|
||
print(f" [信息] 无异常日期与已知事件匹配 (容差 {tolerance_days} 天)")
|
||
return pd.DataFrame()
|
||
|
||
|
||
# ============================================================
|
||
# 6. 可视化
|
||
# ============================================================
|
||
|
||
def plot_price_with_anomalies(
|
||
df: pd.DataFrame,
|
||
anomaly_result: pd.DataFrame,
|
||
garch_anomaly: Optional[pd.Series],
|
||
output_dir: Path,
|
||
):
|
||
"""绘制价格图,标注异常点"""
|
||
fig, axes = plt.subplots(2, 1, figsize=(16, 10), gridspec_kw={'height_ratios': [3, 1]})
|
||
|
||
# 上图:价格 + 异常标记
|
||
ax1 = axes[0]
|
||
ax1.plot(df.index, df['close'], linewidth=0.6, color='steelblue', alpha=0.8, label='BTC 收盘价')
|
||
|
||
# 集成异常
|
||
ensemble_anom = anomaly_result[anomaly_result['anomaly_ensemble'] == 1]
|
||
if not ensemble_anom.empty:
|
||
# 获取异常日期对应的收盘价
|
||
anom_prices = df.loc[df.index.isin(ensemble_anom.index), 'close']
|
||
ax1.scatter(anom_prices.index, anom_prices.values,
|
||
color='red', s=30, zorder=5, label=f'集成异常 (n={len(anom_prices)})',
|
||
alpha=0.7, edgecolors='darkred', linewidths=0.5)
|
||
|
||
# GARCH 异常
|
||
if garch_anomaly is not None:
|
||
garch_anom_dates = garch_anomaly[garch_anomaly == 1].index
|
||
garch_prices = df.loc[df.index.isin(garch_anom_dates), 'close']
|
||
if not garch_prices.empty:
|
||
ax1.scatter(garch_prices.index, garch_prices.values,
|
||
color='orange', s=20, zorder=4, marker='^',
|
||
label=f'GARCH 异常 (n={len(garch_prices)})',
|
||
alpha=0.7, edgecolors='darkorange', linewidths=0.5)
|
||
|
||
ax1.set_ylabel('价格 (USDT)', fontsize=12)
|
||
ax1.set_title('BTC 价格与异常检测结果', fontsize=14)
|
||
ax1.legend(fontsize=10, loc='upper left')
|
||
ax1.grid(True, alpha=0.3)
|
||
ax1.set_yscale('log')
|
||
|
||
# 下图:成交量 + 异常标记
|
||
ax2 = axes[1]
|
||
if 'volume' in df.columns:
|
||
ax2.bar(df.index, df['volume'], width=1, color='steelblue', alpha=0.4, label='成交量')
|
||
if not ensemble_anom.empty:
|
||
anom_vol = df.loc[df.index.isin(ensemble_anom.index), 'volume']
|
||
ax2.bar(anom_vol.index, anom_vol.values, width=1, color='red', alpha=0.7, label='异常日成交量')
|
||
ax2.set_ylabel('成交量', fontsize=12)
|
||
ax2.set_xlabel('日期', fontsize=12)
|
||
ax2.legend(fontsize=10)
|
||
ax2.grid(True, alpha=0.3)
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(output_dir / 'anomaly_price_chart.png', dpi=150, bbox_inches='tight')
|
||
plt.close(fig)
|
||
print(f" [保存] {output_dir / 'anomaly_price_chart.png'}")
|
||
|
||
|
||
def plot_anomaly_feature_distributions(
|
||
anomaly_result: pd.DataFrame,
|
||
output_dir: Path,
|
||
):
|
||
"""绘制异常日 vs 正常日的特征分布对比"""
|
||
features_to_plot = [f for f in DETECTION_FEATURES if f in anomaly_result.columns]
|
||
n_feats = len(features_to_plot)
|
||
if n_feats == 0:
|
||
print(" [警告] 无可绘制特征")
|
||
return
|
||
|
||
n_cols = 3
|
||
n_rows = (n_feats + n_cols - 1) // n_cols
|
||
|
||
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
|
||
axes = np.array(axes).flatten()
|
||
|
||
normal = anomaly_result[anomaly_result['anomaly_ensemble'] == 0]
|
||
anomaly = anomaly_result[anomaly_result['anomaly_ensemble'] == 1]
|
||
|
||
for idx, feat in enumerate(features_to_plot):
|
||
ax = axes[idx]
|
||
|
||
# 正常分布
|
||
vals_normal = normal[feat].dropna()
|
||
vals_anomaly = anomaly[feat].dropna()
|
||
|
||
ax.hist(vals_normal, bins=50, density=True, alpha=0.6,
|
||
color='steelblue', label=f'正常 (n={len(vals_normal)})', edgecolor='white', linewidth=0.3)
|
||
if len(vals_anomaly) > 0:
|
||
ax.hist(vals_anomaly, bins=30, density=True, alpha=0.6,
|
||
color='red', label=f'异常 (n={len(vals_anomaly)})', edgecolor='white', linewidth=0.3)
|
||
|
||
ax.set_title(feat, fontsize=11)
|
||
ax.legend(fontsize=8)
|
||
ax.grid(True, alpha=0.3)
|
||
|
||
# 隐藏多余子图
|
||
for idx in range(n_feats, len(axes)):
|
||
axes[idx].set_visible(False)
|
||
|
||
fig.suptitle('异常日 vs 正常日 特征分布对比', fontsize=14, y=1.02)
|
||
fig.tight_layout()
|
||
fig.savefig(output_dir / 'anomaly_feature_distributions.png', dpi=150, bbox_inches='tight')
|
||
plt.close(fig)
|
||
print(f" [保存] {output_dir / 'anomaly_feature_distributions.png'}")
|
||
|
||
|
||
def plot_precursor_roc(precursor_results: Dict, output_dir: Path):
|
||
"""绘制前兆分类器 ROC 曲线"""
|
||
if not precursor_results or 'fpr' not in precursor_results:
|
||
print(" [警告] 无前兆分类器结果,跳过 ROC 曲线")
|
||
return
|
||
|
||
fig, ax = plt.subplots(figsize=(8, 8))
|
||
|
||
fpr = precursor_results['fpr']
|
||
tpr = precursor_results['tpr']
|
||
auc = precursor_results['auc']
|
||
|
||
ax.plot(fpr, tpr, color='steelblue', linewidth=2,
|
||
label=f'Random Forest (AUC = {auc:.4f})')
|
||
ax.plot([0, 1], [0, 1], 'k--', linewidth=1, label='随机基线')
|
||
|
||
ax.set_xlabel('假阳性率 (FPR)', fontsize=12)
|
||
ax.set_ylabel('真阳性率 (TPR)', fontsize=12)
|
||
ax.set_title('异常前兆分类器 ROC 曲线', fontsize=14)
|
||
ax.legend(fontsize=11)
|
||
ax.grid(True, alpha=0.3)
|
||
ax.set_xlim([-0.02, 1.02])
|
||
ax.set_ylim([-0.02, 1.02])
|
||
|
||
fig.savefig(output_dir / 'precursor_roc_curve.png', dpi=150, bbox_inches='tight')
|
||
plt.close(fig)
|
||
print(f" [保存] {output_dir / 'precursor_roc_curve.png'}")
|
||
|
||
|
||
def plot_feature_importance(precursor_results: Dict, output_dir: Path, top_n: int = 20):
|
||
"""绘制前兆特征重要性条形图"""
|
||
if not precursor_results or 'feature_importances' not in precursor_results:
|
||
print(" [警告] 无特征重要性数据,跳过")
|
||
return
|
||
|
||
importances = precursor_results['feature_importances'].head(top_n)
|
||
|
||
fig, ax = plt.subplots(figsize=(10, max(6, top_n * 0.35)))
|
||
|
||
colors = plt.cm.RdYlBu_r(np.linspace(0.2, 0.8, len(importances)))
|
||
ax.barh(range(len(importances)), importances.values[::-1],
|
||
color=colors[::-1], edgecolor='white', linewidth=0.5)
|
||
ax.set_yticks(range(len(importances)))
|
||
ax.set_yticklabels(importances.index[::-1], fontsize=9)
|
||
ax.set_xlabel('特征重要性', fontsize=12)
|
||
ax.set_title(f'异常前兆 Top-{top_n} 特征重要性 (Random Forest)', fontsize=13)
|
||
ax.grid(True, alpha=0.3, axis='x')
|
||
|
||
fig.savefig(output_dir / 'precursor_feature_importance.png', dpi=150, bbox_inches='tight')
|
||
plt.close(fig)
|
||
print(f" [保存] {output_dir / 'precursor_feature_importance.png'}")
|
||
|
||
|
||
# ============================================================
|
||
# 9. 多尺度异常检测
|
||
# ============================================================
|
||
|
||
def multi_scale_anomaly_detection(intervals=None, contamination=0.05) -> Dict:
|
||
"""多尺度异常检测"""
|
||
if intervals is None:
|
||
intervals = ['1h', '4h', '1d']
|
||
|
||
results = {}
|
||
for interval in intervals:
|
||
try:
|
||
print(f"\n 加载 {interval} 数据进行异常检测...")
|
||
df_tf = load_klines(interval)
|
||
df_tf = add_derived_features(df_tf)
|
||
|
||
# 截断大数据
|
||
if len(df_tf) > 50000:
|
||
df_tf = df_tf.iloc[-50000:]
|
||
|
||
if len(df_tf) < 200:
|
||
print(f" {interval} 数据不足,跳过")
|
||
continue
|
||
|
||
# 集成异常检测
|
||
anomaly_result = ensemble_anomaly_detection(df_tf, contamination=contamination, min_agreement=2)
|
||
|
||
# 提取异常日期
|
||
anomaly_dates = anomaly_result[anomaly_result['anomaly_ensemble'] == 1].index
|
||
|
||
results[interval] = {
|
||
'anomaly_dates': anomaly_dates,
|
||
'n_anomalies': len(anomaly_dates),
|
||
'n_total': len(anomaly_result),
|
||
'anomaly_pct': len(anomaly_dates) / len(anomaly_result) * 100,
|
||
}
|
||
|
||
print(f" {interval}: {len(anomaly_dates)} 个异常 ({len(anomaly_dates)/len(anomaly_result)*100:.2f}%)")
|
||
|
||
except FileNotFoundError:
|
||
print(f" {interval} 数据文件不存在,跳过")
|
||
except Exception as e:
|
||
print(f" {interval} 异常检测失败: {e}")
|
||
|
||
return results
|
||
|
||
|
||
def cross_scale_anomaly_consensus(ms_results: Dict, tolerance_hours: int = 24) -> pd.DataFrame:
|
||
"""
|
||
跨尺度异常共识:多个尺度在同一时间窗口内同时报异常 → 高置信度
|
||
|
||
Parameters
|
||
----------
|
||
ms_results : Dict
|
||
多尺度异常检测结果字典
|
||
tolerance_hours : int
|
||
时间容差(小时)
|
||
|
||
Returns
|
||
-------
|
||
pd.DataFrame
|
||
共识异常数据
|
||
"""
|
||
# 将所有尺度的异常日期映射到日频
|
||
all_dates = []
|
||
for interval, result in ms_results.items():
|
||
dates = result['anomaly_dates']
|
||
# 转换为日期(去除时间部分)
|
||
daily_dates = pd.to_datetime(dates.date).unique()
|
||
for date in daily_dates:
|
||
all_dates.append({'date': date, 'interval': interval})
|
||
|
||
if not all_dates:
|
||
return pd.DataFrame()
|
||
|
||
df_dates = pd.DataFrame(all_dates)
|
||
|
||
# 统计每个日期被多少个尺度报为异常
|
||
consensus_counts = df_dates.groupby('date').size().reset_index(name='n_scales')
|
||
consensus_counts = consensus_counts.sort_values('date')
|
||
|
||
# >=2 个尺度报异常 = "共识异常"
|
||
consensus_counts['is_consensus'] = (consensus_counts['n_scales'] >= 2).astype(int)
|
||
|
||
# 添加参与的尺度列表
|
||
scale_groups = df_dates.groupby('date')['interval'].apply(list).reset_index()
|
||
consensus_counts = consensus_counts.merge(scale_groups, on='date')
|
||
|
||
n_consensus = consensus_counts['is_consensus'].sum()
|
||
print(f"\n 跨尺度共识异常: {n_consensus} 天 (≥2 个尺度同时报异常)")
|
||
|
||
return consensus_counts
|
||
|
||
|
||
def plot_multi_scale_anomaly_timeline(df: pd.DataFrame, ms_results: Dict, consensus: pd.DataFrame, output_dir: Path):
|
||
"""多尺度异常共识时间线"""
|
||
fig, axes = plt.subplots(2, 1, figsize=(16, 10), gridspec_kw={'height_ratios': [2, 1]})
|
||
|
||
# 上图: 价格图(对数尺度)+ 共识异常点标注
|
||
ax1 = axes[0]
|
||
ax1.plot(df.index, df['close'], linewidth=0.6, color='steelblue', alpha=0.8, label='BTC 收盘价')
|
||
|
||
if not consensus.empty:
|
||
# 标注共识异常点
|
||
consensus_dates = consensus[consensus['is_consensus'] == 1]['date']
|
||
if len(consensus_dates) > 0:
|
||
# 获取对应的价格
|
||
consensus_prices = df.loc[df.index.isin(consensus_dates), 'close']
|
||
if not consensus_prices.empty:
|
||
ax1.scatter(consensus_prices.index, consensus_prices.values,
|
||
color='red', s=50, zorder=5, label=f'共识异常 (n={len(consensus_prices)})',
|
||
alpha=0.8, edgecolors='darkred', linewidths=1, marker='*')
|
||
|
||
ax1.set_ylabel('价格 (USDT)', fontsize=12)
|
||
ax1.set_title('多尺度异常检测:价格与共识异常', fontsize=14)
|
||
ax1.legend(fontsize=10, loc='upper left')
|
||
ax1.grid(True, alpha=0.3)
|
||
ax1.set_yscale('log')
|
||
|
||
# 下图: 各尺度异常时间线(类似甘特图)
|
||
ax2 = axes[1]
|
||
|
||
interval_labels = list(ms_results.keys())
|
||
y_positions = range(len(interval_labels))
|
||
|
||
colors = {'1h': 'lightcoral', '4h': 'orange', '1d': 'steelblue'}
|
||
|
||
for idx, interval in enumerate(interval_labels):
|
||
anomaly_dates = ms_results[interval]['anomaly_dates']
|
||
# 转换为日期
|
||
daily_dates = pd.to_datetime(anomaly_dates.date).unique()
|
||
|
||
# 绘制时间线(每个异常日期用竖线表示)
|
||
for date in daily_dates:
|
||
ax2.axvline(x=date, ymin=idx/len(interval_labels), ymax=(idx+0.8)/len(interval_labels),
|
||
color=colors.get(interval, 'gray'), alpha=0.6, linewidth=2)
|
||
|
||
# 标注共识异常区域
|
||
if not consensus.empty:
|
||
consensus_dates = consensus[consensus['is_consensus'] == 1]['date']
|
||
for date in consensus_dates:
|
||
ax2.axvspan(date, date + pd.Timedelta(days=1),
|
||
color='red', alpha=0.15, zorder=0)
|
||
|
||
ax2.set_yticks(y_positions)
|
||
ax2.set_yticklabels(interval_labels)
|
||
ax2.set_ylabel('时间尺度', fontsize=12)
|
||
ax2.set_xlabel('日期', fontsize=12)
|
||
ax2.set_title('各尺度异常时间线(红色背景 = 共识异常)', fontsize=12)
|
||
ax2.grid(True, alpha=0.3, axis='x')
|
||
ax2.set_xlim(df.index.min(), df.index.max())
|
||
|
||
fig.tight_layout()
|
||
fig.savefig(output_dir / 'anomaly_multi_scale_timeline.png', dpi=150, bbox_inches='tight')
|
||
plt.close(fig)
|
||
print(f" [保存] {output_dir / 'anomaly_multi_scale_timeline.png'}")
|
||
|
||
|
||
# ============================================================
|
||
# 7. 结果打印
|
||
# ============================================================
|
||
|
||
def print_anomaly_summary(
|
||
anomaly_result: pd.DataFrame,
|
||
garch_anomaly: Optional[pd.Series],
|
||
precursor_results: Dict,
|
||
):
|
||
"""打印异常检测汇总"""
|
||
print("\n" + "=" * 70)
|
||
print("异常检测结果汇总")
|
||
print("=" * 70)
|
||
|
||
# 集成异常统计
|
||
n_total = len(anomaly_result)
|
||
n_ensemble = anomaly_result['anomaly_ensemble'].sum()
|
||
print(f"\n 总样本数: {n_total}")
|
||
print(f" 集成异常数: {n_ensemble} ({n_ensemble / n_total * 100:.2f}%)")
|
||
|
||
# 各方法统计
|
||
method_cols = [c for c in anomaly_result.columns if c.startswith('anomaly_') and c != 'anomaly_ensemble' and c != 'anomaly_votes']
|
||
for col in method_cols:
|
||
method_name = col.replace('anomaly_', '')
|
||
n_anom = anomaly_result[col].sum()
|
||
print(f" {method_name:>12}: {n_anom} ({n_anom / n_total * 100:.2f}%)")
|
||
|
||
# GARCH 异常
|
||
if garch_anomaly is not None:
|
||
n_garch = garch_anomaly.sum()
|
||
print(f" {'GARCH':>12}: {n_garch} ({n_garch / len(garch_anomaly) * 100:.2f}%)")
|
||
|
||
# 集成异常与 GARCH 异常的重叠
|
||
common_idx = anomaly_result.index.intersection(garch_anomaly.index)
|
||
if len(common_idx) > 0:
|
||
ensemble_set = set(anomaly_result.loc[common_idx][anomaly_result.loc[common_idx, 'anomaly_ensemble'] == 1].index)
|
||
garch_set = set(garch_anomaly[garch_anomaly == 1].index)
|
||
overlap = len(ensemble_set & garch_set)
|
||
print(f"\n 集成 ∩ GARCH 重叠: {overlap} 个")
|
||
|
||
# 前兆分类器
|
||
if precursor_results and 'auc' in precursor_results:
|
||
print(f"\n 前兆分类器 AUC: {precursor_results['auc']:.4f}")
|
||
print(f" Top-5 前兆特征:")
|
||
for feat, imp in precursor_results['feature_importances'].head(5).items():
|
||
print(f" {feat:<40} {imp:.4f}")
|
||
|
||
|
||
# ============================================================
|
||
# 8. 主入口
|
||
# ============================================================
|
||
|
||
def run_anomaly_analysis(
|
||
df: pd.DataFrame,
|
||
output_dir: str = "output/anomaly",
|
||
) -> Dict:
|
||
"""
|
||
异常检测与前兆模式分析主函数
|
||
|
||
Parameters
|
||
----------
|
||
df : pd.DataFrame
|
||
日线数据(已通过 add_derived_features 添加衍生特征)
|
||
output_dir : str
|
||
图表输出目录
|
||
|
||
Returns
|
||
-------
|
||
dict
|
||
包含所有分析结果的字典
|
||
"""
|
||
output_dir = Path(output_dir)
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
print("=" * 70)
|
||
print("BTC 异常检测与前兆模式分析")
|
||
print("=" * 70)
|
||
print(f"数据范围: {df.index.min()} ~ {df.index.max()}")
|
||
print(f"样本数量: {len(df)}")
|
||
|
||
from src.font_config import configure_chinese_font
|
||
configure_chinese_font()
|
||
|
||
# --- 集成异常检测 ---
|
||
print("\n>>> [1/5] 执行集成异常检测...")
|
||
anomaly_result = ensemble_anomaly_detection(df, contamination=0.05, min_agreement=2)
|
||
|
||
# --- GARCH 条件波动率异常 ---
|
||
print("\n>>> [2/5] 执行 GARCH 条件波动率异常检测...")
|
||
garch_anomaly = None
|
||
try:
|
||
garch_anomaly = garch_anomaly_detection(df, threshold=3.0)
|
||
except Exception as e:
|
||
print(f" [错误] GARCH 异常检测失败: {e}")
|
||
|
||
# --- 事件对齐 ---
|
||
print("\n>>> [3/5] 执行事件对齐分析...")
|
||
ensemble_anom_dates = anomaly_result[anomaly_result['anomaly_ensemble'] == 1].index
|
||
event_alignment = align_with_events(ensemble_anom_dates, tolerance_days=5)
|
||
|
||
# --- 前兆模式提取 ---
|
||
print("\n>>> [4/5] 提取前兆模式并训练分类器...")
|
||
precursor_results = {}
|
||
try:
|
||
X_precursor, y_precursor = extract_precursor_features(
|
||
df, anomaly_result['anomaly_ensemble'], lookback_windows=[5, 10, 20]
|
||
)
|
||
print(f" 前兆特征矩阵: {X_precursor.shape[0]} 样本 x {X_precursor.shape[1]} 特征")
|
||
precursor_results = train_precursor_classifier(X_precursor, y_precursor)
|
||
except Exception as e:
|
||
print(f" [错误] 前兆模式提取失败: {e}")
|
||
|
||
# --- 可视化 ---
|
||
print("\n>>> [5/5] 生成可视化图表...")
|
||
plot_price_with_anomalies(df, anomaly_result, garch_anomaly, output_dir)
|
||
plot_anomaly_feature_distributions(anomaly_result, output_dir)
|
||
plot_precursor_roc(precursor_results, output_dir)
|
||
plot_feature_importance(precursor_results, output_dir)
|
||
|
||
# --- 汇总打印 ---
|
||
print_anomaly_summary(anomaly_result, garch_anomaly, precursor_results)
|
||
|
||
# --- 多尺度异常检测 ---
|
||
print("\n>>> [额外] 多尺度异常检测与共识分析...")
|
||
ms_anomaly = multi_scale_anomaly_detection(['1h', '4h', '1d'])
|
||
consensus = None
|
||
if len(ms_anomaly) >= 2:
|
||
consensus = cross_scale_anomaly_consensus(ms_anomaly)
|
||
plot_multi_scale_anomaly_timeline(df, ms_anomaly, consensus, output_dir)
|
||
|
||
print("\n" + "=" * 70)
|
||
print("异常检测与前兆模式分析完成!")
|
||
print(f"图表已保存至: {output_dir.resolve()}")
|
||
print("=" * 70)
|
||
|
||
return {
|
||
'anomaly_result': anomaly_result,
|
||
'garch_anomaly': garch_anomaly,
|
||
'event_alignment': event_alignment,
|
||
'precursor_results': precursor_results,
|
||
'multi_scale_anomaly': ms_anomaly,
|
||
'cross_scale_consensus': consensus,
|
||
}
|
||
|
||
|
||
# ============================================================
|
||
# 独立运行入口
|
||
# ============================================================
|
||
|
||
if __name__ == '__main__':
|
||
from src.data_loader import load_daily
|
||
from src.preprocessing import add_derived_features
|
||
|
||
df = load_daily()
|
||
df = add_derived_features(df)
|
||
run_anomaly_analysis(df)
|