Files
btc_price_anany/src/time_series.py
riba2534 d480712b40 fix: 全面修复代码质量和报告准确性问题
代码修复 (16 个模块):
- GARCH 模型统一改用 t 分布 + 收敛检查 (returns/volatility/anomaly)
- KS 检验替换为 Lilliefors 检验 (returns)
- 修复数据泄漏: StratifiedKFold→TimeSeriesSplit, scaler 逐折 fit (anomaly)
- 前兆标签 shift(-1) 预测次日异常 (anomaly)
- PSD 归一化加入采样频率和单边谱×2 (fft)
- AR(1) 红噪声基线经验缩放 (fft)
- 盒计数法独立 x/y 归一化, MF-DFA q=0 (fractal)
- ADF 平稳性检验 + 移除双重 Bonferroni (causality)
- R/S Hurst 添加 R² 拟合优度 (hurst)
- Prophet 递推预测避免信息泄露 (time_series)
- IC 计算过滤零信号, 中性形态 hit_rate=NaN (indicators/patterns)
- 聚类阈值自适应化 (clustering)
- 日历效应前后半段稳健性检查 (calendar)
- 证据评分标准文本与代码对齐 (visualization)
- 核心管道 NaN/空值防护 (data_loader/preprocessing/main)

报告修复 (docs/REPORT.md, 15 处):
- 标度指数 H_scaling 与 Hurst 指数消歧
- GBM 6 个月概率锥数值重算
- CLT 限定、减半措辞弱化、情景概率逻辑修正
- GPD 形状参数解读修正、异常 AUC 证据降级

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 01:07:50 +08:00

803 lines
27 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""时间序列预测模块 - ARIMA、Prophet、LSTM/GRU
对BTC日线数据进行多模型预测与对比评估。
每个模型独立运行,单个模型失败不影响其他模型。
"""
import warnings
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from pathlib import Path
from typing import Optional, Tuple, Dict, List
from scipy import stats
from src.data_loader import split_data
# ============================================================
# 评估指标
# ============================================================
def _direction_accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> float:
"""方向准确率:预测涨跌方向正确的比例"""
if len(y_true) < 2:
return np.nan
true_dir = np.sign(y_true)
pred_dir = np.sign(y_pred)
return np.mean(true_dir == pred_dir)
def _rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
"""均方根误差"""
return np.sqrt(np.mean((y_true - y_pred) ** 2))
def _diebold_mariano_test(e1: np.ndarray, e2: np.ndarray, h: int = 1) -> Tuple[float, float]:
"""
Diebold-Mariano检验比较两个预测的损失差异是否显著
H0: 两个模型预测精度无差异
e1, e2: 两个模型的预测误差序列
Returns
-------
dm_stat : DM统计量
p_value : 双侧p值
"""
d = e1 ** 2 - e2 ** 2 # 平方损失差
n = len(d)
if n < 10:
return np.nan, np.nan
mean_d = np.mean(d)
# Newey-West方差估计考虑自相关
gamma_0 = np.var(d, ddof=1)
gamma_sum = 0
for k in range(1, h):
gamma_k = np.cov(d[k:], d[:-k])[0, 1] if len(d[k:]) > 1 else 0
gamma_sum += 2 * gamma_k
var_d = (gamma_0 + gamma_sum) / n
if var_d <= 0:
return np.nan, np.nan
dm_stat = mean_d / np.sqrt(var_d)
p_value = 2 * stats.norm.sf(np.abs(dm_stat))
return dm_stat, p_value
def _evaluate_model(name: str, y_true: np.ndarray, y_pred: np.ndarray,
rw_errors: np.ndarray) -> Dict:
"""统一评估单个模型"""
errors = y_true - y_pred
rmse_val = _rmse(y_true, y_pred)
rw_rmse = _rmse(y_true, np.zeros_like(y_true)) # Random Walk RMSE
rmse_ratio = rmse_val / rw_rmse if rw_rmse > 0 else np.nan
dir_acc = _direction_accuracy(y_true, y_pred)
# DM检验 vs Random Walk
dm_stat, dm_pval = _diebold_mariano_test(errors, rw_errors)
result = {
"name": name,
"rmse": rmse_val,
"rmse_ratio_vs_rw": rmse_ratio,
"direction_accuracy": dir_acc,
"dm_stat_vs_rw": dm_stat,
"dm_pval_vs_rw": dm_pval,
"predictions": y_pred,
"errors": errors,
}
return result
# ============================================================
# 基准模型
# ============================================================
def _baseline_random_walk(y_true: np.ndarray) -> np.ndarray:
"""随机游走基准:预测收益率=0"""
return np.zeros_like(y_true)
def _baseline_historical_mean(train_returns: np.ndarray, n_pred: int) -> np.ndarray:
"""历史均值基准:预测收益率=训练集均值"""
return np.full(n_pred, np.mean(train_returns))
# ============================================================
# ARIMA 模型
# ============================================================
def _run_arima(train_returns: pd.Series, val_returns: pd.Series) -> Dict:
"""
ARIMA模型使用auto_arima自动选参 + walk-forward预测
Returns
-------
dict : 包含预测结果和诊断信息
"""
try:
import pmdarima as pm
from statsmodels.stats.diagnostic import acorr_ljungbox
except ImportError:
print(" [ARIMA] 跳过 - pmdarima 未安装。pip install pmdarima")
return None
print("\n" + "=" * 60)
print("ARIMA 模型")
print("=" * 60)
# 自动选择ARIMA参数
print(" [1/3] auto_arima 参数搜索...")
model = pm.auto_arima(
train_returns.values,
start_p=0, max_p=5,
start_q=0, max_q=5,
d=0, # 对数收益率已经是平稳的
seasonal=False,
stepwise=True,
suppress_warnings=True,
error_action='ignore',
trace=False,
information_criterion='aic',
)
print(f" 最优模型: ARIMA{model.order}")
print(f" AIC: {model.aic():.2f}")
# Ljung-Box 残差诊断
print(" [2/3] Ljung-Box 残差白噪声检验...")
residuals = model.resid()
lb_result = acorr_ljungbox(residuals, lags=[10, 20], return_df=True)
print(f" Ljung-Box 检验 (lag=10): 统计量={lb_result.iloc[0]['lb_stat']:.2f}, "
f"p值={lb_result.iloc[0]['lb_pvalue']:.4f}")
print(f" Ljung-Box 检验 (lag=20): 统计量={lb_result.iloc[1]['lb_stat']:.2f}, "
f"p值={lb_result.iloc[1]['lb_pvalue']:.4f}")
if lb_result.iloc[0]['lb_pvalue'] > 0.05:
print(" 残差通过白噪声检验 (p>0.05),模型拟合充分")
else:
print(" 残差未通过白噪声检验 (p<=0.05),可能存在未捕获的自相关结构")
# Walk-forward 预测
print(" [3/3] Walk-forward 验证集预测...")
val_values = val_returns.values
n_val = len(val_values)
predictions = np.zeros(n_val)
# 使用滚动窗口预测
history = list(train_returns.values)
for i in range(n_val):
# 一步预测
fc = model.predict(n_periods=1)
predictions[i] = fc[0]
# 更新模型(添加真实观测值)
model.update(val_values[i:i+1])
if (i + 1) % 100 == 0:
print(f" 进度: {i+1}/{n_val}")
print(f" Walk-forward 预测完成,共{n_val}")
return {
"predictions": predictions,
"order": model.order,
"aic": model.aic(),
"ljung_box": lb_result,
}
# ============================================================
# Prophet 模型
# ============================================================
def _run_prophet(train_df: pd.DataFrame, val_df: pd.DataFrame) -> Dict:
"""
Prophet模型基于日收盘价的时间序列预测
Returns
-------
dict : 包含预测结果
"""
try:
from prophet import Prophet
except ImportError:
print(" [Prophet] 跳过 - prophet 未安装。pip install prophet")
return None
print("\n" + "=" * 60)
print("Prophet 模型")
print("=" * 60)
# 准备Prophet格式数据
prophet_train = pd.DataFrame({
'ds': train_df.index,
'y': train_df['close'].values,
})
print(" [1/3] 构建Prophet模型并添加自定义季节性...")
model = Prophet(
daily_seasonality=False,
weekly_seasonality=False,
yearly_seasonality=False,
changepoint_prior_scale=0.05,
)
# 添加自定义季节性
model.add_seasonality(name='weekly', period=7, fourier_order=3)
model.add_seasonality(name='monthly', period=30, fourier_order=5)
model.add_seasonality(name='yearly', period=365, fourier_order=10)
model.add_seasonality(name='halving_cycle', period=1458, fourier_order=5)
print(" [2/3] 拟合模型...")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
model.fit(prophet_train)
# 预测验证期
print(" [3/3] 预测验证期...")
future_dates = pd.DataFrame({'ds': val_df.index})
forecast = model.predict(future_dates)
# 转换为对数收益率预测(与其他模型对齐)
pred_close = forecast['yhat'].values
# 使用递推方式首个prev_close用训练集末尾真实价格后续用模型预测价格
prev_close = np.concatenate([[train_df['close'].iloc[-1]], pred_close[:-1]])
pred_returns = np.log(pred_close / prev_close)
print(f" 预测完成,验证期: {val_df.index[0]} ~ {val_df.index[-1]}")
print(f" 预测价格范围: {pred_close.min():.0f} ~ {pred_close.max():.0f}")
return {
"predictions_return": pred_returns,
"predictions_close": pred_close,
"forecast": forecast,
"model": model,
}
# ============================================================
# LSTM/GRU 模型 (PyTorch)
# ============================================================
def _run_lstm(train_df: pd.DataFrame, val_df: pd.DataFrame,
lookback: int = 60, hidden_size: int = 128,
num_layers: int = 2, max_epochs: int = 100,
patience: int = 10, batch_size: int = 64) -> Dict:
"""
LSTM/GRU 模型基于PyTorch的深度学习时间序列预测
Returns
-------
dict : 包含预测结果和训练历史
"""
try:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
except ImportError:
print(" [LSTM] 跳过 - PyTorch 未安装。pip install torch")
return None
print("\n" + "=" * 60)
print("LSTM 模型 (PyTorch)")
print("=" * 60)
device = torch.device('cuda' if torch.cuda.is_available() else
'mps' if torch.backends.mps.is_available() else 'cpu')
print(f" 设备: {device}")
# ---- 数据准备 ----
# 使用收盘价的对数收益率作为目标
feature_cols = ['log_return', 'volume_ratio', 'taker_buy_ratio']
available_cols = [c for c in feature_cols if c in train_df.columns]
if not available_cols:
# 降级到只用收盘价
print(" [警告] 特征列不可用,仅使用收盘价收益率")
available_cols = ['log_return']
print(f" 特征: {available_cols}")
# 合并训练和验证数据以创建连续序列
all_data = pd.concat([train_df, val_df])
features = all_data[available_cols].values
target = all_data['log_return'].values
# 处理NaN
mask = ~np.isnan(features).any(axis=1) & ~np.isnan(target)
features_clean = features[mask]
target_clean = target[mask]
# 特征标准化(基于训练集统计量)
train_len = mask[:len(train_df)].sum()
feat_mean = features_clean[:train_len].mean(axis=0)
feat_std = features_clean[:train_len].std(axis=0) + 1e-10
features_norm = (features_clean - feat_mean) / feat_std
target_mean = target_clean[:train_len].mean()
target_std = target_clean[:train_len].std() + 1e-10
target_norm = (target_clean - target_mean) / target_std
# 创建序列样本
def create_sequences(feat, tgt, seq_len):
X, y = [], []
for i in range(seq_len, len(feat)):
X.append(feat[i - seq_len:i])
y.append(tgt[i])
return np.array(X), np.array(y)
X_all, y_all = create_sequences(features_norm, target_norm, lookback)
# 划分训练和验证(根据原始训练集长度调整)
train_samples = max(0, train_len - lookback)
X_train = X_all[:train_samples]
y_train = y_all[:train_samples]
X_val = X_all[train_samples:]
y_val = y_all[train_samples:]
if len(X_train) == 0 or len(X_val) == 0:
print(" [LSTM] 跳过 - 数据不足以创建训练/验证序列")
return None
print(f" 训练样本: {len(X_train)}, 验证样本: {len(X_val)}")
print(f" 回看窗口: {lookback}, 隐藏维度: {hidden_size}, 层数: {num_layers}")
# 转换为Tensor
X_train_t = torch.FloatTensor(X_train).to(device)
y_train_t = torch.FloatTensor(y_train).to(device)
X_val_t = torch.FloatTensor(X_val).to(device)
y_val_t = torch.FloatTensor(y_val).to(device)
train_dataset = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# ---- 模型定义 ----
class LSTMModel(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, dropout=0.2):
super().__init__()
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=dropout if num_layers > 1 else 0,
)
self.fc = nn.Sequential(
nn.Linear(hidden_size, 64),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(64, 1),
)
def forward(self, x):
lstm_out, _ = self.lstm(x)
# 取最后一个时间步的输出
last_out = lstm_out[:, -1, :]
return self.fc(last_out).squeeze(-1)
input_size = len(available_cols)
model = LSTMModel(input_size, hidden_size, num_layers).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode='min', factor=0.5, patience=5, verbose=False
)
# ---- 训练 ----
print(f" 开始训练 (最多{max_epochs}轮, 早停耐心={patience})...")
best_val_loss = np.inf
patience_counter = 0
train_losses = []
val_losses = []
for epoch in range(max_epochs):
# 训练
model.train()
epoch_loss = 0
n_batches = 0
for batch_X, batch_y in train_loader:
optimizer.zero_grad()
pred = model(batch_X)
loss = criterion(pred, batch_y)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
epoch_loss += loss.item()
n_batches += 1
avg_train_loss = epoch_loss / max(n_batches, 1)
train_losses.append(avg_train_loss)
# 验证
model.eval()
with torch.no_grad():
val_pred = model(X_val_t)
val_loss = criterion(val_pred, y_val_t).item()
val_losses.append(val_loss)
scheduler.step(val_loss)
if (epoch + 1) % 10 == 0:
lr = optimizer.param_groups[0]['lr']
print(f" Epoch {epoch+1}/{max_epochs}: "
f"train_loss={avg_train_loss:.6f}, val_loss={val_loss:.6f}, lr={lr:.1e}")
# 早停
if val_loss < best_val_loss:
best_val_loss = val_loss
patience_counter = 0
best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
else:
patience_counter += 1
if patience_counter >= patience:
print(f" 早停触发 (epoch {epoch+1})")
break
# 加载最佳模型
model.load_state_dict(best_state)
model.eval()
# ---- 预测 ----
with torch.no_grad():
val_pred_norm = model(X_val_t).cpu().numpy()
# 逆标准化
val_pred_returns = val_pred_norm * target_std + target_mean
val_true_returns = y_val * target_std + target_mean
print(f" 训练完成,最佳验证损失: {best_val_loss:.6f}")
return {
"predictions_return": val_pred_returns,
"true_returns": val_true_returns,
"train_losses": train_losses,
"val_losses": val_losses,
"model": model,
"device": str(device),
}
# ============================================================
# 可视化
# ============================================================
def _plot_predictions(val_dates, y_true, model_preds: Dict[str, np.ndarray],
output_dir: Path):
"""各模型实际 vs 预测对比图"""
n_models = len(model_preds)
fig, axes = plt.subplots(n_models, 1, figsize=(16, 4 * n_models), sharex=True)
if n_models == 1:
axes = [axes]
for i, (name, y_pred) in enumerate(model_preds.items()):
ax = axes[i]
# 对齐长度LSTM可能因lookback导致长度不同
n = min(len(y_true), len(y_pred))
dates = val_dates[:n] if len(val_dates) >= n else val_dates
ax.plot(dates, y_true[:n], 'b-', alpha=0.6, linewidth=0.8, label='实际收益率')
ax.plot(dates, y_pred[:n], 'r-', alpha=0.6, linewidth=0.8, label='预测收益率')
ax.set_title(f"{name} - 实际 vs 预测", fontsize=13)
ax.set_ylabel("对数收益率", fontsize=11)
ax.legend(fontsize=9)
ax.grid(True, alpha=0.3)
ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[-1].set_xlabel("日期", fontsize=11)
plt.tight_layout()
fig.savefig(output_dir / "ts_predictions_comparison.png", dpi=150, bbox_inches='tight')
plt.close(fig)
print(f" [保存] ts_predictions_comparison.png")
def _plot_direction_accuracy(metrics: Dict[str, Dict], output_dir: Path):
"""方向准确率对比柱状图"""
names = list(metrics.keys())
accs = [metrics[n]["direction_accuracy"] * 100 for n in names]
fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.Set2(np.linspace(0, 1, len(names)))
bars = ax.bar(names, accs, color=colors, edgecolor='gray', linewidth=0.5)
# 标注数值
for bar, acc in zip(bars, accs):
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
f"{acc:.1f}%", ha='center', va='bottom', fontsize=11, fontweight='bold')
ax.axhline(y=50, color='red', linestyle='--', alpha=0.7, label='随机基准 (50%)')
ax.set_ylabel("方向准确率 (%)", fontsize=12)
ax.set_title("各模型方向预测准确率对比", fontsize=14)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3, axis='y')
ax.set_ylim(0, max(accs) * 1.2 if accs else 100)
fig.savefig(output_dir / "ts_direction_accuracy.png", dpi=150, bbox_inches='tight')
plt.close(fig)
print(f" [保存] ts_direction_accuracy.png")
def _plot_cumulative_error(val_dates, metrics: Dict[str, Dict], output_dir: Path):
"""累计误差对比图"""
fig, ax = plt.subplots(figsize=(16, 7))
for name, m in metrics.items():
errors = m.get("errors")
if errors is None:
continue
n = len(errors)
dates = val_dates[:n]
cum_sq_err = np.cumsum(errors ** 2)
ax.plot(dates, cum_sq_err, linewidth=1.2, label=f"{name}")
ax.set_xlabel("日期", fontsize=12)
ax.set_ylabel("累计平方误差", fontsize=12)
ax.set_title("各模型累计预测误差对比", fontsize=14)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
fig.savefig(output_dir / "ts_cumulative_error.png", dpi=150, bbox_inches='tight')
plt.close(fig)
print(f" [保存] ts_cumulative_error.png")
def _plot_lstm_training(train_losses: List, val_losses: List, output_dir: Path):
"""LSTM训练损失曲线"""
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(train_losses, 'b-', label='训练损失', linewidth=1.5)
ax.plot(val_losses, 'r-', label='验证损失', linewidth=1.5)
ax.set_xlabel("Epoch", fontsize=12)
ax.set_ylabel("MSE Loss", fontsize=12)
ax.set_title("LSTM 训练过程", fontsize=14)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
fig.savefig(output_dir / "ts_lstm_training.png", dpi=150, bbox_inches='tight')
plt.close(fig)
print(f" [保存] ts_lstm_training.png")
def _plot_prophet_components(prophet_result: Dict, output_dir: Path):
"""Prophet预测 - 实际价格 vs 预测价格"""
try:
from prophet import Prophet
except ImportError:
return
forecast = prophet_result.get("forecast")
if forecast is None:
return
fig, ax = plt.subplots(figsize=(16, 7))
ax.plot(forecast['ds'], forecast['yhat'], 'r-', linewidth=1.2, label='Prophet预测')
ax.fill_between(forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'],
alpha=0.15, color='red', label='置信区间')
ax.set_xlabel("日期", fontsize=12)
ax.set_ylabel("BTC 价格 (USDT)", fontsize=12)
ax.set_title("Prophet 价格预测(验证期)", fontsize=14)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)
fig.savefig(output_dir / "ts_prophet_forecast.png", dpi=150, bbox_inches='tight')
plt.close(fig)
print(f" [保存] ts_prophet_forecast.png")
# ============================================================
# 结果打印
# ============================================================
def _print_metrics_table(all_metrics: Dict[str, Dict]):
"""打印所有模型的评估指标表"""
print("\n" + "=" * 80)
print(" 模型评估汇总")
print("=" * 80)
print(f" {'模型':<20s} {'RMSE':>10s} {'RMSE/RW':>10s} {'方向准确率':>10s} "
f"{'DM统计量':>10s} {'DM p值':>10s}")
print("-" * 80)
for name, m in all_metrics.items():
rmse_str = f"{m['rmse']:.6f}"
ratio_str = f"{m['rmse_ratio_vs_rw']:.4f}" if not np.isnan(m['rmse_ratio_vs_rw']) else "N/A"
dir_str = f"{m['direction_accuracy']*100:.1f}%"
dm_str = f"{m['dm_stat_vs_rw']:.3f}" if not np.isnan(m['dm_stat_vs_rw']) else "N/A"
pv_str = f"{m['dm_pval_vs_rw']:.4f}" if not np.isnan(m['dm_pval_vs_rw']) else "N/A"
print(f" {name:<20s} {rmse_str:>10s} {ratio_str:>10s} {dir_str:>10s} "
f"{dm_str:>10s} {pv_str:>10s}")
print("-" * 80)
# 解读
print("\n [解读]")
print(" - RMSE/RW < 1.0 表示优于随机游走基准")
print(" - 方向准确率 > 50% 表示有一定方向预测能力")
print(" - DM检验 p值 < 0.05 表示与随机游走有显著差异")
# ============================================================
# 主入口
# ============================================================
def run_time_series_analysis(df: pd.DataFrame, output_dir: "str | Path" = "output/time_series") -> Dict:
"""
时间序列预测分析 - 主入口
Parameters
----------
df : pd.DataFrame
已经通过 add_derived_features() 添加了衍生特征的日线数据
output_dir : str or Path
图表输出目录
Returns
-------
results : dict
包含所有模型的预测结果和评估指标
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
from src.font_config import configure_chinese_font
configure_chinese_font()
print("=" * 60)
print(" BTC 时间序列预测分析")
print("=" * 60)
# ---- 数据划分 ----
train_df, val_df, test_df = split_data(df)
print(f"\n 训练集: {train_df.index[0]} ~ {train_df.index[-1]} ({len(train_df)}天)")
print(f" 验证集: {val_df.index[0]} ~ {val_df.index[-1]} ({len(val_df)}天)")
print(f" 测试集: {test_df.index[0]} ~ {test_df.index[-1]} ({len(test_df)}天)")
# 对数收益率序列
train_returns = train_df['log_return'].dropna()
val_returns = val_df['log_return'].dropna()
val_dates = val_returns.index
y_true = val_returns.values
# ---- 基准模型 ----
print("\n" + "=" * 60)
print("基准模型")
print("=" * 60)
# Random Walk基准
rw_pred = _baseline_random_walk(y_true)
rw_errors = y_true - rw_pred
print(f" Random Walk (预测收益=0): RMSE = {_rmse(y_true, rw_pred):.6f}")
# 历史均值基准
hm_pred = _baseline_historical_mean(train_returns.values, len(y_true))
print(f" Historical Mean (收益={train_returns.mean():.6f}): RMSE = {_rmse(y_true, hm_pred):.6f}")
# 存储所有模型结果
all_metrics = {}
model_preds = {}
# 评估基准模型
all_metrics["Random Walk"] = _evaluate_model("Random Walk", y_true, rw_pred, rw_errors)
model_preds["Random Walk"] = rw_pred
all_metrics["Historical Mean"] = _evaluate_model("Historical Mean", y_true, hm_pred, rw_errors)
model_preds["Historical Mean"] = hm_pred
# ---- ARIMA ----
try:
arima_result = _run_arima(train_returns, val_returns)
if arima_result is not None:
arima_pred = arima_result["predictions"]
all_metrics["ARIMA"] = _evaluate_model("ARIMA", y_true, arima_pred, rw_errors)
model_preds["ARIMA"] = arima_pred
print(f"\n ARIMA 验证集: RMSE={all_metrics['ARIMA']['rmse']:.6f}, "
f"方向准确率={all_metrics['ARIMA']['direction_accuracy']*100:.1f}%")
except Exception as e:
print(f"\n [ARIMA] 运行失败: {e}")
# ---- Prophet ----
try:
prophet_result = _run_prophet(train_df, val_df)
if prophet_result is not None:
prophet_pred = prophet_result["predictions_return"]
# 对齐长度
n = min(len(y_true), len(prophet_pred))
all_metrics["Prophet"] = _evaluate_model(
"Prophet", y_true[:n], prophet_pred[:n], rw_errors[:n]
)
model_preds["Prophet"] = prophet_pred[:n]
print(f"\n Prophet 验证集: RMSE={all_metrics['Prophet']['rmse']:.6f}, "
f"方向准确率={all_metrics['Prophet']['direction_accuracy']*100:.1f}%")
# Prophet专属图表
_plot_prophet_components(prophet_result, output_dir)
except Exception as e:
print(f"\n [Prophet] 运行失败: {e}")
prophet_result = None
# ---- LSTM ----
try:
lstm_result = _run_lstm(train_df, val_df)
if lstm_result is not None:
lstm_pred = lstm_result["predictions_return"]
lstm_true = lstm_result["true_returns"]
n_lstm = len(lstm_pred)
# LSTM因lookback导致样本数不同使用其自身的true_returns评估
lstm_rw_errors = lstm_true - np.zeros_like(lstm_true)
all_metrics["LSTM"] = _evaluate_model(
"LSTM", lstm_true, lstm_pred, lstm_rw_errors
)
model_preds["LSTM"] = lstm_pred
print(f"\n LSTM 验证集: RMSE={all_metrics['LSTM']['rmse']:.6f}, "
f"方向准确率={all_metrics['LSTM']['direction_accuracy']*100:.1f}%")
# LSTM训练曲线
_plot_lstm_training(lstm_result["train_losses"],
lstm_result["val_losses"], output_dir)
except Exception as e:
print(f"\n [LSTM] 运行失败: {e}")
lstm_result = None
# ---- 评估汇总 ----
_print_metrics_table(all_metrics)
# ---- 可视化 ----
print("\n[可视化] 生成分析图表...")
# 预测对比图仅使用与y_true等长的预测排除LSTM
aligned_preds = {k: v for k, v in model_preds.items()
if k != "LSTM" and len(v) == len(y_true)}
if aligned_preds:
_plot_predictions(val_dates, y_true, aligned_preds, output_dir)
# LSTM单独画图长度不同
if "LSTM" in model_preds and lstm_result is not None:
lstm_dates = val_dates[-len(lstm_result["predictions_return"]):]
_plot_predictions(lstm_dates, lstm_result["true_returns"],
{"LSTM": lstm_result["predictions_return"]}, output_dir)
# 方向准确率对比
_plot_direction_accuracy(all_metrics, output_dir)
# 累计误差对比
_plot_cumulative_error(val_dates, all_metrics, output_dir)
# ---- 汇总 ----
results = {
"metrics": all_metrics,
"model_predictions": model_preds,
"val_dates": val_dates,
"y_true": y_true,
}
if 'arima_result' in dir() and arima_result is not None:
results["arima"] = arima_result
if prophet_result is not None:
results["prophet"] = prophet_result
if lstm_result is not None:
results["lstm"] = lstm_result
print("\n" + "=" * 60)
print(" 时间序列预测分析完成!")
print("=" * 60)
return results
# ============================================================
# 命令行入口
# ============================================================
if __name__ == "__main__":
from data_loader import load_daily
from preprocessing import add_derived_features
df = load_daily()
df = add_derived_features(df)
results = run_time_series_analysis(df, output_dir="output/time_series")