Add comprehensive BTC/USDT price analysis framework with 17 modules

Complete statistical analysis pipeline covering:
- FFT spectral analysis, wavelet CWT, ACF/PACF autocorrelation
- Returns distribution (fat tails, kurtosis=15.65), GARCH volatility modeling
- Hurst exponent (H=0.593), fractal dimension, power law corridor
- Volume-price causality (Granger), calendar effects, halving cycle analysis
- Technical indicator validation (0/21 pass FDR), candlestick pattern testing
- Market state clustering (K-Means/GMM), Markov chain transitions
- Time series forecasting (ARIMA/Prophet/LSTM benchmarks)
- Anomaly detection ensemble (IF+LOF+COPOD, AUC=0.9935)

Key finding: volatility is predictable (GARCH persistence=0.973),
but price direction is statistically indistinguishable from random walk.

Includes REPORT.md with 16-section analysis report and future projections,
70+ charts in output/, and all source modules in src/.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-02-03 10:29:54 +08:00
parent 3ab7ba6c7f
commit f4c4408708
96 changed files with 13218 additions and 0 deletions

742
src/clustering.py Normal file
View File

@@ -0,0 +1,742 @@
"""市场状态聚类与马尔可夫链分析模块
基于K-Means、GMM、HDBSCAN对BTC日线特征进行聚类
构建状态转移矩阵并计算平稳分布。
"""
import warnings
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from pathlib import Path
from typing import Optional, Tuple, Dict, List
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples
try:
import hdbscan
HAS_HDBSCAN = True
except ImportError:
HAS_HDBSCAN = False
warnings.warn("hdbscan 未安装,将跳过 HDBSCAN 聚类。pip install hdbscan")
# ============================================================
# 特征工程
# ============================================================
FEATURE_COLS = [
"log_return", "abs_return", "vol_7d", "vol_30d",
"volume_ratio", "taker_buy_ratio", "range_pct", "body_pct",
"log_return_lag1", "log_return_lag2",
]
def _prepare_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray, StandardScaler]:
"""
准备聚类特征添加滞后收益率、标准化、去除NaN行
Returns
-------
df_clean : 清洗后的DataFrame保留索引用于后续映射
X_scaled : 标准化后的特征矩阵
scaler : 标准化器(可用于逆变换)
"""
out = df.copy()
# 添加滞后收益率特征
out["log_return_lag1"] = out["log_return"].shift(1)
out["log_return_lag2"] = out["log_return"].shift(2)
# 只保留所需特征列删除含NaN的行
df_feat = out[FEATURE_COLS].copy()
mask = df_feat.notna().all(axis=1)
df_clean = out.loc[mask].copy()
X_raw = df_feat.loc[mask].values
# Z-score标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)
print(f"[特征准备] 有效样本数: {X_scaled.shape[0]}, 特征维度: {X_scaled.shape[1]}")
return df_clean, X_scaled, scaler
# ============================================================
# K-Means 聚类
# ============================================================
def _run_kmeans(X: np.ndarray, k_range: List[int] = None) -> Tuple[int, np.ndarray, Dict]:
"""
K-Means聚类通过轮廓系数选择最优k
Returns
-------
best_k : 最优聚类数
labels : 最优k对应的聚类标签
info : 包含每个k的轮廓系数、惯性等
"""
if k_range is None:
k_range = [3, 4, 5, 6, 7]
results = {}
best_score = -1
best_k = k_range[0]
best_labels = None
print("\n" + "=" * 60)
print("K-Means 聚类分析")
print("=" * 60)
for k in k_range:
km = KMeans(n_clusters=k, n_init=20, max_iter=500, random_state=42)
labels = km.fit_predict(X)
sil = silhouette_score(X, labels)
inertia = km.inertia_
results[k] = {"silhouette": sil, "inertia": inertia, "labels": labels, "model": km}
print(f" k={k}: 轮廓系数={sil:.4f}, 惯性={inertia:.1f}")
if sil > best_score:
best_score = sil
best_k = k
best_labels = labels
print(f"\n >>> 最优 k = {best_k} (轮廓系数 = {best_score:.4f})")
return best_k, best_labels, results
# ============================================================
# GMM (高斯混合模型)
# ============================================================
def _run_gmm(X: np.ndarray, k_range: List[int] = None) -> Tuple[int, np.ndarray, Dict]:
"""
GMM聚类通过BIC选择最优组件数
Returns
-------
best_k : BIC最低的组件数
labels : 对应的聚类标签
info : 每个k的BIC、AIC、标签等
"""
if k_range is None:
k_range = [3, 4, 5, 6, 7]
results = {}
best_bic = np.inf
best_k = k_range[0]
best_labels = None
print("\n" + "=" * 60)
print("GMM (高斯混合模型) 聚类分析")
print("=" * 60)
for k in k_range:
gmm = GaussianMixture(n_components=k, covariance_type='full',
n_init=5, max_iter=500, random_state=42)
gmm.fit(X)
labels = gmm.predict(X)
bic = gmm.bic(X)
aic = gmm.aic(X)
sil = silhouette_score(X, labels)
results[k] = {"bic": bic, "aic": aic, "silhouette": sil,
"labels": labels, "model": gmm}
print(f" k={k}: BIC={bic:.1f}, AIC={aic:.1f}, 轮廓系数={sil:.4f}")
if bic < best_bic:
best_bic = bic
best_k = k
best_labels = labels
print(f"\n >>> 最优 k = {best_k} (BIC = {best_bic:.1f})")
return best_k, best_labels, results
# ============================================================
# HDBSCAN (密度聚类)
# ============================================================
def _run_hdbscan(X: np.ndarray) -> Tuple[np.ndarray, Dict]:
"""
HDBSCAN密度聚类
Returns
-------
labels : 聚类标签 (-1表示噪声)
info : 聚类统计信息
"""
if not HAS_HDBSCAN:
print("\n[HDBSCAN] 跳过 - hdbscan 未安装")
return None, {}
print("\n" + "=" * 60)
print("HDBSCAN 密度聚类分析")
print("=" * 60)
clusterer = hdbscan.HDBSCAN(
min_cluster_size=30,
min_samples=10,
metric='euclidean',
cluster_selection_method='eom',
)
labels = clusterer.fit_predict(X)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = (labels == -1).sum()
noise_pct = n_noise / len(labels) * 100
info = {
"n_clusters": n_clusters,
"n_noise": n_noise,
"noise_pct": noise_pct,
"labels": labels,
"model": clusterer,
}
print(f" 聚类数: {n_clusters}")
print(f" 噪声点: {n_noise} ({noise_pct:.1f}%)")
# 排除噪声点后计算轮廓系数
if n_clusters >= 2:
mask = labels >= 0
if mask.sum() > n_clusters:
sil = silhouette_score(X[mask], labels[mask])
info["silhouette"] = sil
print(f" 轮廓系数(去噪): {sil:.4f}")
return labels, info
# ============================================================
# 聚类解释与标签映射
# ============================================================
# 状态标签定义
STATE_LABELS = {
"sideways": "横盘整理",
"mild_up": "温和上涨",
"mild_down": "温和下跌",
"surge": "强势上涨",
"crash": "急剧下跌",
"high_vol": "高波动",
"low_vol": "低波动",
}
def _interpret_clusters(df_clean: pd.DataFrame, labels: np.ndarray,
method_name: str = "K-Means") -> pd.DataFrame:
"""
解释聚类结果:计算每个簇的特征均值,并自动标注状态名称
Returns
-------
cluster_desc : 每个聚类的特征均值表 + state_label列
"""
df_work = df_clean.copy()
col_name = f"cluster_{method_name}"
df_work[col_name] = labels
# 计算每个聚类的特征均值
cluster_means = df_work.groupby(col_name)[FEATURE_COLS].mean()
print(f"\n{'=' * 60}")
print(f"{method_name} 聚类特征均值")
print("=" * 60)
# 自动标注状态
state_labels = {}
for cid in cluster_means.index:
row = cluster_means.loc[cid]
lr = row["log_return"]
vol = row["vol_7d"]
abs_r = row["abs_return"]
# 基于收益率和波动率的规则判断
if lr > 0.02 and abs_r > 0.02:
label = "surge"
elif lr < -0.02 and abs_r > 0.02:
label = "crash"
elif lr > 0.005:
label = "mild_up"
elif lr < -0.005:
label = "mild_down"
elif abs_r > 0.015 or vol > cluster_means["vol_7d"].median() * 1.5:
label = "high_vol"
else:
label = "sideways"
state_labels[cid] = label
cluster_means["state_label"] = pd.Series(state_labels)
cluster_means["state_cn"] = cluster_means["state_label"].map(STATE_LABELS)
# 统计每个聚类的样本数和占比
counts = df_work[col_name].value_counts().sort_index()
cluster_means["count"] = counts
cluster_means["pct"] = (counts / counts.sum() * 100).round(1)
for cid in cluster_means.index:
row = cluster_means.loc[cid]
print(f"\n 聚类 {cid} [{row['state_cn']}] (n={int(row['count'])}, {row['pct']:.1f}%)")
print(f" log_return: {row['log_return']:.5f}, abs_return: {row['abs_return']:.5f}")
print(f" vol_7d: {row['vol_7d']:.4f}, vol_30d: {row['vol_30d']:.4f}")
print(f" volume_ratio: {row['volume_ratio']:.3f}, taker_buy_ratio: {row['taker_buy_ratio']:.4f}")
print(f" range_pct: {row['range_pct']:.5f}, body_pct: {row['body_pct']:.5f}")
return cluster_means
# ============================================================
# 马尔可夫转移矩阵
# ============================================================
def _compute_transition_matrix(labels: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
计算状态转移概率矩阵、平稳分布和平均持有时间
Parameters
----------
labels : 时间序列的聚类标签
Returns
-------
trans_matrix : 转移概率矩阵 (n_states x n_states)
stationary : 平稳分布向量
holding_time : 各状态平均持有时间
"""
states = np.sort(np.unique(labels))
n_states = len(states)
# 状态映射到连续索引
state_to_idx = {s: i for i, s in enumerate(states)}
# 计数矩阵
count_matrix = np.zeros((n_states, n_states), dtype=np.float64)
for t in range(len(labels) - 1):
i = state_to_idx[labels[t]]
j = state_to_idx[labels[t + 1]]
count_matrix[i, j] += 1
# 转移概率矩阵(行归一化)
row_sums = count_matrix.sum(axis=1, keepdims=True)
row_sums[row_sums == 0] = 1 # 避免除零
trans_matrix = count_matrix / row_sums
# 平稳分布:求转移矩阵的左特征向量(特征值=1对应的
# π * P = π => P^T * π^T = π^T
eigenvalues, eigenvectors = np.linalg.eig(trans_matrix.T)
# 找最接近1的特征值对应的特征向量
idx = np.argmin(np.abs(eigenvalues - 1.0))
stationary = np.real(eigenvectors[:, idx])
stationary = stationary / stationary.sum() # 归一化为概率
# 确保非负(数值误差可能导致微小负值)
stationary = np.abs(stationary)
stationary = stationary / stationary.sum()
# 平均持有时间 = 1 / (1 - p_ii)
diag = np.diag(trans_matrix)
holding_time = np.where(diag < 1.0, 1.0 / (1.0 - diag), np.inf)
return trans_matrix, stationary, holding_time
def _print_markov_results(trans_matrix: np.ndarray, stationary: np.ndarray,
holding_time: np.ndarray, cluster_desc: pd.DataFrame):
"""打印马尔可夫链分析结果"""
states = cluster_desc.index.tolist()
state_names = cluster_desc["state_cn"].tolist()
print("\n" + "=" * 60)
print("马尔可夫链状态转移分析")
print("=" * 60)
# 转移概率矩阵
print("\n转移概率矩阵:")
header = " " + " ".join([f" {state_names[j][:4]:>4s}" for j in range(len(states))])
print(header)
for i, s in enumerate(states):
row_str = f" {state_names[i][:4]:>4s}"
for j in range(len(states)):
row_str += f" {trans_matrix[i, j]:6.3f}"
print(row_str)
# 平稳分布
print("\n平稳分布 (长期均衡概率):")
for i, s in enumerate(states):
print(f" {state_names[i]}: {stationary[i]:.4f} ({stationary[i]*100:.1f}%)")
# 平均持有时间
print("\n平均持有时间 (天):")
for i, s in enumerate(states):
if np.isinf(holding_time[i]):
print(f" {state_names[i]}: ∞ (吸收态)")
else:
print(f" {state_names[i]}: {holding_time[i]:.2f}")
# ============================================================
# 可视化
# ============================================================
def _plot_pca_scatter(X: np.ndarray, labels: np.ndarray,
cluster_desc: pd.DataFrame, method_name: str,
output_dir: Path):
"""2D PCA散点图按聚类着色"""
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X)
fig, ax = plt.subplots(figsize=(12, 8))
states = np.sort(np.unique(labels))
colors = plt.cm.Set2(np.linspace(0, 1, len(states)))
for i, s in enumerate(states):
mask = labels == s
label_name = cluster_desc.loc[s, "state_cn"] if s in cluster_desc.index else f"Cluster {s}"
ax.scatter(X_2d[mask, 0], X_2d[mask, 1], c=[colors[i]], label=label_name,
alpha=0.5, s=15, edgecolors='none')
ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)", fontsize=12)
ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)", fontsize=12)
ax.set_title(f"{method_name} 聚类结果 - PCA 2D投影", fontsize=14)
ax.legend(fontsize=10, loc='best')
ax.grid(True, alpha=0.3)
fig.savefig(output_dir / f"cluster_pca_{method_name.lower().replace(' ', '_')}.png",
dpi=150, bbox_inches='tight')
plt.close(fig)
print(f" [保存] cluster_pca_{method_name.lower().replace(' ', '_')}.png")
def _plot_silhouette(X: np.ndarray, labels: np.ndarray, method_name: str, output_dir: Path):
"""轮廓系数分析图"""
n_clusters = len(set(labels) - {-1})
if n_clusters < 2:
return
# 排除噪声点
mask = labels >= 0
if mask.sum() < n_clusters + 1:
return
sil_vals = silhouette_samples(X[mask], labels[mask])
avg_sil = silhouette_score(X[mask], labels[mask])
fig, ax = plt.subplots(figsize=(10, 7))
y_lower = 10
valid_labels = np.sort(np.unique(labels[mask]))
colors = plt.cm.Set2(np.linspace(0, 1, len(valid_labels)))
for i, c in enumerate(valid_labels):
c_sil = sil_vals[labels[mask] == c]
c_sil.sort()
size = c_sil.shape[0]
y_upper = y_lower + size
ax.fill_betweenx(np.arange(y_lower, y_upper), 0, c_sil,
facecolor=colors[i], edgecolor=colors[i], alpha=0.7)
ax.text(-0.05, y_lower + 0.5 * size, str(c), fontsize=10)
y_lower = y_upper + 10
ax.axvline(x=avg_sil, color="red", linestyle="--", label=f"平均={avg_sil:.3f}")
ax.set_xlabel("轮廓系数", fontsize=12)
ax.set_ylabel("聚类标签", fontsize=12)
ax.set_title(f"{method_name} 轮廓系数分析 (平均={avg_sil:.3f})", fontsize=14)
ax.legend(fontsize=10)
fig.savefig(output_dir / f"cluster_silhouette_{method_name.lower().replace(' ', '_')}.png",
dpi=150, bbox_inches='tight')
plt.close(fig)
print(f" [保存] cluster_silhouette_{method_name.lower().replace(' ', '_')}.png")
def _plot_cluster_heatmap(cluster_desc: pd.DataFrame, method_name: str, output_dir: Path):
"""聚类特征热力图"""
# 只选择数值型特征列
feat_cols = [c for c in FEATURE_COLS if c in cluster_desc.columns]
data = cluster_desc[feat_cols].copy()
# 对每列进行Z-score标准化便于比较不同量纲的特征
data_norm = (data - data.mean()) / (data.std() + 1e-10)
fig, ax = plt.subplots(figsize=(14, max(6, len(data) * 1.2)))
# 行标签用中文状态名
row_labels = [f"{idx}-{cluster_desc.loc[idx, 'state_cn']}" for idx in data.index]
im = ax.imshow(data_norm.values, cmap='RdYlGn', aspect='auto')
ax.set_xticks(range(len(feat_cols)))
ax.set_xticklabels(feat_cols, rotation=45, ha='right', fontsize=10)
ax.set_yticks(range(len(row_labels)))
ax.set_yticklabels(row_labels, fontsize=11)
# 在格子中显示原始数值
for i in range(data.shape[0]):
for j in range(data.shape[1]):
val = data.iloc[i, j]
ax.text(j, i, f"{val:.4f}", ha='center', va='center', fontsize=8,
color='black' if abs(data_norm.iloc[i, j]) < 1.5 else 'white')
plt.colorbar(im, ax=ax, shrink=0.8, label="标准化值")
ax.set_title(f"{method_name} 各聚类特征热力图", fontsize=14)
fig.savefig(output_dir / f"cluster_heatmap_{method_name.lower().replace(' ', '_')}.png",
dpi=150, bbox_inches='tight')
plt.close(fig)
print(f" [保存] cluster_heatmap_{method_name.lower().replace(' ', '_')}.png")
def _plot_transition_heatmap(trans_matrix: np.ndarray, cluster_desc: pd.DataFrame,
output_dir: Path):
"""状态转移概率矩阵热力图"""
state_names = [cluster_desc.loc[idx, "state_cn"] for idx in cluster_desc.index]
fig, ax = plt.subplots(figsize=(10, 8))
im = ax.imshow(trans_matrix, cmap='YlOrRd', vmin=0, vmax=1, aspect='auto')
n = len(state_names)
ax.set_xticks(range(n))
ax.set_xticklabels(state_names, rotation=45, ha='right', fontsize=11)
ax.set_yticks(range(n))
ax.set_yticklabels(state_names, fontsize=11)
# 标注概率值
for i in range(n):
for j in range(n):
color = 'white' if trans_matrix[i, j] > 0.5 else 'black'
ax.text(j, i, f"{trans_matrix[i, j]:.3f}", ha='center', va='center',
fontsize=11, color=color, fontweight='bold')
plt.colorbar(im, ax=ax, shrink=0.8, label="转移概率")
ax.set_xlabel("下一状态", fontsize=12)
ax.set_ylabel("当前状态", fontsize=12)
ax.set_title("马尔可夫状态转移概率矩阵", fontsize=14)
fig.savefig(output_dir / "cluster_transition_matrix.png", dpi=150, bbox_inches='tight')
plt.close(fig)
print(f" [保存] cluster_transition_matrix.png")
def _plot_state_timeseries(df_clean: pd.DataFrame, labels: np.ndarray,
cluster_desc: pd.DataFrame, output_dir: Path):
"""状态随时间变化的时间序列图"""
fig, axes = plt.subplots(2, 1, figsize=(18, 10), height_ratios=[2, 1], sharex=True)
dates = df_clean.index
close = df_clean["close"].values
states = np.sort(np.unique(labels))
colors = plt.cm.Set2(np.linspace(0, 1, len(states)))
color_map = {s: colors[i] for i, s in enumerate(states)}
# 上图:价格走势,按状态着色
ax1 = axes[0]
for i in range(len(dates) - 1):
ax1.plot([dates[i], dates[i + 1]], [close[i], close[i + 1]],
color=color_map[labels[i]], linewidth=0.8)
# 添加图例
from matplotlib.patches import Patch
legend_patches = []
for s in states:
name = cluster_desc.loc[s, "state_cn"] if s in cluster_desc.index else f"Cluster {s}"
legend_patches.append(Patch(color=color_map[s], label=name))
ax1.legend(handles=legend_patches, fontsize=9, loc='upper left')
ax1.set_ylabel("BTC 价格 (USDT)", fontsize=12)
ax1.set_title("BTC 价格与市场状态时间序列", fontsize=14)
ax1.set_yscale('log')
ax1.grid(True, alpha=0.3)
# 下图:状态标签时间线
ax2 = axes[1]
state_colors = [color_map[l] for l in labels]
ax2.bar(dates, np.ones(len(dates)), color=state_colors, width=1.5, edgecolor='none')
ax2.set_yticks([])
ax2.set_ylabel("市场状态", fontsize=12)
ax2.set_xlabel("日期", fontsize=12)
plt.tight_layout()
fig.savefig(output_dir / "cluster_state_timeseries.png", dpi=150, bbox_inches='tight')
plt.close(fig)
print(f" [保存] cluster_state_timeseries.png")
def _plot_kmeans_selection(kmeans_results: Dict, gmm_results: Dict, output_dir: Path):
"""K选择对比图轮廓系数 + BIC"""
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
# 1. K-Means 轮廓系数
ks_km = sorted(kmeans_results.keys())
sils_km = [kmeans_results[k]["silhouette"] for k in ks_km]
axes[0].plot(ks_km, sils_km, 'bo-', linewidth=2, markersize=8)
best_k_km = ks_km[np.argmax(sils_km)]
axes[0].axvline(x=best_k_km, color='red', linestyle='--', alpha=0.7)
axes[0].set_xlabel("k", fontsize=12)
axes[0].set_ylabel("轮廓系数", fontsize=12)
axes[0].set_title("K-Means 轮廓系数", fontsize=13)
axes[0].grid(True, alpha=0.3)
# 2. K-Means 惯性 (Elbow)
inertias = [kmeans_results[k]["inertia"] for k in ks_km]
axes[1].plot(ks_km, inertias, 'gs-', linewidth=2, markersize=8)
axes[1].set_xlabel("k", fontsize=12)
axes[1].set_ylabel("惯性 (Inertia)", fontsize=12)
axes[1].set_title("K-Means 肘部法则", fontsize=13)
axes[1].grid(True, alpha=0.3)
# 3. GMM BIC
ks_gmm = sorted(gmm_results.keys())
bics = [gmm_results[k]["bic"] for k in ks_gmm]
axes[2].plot(ks_gmm, bics, 'r^-', linewidth=2, markersize=8)
best_k_gmm = ks_gmm[np.argmin(bics)]
axes[2].axvline(x=best_k_gmm, color='blue', linestyle='--', alpha=0.7)
axes[2].set_xlabel("k", fontsize=12)
axes[2].set_ylabel("BIC", fontsize=12)
axes[2].set_title("GMM BIC 选择", fontsize=13)
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
fig.savefig(output_dir / "cluster_k_selection.png", dpi=150, bbox_inches='tight')
plt.close(fig)
print(f" [保存] cluster_k_selection.png")
# ============================================================
# 主入口
# ============================================================
def run_clustering_analysis(df: pd.DataFrame, output_dir: "str | Path" = "output/clustering") -> Dict:
"""
市场状态聚类与马尔可夫链分析 - 主入口
Parameters
----------
df : pd.DataFrame
已经通过 add_derived_features() 添加了衍生特征的日线数据
output_dir : str or Path
图表输出目录
Returns
-------
results : dict
包含聚类结果、转移矩阵、平稳分布等
"""
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# 设置中文字体macOS
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
print("=" * 60)
print(" BTC 市场状态聚类与马尔可夫链分析")
print("=" * 60)
# ---- 1. 特征准备 ----
df_clean, X_scaled, scaler = _prepare_features(df)
# ---- 2. K-Means 聚类 ----
best_k_km, km_labels, kmeans_results = _run_kmeans(X_scaled)
# ---- 3. GMM 聚类 ----
best_k_gmm, gmm_labels, gmm_results = _run_gmm(X_scaled)
# ---- 4. HDBSCAN 聚类 ----
hdbscan_labels, hdbscan_info = _run_hdbscan(X_scaled)
# ---- 5. K选择对比图 ----
print("\n[可视化] 生成K选择对比图...")
_plot_kmeans_selection(kmeans_results, gmm_results, output_dir)
# ---- 6. K-Means 聚类解释 ----
km_desc = _interpret_clusters(df_clean, km_labels, "K-Means")
# ---- 7. GMM 聚类解释 ----
gmm_desc = _interpret_clusters(df_clean, gmm_labels, "GMM")
# ---- 8. 马尔可夫链分析基于K-Means结果----
trans_matrix, stationary, holding_time = _compute_transition_matrix(km_labels)
_print_markov_results(trans_matrix, stationary, holding_time, km_desc)
# ---- 9. 可视化 ----
print("\n[可视化] 生成分析图表...")
# PCA散点图
_plot_pca_scatter(X_scaled, km_labels, km_desc, "K-Means", output_dir)
_plot_pca_scatter(X_scaled, gmm_labels, gmm_desc, "GMM", output_dir)
if hdbscan_labels is not None and hdbscan_info.get("n_clusters", 0) >= 2:
# 为HDBSCAN创建简易描述
hdb_states = np.sort(np.unique(hdbscan_labels[hdbscan_labels >= 0]))
hdb_desc = _interpret_clusters(df_clean, hdbscan_labels, "HDBSCAN")
_plot_pca_scatter(X_scaled, hdbscan_labels, hdb_desc, "HDBSCAN", output_dir)
# 轮廓系数图
_plot_silhouette(X_scaled, km_labels, "K-Means", output_dir)
# 聚类特征热力图
_plot_cluster_heatmap(km_desc, "K-Means", output_dir)
_plot_cluster_heatmap(gmm_desc, "GMM", output_dir)
# 转移矩阵热力图
_plot_transition_heatmap(trans_matrix, km_desc, output_dir)
# 状态时间序列图
_plot_state_timeseries(df_clean, km_labels, km_desc, output_dir)
# ---- 10. 汇总结果 ----
results = {
"kmeans": {
"best_k": best_k_km,
"labels": km_labels,
"cluster_desc": km_desc,
"all_results": kmeans_results,
},
"gmm": {
"best_k": best_k_gmm,
"labels": gmm_labels,
"cluster_desc": gmm_desc,
"all_results": gmm_results,
},
"hdbscan": {
"labels": hdbscan_labels,
"info": hdbscan_info,
},
"markov": {
"transition_matrix": trans_matrix,
"stationary_distribution": stationary,
"holding_time": holding_time,
},
"features": {
"df_clean": df_clean,
"X_scaled": X_scaled,
"scaler": scaler,
},
}
print("\n" + "=" * 60)
print(" 聚类与马尔可夫链分析完成!")
print("=" * 60)
return results
# ============================================================
# 命令行入口
# ============================================================
if __name__ == "__main__":
from data_loader import load_daily
from preprocessing import add_derived_features
df = load_daily()
df = add_derived_features(df)
results = run_clustering_analysis(df, output_dir="output/clustering")