From 2b0eb4449ffe5256f987a5d8a990692ca59be493 Mon Sep 17 00:00:00 2001 From: riba2534 Date: Wed, 4 Feb 2026 01:20:55 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=20K=20=E7=BA=BF?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E4=B8=80=E9=94=AE=E4=B8=8B=E8=BD=BD=E8=84=9A?= =?UTF-8?q?=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 download_data.py,从 Binance API 自动下载全部 15 个粒度 K 线数据 - 支持断点续传、限频重试、Ctrl+C 安全中断 - 更新 README 数据获取说明和项目结构 - requirements.txt 添加 requests 依赖 Co-Authored-By: Claude Opus 4.5 --- README.md | 27 +++-- download_data.py | 263 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 3 files changed, 283 insertions(+), 8 deletions(-) create mode 100644 download_data.py diff --git a/README.md b/README.md index f349d1a..dbd7143 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,10 @@ ``` btc_price_anany/ ├── main.py # CLI 入口 +├── download_data.py # 数据下载脚本 ├── requirements.txt # Python 依赖 ├── LICENSE # MIT 许可证 -├── data/ # 15 个 BTC/USDT K线 CSV(1m ~ 1M) +├── data/ # 15 个 BTC/USDT K线 CSV(需下载) ├── src/ # 30 个分析与工具模块 │ ├── data_loader.py # 数据加载与校验 │ ├── preprocessing.py # 衍生特征工程 @@ -44,8 +45,8 @@ btc_price_anany/ ### 安装 ```bash -git clone https://github.com/riba2534/btc_price_anany.git -cd btc_price_anany +git clone https://github.com/riba2534/bitcoin-all-klines-analysis.git +cd bitcoin-all-klines-analysis pip install -r requirements.txt ``` @@ -85,13 +86,23 @@ python main.py --start 2020-01-01 --end 2025-12-31 | `btcusdt_1w.csv` | 1 周 | ~450 | | `btcusdt_1mo.csv` | 1 月 | ~100 | -全部数据来源于 Binance 公开 API,时间范围 2017-08 至 2026-02。 +全部数据来源于 Binance 公开 API,时间范围 2017-08-17(BTCUSDT 上线日)至今。 -> **数据未包含在仓库中**,请从 Binance 官方数据源下载后放入 `data/` 目录: +> **数据未包含在仓库中**,请使用内置脚本一键下载: > -> - K 线数据下载页面: -> - 将 URL 中的 `1m` 替换为所需粒度(`3m`、`5m`、`15m`、`30m`、`1h`、`2h`、`4h`、`6h`、`8h`、`12h`、`1d`、`3d`、`1w`、`1mo`)即可下载对应时间粒度的数据 -> - 下载后合并为单个 CSV 文件,命名格式:`btcusdt_{interval}.csv`,放入 `data/` 目录 +> ```bash +> # 下载全部 15 个粒度(约需 30-60 分钟,支持断点续传) +> python download_data.py +> +> # 只下载指定粒度 +> python download_data.py 1d 1h 4h +> +> # 查看可用粒度 +> python download_data.py --list +> ``` +> +> 也可从 Binance 官方手动下载: +> (将 URL 中的 `1m` 替换为所需粒度即可) ## 分析模块 diff --git a/download_data.py b/download_data.py new file mode 100644 index 0000000..5cc135a --- /dev/null +++ b/download_data.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 +""" +BTC/USDT K线数据下载脚本 + +从 Binance 公开 API 下载全部 15 个时间粒度的历史 K 线数据。 +数据范围:2017-08-17(BTCUSDT 上线日)至今。 +支持断点续传:已下载的数据不会重复拉取。 + +用法: + python download_data.py # 下载全部 15 个粒度 + python download_data.py 1d 1h 4h # 只下载指定粒度 + python download_data.py --list # 查看可用粒度 +""" + +import csv +import sys +import time +import requests +from datetime import datetime, timezone +from pathlib import Path + +# ============================================================ +# 配置 +# ============================================================ + +SYMBOL = "BTCUSDT" +BASE_URL = "https://api.binance.com/api/v3/klines" +LIMIT = 1000 # 每次请求最大行数 + +# BTCUSDT 上线时间 +START_MS = int(datetime(2017, 8, 17, tzinfo=timezone.utc).timestamp() * 1000) + +# 全部 15 个粒度(API 参数值) +ALL_INTERVALS = [ + "1m", "3m", "5m", "15m", "30m", + "1h", "2h", "4h", "6h", "8h", "12h", + "1d", "3d", "1w", "1M", +] + +# API interval → 本地文件名中的粒度标识 +INTERVAL_TO_FILENAME = {i: i for i in ALL_INTERVALS} +INTERVAL_TO_FILENAME["1M"] = "1mo" # Binance API 用 '1M',项目文件用 '1mo' + +# CSV 表头,与 src/data_loader.py 期望的列名一致 +CSV_HEADER = [ + "open_time", "open", "high", "low", "close", "volume", + "close_time", "quote_volume", "trades", + "taker_buy_volume", "taker_buy_quote_volume", "ignore", +] + + +# ============================================================ +# 下载逻辑 +# ============================================================ + +def get_last_timestamp(filepath: Path) -> int | None: + """读取已有 CSV 最后一行的 close_time,用于断点续传。""" + if not filepath.exists() or filepath.stat().st_size == 0: + return None + last_line = "" + with open(filepath, "rb") as f: + # 从文件末尾向前查找最后一行 + f.seek(0, 2) + pos = f.tell() + while pos > 0: + pos -= 1 + f.seek(pos) + ch = f.read(1) + if ch == b"\n" and pos < f.tell() - 1: + last_line = f.readline().decode().strip() + break + if not last_line: + f.seek(0) + for line in f: + last_line = line.decode().strip() + if not last_line or last_line.startswith("open_time"): + return None + try: + close_time = int(last_line.split(",")[6]) + return close_time + except (IndexError, ValueError): + return None + + +def count_lines(filepath: Path) -> int: + """快速统计 CSV 数据行数(不含表头)。""" + if not filepath.exists(): + return 0 + with open(filepath, "rb") as f: + count = sum(1 for _ in f) - 1 # 减去表头 + return max(0, count) + + +def download_interval(interval: str, output_dir: Path) -> int: + """下载单个粒度的全量 K 线数据,返回最终行数。""" + tag = INTERVAL_TO_FILENAME[interval] + filepath = output_dir / f"btcusdt_{tag}.csv" + + existing_rows = count_lines(filepath) + last_ts = get_last_timestamp(filepath) + + if last_ts is not None: + start_time = last_ts + 1 + print(f" 断点续传: 已有 {existing_rows:,} 行," + f"从 {ms_to_date(start_time)} 继续") + else: + start_time = START_MS + + now_ms = int(datetime.now(timezone.utc).timestamp() * 1000) + if start_time >= now_ms: + print(f" 已是最新数据,跳过") + return existing_rows + + # 写入模式:续传用 append,否则新建 + mode = "a" if existing_rows > 0 else "w" + new_rows = 0 + retries = 0 + max_retries = 10 + + with open(filepath, mode, newline="") as f: + writer = csv.writer(f) + if existing_rows == 0: + writer.writerow(CSV_HEADER) + + current = start_time + while current < now_ms: + params = { + "symbol": SYMBOL, + "interval": interval, + "startTime": current, + "limit": LIMIT, + } + try: + resp = requests.get(BASE_URL, params=params, timeout=30) + + if resp.status_code == 429: + wait = int(resp.headers.get("Retry-After", 60)) + print(f"\n [限频] 等待 {wait}s...") + time.sleep(wait) + continue + if resp.status_code == 418: + print(f"\n [IP 封禁] 等待 120s...") + time.sleep(120) + continue + + resp.raise_for_status() + data = resp.json() + + if not data: + break + + for row in data: + writer.writerow(row) + new_rows += len(data) + + # 下一批起始点 + current = data[-1][6] + 1 # last close_time + 1 + + # 进度 + total = existing_rows + new_rows + pct = min(100, (current - START_MS) / max(1, now_ms - START_MS) * 100) + print(f"\r {ms_to_date(current)} | " + f"{total:>10,} 行 | {pct:5.1f}%", end="", flush=True) + + retries = 0 + time.sleep(0.05) + + except KeyboardInterrupt: + print(f"\n [中断] 已保存 {existing_rows + new_rows:,} 行") + return existing_rows + new_rows + except requests.exceptions.RequestException as e: + retries += 1 + if retries > max_retries: + print(f"\n [失败] 连续 {max_retries} 次错误,中止: {e}") + break + wait = min(2 ** retries, 60) + print(f"\n [重试 {retries}/{max_retries}] {wait}s 后: {e}") + time.sleep(wait) + + total = existing_rows + new_rows + print(f"\n 完成: +{new_rows:,} 行,共 {total:,} 行 → {filepath.name}") + return total + + +def ms_to_date(ms: int) -> str: + return datetime.fromtimestamp(ms / 1000, tz=timezone.utc).strftime("%Y-%m-%d") + + +# ============================================================ +# 入口 +# ============================================================ + +def parse_interval(arg: str) -> str: + """将用户输入的粒度标识映射为 Binance API interval。""" + s = arg.strip().lower() + # 处理 '1mo' → '1M' + if s == "1mo": + return "1M" + for iv in ALL_INTERVALS: + if iv.lower() == s: + return iv + return "" + + +def main(): + output_dir = Path(__file__).resolve().parent / "data" + output_dir.mkdir(exist_ok=True) + + # --list 模式 + if "--list" in sys.argv: + print("可用粒度:") + for iv in ALL_INTERVALS: + tag = INTERVAL_TO_FILENAME[iv] + print(f" {tag:5s} (API: {iv})") + return + + # 解析参数 + if len(sys.argv) > 1: + intervals = [] + for arg in sys.argv[1:]: + iv = parse_interval(arg) + if not iv: + print(f"未知粒度: {arg}") + tags = [INTERVAL_TO_FILENAME[i] for i in ALL_INTERVALS] + print(f"可选: {', '.join(tags)}") + sys.exit(1) + intervals.append(iv) + else: + intervals = list(ALL_INTERVALS) + + tags = [INTERVAL_TO_FILENAME[i] for i in intervals] + print("=" * 60) + print(f"BTC/USDT K 线数据下载") + print(f"=" * 60) + print(f"交易对: {SYMBOL}") + print(f"粒度: {', '.join(tags)}") + print(f"起始日: {ms_to_date(START_MS)}") + print(f"输出目录: {output_dir}") + print(f"依赖: pip install requests") + print("=" * 60) + + results = {} + t0 = time.time() + + for i, interval in enumerate(intervals, 1): + tag = INTERVAL_TO_FILENAME[interval] + print(f"\n[{i}/{len(intervals)}] {tag}") + rows = download_interval(interval, output_dir) + results[tag] = rows + + elapsed = time.time() - t0 + m, s = divmod(int(elapsed), 60) + + print(f"\n{'=' * 60}") + print(f"全部完成(耗时 {m}m{s}s):") + print(f"{'=' * 60}") + for tag, rows in results.items(): + print(f" {tag:5s} → {rows:>10,} 行") + print(f"\n数据目录: {output_dir}") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index d481281..41553d1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +requests>=2.28 pandas>=2.0 numpy>=1.24 scipy>=1.11