Files
ca_auto_table/chain/append_page2_from_coingecko.py
2025-12-04 15:50:55 +08:00

208 lines
5.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import csv
import re
import time
from typing import List
import requests
def fetch_html(url: str, timeout: int = 15) -> str:
"""
获取目标页面 HTML 内容(带 UA 与重试)
参数:
url: 目标页面地址
timeout: 请求超时时间(秒)
返回值:
页面 HTML 字符串,若失败则抛出异常
"""
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Cache-Control": "no-cache",
"Pragma": "no-cache",
}
last_exc = None
for _ in range(3):
try:
resp = requests.get(url, headers=headers, timeout=timeout)
if resp.status_code == 200:
return resp.text
last_exc = Exception(f"HTTP {resp.status_code}")
except Exception as e:
last_exc = e
time.sleep(1.0)
raise last_exc if last_exc else Exception("未知的网络错误")
def parse_chain_names(html: str) -> List[str]:
"""
从 CoinGecko 链页 HTML 中解析链名称(尽量兼容多种结构)
参数:
html: 页面 HTML 字符串
返回值:
链名称列表(按页面出现顺序)
"""
names: List[str] = []
# 方案1匹配到指向链详情的链接文本
# /zh/chains/<slug> 或 /en/chains/<slug>
link_name_pattern = re.compile(
r'<a[^>]+href="/(?:zh|en)/chains/[^"]+"[^>]*>(.*?)</a>',
re.IGNORECASE | re.DOTALL,
)
for m in link_name_pattern.finditer(html):
text = re.sub(r"<[^>]+>", "", m.group(1)).strip()
if text:
names.append(text)
# 仅使用链接匹配,避免误采集页面其他导航/文案
# 去重并裁剪可能的非链名称噪音
cleaned: List[str] = []
for n in names:
# 过滤常见非名称文本
if n.lower() in {"chains", "tvl", "volume", "market share"}:
continue
if len(n) > 100:
continue
cleaned.append(n)
# 有些页面会包含额外的链接文本,尽量只保留前 100 条
return cleaned[:100]
def read_current_max_index(csv_path: str) -> int:
"""
读取当前 CSV 的最大排名(第一列)
参数:
csv_path: CSV 路径
返回值:
最大排名数字,若仅有表头则返回 0
"""
try:
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.reader(f)
rows = list(reader)
max_idx = 0
for row in rows[1:]:
if not row:
continue
try:
max_idx = max(max_idx, int(row[0]))
except Exception:
pass
return max_idx
except FileNotFoundError:
return 0
def append_to_csv(csv_path: str, names: List[str], start_index: int) -> None:
"""
将解析到的链名称追加写入 CSV其余字段按未知/不适用填充)
参数:
csv_path: CSV 路径
names: 链名称列表
start_index: 起始排名序号(例如 101
"""
with open(csv_path, "a", encoding="utf-8", newline="") as f:
writer = csv.writer(f)
for i, name in enumerate(names, start=0):
idx = start_index + i
writer.writerow(
[
idx,
name,
"未知",
"未知",
"待补充",
"",
"不适用",
"不适用",
]
)
def read_existing_names(csv_path: str) -> List[str]:
"""
读取 CSV 已存在的链名称列表(第二列)
参数:
csv_path: CSV 路径
返回值:
名称列表
"""
try:
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.reader(f)
next(reader, None) # 跳过表头
return [row[1] for row in reader if row and len(row) >= 2]
except FileNotFoundError:
return []
def main() -> None:
"""
抓取指定页的链信息并顺序追加到 CSV
说明:
- 默认抓取第 2 页;可通过命令行参数传入页码,例如: `python script.py 3`
- 起始排名自动接续 CSV 当前最大排名,避免重复编号
"""
import sys
page = 2
if len(sys.argv) >= 2:
try:
page = int(sys.argv[1])
except Exception:
pass
url_zh = f"https://www.coingecko.com/zh/chains?page={page}"
url_en = f"https://www.coingecko.com/en/chains?page={page}"
csv_path = "chain/top_100_chains.csv"
names: List[str] = []
# 合并不同语言页的解析结果,提高覆盖率
try:
html_zh = fetch_html(url_zh)
names.extend(parse_chain_names(html_zh))
except Exception:
pass
try:
html_en = fetch_html(url_en)
names.extend(parse_chain_names(html_en))
except Exception:
pass
# 去重保持顺序
seen = set()
names = [n for n in names if not (n in seen or seen.add(n))]
if not names:
raise RuntimeError("未解析到链名称,请稍后重试或更换解析策略")
# 仅追加当前 CSV 尚不存在的名称,避免重复
existing = set(read_existing_names(csv_path))
names_to_append = [n for n in names if n not in existing]
if not names_to_append:
return
start_index = read_current_max_index(csv_path) + 1
append_to_csv(csv_path, names_to_append, start_index)
if __name__ == "__main__":
main()