import csv import re import time from typing import List import requests def fetch_html(url: str, timeout: int = 15) -> str: """ 获取目标页面 HTML 内容(带 UA 与重试) 参数: url: 目标页面地址 timeout: 请求超时时间(秒) 返回值: 页面 HTML 字符串,若失败则抛出异常 """ headers = { "User-Agent": ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Cache-Control": "no-cache", "Pragma": "no-cache", } last_exc = None for _ in range(3): try: resp = requests.get(url, headers=headers, timeout=timeout) if resp.status_code == 200: return resp.text last_exc = Exception(f"HTTP {resp.status_code}") except Exception as e: last_exc = e time.sleep(1.0) raise last_exc if last_exc else Exception("未知的网络错误") def parse_chain_names(html: str) -> List[str]: """ 从 CoinGecko 链页 HTML 中解析链名称(尽量兼容多种结构) 参数: html: 页面 HTML 字符串 返回值: 链名称列表(按页面出现顺序) """ names: List[str] = [] # 方案1:匹配到指向链详情的链接文本 # /zh/chains/ 或 /en/chains/ link_name_pattern = re.compile( r']+href="/(?:zh|en)/chains/[^"]+"[^>]*>(.*?)', re.IGNORECASE | re.DOTALL, ) for m in link_name_pattern.finditer(html): text = re.sub(r"<[^>]+>", "", m.group(1)).strip() if text: names.append(text) # 仅使用链接匹配,避免误采集页面其他导航/文案 # 去重并裁剪可能的非链名称噪音 cleaned: List[str] = [] for n in names: # 过滤常见非名称文本 if n.lower() in {"chains", "tvl", "volume", "market share"}: continue if len(n) > 100: continue cleaned.append(n) # 有些页面会包含额外的链接文本,尽量只保留前 100 条 return cleaned[:100] def read_current_max_index(csv_path: str) -> int: """ 读取当前 CSV 的最大排名(第一列) 参数: csv_path: CSV 路径 返回值: 最大排名数字,若仅有表头则返回 0 """ try: with open(csv_path, "r", encoding="utf-8") as f: reader = csv.reader(f) rows = list(reader) max_idx = 0 for row in rows[1:]: if not row: continue try: max_idx = max(max_idx, int(row[0])) except Exception: pass return max_idx except FileNotFoundError: return 0 def append_to_csv(csv_path: str, names: List[str], start_index: int) -> None: """ 将解析到的链名称追加写入 CSV(其余字段按未知/不适用填充) 参数: csv_path: CSV 路径 names: 链名称列表 start_index: 起始排名序号(例如 101) """ with open(csv_path, "a", encoding="utf-8", newline="") as f: writer = csv.writer(f) for i, name in enumerate(names, start=0): idx = start_index + i writer.writerow( [ idx, name, "未知", "未知", "待补充", "否", "不适用", "不适用", ] ) def read_existing_names(csv_path: str) -> List[str]: """ 读取 CSV 已存在的链名称列表(第二列) 参数: csv_path: CSV 路径 返回值: 名称列表 """ try: with open(csv_path, "r", encoding="utf-8") as f: reader = csv.reader(f) next(reader, None) # 跳过表头 return [row[1] for row in reader if row and len(row) >= 2] except FileNotFoundError: return [] def main() -> None: """ 抓取指定页的链信息并顺序追加到 CSV 说明: - 默认抓取第 2 页;可通过命令行参数传入页码,例如: `python script.py 3` - 起始排名自动接续 CSV 当前最大排名,避免重复编号 """ import sys page = 2 if len(sys.argv) >= 2: try: page = int(sys.argv[1]) except Exception: pass url_zh = f"https://www.coingecko.com/zh/chains?page={page}" url_en = f"https://www.coingecko.com/en/chains?page={page}" csv_path = "chain/top_100_chains.csv" names: List[str] = [] # 合并不同语言页的解析结果,提高覆盖率 try: html_zh = fetch_html(url_zh) names.extend(parse_chain_names(html_zh)) except Exception: pass try: html_en = fetch_html(url_en) names.extend(parse_chain_names(html_en)) except Exception: pass # 去重保持顺序 seen = set() names = [n for n in names if not (n in seen or seen.add(n))] if not names: raise RuntimeError("未解析到链名称,请稍后重试或更换解析策略") # 仅追加当前 CSV 尚不存在的名称,避免重复 existing = set(read_existing_names(csv_path)) names_to_append = [n for n in names if n not in existing] if not names_to_append: return start_index = read_current_max_index(csv_path) + 1 append_to_csv(csv_path, names_to_append, start_index) if __name__ == "__main__": main()