From e2d2b0b75b4f6e4676ab800675c4d254d95b5fcd Mon Sep 17 00:00:00 2001 From: bvwl <2201101122@qq.com> Date: Thu, 20 Nov 2025 11:42:18 +0800 Subject: [PATCH] 0.0.3 --- .gitignore | 2 + README.md | 4 + {app => back}/apis/__init__.py | 0 {app => back}/apis/country/__init__.py | 0 {app => back}/apis/country/food/schema.py | 0 {app => back}/apis/country/food/view.py | 0 {app => back}/apis/country/info/schema.py | 32 +- {app => back}/apis/country/info/view.py | 40 +- {app => back}/apis/country/models.py | 34 +- {app => back}/apis/country/shop/schema.py | 0 {app => back}/apis/country/shop/view.py | 0 {app => back}/main.py | 0 .../models/0_20251118164406_init.py | 0 {app => back}/pyproject.toml | 0 {app => back}/settings.py | 0 {app => back}/utils/__init__.py | 0 {app => back}/utils/browser_api.py | 0 {app => back}/utils/decorators.py | 0 {app => back}/utils/exceptions.py | 0 {app => back}/utils/logs.py | 0 {app => back}/utils/out_base.py | 0 {app => back}/utils/redis_tool.py | 0 {app => back}/utils/session_store.py | 0 {app => back}/utils/time_tool.py | 0 spider/bit_browser.py | 522 +++++++++++ spider/mail_.py | 835 ++++++++++++++++++ spider/main.py | 323 +++++++ spider/requirements.txt | 23 + spider/work.py | 333 +++++++ 29 files changed, 2100 insertions(+), 48 deletions(-) rename {app => back}/apis/__init__.py (100%) rename {app => back}/apis/country/__init__.py (100%) rename {app => back}/apis/country/food/schema.py (100%) rename {app => back}/apis/country/food/view.py (100%) rename {app => back}/apis/country/info/schema.py (64%) rename {app => back}/apis/country/info/view.py (85%) rename {app => back}/apis/country/models.py (74%) rename {app => back}/apis/country/shop/schema.py (100%) rename {app => back}/apis/country/shop/view.py (100%) rename {app => back}/main.py (100%) rename {app => back}/migrations/models/0_20251118164406_init.py (100%) rename {app => back}/pyproject.toml (100%) rename {app => back}/settings.py (100%) rename {app => back}/utils/__init__.py (100%) rename {app => back}/utils/browser_api.py (100%) rename {app => back}/utils/decorators.py (100%) rename {app => back}/utils/exceptions.py (100%) rename {app => back}/utils/logs.py (100%) rename {app => back}/utils/out_base.py (100%) rename {app => back}/utils/redis_tool.py (100%) rename {app => back}/utils/session_store.py (100%) rename {app => back}/utils/time_tool.py (100%) create mode 100644 spider/bit_browser.py create mode 100644 spider/mail_.py create mode 100644 spider/main.py create mode 100644 spider/requirements.txt create mode 100644 spider/work.py diff --git a/.gitignore b/.gitignore index 142a69f..5a77b81 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ logs/sessions.json logs/sessions.log 222.py 333.py +444.py +ran diff --git a/README.md b/README.md index feaa5bf..4c5f7f9 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,6 @@ +# 0.0.3 +- 添加自动化脚本 +# 0.0.2 +- 修复随机接口 # 0.0.1 - 初始化项目 diff --git a/app/apis/__init__.py b/back/apis/__init__.py similarity index 100% rename from app/apis/__init__.py rename to back/apis/__init__.py diff --git a/app/apis/country/__init__.py b/back/apis/country/__init__.py similarity index 100% rename from app/apis/country/__init__.py rename to back/apis/country/__init__.py diff --git a/app/apis/country/food/schema.py b/back/apis/country/food/schema.py similarity index 100% rename from app/apis/country/food/schema.py rename to back/apis/country/food/schema.py diff --git a/app/apis/country/food/view.py b/back/apis/country/food/view.py similarity index 100% rename from app/apis/country/food/view.py rename to back/apis/country/food/view.py diff --git a/app/apis/country/info/schema.py b/back/apis/country/info/schema.py similarity index 64% rename from app/apis/country/info/schema.py rename to back/apis/country/info/schema.py index c906144..c7d8702 100644 --- a/app/apis/country/info/schema.py +++ b/back/apis/country/info/schema.py @@ -9,20 +9,22 @@ CHINA_TZ = timezone(timedelta(hours=8)) class Base(BaseModel): """ - 基础地址信息模型 + 基础信息模型 - 包含地址相关的通用字段,供创建与输出模型复用 + 字段与数据库模型 Info 保持一致 """ - firstname: str = Field(..., description='名') - lastname: str = Field(..., description='姓') - full_name: str = Field(..., description='全名') + first_name: str = Field(..., description='名') + last_name: str = Field(..., description='姓') birthday: str = Field(..., description='生日') - street_address: str = Field(..., description='街道地址') + current_address: str = Field(..., description='街道地址') city: str = Field(..., description='城市') phone: str = Field(..., description='电话') - zip_code: str = Field(..., description='邮编') - state_fullname: str = Field(..., description='州全称') + postal_code: str = Field(..., description='邮编') + province: str = Field(..., description='州全称') status: bool = Field(False, description='状态') + email: str | None = Field(None, description='邮箱') + email_content: str | None = Field(None, description='邮件内容') + text: str | None = Field(None, description='文本内容') class Create(Base): @@ -36,16 +38,18 @@ class Update(BaseModel): """ 更新请求模型,支持部分更新 """ - firstname: str | None = Field(None, description='名') - lastname: str | None = Field(None, description='姓') - full_name: str | None = Field(None, description='全名') + first_name: str | None = Field(None, description='名') + last_name: str | None = Field(None, description='姓') birthday: str | None = Field(None, description='生日') - street_address: str | None = Field(None, description='街道地址') + current_address: str | None = Field(None, description='街道地址') city: str | None = Field(None, description='城市') phone: str | None = Field(None, description='电话') - zip_code: str | None = Field(None, description='邮编') - state_fullname: str | None = Field(None, description='州全称') + postal_code: str | None = Field(None, description='邮编') + province: str | None = Field(None, description='州全称') status: bool | None = Field(None, description='状态') + email: str | None = Field(None, description='邮箱') + email_content: str | None = Field(None, description='邮件内容') + text: str | None = Field(None, description='文本内容') class Out(TimestampModel, Base): diff --git a/app/apis/country/info/view.py b/back/apis/country/info/view.py similarity index 85% rename from app/apis/country/info/view.py rename to back/apis/country/info/view.py index c6de005..f28766c 100644 --- a/app/apis/country/info/view.py +++ b/back/apis/country/info/view.py @@ -30,18 +30,18 @@ async def post(item: Create = Body(..., description='创建数据')): @handle_exceptions_unified() async def gets( id: UUID | None = Query(None, description='主键ID'), - firstname: str | None = Query(None, description='名'), - lastname: str | None = Query(None, description='姓'), - full_name: str | None = Query(None, description='全名'), + first_name: str | None = Query(None, description='名'), + last_name: str | None = Query(None, description='姓'), birthday: str | None = Query(None, description='生日'), - street_address: str | None = Query(None, description='街道地址'), + current_address: str | None = Query(None, description='街道地址'), city: str | None = Query(None, description='城市'), phone: str | None = Query(None, description='电话'), - zip_code: str | None = Query(None, description='邮编'), - state_fullname: str | None = Query(None, description='州全称'), + postal_code: str | None = Query(None, description='邮编'), + province: str | None = Query(None, description='州全称'), status: bool | None = Query(None, description='状态'), + email: str | None = Query(None, description='邮箱'), order_by: str | None = Query('create_time', description='排序字段', - regex='^(-)?(id|firstname|lastname|city|zip_code|create_time|update_time)$'), + regex='^(-)?(id|first_name|last_name|city|postal_code|province|create_time|update_time)$'), res_count: bool = Query(False, description='是否返回总数'), create_time_start: str | int | None = Query( None, description='创建时间开始 (支持 YYYY-MM-DD / YYYY-MM-DD HH:mm:ss / 13位时间戳)'), @@ -60,24 +60,24 @@ async def gets( query = Info.all() if id: query = query.filter(id=id) - if firstname: - query = query.filter(firstname=firstname) - if lastname: - query = query.filter(lastname=lastname) - if full_name: - query = query.filter(full_name=full_name) + if first_name: + query = query.filter(first_name=first_name) + if last_name: + query = query.filter(last_name=last_name) if birthday: query = query.filter(birthday=birthday) - if street_address: - query = query.filter(street_address=street_address) + if current_address: + query = query.filter(current_address=current_address) if city: query = query.filter(city=city) if phone: query = query.filter(phone=phone) - if zip_code: - query = query.filter(zip_code=zip_code) - if state_fullname: - query = query.filter(state_fullname=state_fullname) + if postal_code: + query = query.filter(postal_code=postal_code) + if province: + query = query.filter(province=province) + if email: + query = query.filter(email=email) if status is not None: query = query.filter(status=status) if create_time_start: @@ -152,7 +152,7 @@ async def delete(id: UUID = Query(..., description='主键ID'), # 随机获取一条状态修改为True的记录 -@app.put("/one", response_model=Out, description='随机获取一条状态修改为True的记录', summary='随机获取一条状态修改为True的记录') +@app.get("/one", response_model=Out, description='随机获取一条状态修改为True的记录', summary='随机获取一条状态修改为True的记录') @handle_exceptions_unified() async def random_update_status(): """ diff --git a/app/apis/country/models.py b/back/apis/country/models.py similarity index 74% rename from app/apis/country/models.py rename to back/apis/country/models.py index 22107c6..ada533f 100644 --- a/app/apis/country/models.py +++ b/back/apis/country/models.py @@ -70,27 +70,33 @@ class Info(Model): 字段: id (UUIDField): 主键,默认使用 UUID 生成 - firstname (CharField): 名,最大长度 255 - lastname (CharField): 姓,最大长度 255 - full_name (CharField): 全名,最大长度 255 + first_name (CharField): 名,最大长度 255 + last_name (CharField): 姓,最大长度 255 birthday (CharField): 生日(原始字符串),最大长度 32 - street_address (CharField): 街道地址,最大长度 255 + current_address (CharField): 街道地址,最大长度 255 city (CharField): 城市,最大长度 255 phone (CharField): 电话,最大长度 64 - zip_code (CharField): 邮编,最大长度 20 - state_fullname (CharField): 州全称,最大长度 255 + postal_code (CharField): 邮编,最大长度 20 + province (CharField): 州全称,最大长度 255 + status (BooleanField): 状态,默认值 False + email (CharField): 邮箱,最大长度 255, nullable 为 True + text (TextField): 文本内容, nullable 为 True + """ id = fields.UUIDField(pk=True, default=uuid.uuid4, description="ID") - firstname = fields.CharField(max_length=255, index=True, description="名") - lastname = fields.CharField(max_length=255, index=True, description="姓") - full_name = fields.CharField(max_length=255, index=True, description="全名") + first_name = fields.CharField(max_length=255, index=True, description="名") + last_name = fields.CharField(max_length=255, index=True, description="姓") birthday = fields.CharField(max_length=32, description="生日") - street_address = fields.CharField(max_length=255, index=True, description="街道地址") + current_address = fields.CharField(max_length=255, index=True, description="街道地址") city = fields.CharField(max_length=255, index=True, description="城市") phone = fields.CharField(max_length=64, description="电话") - zip_code = fields.CharField(max_length=20, index=True, description="邮编") - state_fullname = fields.CharField(max_length=255, index=True, description="州全称") + postal_code = fields.CharField(max_length=20, index=True, description="邮编") + province = fields.CharField(max_length=255, index=True, description="州全称") status = fields.BooleanField(default=False, description="状态") + # 邮件内容 + email = fields.CharField(max_length=255, unique=True, index=True, description="邮箱") + email_content = fields.TextField(null=True, description="邮件内容") + text = fields.TextField(null=True, description="文本内容") create_time = fields.DatetimeField(auto_now_add=True, index=True, description='创建时间') update_time = fields.DatetimeField(auto_now=True, description='更新时间') @@ -100,8 +106,8 @@ class Info(Model): table_description = "信息表" ordering = ["create_time"] indexes = [ - ("city", "zip_code", "state_fullname"), - ("firstname", "lastname"), + ("city", "postal_code", "province"), + ("first_name", "last_name"), ] def __repr__(self): diff --git a/app/apis/country/shop/schema.py b/back/apis/country/shop/schema.py similarity index 100% rename from app/apis/country/shop/schema.py rename to back/apis/country/shop/schema.py diff --git a/app/apis/country/shop/view.py b/back/apis/country/shop/view.py similarity index 100% rename from app/apis/country/shop/view.py rename to back/apis/country/shop/view.py diff --git a/app/main.py b/back/main.py similarity index 100% rename from app/main.py rename to back/main.py diff --git a/app/migrations/models/0_20251118164406_init.py b/back/migrations/models/0_20251118164406_init.py similarity index 100% rename from app/migrations/models/0_20251118164406_init.py rename to back/migrations/models/0_20251118164406_init.py diff --git a/app/pyproject.toml b/back/pyproject.toml similarity index 100% rename from app/pyproject.toml rename to back/pyproject.toml diff --git a/app/settings.py b/back/settings.py similarity index 100% rename from app/settings.py rename to back/settings.py diff --git a/app/utils/__init__.py b/back/utils/__init__.py similarity index 100% rename from app/utils/__init__.py rename to back/utils/__init__.py diff --git a/app/utils/browser_api.py b/back/utils/browser_api.py similarity index 100% rename from app/utils/browser_api.py rename to back/utils/browser_api.py diff --git a/app/utils/decorators.py b/back/utils/decorators.py similarity index 100% rename from app/utils/decorators.py rename to back/utils/decorators.py diff --git a/app/utils/exceptions.py b/back/utils/exceptions.py similarity index 100% rename from app/utils/exceptions.py rename to back/utils/exceptions.py diff --git a/app/utils/logs.py b/back/utils/logs.py similarity index 100% rename from app/utils/logs.py rename to back/utils/logs.py diff --git a/app/utils/out_base.py b/back/utils/out_base.py similarity index 100% rename from app/utils/out_base.py rename to back/utils/out_base.py diff --git a/app/utils/redis_tool.py b/back/utils/redis_tool.py similarity index 100% rename from app/utils/redis_tool.py rename to back/utils/redis_tool.py diff --git a/app/utils/session_store.py b/back/utils/session_store.py similarity index 100% rename from app/utils/session_store.py rename to back/utils/session_store.py diff --git a/app/utils/time_tool.py b/back/utils/time_tool.py similarity index 100% rename from app/utils/time_tool.py rename to back/utils/time_tool.py diff --git a/spider/bit_browser.py b/spider/bit_browser.py new file mode 100644 index 0000000..35057da --- /dev/null +++ b/spider/bit_browser.py @@ -0,0 +1,522 @@ +import os +import time +import aiohttp +import asyncio +import requests +from loguru import logger +from functools import wraps + + +def retry(max_retries: int = 3, delay: float = 1.0, backoff: float = 1.0): + """ + 通用重试装饰器 + :param max_retries: 最大重试次数 + :param delay: 每次重试的初始延迟(秒) + :param backoff: 每次重试延迟的递增倍数 + """ + + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + retries = 0 + current_delay = delay + while retries < max_retries: + try: + return func(*args, **kwargs) + except Exception as e: + retries += 1 + if retries >= max_retries: + logger.warning(f"函数 {func.__name__} 在尝试了 {max_retries} 次后失败,错误信息: {e}") + return None # 重试次数用尽后返回 None + logger.warning(f"正在重试 {func.__name__} {retries + 1}/{max_retries} 因错误: {e}") + time.sleep(current_delay) + current_delay *= backoff + + return None # 三次重试仍未成功,返回 None + + return wrapper + + return decorator + + +def async_retry(max_retries: int = 3, delay: float = 1.0, backoff: float = 1.0): + """ + 支持异步函数的通用重试装饰器 + :param max_retries: 最大重试次数 + :param delay: 每次重试的初始延迟(秒) + :param backoff: 每次重试延迟的递增倍数 + """ + + def decorator(func): + @wraps(func) + async def wrapper(*args, **kwargs): + retries = 0 + current_delay = delay + while retries < max_retries: + try: + return await func(*args, **kwargs) # 直接执行原始方法 + except Exception as e: + retries += 1 + if retries >= max_retries: + logger.warning(f"函数 {func.__name__} 在尝试了 {max_retries} 次后失败,错误信息: {e}") + return None # 重试次数用尽后返回 None + logger.warning(f"正在重试 {func.__name__} {retries + 1}/{max_retries} 因错误: {e}") + + await asyncio.sleep(current_delay) # 异步延迟 + current_delay *= backoff # 根据backoff递增延迟 + + return None # 三次重试仍未成功,返回 None + + return wrapper + + return decorator + + +# 比特浏览器模块 +class BitBrowser: + def __init__(self): + self.bit_host = "http://127.0.0.1" + pass + + # 创建比特币浏览器 + @retry(max_retries=3, delay=1.0, backoff=1.0) + def bit_browser_create(self, remark: str = '指纹浏览器', ua: str = None, host: str = None, port: str = None, + proxy_user: str = None, + proxy_pwd: str = None, proxy_type: str = 'noproxy', urls: str = None, + bit_port: str = "54345") -> str: + """ + 创建比特币浏览器 + :param bit_port: 可选,默认54345 + :param ua: 可选,默认随机 + :param proxy_type: 代理类型 (可选) ['noproxy', 'http', 'https', 'socks5', 'ssh'] + :param urls: 额外打开的url (可选) 多个用,分割 + :param host: 代理IP地址 (可选) + :param port: 代理IP端口 (可选) + :param proxy_user: 代理账号 (可选) + :param proxy_pwd: 代理密码 (可选) + :param remark: 备注 (可选) + :param bit_port: 可选,默认54345 + :return: 返回浏览器ID + """ + url = f"{self.bit_host}:{bit_port}/browser/update" + headers = {'Content-Type': 'application/json'} + data = { + 'name': f'{remark if len(remark) < 40 else remark[:40]}', # 窗口名称 + 'remark': f'{remark}', # 备注 + 'proxyMethod': 2, # 代理方式 2自定义 3 提取IP + # 代理类型 ['noproxy', 'http', 'https', 'socks5', 'ssh'] + 'proxyType': f'{proxy_type}', + "browserFingerPrint": {"userAgent": ua} # 留空,随机指纹 + } + if host is not None: + data['host'] = host + if port is not None: + data['port'] = port + if proxy_user is not None: + data['proxyUserName'] = proxy_user + if proxy_pwd is not None: + data['proxyPassword'] = proxy_pwd + if urls is not None: + data['url'] = urls # 额外打开的url 多个用,分割 + res = requests.post(url, json=data, headers=headers).json() + if not res.get('success'): + raise Exception(res) + browser_pk = res['data']['id'] + return browser_pk + + # 修改比特币浏览器 + @retry(max_retries=3, delay=1.0, backoff=1.0) + def bit_browser_update(self, pk: str, remark: str = None, proxyType: str = 'noproxy', host: str = None, + port: str = None, proxy_user: str = None, proxy_pwd: str = None, urls: str = None, + bit_port: str = "54345") -> bool: + """ + 修改比特币浏览器 传入某个参数则修改某个参数 + :param proxyType: 代理类型 noproxy|http|https|socks5(默认noproxy) + :param pk: # 浏览器ID + :param remark: # 备注 + :param host: # 代理主机 + :param port: # 代理端口 + :param proxy_user: # 代理账号 + :param proxy_pwd: # 代理密码 + :param urls: # 额外打开的url 多个用,分割 + :param bit_port: # 可选,默认54345 + :return: bool + """ + url = f"{self.bit_host}:{bit_port}/browser/update/partial" + headers = {'Content-Type': 'application/json'} + data = dict() + data['ids'] = [pk] + if remark is not None: + data['remark'] = remark + data['name'] = remark + if urls is not None: + data['url'] = urls + if proxyType != 'noproxy': + data['proxyType'] = proxyType + if host is not None: + data['host'] = host + if port is not None: + data['port'] = port if isinstance(port, int) else int(port) + if proxy_user is not None: + data['proxyUserName'] = proxy_user + if proxy_pwd is not None: + data['proxyPassword'] = proxy_pwd + res = requests.post(url, json=data, headers=headers).json() + if not res.get('success'): + raise Exception(res) + return True + + # 打开比特币浏览器 + @retry(max_retries=3, delay=1.0, backoff=1.0) + def bit_browser_open(self, pk: str, bit_port: str = "54345") -> str: + """ + 打开比特币浏览器 + :param pk: 浏览器ID + :param bit_port: 可选,默认54345 + :return: 返回浏览器地址 + """ + url = f"{self.bit_host}:{bit_port}/browser/open" + data = {"id": f'{pk}'} + headers = {'Content-Type': 'application/json'} + res = requests.post(url, json=data, headers=headers).json() + if not res.get('success'): + raise Exception(res) + debugger_address = res['data']['http'] + return debugger_address + + # 关闭比特币浏览器 + def bit_browser_close(self, pk: str, bit_port: str = "54345"): + """ + 关闭比特币浏览器 - 执行后需要等待5s + :param pk: 浏览器ID + :param bit_port: 可选,默认54345 + :return: 无返回值 + """ + url = f"{self.bit_host}:{bit_port}/browser/close" + headers = {'Content-Type': 'application/json'} + data = {'id': f'{pk}'} + requests.post(url, json=data, headers=headers).json() + time.sleep(5) # 等待5s,等待浏览器关闭 + # 关闭浏览器进程 + # pid = self.bit_browser_pid(pk, bit_port) + # if pid is not None: + # os.system(f"kill -9 {pid}") + + # 删除比特币浏览器 + def bit_browser_delete(self, pk: str, bit_port: str = "54345"): + """ + 删除比特币浏览器 + :param pk: 浏览器ID + :param bit_port: 可选,默认54345 + :return: 无返回值 + """ + url = f"{self.bit_host}:{bit_port}/browser/delete" + headers = {'Content-Type': 'application/json'} + data = {'id': f'{pk}'} + print(requests.post(url, json=data, headers=headers).json()) + + # 获取所有比特币浏览器 + @retry(max_retries=3, delay=1.0, backoff=1.0) + def bit_browser_get(self, page: int = 0, limit: int = 10, group_id: str | None = None, + bit_port: str | None = "54345") -> dict: + """ + 获取所有比特币浏览器 + :param page: 页码 + :param limit: 每页数量 + :param group_id: 组ID(可选) + :param bit_port: 可选,默认54345 + :return: {'success': True, 'data': {'page': 1, 'pageSize': 10, 'totalNum': 128, 'list': [{'id': '12a3126accc14c93bd34adcccfc3083c'},{'id':'edc5d61a56214e9f8a8bbf1a2e1b405d'}]}} + """ + + url = f"{self.bit_host}:{bit_port}/browser/list" + headers = {'Content-Type': 'application/json'} + data = {'page': page, 'pageSize': limit} + if group_id is not None: + data['groupId'] = group_id + res = requests.post(url, json=data, headers=headers).json() + if not res.get('success'): + raise Exception(res) + return res + + # 获取比特浏览器窗口详情 + @retry(max_retries=3, delay=1.0, backoff=1.0) + def bit_browser_detail(self, pk: str, bit_port: str = "54345") -> dict: + """ + 获取比特浏览器窗口详情 + :param pk: 浏览器ID + :param bit_port: 可选,默认54345 + :return: {'success': True, 'data': {'id': '12a3126accc14c93bd34adcccfc3083c', 'name': '12a3126accc14c93bd34adcccfc3083c', 'remark': '12a3126accc14c93bd34adcccfc3083c', ' + """ + url = f"{self.bit_host}:{bit_port}/browser/detail" + headers = {'Content-Type': 'application/json'} + data = {'id': f'{pk}'} + res = requests.post(url, json=data, headers=headers).json() + if not res.get('success'): + raise Exception(res) + return res + + # 获取比特浏览器的进程id + def bit_browser_pid(self, pk: str, bit_port: str = "54345") -> str: + """ + 获取比特浏览器的进程id + :param pk: 浏览器ID + :param bit_port: 可选,默认54345 + :return: 返回进程id + """ + url = f"{self.bit_host}:{bit_port}/browser/pids/alive" + headers = {'Content-Type': 'application/json'} + data = { + "ids": [pk] + } + res = requests.post(url, json=data, headers=headers).json() + if not res.get('success'): + raise Exception(res) + return res['data'][pk] + + @staticmethod + async def __request(method: str, url: str, params: dict = None, **kwargs) -> dict: + """ + 通用异步请求方法 + :param method: HTTP方法 (GET, POST, PUT, DELETE) + :param endpoint: API接口地址 + :param kwargs: 其他请求参数 (json, params等) + :return: 返回JSON数据 + """ + if params: + # 将布尔值转换为字符串或整数 + params = {k: str(v).lower() if isinstance(v, bool) else v for k, v in params.items()} + async with aiohttp.ClientSession() as session: + async with session.request(method, url, params=params, **kwargs) as response: + return await response.json() + + # 创建比特币浏览器 + @async_retry(max_retries=3, delay=1.0, backoff=1.0) + async def _bit_browser_create(self, remark: str = '指纹浏览器', ua: str = None, host: str = None, port: str = None, + proxy_user: str = None, + proxy_pwd: str = None, proxy_type: str = 'noproxy', urls: str = None, + bit_port: str = "54345") -> str: + """ + 创建比特币浏览器 + :param urls: 额外打开的url (可选) 多个用,分割 + :param remark: 备注 (可选) + :param bit_port: 可选,默认54345 + :return: 返回浏览器ID + """ + url = f"{self.bit_host}:{bit_port}/browser/update" + headers = {'Content-Type': 'application/json'} + data = { + 'name': f'{remark if len(remark) < 40 else remark[:40]}', # 窗口名称 + 'remark': f'{remark}', # 备注 + 'proxyType': f'{proxy_type}', + "browserFingerPrint": {"userAgent": ua} # 留空,随机指纹 + } + if host is not None: + data['host'] = host + if port is not None: + data['port'] = port + if proxy_user is not None: + data['proxyUserName'] = proxy_user + if proxy_pwd is not None: + data['proxyPassword'] = proxy_pwd + if urls is not None: + data['url'] = urls # 额外打开的url 多个用,分割 + res = await self.__request('POST', url, json=data, headers=headers) + if not res.get('success'): + raise Exception(res) + browser_pk = res['data']['id'] + return browser_pk + + # 修改比特币浏览器 + @async_retry(max_retries=3, delay=1.0, backoff=1.0) + async def _bit_browser_update(self, pk: str, remark: str = None, proxyType: str = 'noproxy', host: str = None, + port: str = None, proxy_user: str = None, proxy_pwd: str = None, urls: str = None, + bit_port: str = "54345") -> bool: + """ + 修改比特币浏览器 传入某个参数则修改某个参数 + :param pk: # 浏览器ID + :param remark: # 备注 + :param urls: # 额外打开的url 多个用,分割 + :param bit_port: # 可选,默认54345 + :return: + """ + url = f"{self.bit_host}:{bit_port}/browser/update/partial" + headers = {'Content-Type': 'application/json'} + data = dict() + data['ids'] = [pk] + if remark is not None: + data['remark'] = remark + data['name'] = remark + if urls is not None: + data['url'] = urls + if proxyType != 'noproxy': + data['proxyType'] = proxyType + if host is not None: + data['host'] = host + if port is not None: + data['port'] = port if isinstance(port, int) else int(port) + if proxy_user is not None: + data['proxyUserName'] = proxy_user + if proxy_pwd is not None: + data['proxyPassword'] = proxy_pwd + res = await self.__request('POST', url, json=data, headers=headers) + if not res.get('success'): + raise Exception(res) + return True + + # 打开比特币浏览器 + @async_retry(max_retries=3, delay=1.0, backoff=1.0) + async def _bit_browser_open(self, pk: str, bit_port: str = "54345") -> str: + """ + 打开比特币浏览器 + :param pk: 浏览器ID + :param bit_port: 可选,默认54345 + :return: 返回浏览器地址 + """ + url = f"{self.bit_host}:{bit_port}/browser/open" + data = {"id": f'{pk}'} + headers = {'Content-Type': 'application/json'} + res = await self.__request('POST', url, json=data, headers=headers) + if not res.get('success'): + raise Exception(res) + debugger_address = res['data']['http'] + return debugger_address + + # 关闭比特币浏览器 + async def _bit_browser_close(self, pk: str, bit_port: str = "54345"): + """ + 关闭比特币浏览器 - 执行后需要等待5s + :param pk: 浏览器ID + :param bit_port: 可选,默认54345 + :return: 无返回值 + """ + url = f"{self.bit_host}:{bit_port}/browser/close" + headers = {'Content-Type': 'application/json'} + data = {'id': f'{pk}'} + await self.__request('POST', url, json=data, headers=headers) + await asyncio.sleep(5) # 等待5s,等待浏览器关闭 + + # 删除比特币浏览器 + async def _bit_browser_delete(self, pk: str, bit_port: str = "54345"): + """ + 删除比特币浏览器 + :param pk: 浏览器ID + :param bit_port: 可选,默认54345 + :return: 无返回值 + """ + url = f"{self.bit_host}:{bit_port}/browser/delete" + headers = {'Content-Type': 'application/json'} + data = {'id': f'{pk}'} + print(await self.__request('POST', url, json=data, headers=headers)) + + # 获取所有比特币浏览器 + @async_retry(max_retries=3, delay=1.0, backoff=1.0) + async def _bit_browser_get(self, page: int = 0, limit: int = 10, group_id: str | None = None, + bit_port: str | None = "54345", + ) -> dict: + """ + 获取所有比特币浏览器 + :param page: 页码 + :param group_id: 分组ID + :param limit: 每页数量 + :param bit_port: 可选,默认54345 + :return: {'success': True, 'data': {'page': 1, 'pageSize': 10, 'totalNum': 128, 'list': [{'id': '12a3126accc14c93bd34adcccfc3083c'},{'id':'edc5d61a56214e9f8a8bbf1a2e1b405d'}]}} + """ + + url = f"{self.bit_host}:{bit_port}/browser/list" + headers = {'Content-Type': 'application/json'} + data = {'page': page, 'pageSize': limit} + if group_id is not None: + data['groupId'] = group_id + res = await self.__request('POST', url, json=data, headers=headers) + if not res.get('success'): + raise Exception(res) + return res + + # 获取比特浏览器窗口详情 + @async_retry(max_retries=3, delay=1.0, backoff=1.0) + async def _bit_browser_detail(self, pk: str, bit_port: str = "54345") -> dict: + """ + 获取比特浏览器窗口详情 + :param pk: 浏览器ID + :param bit_port: 可选,默认54345 + :return: {'success': True, 'data': {'id': '12a3126accc14c93bd34adcccfc3083c', 'name': '12a3126accc14c93bd34adcccfc3083c', 'remark': '12a3126accc14c93bd34adcccfc3083c', 'groupId': '12a3126accc14c93bd34adcccfc3083c', 'proxyType + """ + url = f"{self.bit_host}:{bit_port}/browser/detail" + headers = {'Content-Type': 'application/json'} + data = { + "id": pk + } + res = await self.__request('POST', url, json=data, headers=headers) + if not res.get('success'): + raise Exception(res) + return res + + # 获取比特浏览器的进程id并杀死进程 + @async_retry(max_retries=3, delay=1.0, backoff=1.0) + async def _bit_browser_kill_pid(self, pk: str, bit_port: str = "54345") -> str: + """ + 获取比特浏览器的进程id + :param pk: 浏览器ID + :param bit_port: 可选,默认54345 + :return: 返回进程id + """ + url = f"{self.bit_host}:{bit_port}/browser/pids/alive" + headers = {'Content-Type': 'application/json'} + data = { + "ids": [pk] + } + res = await self.__request('POST', url, json=data, headers=headers) + if not res.get('success'): + raise Exception(res) + pid = res['data'][pk] + # 检测系统 并杀死进程 + if pid is not None: + if os.name == 'nt': + os.system(f"taskkill /F /PID {pid}") + else: + os.system(f"kill -9 {pid}") + return pid + + +async def main(): + bit = BitBrowser() + # res = await bit._bit_browser_get() + jc = 0 + while 1: + res = await bit._bit_browser_get( + page=jc, + limit=100, + group_id='4028808b9a52223a019a581bbea1275c') + li = res["data"]["list"] + if len(li) == 0: + break + + for i in li: + id = i["id"] + # 读取浏览器详情 + res = await bit._bit_browser_detail(id) + + # print(f'id -->{id} --> {res}') + data = res["data"] + ua = data["browserFingerPrint"]["userAgent"] + proxy_type = data.get("proxyType") + host = data.get("host") + port = data.get("port") + proxy_account = data.get("proxyUserName") + proxy_password = data.get("proxyPassword") + print(f'id -->{id}') + print(f'ua -->{ua}') + print(f'proxy_type -->{proxy_type}') + print(f'host -->{host}') + print(f'port -->{port}') + print(f'proxy_account -->{proxy_account}') + print(f'proxy_password -->{proxy_password}') + print(f'='*50) + jc += 1 + + + +bit_browser = BitBrowser() + +# if __name__ == '__main__': +# asyncio.run(main()) diff --git a/spider/mail_.py b/spider/mail_.py new file mode 100644 index 0000000..4bfbc17 --- /dev/null +++ b/spider/mail_.py @@ -0,0 +1,835 @@ +import asyncio +import imaplib +import email +import random +import socket +import string +import time +from email.header import decode_header +from datetime import timezone, timedelta +import email.utils +import aiohttp +import socks +import requests +import smtplib +from email.mime.text import MIMEText +from email.header import Header +from functools import wraps +from loguru import logger + + +def retry(max_retries: int = 3, delay: float = 1.0, backoff: float = 1.0): + """ + 通用重试装饰器 + :param max_retries: 最大重试次数 + :param delay: 每次重试的初始延迟(秒) + :param backoff: 每次重试延迟的递增倍数 + """ + + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + retries = 0 + current_delay = delay + while retries < max_retries: + try: + return func(*args, **kwargs) + except Exception as e: + retries += 1 + if retries >= max_retries: + logger.warning(f"函数 {func.__name__} 在尝试了 {max_retries} 次后失败,错误信息: {e}") + return None # 重试次数用尽后返回 None + logger.warning(f"正在重试 {func.__name__} {retries + 1}/{max_retries} 因错误: {e}") + time.sleep(current_delay) + current_delay *= backoff + + return None # 三次重试仍未成功,返回 None + + return wrapper + + return decorator + + +def async_retry(max_retries: int = 3, delay: float = 1.0, backoff: float = 1.0): + """ + 支持异步函数的通用重试装饰器 + :param max_retries: 最大重试次数 + :param delay: 每次重试的初始延迟(秒) + :param backoff: 每次重试延迟的递增倍数 + """ + + def decorator(func): + @wraps(func) + async def wrapper(*args, **kwargs): + retries = 0 + current_delay = delay + while retries < max_retries: + try: + return await func(*args, **kwargs) # 直接执行原始方法 + except Exception as e: + retries += 1 + if retries >= max_retries: + logger.warning(f"函数 {func.__name__} 在尝试了 {max_retries} 次后失败,错误信息: {e}") + return None # 重试次数用尽后返回 None + logger.warning(f"正在重试 {func.__name__} {retries + 1}/{max_retries} 因错误: {e}") + + await asyncio.sleep(current_delay) # 异步延迟 + current_delay *= backoff # 根据backoff递增延迟 + + return None # 三次重试仍未成功,返回 None + + return wrapper + + return decorator + + +# 域名管理类 - 高内聚低耦合的域名管理方案 +class DomainManager: + """ + 域名管理器 - 统一管理所有邮箱域名相关操作 + 实现高内聚低耦合的设计原则 + """ + + def __init__(self): + # 域名列表 - 只需要在这里添加新域名 + self._domains = [ + "gmail.com", + "qianyouduo.com", + "rxybb.com", + "cqrxy.vip", + "0n.lv", + "qianyouduo.com", + "ziyouzuan.com", + "emaing.online", + "emaing.fun", + "emaing.asia", + "isemaing.site", + "emaing.cyou", + "emaing.site", + "emaing.icu", + "emaing.store", + "emaing.pw", + "emaing.xyz", + "qydkjgs.asia", + "qydgs.autos", + "qydkj.homes", + "qydkjgs.baby", + "qydkj.baby", + "qydkj.cyou", + "qydkjgs.autos", + "qydkj.autos", + "qydkjgs.cyou", + "qydkjgs.homes", + "qydgs.asia", + "qydkj.asia", + "qydgs.baby", + "qydgs.cyou", + "qydgs.homes", + "lulanjing.asia", + "lisihan.asia", + "mmwan.asia", + "xyttan.asia", + "zpaily.asia", + "youxinzhiguo.asia", + "huijinfenmu.asia", + "linghao.asia", + "cqhc.asia", + "huacun.asia", + "huachen.asia", + "yisabeier.asia", + "xinxinr.cyou", + "lilisi.asia", + "xybbwan.cyou", + "zhongjing.cyou", + "zprxy.cyou", + "cqhuacun.cyou", + "huazong.icu", + "huacun.cyou" + ] + + def get_domain_by_type(self, mail_type: int) -> str: + """ + 根据邮箱类型获取域名 + :param mail_type: 邮箱类型编号 + :return: 对应的域名 + """ + if 0 <= mail_type < len(self._domains): + return self._domains[mail_type] + return self._domains[1] # 默认返回 qianyouduo.com + + def get_domain_type(self, domain: str) -> int: + """ + 根据域名获取类型编号 + :param domain: 域名 + :return: 对应的类型编号,如果不存在返回1 + """ + try: + return self._domains.index(domain) + except ValueError: + return 1 # 默认返回 qianyouduo.com 的类型 + + def get_imap_server(self, mail_type: int) -> str: + """ + 根据邮箱类型获取IMAP服务器地址 + :param mail_type: 邮箱类型编号 + :return: IMAP服务器地址 + """ + domain = self.get_domain_by_type(mail_type) + return f"imap.{domain}" + + def get_imap_server_by_domain(self, domain: str) -> str: + """ + 根据域名获取IMAP服务器地址 + :param domain: 域名 + :return: IMAP服务器地址 + """ + return f"imap.{domain}" + + def is_valid_domain(self, domain: str) -> bool: + """ + 检查域名是否在支持列表中 + :param domain: 域名 + :return: 是否支持该域名 + """ + return domain in self._domains + + def get_all_domains(self) -> list: + """ + 获取所有支持的域名列表 + :return: 域名列表的副本 + """ + return self._domains.copy() + + def get_domain_count(self) -> int: + """ + 获取支持的域名总数 + :return: 域名总数 + """ + return len(self._domains) + + def get_creatable_domains(self) -> list: + """ + 获取可用于创建邮箱的域名列表(排除gmail.com) + :return: 可创建邮箱的域名列表 + """ + return [domain for domain in self._domains if domain != "gmail.com"] + + def get_creatable_domain_by_type(self, mail_type: int) -> str: + """ + 根据邮箱类型获取可创建的域名(排除gmail.com) + :param mail_type: 邮箱类型编号 + :return: 对应的域名,如果是gmail.com则返回默认域名 + """ + domain = self.get_domain_by_type(mail_type) + if domain == "gmail.com": + return self._domains[1] # 返回qianyouduo.com作为默认 + return domain + + +# 邮箱模块 +class Mail: + def __init__(self): + self.domain_manager = DomainManager() + self.api_host = 'http://111.10.175.206:5020' + + def email_account_read(self, pk: int = None, account: str = None, status: bool = None, host: str = None, + proxy_account: str = None, + parent_account: str = None, order_by: str = None, level: int = None, + update_time_start: str = None, update_time_end: str = None, res_count: bool = False, + create_time_start: str = None, create_time_end: str = None, page: int = None, + limit: int = None) -> dict: + """ + 读取mail账号 + :param level: 邮箱等级(可选) + :param status: 状态(可选) + :param update_time_start: 更新时间起始(可选) + :param update_time_end: 更新时间结束(可选) + :param res_count: 返回总数 (可选) + :param parent_account: 母邮箱账号 (可选) + :param pk: 主键 (可选) + :param account: 账号 (可选) + :param host: 代理 (可选) + :param proxy_account: 代理账号 (可选) + :param order_by: 排序方式 (可选) id|create_time|update_time 前面加-表示倒序 + :param create_time_start: 创建起始时间 (可选) + :param create_time_end: 创建结束时间 (可选) + :param page: 页码 (可选) + :param limit: 每页数量 (可选) + :return: 返回json 成功字段code=200 + """ + if pk is not None: + url = f'{self.api_host}/mail/account/{pk}' + return requests.get(url).json() + + url = f'{self.api_host}/mail/account' + data = dict() + if account is not None: + data['account'] = account + if status is not None: + data['status'] = status + if host is not None: + data['host'] = host + if proxy_account is not None: + data['proxy_account'] = proxy_account + if parent_account is not None: + data['parent_account'] = parent_account + if order_by is not None: + data['order_by'] = order_by + if level is not None: + data['level'] = level + if create_time_start is not None: + data['create_time_start'] = create_time_start + if create_time_end is not None: + data['create_time_end'] = create_time_end + if update_time_start is not None: + data['update_time_start'] = update_time_start + if update_time_end is not None: + data['update_time_end'] = update_time_end + if res_count: + data['res_count'] = res_count + if page is not None: + data['page'] = page + if limit is not None: + data['limit'] = limit + res = requests.get(url, params=data).json() + if res.get('code') not in [200, 400, 404]: + raise Exception(res) + return res + + # 创建随机邮箱 + @retry(max_retries=3, delay=1.0, backoff=1.0) + def email_create_random(self, count: int = 8, pwd: str = 'Zpaily88', mail_type: int = 1) -> str: + """ + 创建邮箱 + :param count: 邮箱长度(默认8位) + :param pwd: 邮箱密码(默认Zpaily88) + :param mail_type: 邮箱类型(1表示qianyouduo.com 2表示rxybb.com 3表示cqrxy.vip 4表示0n.lv 默认1) + :return: 邮箱账号 + """ + headers = { + "Accept-Language": "zh-CN,zh;q=0.9", + "Authorization": "Basic YWRtaW5AcWlhbnlvdWR1by5jb206WnBhaWx5ODgh", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Content-Type": "application/json", + "Origin": "https://mail.qianyouduo.com", + "Pragma": "no-cache", + "Referer": "https://mail.qianyouduo.com/admin/api/doc", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "accept": "*/*", + "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"" + } + url = "https://mail.qianyouduo.com/admin/api/v1/boxes" + name = ''.join(random.choices(string.ascii_letters + string.digits, k=count)).lower() + + # 使用域名管理器获取可创建的域名(排除gmail.com) + mail_end = self.domain_manager.get_creatable_domain_by_type(mail_type) + data = { + "name": name, + "email": f"{name}@{mail_end}", + "passwordPlaintext": pwd + } + response = requests.post(url, headers=headers, json=data) + if 'Validation errors: [user] This combination of username and domain is already in database' in response.text: + return f'{name}@{mail_end}' + if response.status_code != 201: + raise Exception(response.status_code) + return f"{name}@{mail_end}" + + # 异步创建随机邮箱 + @async_retry(max_retries=3, delay=1.0, backoff=1.0) + async def _email_create_random(self, count: int = 8, pwd: str = 'Zpaily88', mail_type: int = 1) -> str: + """ + 创建邮箱 + :param count: 邮箱长度(默认8位) + :param pwd: 邮箱密码(默认Zpaily88) + :param mail_type: 邮箱类型(1表示qianyouduo.com 2表示rxybb.com 3表示cqrxy.vip 4表示0n.lv 默认1) + :return:邮箱账号 + """ + headers = { + "Accept-Language": "zh-CN,zh;q=0.9", + "Authorization": "Basic YWRtaW5AcWlhbnlvdWR1by5jb206WnBhaWx5ODgh", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Content-Type": "application/json", + "Origin": "https://mail.qianyouduo.com", + "Pragma": "no-cache", + "Referer": "https://mail.qianyouduo.com/admin/api/doc", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "accept": "*/*", + "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"" + } + url = "https://mail.qianyouduo.com/admin/api/v1/boxes" + name = ''.join(random.choices(string.ascii_letters + string.digits, k=count)).lower() + + # 使用域名管理器获取可创建的域名(排除gmail.com) + mail_end = self.domain_manager.get_creatable_domain_by_type(mail_type) + data = { + "name": name, + "email": f"{name}@{mail_end}", + "passwordPlaintext": pwd + } + async with aiohttp.ClientSession() as session: + async with session.post(url, headers=headers, json=data) as response: + status = response.status + text = await response.text() + if 'Validation errors: [user] This combination of username and domain is already in database' in text: + return f"{name}@{mail_end}" + if status != 201: + raise Exception(status) + return f"{name}@{mail_end}" + + # 创建邮箱 + @retry(max_retries=3, delay=1.0, backoff=1.0) + def email_create(self, account: str, pwd: str = 'Zpaily88') -> str | None: + """ + 创建邮箱 + :param account: 邮箱账号 + :param pwd: 邮箱密码(默认Zpaily88) + :return:邮箱账号 + """ + headers = { + "Accept-Language": "zh-CN,zh;q=0.9", + "Authorization": "Basic YWRtaW5AcWlhbnlvdWR1by5jb206WnBhaWx5ODgh", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Content-Type": "application/json", + "Origin": "https://mail.qianyouduo.com", + "Pragma": "no-cache", + "Referer": "https://mail.qianyouduo.com/admin/api/doc", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "accept": "*/*", + "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"" + } + url = "https://mail.qianyouduo.com/admin/api/v1/boxes" + name = account.split('@')[0] + mail_end = account.split('@')[1] + + # 排除gmail.com域名 + if mail_end == "gmail.com": + return None + # 验证域名是否支持 + if not self.domain_manager.is_valid_domain(mail_end): + raise ValueError(f"不支持的域名: {mail_end},支持的域名列表: {self.domain_manager.get_all_domains()}") + + data = { + "name": name, + "email": f"{name}@{mail_end}", + "passwordPlaintext": pwd + } + response = requests.post(url, headers=headers, json=data) + print(f'创建邮箱响应: {response.status_code}') + if response.status_code not in [201, 400]: + raise Exception(response.status_code) + return f"{name}@{mail_end}" + + # 异步创建邮箱 + @async_retry(max_retries=3, delay=1.0, backoff=1.0) + async def _email_create(self, account: str, pwd: str = 'Zpaily88') -> str | None: + """ + 创建邮箱 + :param account: 邮箱账号 + :param pwd: 邮箱密码(默认Zpaily88) + :return: 邮箱账号 + """ + headers = { + "Accept-Language": "zh-CN,zh;q=0.9", + "Authorization": "Basic YWRtaW5AcWlhbnlvdWR1by5jb206WnBhaWx5ODgh", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Content-Type": "application/json", + "Origin": "https://mail.qianyouduo.com", + "Pragma": "no-cache", + "Referer": "https://mail.qianyouduo.com/admin/api/doc", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "accept": "*/*", + "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"" + } + url = "https://mail.qianyouduo.com/admin/api/v1/boxes" + name = account.split('@')[0] + mail_end = account.split('@')[1] + # 排除gmail.com域名 + if mail_end == "gmail.com": + return None + + # 验证域名是否支持 + if not self.domain_manager.is_valid_domain(mail_end): + raise ValueError(f"不支持的域名: {mail_end},支持的域名列表: {self.domain_manager.get_all_domains()}") + + data = { + "name": name, + "email": f"{name}@{mail_end}", + "passwordPlaintext": pwd + } + async with aiohttp.ClientSession() as session: + async with session.post(url, headers=headers, json=data) as response: + status = response.status + if status not in [201, 400]: + raise Exception(f'status code: {status}') + return f"{name}@{mail_end}" + + # 删除邮箱 + @retry(max_retries=3, delay=1.0, backoff=1.0) + def email_delete(self, account: str) -> bool: + """ + 删除邮箱 + :param account: 邮箱账号 + :return: True表示删除成功,False表示删除失败 + """ + headers = { + "Accept-Language": "zh-CN,zh;q=0.9", + "Authorization": "Basic YWRtaW5AcWlhbnlvdWR1by5jb206WnBhaWx5ODgh", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Content-Type": "application/json", + "Origin": "https://mail.qianyouduo.com", + "Pragma": "no-cache", + "Referer": "https://mail.qianyouduo.com/admin/api/doc", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "accept": "*/*", + "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"" + } + url = f"https://mail.qianyouduo.com/admin/api/v1/boxes/{account}" + if '@gmail.com' in account: + return False + response = requests.delete(url, headers=headers) + print(f'删除邮箱响应: --> {response.status_code}') + if response.status_code not in [204, 404]: + raise Exception(response.status_code) + return True + + # 异步删除邮箱 + @async_retry(max_retries=3, delay=1.0, backoff=1.0) + async def _email_delete(self, account: str) -> bool: + """ + 删除邮箱 + :param account: 邮箱账号 + :return: True表示删除成功,False表示删除失败 + """ + headers = { + "Accept-Language": "zh-CN,zh;q=0.9", + "Authorization": "Basic YWRtaW5AcWlhbnlvdWR1by5jb206WnBhaWx5ODgh", + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "Content-Type": "application/json", + "Origin": "https://mail.qianyouduo.com", + "Pragma": "no-cache", + "Referer": "https://mail.qianyouduo.com/admin/api/doc", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "accept": "*/*", + "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"", + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": "\"macOS\"" + } + url = f"https://mail.qianyouduo.com/admin/api/v1/boxes/{account}" + if '@gmail.com' in account: + return False + async with aiohttp.ClientSession() as session: + async with session.delete(url, headers=headers) as response: + status = response.status + if status not in [204, 404]: + raise Exception(f'status code: {status}') + return True + + # 处理邮件正文 + @staticmethod + def extract_body(msg): + """ + 提取邮件正文,优先返回 HTML 文本 + - 更健壮的字符集解析:优先使用 part 的 charset 信息,失败回退到 utf-8 / latin-1 + - 仅处理 inline 的 text/html 与 text/plain 内容 + """ + html_text = None + plain_text = None + + def _decode_part(part): + payload = part.get_payload(decode=True) + if payload is None: + return None + # 优先从内容中解析 charset + charset = (part.get_content_charset() or part.get_param('charset') or 'utf-8') + try: + return payload.decode(charset, errors='replace') + except LookupError: + # 未知编码时回退 + try: + return payload.decode('utf-8', errors='replace') + except Exception: + return payload.decode('latin-1', errors='replace') + + if msg.is_multipart(): + for part in msg.walk(): + content_type = part.get_content_type() + content_disposition = part.get_content_disposition() + + if content_type == "text/html" and (not content_disposition or content_disposition == "inline"): + html_text = _decode_part(part) or html_text + elif content_type == "text/plain" and (not content_disposition or content_disposition == "inline"): + plain_text = _decode_part(part) or plain_text + else: + content_type = msg.get_content_type() + if content_type == "text/html": + html_text = _decode_part(msg) + elif content_type == "text/plain": + plain_text = _decode_part(msg) + + # 优先返回 HTML 文本,如果没有 HTML 文本,则返回纯文本 + return html_text or plain_text or "" + + # 转换邮件日期 + @staticmethod + def convert_to_china_time(date_str): + """ + 将邮件日期转换为10位时间戳(中国时区) + - 保留原始邮件的时区信息;若无时区,则按 UTC 处理 + - 异常时返回当前时间戳,避免解析失败导致崩溃 + """ + try: + email_date = email.utils.parsedate_to_datetime(date_str) + if email_date is None: + return int(time.time()) + if email_date.tzinfo is None: + email_date = email_date.replace(tzinfo=timezone.utc) + china_time = email_date.astimezone(timezone(timedelta(hours=8))) + return int(china_time.timestamp()) + except Exception: + return int(time.time()) + + # 获取邮件 + def email_read(self, user: str, from_: str, limit: int = 1, is_del: bool = False) -> list | None: + """ + 获取最新邮件 + :param user: 母账号 + :param from_: 发件人匹配关键字(可为邮箱或显示名,大小写不敏感) + :param limit: 获取邮件数量(默认1封) + :param is_del: 是否删除整个邮箱账号(非 Gmail 才会执行账号删除) + :return: 返回邮件列表,每个元素格式为: + { + "title": "邮件标题", + "from": "发件人", + "date": "邮件日期(中国时区时间戳)", + "content": "邮件正文", + "code": 200 + } + """ + user_li = user.split('@') + domain = user_li[1] + + # 使用域名管理器获取邮箱类型 + if not self.domain_manager.is_valid_domain(domain): + return None + + mail_type = self.domain_manager.get_domain_type(domain) + # 仅对 Gmail 进行点号归一化,其它域名按原样处理 + local_part = user_li[0] + if domain == "gmail.com": + local_part = local_part.replace('.', '') + user = local_part + '@' + user_li[1] + proxy_host = None + proxy_port = None + proxy_user = None + proxy_pwd = None + if mail_type == 0: + res = self.email_account_read(parent_account=user, status=True, level=0) + if res['code'] != 200: + return None + pwd = res['items'][0]['parent_pwd'] + proxy_host = res['items'][0]['host'] + proxy_port = res['items'][0]['port'] + proxy_user = res['items'][0]['proxy_account'] + proxy_pwd = res['items'][0]['proxy_pwd'] + else: + pwd = 'Zpaily88' + + items = [] # 存储邮件列表 + + # 保存原始socket + original_socket = None + if proxy_host is not None and proxy_port is not None: + original_socket = socket.socket + if proxy_user is not None and proxy_pwd is not None: + socks.setdefaultproxy(socks.SOCKS5, proxy_host, int(proxy_port), True, proxy_user, proxy_pwd) + else: + socks.setdefaultproxy(socks.SOCKS5, proxy_host, int(proxy_port), True) + socket.socket = socks.socksocket + + imap_server = None + had_error = False + try: + # 在设置代理后创建IMAP连接 + imap_server = imaplib.IMAP4_SSL(self.domain_manager.get_imap_server(mail_type)) + if not imap_server: + had_error = True + else: + + # pwd去除空格 + pwd = pwd.replace(' ', '') + # print(f'pwd: {pwd}') + imap_server.login(user, pwd) + status, _ = imap_server.select("INBOX") + if status != 'OK': + had_error = True + else: + status, email_ids = imap_server.search(None, "ALL") + if status != 'OK': + had_error = True + else: + email_id_list = email_ids[0].split() + + # 获取最近limit条邮件ID + recent_ids = email_id_list[-20:] # 仍然获取最近20封以确保有足够的邮件可以筛选 + found_count = 0 # 记录找到的符合条件的邮件数量 + + for email_id in recent_ids[::-1]: # 从最新的邮件开始处理 + if found_count >= limit: # 如果已经找到足够数量的邮件,就退出循环 + break + + status, msg_data = imap_server.fetch(email_id, "(RFC822)") + for response in msg_data: + if isinstance(response, tuple): + msg = email.message_from_bytes(response[1]) + # 兼容性发件人匹配:解析地址与显示名,大小写不敏感,支持子串匹配 + from_field = msg.get("From", "") + addresses = email.utils.getaddresses([from_field]) + needle = (from_ or "").lower() + candidates = [] + for name, addr in addresses: + if name: + candidates.append(name.lower()) + if addr: + candidates.append(addr.lower()) + if any(needle in c for c in candidates): + # 标题解码,处理无标题或编码缺失的情况 + raw_subject = msg.get("Subject") + subject = "" + if raw_subject is not None: + dh = decode_header(raw_subject) + if dh: + s, enc = dh[0] + if isinstance(s, bytes): + try: + subject = s.decode(enc or 'utf-8', errors='replace') + except LookupError: + subject = s.decode('utf-8', errors='replace') + else: + subject = s + + item = { + "title": subject, + "from": msg["From"], + "content": self.extract_body(msg), + "code": 200 + } + + # 获取并转换邮件时间 + date_str = msg["Date"] + if date_str: + item["date"] = self.convert_to_china_time(date_str) + + items.append(item) + found_count += 1 + + if found_count >= limit: # 如果已经找到足够数量的邮件,就跳出内层循环 + break + + # 读取完成不再对单封邮件做删除标记与 expunge + + except imaplib.IMAP4.error as e: + # items.append({'title': 'error', 'from': 'error', 'content': f'连接邮箱失败: {e}', 'code': 500}) + had_error = True + except Exception as e: + # items.append({'title': 'error', 'from': 'error', 'content': f'获取邮件异常: {e}', 'code': 500}) + had_error = True + finally: + try: + # 检查连接是否建立 + if 'imap_server' in locals() and imap_server is not None: + try: + # 先检查是否处于已选择状态 + if hasattr(imap_server, 'state') and imap_server.state == 'SELECTED': + imap_server.close() + except Exception as e: + logger.error(f"关闭IMAP文件夹时发生错误: {e}") + try: + # 无论如何尝试登出 + imap_server.logout() + except Exception as e: + logger.error(f"登出IMAP服务器时发生错误: {e}") + # 在Windows上可能需要强制关闭socket + try: + if hasattr(imap_server, 'sock') and imap_server.sock is not None: + imap_server.sock.close() + except Exception as sock_err: + logger.error(f"强制关闭socket时发生错误: {sock_err}") + except Exception as outer_e: + logger.error(f"处理IMAP连接关闭时发生错误: {outer_e}") + finally: + # 重置socket设置(如果使用了代理) + if proxy_host is not None and original_socket is not None: + socket.socket = original_socket + + # 若成功获取到至少一封匹配邮件且请求删除,则删除整个邮箱账号 + if is_del and len(items) > 0: + try: + self.email_delete(user) + except Exception as del_err: + logger.error(f"删除邮箱账号失败: {del_err}") + + if had_error: + return None + if len(items) == 0: + return None + return items # 返回邮件列表 + + +async def main(): + """ + 使用示例:展示新的域名管理系统的使用方法 + """ + mail = Mail() + mai = '0gz3vvd4@'+'qydgs.asia' + res = mail.email_create(mai) + print(f"创建的邮箱: {res}") + # random_email = mail.email_create_random(count=8, mail_type=1) + # print(f"创建的随机邮箱: {random_email}") + + # 读取邮件 + # res = mail.email_read('0gz3vvd4@qydgs.asia', '@', 1, is_del=True) + # print(f'读取的邮件: {res}') + + # 删除邮箱 + res = mail.email_delete(mai) + print(f"删除的邮箱: {res}") + +mail_ = Mail() + +# if __name__ == '__main__': +# asyncio.run(main()) diff --git a/spider/main.py b/spider/main.py new file mode 100644 index 0000000..a336c0f --- /dev/null +++ b/spider/main.py @@ -0,0 +1,323 @@ +from math import log +import random +from re import S +import time +from tkinter import N +from DrissionPage import Chromium +from loguru import logger +from work import get_random_canada_info +from mail_ import mail_ +from bit_browser import bit_browser + +class Auto: + def __init__(self,http:str): + self.browser = Chromium(http) + self.tab = self.browser.latest_tab + pass + + + # cf打码 + def solve_cloudflare(self): + tab = self.browser.latest_tab + for _ in range(8): + self.tab.wait(1) + try: + shadow1 = tab.ele( + 'x://*[@name="cf-turnstile-response"]').parent().shadow_root + iframe = shadow1.get_frame(1) + if iframe: + logger.debug("找到Cloudflare iframe") + shadow2 = iframe.ele('x:/html/body').shadow_root + if shadow2: + logger.debug("找到Cloudflare iframe body shadow root") + status = shadow2.ele( + 'x://span[text()="Success!"]', timeout=1) + if status: + logger.debug("Cloudflare验证成功") + return True + checkbox = shadow2.ele( + 'x://input[@type="checkbox"]', timeout=1) + if checkbox: + checkbox.click() + logger.debug("点击Cloudflare复选框") + tab.wait(2) + logger.debug("重新获取状态") + # return False + except Exception as e: + # logger.error(f"处理Cloudflare异常: {e}") + logger.debug(f"cloudflare处理通过: {e}") + return True + tab.wait(1) + return False + + # 打开URL + def open_url(self, url: str): + self.tab.get(url) + + # 等待进入首页 + def wait_home(self): + logger.debug("等待进入首页") + jc = 0 + while True: + if jc > 5: + logger.error("等待进入首页超过5次,未成功") + return False + self.tab.wait(1) + # 判断cf是否通过 + bol = self.solve_cloudflare() + if not bol: + logger.debug("Cloudflare验证失败.") + continue + else: + logger.debug("Cloudflare验证成功.") + self.tab.wait(1.5) + html = self.tab.url + logger.debug(f"当前URL: {html}") + if 'https://veritaconnect.ca/canadianbreadsettlement/en-us' == html: + logger.debug("成功进入首页") + return True + jc += 1 + + # 点击continue按钮 + def click_continue(self, bl: bool = False): + logger.debug("点击Continue按钮") + jc = 0 + while True: + if jc > 5: + logger.error("点击Continue按钮超过5次,未成功") + return False + try: + continue_button = self.tab.ele( + 't:button@text():Continue', timeout=1) + if continue_button: + # 判断cf是否通过 + bol = self.solve_cloudflare() + if not bol: + logger.debug("Cloudflare验证失败..") + continue + else: + logger.debug("Cloudflare验证成功..") + continue_button.click() + logger.debug("点击Continue按钮成功") + self.tab.wait(1.5) + + bol = self.tab.ele( + 't:li@text():There was a problem, please try again.', timeout=1) + if bol: + if bl: + logger.debug("多次异常界面, 结束继续点击") + return False + logger.debug("异常界面") + self.tab.wait(1) + return self.click_continue(bl=True) + # bol = self.tab.ele('t:h2@text()=Claim Form', timeout=1) + # if bol: + # logger.debug("成功进入问卷界面") + # return True + html = self.tab.url + logger.debug(f"当前URL: {html}") + if 'https://veritaconnect.ca/canadianbreadsettlement/en-us/Claimant/UnknownClaimForm' in html: + logger.debug("成功进入问卷界面") + return True + except Exception as e: + logger.error(f"点击Continue按钮异常: {e}") + self.tab.wait(1) + return False + + # 随机取城市 + def get_random_city(self, province: str|None=None): + cities = { + "Alberta": ["Calgary", "Edmonton"], + "British Columbia": ["Vancouver"], + # "Manitoba": ["Winnipeg", "Rochester"], + # "New Brunswick": ["Fredericton", "Moncton"], + # "Newfoundland and Labrador": ["St. John's", "Halifax"], + "Nova Scotia": ["Halifax"], + "Ontario": ["Toronto"], + # "Prince Edward Island": ["Charlottetown", "St. John's"], + # "Quebec": ["Quebec City", "Montreal"], + # "Saskatchewan": ["Saskatoon", "Regina"], + } + if province is None: + province = random.choice(list(cities.keys())) + return province,random.choice(cities.get(province, [])) + + # 填写问卷 + def fill_questionnaire(self): + province, city = self.get_random_city() + info = get_random_canada_info(province, city) + first_name = info["firstname"] + last_name = info["lastname"] + # 将生日格式从 '8/28/1995' 转为 'yyyy-mm-dd',日月不足两位补0 + birthday = info["birthday"] + current_address = info["address_str"] + city = info["city_name"] + province = info["province"] + postal_code = info["postcode"] + email = 'sfsf@qq.com' + phone = info["phone"] + text = '3333' + # 人数 + person_count = str(random.randint(3, 5)) + logger.debug("填写问卷") + self.tab.wait(0.1) + logger.debug(f"填写first_name: {first_name}") + self.tab.ele('t:input@id=FirstName').set.value(first_name) + self.tab.wait(0.1) + logger.debug(f"填写last_name: {last_name}") + self.tab.ele('t:input@id=LastName').set.value(last_name) + self.tab.wait(0.1) + logger.debug(f"填写birthday: {birthday}") + self.tab.ele('t:input@id=DateOfBirth').set.value(birthday) + self.tab.wait(0.1) + logger.debug(f"填写current_address: {current_address}") + self.tab.ele('t:input@id=AddressLine1').set.value(current_address) + self.tab.wait(0.1) + logger.debug(f"填写city: {city}") + self.tab.ele('t:input@id=City').set.value(city) + self.tab.wait(0.1) + logger.debug(f"填写province: {province}") + self.tab.ele( + 't:select@id=CanProv').ele(f't:option@text()={province}').click() + self.tab.wait(0.1) + logger.debug(f"填写postal_code: {postal_code}") + self.tab.ele('t:input@id=CanPostal').set.value(postal_code) + self.tab.wait(0.1) + logger.debug(f"填写NumberOfAdults: {person_count}") + self.tab.ele( + 't:select@id=NumberOfAdults').ele(f't:option@text()={person_count}').click() + self.tab.wait(0.1) + logger.debug(f"选择地址没变") + self.tab.eles('t:input@id=IsDifferentAddress')[1].click() + self.tab.wait(0.1) + logger.debug(f"填写email: {email}") + self.tab.ele('t:input@id=EmailAddress').set.value(email) + self.tab.wait(0.1) + logger.debug(f"填写ConfirmEmailAddress: {email}") + self.tab.ele('t:input@id=ConfirmEmailAddress').set.value(email) + self.tab.wait(0.1) + logger.debug(f"填写phone: {phone}") + self.tab.ele('t:input@id=PhoneNumber').set.value(phone) + self.tab.wait(0.1) + logger.debug(f"选择同意条款") + self.tab.ele('t:input@id=IVerify').click() + self.tab.wait(0.1) + logger.debug(f"选择没有申请过") + self.tab.eles('t:input@id=IsCompensated')[1].click() + self.tab.wait(0.1) + logger.debug(f"填写text: {text}") + self.tab.ele('t:textarea@id=MetaAnswerA').set.value(text) + self.tab.wait(0.1) + logger.debug(f"勾选同意我的名字") + self.tab.ele('t:input@id=IDeclare').click() + self.tab.wait(0.1) + logger.debug(f"填写PrintName: {last_name+' '+first_name}") + self.tab.ele( + 't:input@id=PrintName').set.value(last_name+' '+first_name) + self.tab.wait(0.1) + # logger.debug(f"点击Submit按钮") + # self.tab.ele('t:button@text():Submit').click() + + +# 取对应城市的代理 +def get_proxy( city: str): + if city == "Calgary": + return "us.novproxy.io:1000:uwqr8065-region-CA-st-Alberta-city-Calgary:d6vqwerx".split(':') + elif city =='Edmonton': + return 'us.novproxy.io:1000:uwqr8065-region-CA-st-Alberta-city-Edmonton:d6vqwerx'.split(':') + elif city =='Vancouver': + return 'us.novproxy.io:1000:uwqr8065-region-CA-st-British Columbia-city-Vancouver:d6vqwerx'.split(':') + elif city =='Halifax': + return 'us.novproxy.io:1000:uwqr8065-region-CA-st-Nova Scotia-city-Halifax:d6vqwerx'.split(':') + elif city == 'Toronto': + return 'us.novproxy.io:1000:uwqr8065-region-CA-st-Ontario-city-Toronto:d6vqwerx'.split(':') + else: + return None + + +"""指纹浏览器操作""" +# 创建指纹浏览器 +def create_fingerprint_browser(city: str): + """ + 根据城市创建指纹浏览器并执行问卷流程 + + 参数: + city (str): 城市名称,例如 `Calgary`、`Edmonton` 等 + """ + browser_id = None + try: + proxy = get_proxy(city) + logger.info(f"{city} 准备创建指纹浏览器") + browser_id = bit_browser.bit_browser_create( + remark=city, + host=proxy[0], + port=proxy[1], + proxy_user=proxy[2], + proxy_pwd=proxy[3], + proxy_type='socks5' + ) + logger.debug(browser_id) + # 打开指纹浏览器 + http = bit_browser.bit_browser_open(browser_id) + logger.debug(http) + auto = Auto(http) + auto.open_url( + "https://veritaconnect.ca/canadianbreadsettlement/en-us/Claimant/UnknownClaimForm") + bol = auto.wait_home() + if not bol: + logger.error(f"{city} 进入首页失败,结束该线程") + return + + bol = auto.click_continue() + if not bol: + logger.error(f"{city} 点击 Continue 失败,结束该线程") + return + auto.fill_questionnaire() + time.sleep(5) + finally: + if browser_id: + # 关闭指纹浏览器 + try: + bit_browser.bit_browser_close(browser_id) + except Exception as e: + logger.error(f"{city} 关闭浏览器异常: {e}") + # 删除指纹浏览器 + try: + bit_browser.bit_browser_delete(browser_id) + except Exception as e: + logger.error(f"{city} 删除浏览器异常: {e}") + +def run_city_forever(city: str): + """ + 持续循环运行指定城市流程:完成一次即关闭并删除浏览器,然后重新创建继续运行 + + 参数: + city (str): 城市名称 + """ + while True: + try: + create_fingerprint_browser(city) + except Exception as e: + logger.error(f"{city} 流程异常: {e}") + time.sleep(2) + +def run_all_cities_concurrently(): + """ + 多线程并发运行所有城市流程 + """ + import threading + cities = ['Calgary', 'Edmonton', 'Vancouver', 'Halifax', 'Toronto'] + threads = [] + for city in cities: + t = threading.Thread(target=run_city_forever, args=(city,), name=f"{city}-thread") + t.start() + threads.append(t) + logger.info(f"{city} 线程已启动") + time.sleep(2) + for t in threads: + t.join() + logger.info("所有城市流程执行完成") + +if __name__ == "__main__": + run_all_cities_concurrently() + \ No newline at end of file diff --git a/spider/requirements.txt b/spider/requirements.txt new file mode 100644 index 0000000..085a4c2 --- /dev/null +++ b/spider/requirements.txt @@ -0,0 +1,23 @@ +aiohttp +requests +curl_cffi +aiohttp-socks +requests[socks] +fake_useragent +apscheduler +aiofiles +loguru +portalocker +aiomultiprocess +faker +eth_account +eth_utils +solders +toncli +ecdsa +base58 +ddddocr +aiohttp_socks +websockets +psutil +socks \ No newline at end of file diff --git a/spider/work.py b/spider/work.py new file mode 100644 index 0000000..60192aa --- /dev/null +++ b/spider/work.py @@ -0,0 +1,333 @@ +import random +import time +from datetime import date, timedelta +from typing import Optional, Dict + +import requests + + +CA_PROVINCE_ABBR = { + "Alberta": "AB", + "British Columbia": "BC", + "Manitoba": "MB", + "New Brunswick": "NB", + "Newfoundland and Labrador": "NL", + "Nova Scotia": "NS", + "Ontario": "ON", + "Prince Edward Island": "PE", + "Quebec": "QC", + "Saskatchewan": "SK", + "Northwest Territories": "NT", + "Nunavut": "NU", + "Yukon": "YT", +} + + +CA_COORDS = { + "AB": [(51.044733, -114.071883, "Calgary"), (53.546124, -113.493823, "Edmonton")], + "BC": [(49.282729, -123.120738, "Vancouver"), (48.428421, -123.365644, "Victoria")], + "MB": [(49.895137, -97.138374, "Winnipeg"), (50.445211, -96.823611, "East St Paul")], + "NB": [(45.963589, -66.643115, "Fredericton"), (46.510712, -67.255044, "Woodstock")], + "NL": [(53.135509, -57.660435, "Labrador City"), (47.561510, -52.712585, "St. John's")], + "NS": [(44.648862, -63.575320, "Halifax"), (45.010474, -63.416817, "Truro")], + "ON": [(43.653225, -79.383186, "Toronto"), (45.421532, -75.697189, "Ottawa")], + "PE": [(46.238240, -63.131074, "Charlottetown"), (46.392410, -63.787629, "Summerside")], + "QC": [(45.501689, -73.567256, "Montreal"), (46.813878, -71.207980, "Quebec City")], + "SK": [(52.133214, -106.670046, "Saskatoon"), (50.445211, -104.618896, "Regina")], + "NT": [(62.4540, -114.3725, "Yellowknife"), (61.251955, -114.352482, "Yellowknife")], + "NU": [(63.7467, -68.5167, "Iqaluit"), (64.282327, -76.614813, "Nunavut")], + "YT": [(60.7212, -135.0568, "Whitehorse"), (64.000000, -138.000000, "Yukon")], +} + + +CA_AREA_CODES = { + "AB": ["403", "587", "825"], + "BC": ["236", "250", "604", "672", "778"], + "MB": ["204", "431"], + "NB": ["506"], + "NL": ["709"], + "NS": ["782", "902"], + "ON": ["226", "249", "289", "343", "365", "416", "437", "519", "548", "613", "639", "647", "705", "807", "905"], + "PE": ["902"], + "QC": ["418", "438", "450", "514", "579", "581", "819", "873"], + "SK": ["306", "639"], + "NT": ["867"], + "NU": ["867"], + "YT": ["867"], +} + + +REMOTE_PROVINCES = {"NL", "NT", "NU", "YT"} + + +def _normalize_province(province: str) -> str: + """ + 省份入参规范化,支持全称或缩写,返回缩写 + + 参数: + province (str): 省份,可为全称或缩写(如 "Alberta" 或 "AB") + + 返回值: + str: 省份缩写(如 "AB") + """ + if not province: + raise ValueError("province 不能为空") + p = province.strip() + if len(p) == 2: + return p.upper() + return CA_PROVINCE_ABBR.get(p, p) + + +def _pick_coords(province_abbr: str, city: Optional[str]) -> tuple[float, float, str]: + """ + 按省份与可选城市选择一个坐标点 + + 参数: + province_abbr (str): 省份缩写 + city (Optional[str]): 城市名(如 "Calgary"),可为空 + + 返回值: + (lat, lon, city_name): 选中的基础坐标及城市名 + """ + coords = CA_COORDS.get(province_abbr) + if not coords: + # 默认回退至 Calgary + return 51.044733, -114.071883, "Calgary" + if city: + c = city.strip().lower() + for lat, lon, cname in coords: + if cname.lower() == c: + return lat, lon, cname + return random.choice(coords) + + +def _random_near(lat: float, lon: float) -> tuple[float, float]: + """ + 在给定坐标附近生成一个随机偏移坐标 + + 参数: + lat (float): 基准纬度 + lon (float): 基准经度 + + 返回值: + (new_lat, new_lon): 随机偏移后的坐标 + """ + return lat + (random.random() - 0.5) * 0.1, lon + (random.random() - 0.5) * 0.1 + + +def _reverse_geocode(lat: float, lon: float) -> Dict: + """ + 使用 Nominatim 反向地理编码,返回地址字典 + + 参数: + lat (float): 纬度 + lon (float): 经度 + + 返回值: + dict: 包含 address 字段的响应数据 + """ + url = f"https://nominatim.openstreetmap.org/reverse?format=json&lat={lat}&lon={lon}&zoom=18&addressdetails=1" + headers = {"User-Agent": "ca_auto_table/1.0"} + r = requests.get(url, headers=headers, timeout=15) + r.raise_for_status() + return r.json() + + +def _format_address(address: Dict, province_abbr: str) -> str: + """ + 将 Nominatim 的 address 格式化为完整地址字符串 + + 参数: + address (dict): Nominatim 返回的 address 字段 + province_abbr (str): 省份缩写(如 "AB") + + 返回值: + str: 格式化后的地址字符串 + """ + house = address.get("house_number") + road = address.get("road") or address.get("residential") or address.get("footway") + city = address.get("city") or address.get("town") or address.get("village") + postcode = address.get("postcode") or "" + if house and road and city: + return f"{house} {road}, {city}, {province_abbr} {postcode}, Canada" + # 远端省份允许部分地址 + return f"{city or ''}, {province_abbr} {postcode}, Canada".strip(", ") + + +def _random_name() -> tuple[str, str]: + """ + 生成随机英文名(Firstname, Lastname),组合空间可达数百万以上 + + 实现策略: + - 60% 概率使用常见英文名与姓氏列表(更自然) + - 40% 概率使用音节组合算法动态生成(数量级远超百万) + + 返回值: + (firstname, lastname) + """ + common_first = [ + "James", "Mary", "Robert", "Patricia", "John", "Jennifer", "Michael", "Linda", "William", "Elizabeth", + "David", "Barbara", "Richard", "Susan", "Joseph", "Jessica", "Thomas", "Sarah", "Charles", "Karen", + "Christopher", "Nancy", "Daniel", "Lisa", "Matthew", "Betty", "Anthony", "Margaret", "Mark", "Sandra", + "Donald", "Ashley", "Steven", "Kimberly", "Paul", "Emily", "Andrew", "Donna", "Joshua", "Michelle", + "Kenneth", "Dorothy", "Kevin", "Carol", "Brian", "Amanda", "George", "Melissa", "Edward", "Deborah", + "Ronald", "Stephanie", "Timothy", "Rebecca", "Jason", "Laura", "Jeffrey", "Sharon", "Ryan", "Cynthia", + "Jacob", "Kathleen", "Gary", "Amy", "Nicholas", "Shirley", "Eric", "Angela", "Stephen", "Helen", + "Jonathan", "Anna", "Larry", "Brenda", "Justin", "Pamela", "Scott", "Nicole", "Brandon", "Samantha", + "Frank", "Katherine", "Benjamin", "Christine", "Gregory", "Emma", "Raymond", "Ruth", "Samuel", "Julie", + "Patrick", "Olivia", "Alexander", "Victoria" + ] + common_last = [ + "Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez", + "Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin", + "Lee", "Perez", "Thompson", "White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson", + "Walker", "Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill", "Flores", + "Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell", "Mitchell", "Carter", "Roberts", + "Turner", "Phillips", "Parker", "Evans", "Edwards", "Collins", "Stewart", "Sanchez", "Morris", "Rogers", + "Reed", "Cook", "Morgan", "Bell", "Murphy", "Bailey", "Cooper", "Richardson", "Cox", "Howard", + "Ward", "Torres", "Peterson", "Gray", "Ramirez", "James", "Watson", "Brooks", "Kelly", "Sanders", + "Price", "Bennett", "Wood", "Barnes", "Ross", "Henderson", "Coleman", "Jenkins", "Perry", "Powell", + "Long", "Patterson", "Hughes", "Flores" + ] + + if random.random() < 0.6: + return random.choice(common_first), random.choice(common_last) + + # 动态音节组合生成,支持数百万组合 + f_beg = [ + "al", "ben", "car", "dan", "el", "fran", "ge", "har", "isa", "jo", "ka", "li", "mar", "no", + "ol", "pa", "qui", "ra", "sa", "ta", "ul", "vi", "wil", "xa", "ya", "zo" + ] + f_mid = [ + "a", "e", "i", "o", "u", "ae", "ai", "ia", "ie", "oa", "ou" + ] + f_end = [ + "n", "ne", "na", "son", "ton", "la", "ra", "rie", "ry", "ley", "ly", "ah" + ] + + l_beg = [ + "sm", "john", "dav", "wil", "and", "tho", "tay", "mo", "jack", "mar", "lee", "tho", "whi", "har", + "san", "cla", "ram", "lew", "rob", "walk", "young", "all", "king", "wri", "scott", "tor", "nguy", + "hil", "flo", "gre", "ada", "nel", "bak", "hal", "riv", "camp", "mit", "car", "rob" + ] + l_mid = [ + "a", "e", "i", "o", "u", "ar", "er", "or", "an", "en", "in", "on", "un" + ] + l_suf = [ + "son", "ton", "man", "ley", "ford", "wood", "well", "er", "ers", "ing", "s", "son", "es" + ] + + def build_name(beg, mid, end, syllables=(2, 3)) -> str: + parts = [random.choice(beg)] + for _ in range(random.choice(syllables) - 1): + parts.append(random.choice(mid)) + parts.append(random.choice(end)) + name = "".join(parts) + return name.capitalize() + + first = build_name(f_beg, f_mid, f_end) + last = build_name(l_beg, l_mid, l_suf) + return first, last + + +def _random_birthday() -> str: + """ + 生成随机生日,格式为 yyyy-mm-dd + + 返回值: + str: 生日字符串 + """ + start = date(1950, 1, 1) + end = date(2000, 12, 31) + delta_days = (end - start).days + d = start + timedelta(days=random.randint(0, delta_days)) + return f"{d.year}-{d.month:02d}-{d.day:02d}" + + +def _random_phone(province_abbr: str) -> str: + """ + 生成随机加拿大电话号码,带区号 + + 参数: + province_abbr (str): 省份缩写 + + 返回值: + str: 电话,例如 "(403) 555-1234" + """ + codes = CA_AREA_CODES.get(province_abbr, ["000"]) + area = random.choice(codes) + exchange = str(random.randint(200, 899)).zfill(3) + line = str(random.randint(1000, 9999)).zfill(4) + return f"({area}) {exchange}-{line}" + + +def generate_canada_info(province: str, city: Optional[str] = None, max_attempts: int = 15, sleep_sec: float = 0.6) -> Dict[str, str]: + """ + 随机生成加拿大个人与地址信息,可指定省份(全称或缩写)与可选城市 + + 参数: + province (str): 省份(如 "Alberta" 或 "AB") + city (Optional[str]): 城市(如 "Calgary"),不传则在省内随机 + max_attempts (int): 反向地理编码最大尝试次数 + sleep_sec (float): 每次失败后的等待秒数,用于尊重 Nominatim 频率限制 + + 返回值: + dict: 包含 Firstname、Lastname、全名、生日、街道地址、城市、电话、邮编、州全称 + """ + prov_abbr = _normalize_province(province) + base_lat, base_lon, chosen_city = _pick_coords(prov_abbr, city) + + address_str = "" + city_name = "" + postcode = "" + for _ in range(max_attempts): + lat, lon = _random_near(base_lat, base_lon) + data = _reverse_geocode(lat, lon) + addr = data.get("address", {}) + city_name = addr.get("city") or addr.get("town") or addr.get("village") or chosen_city + postcode = addr.get("postcode") or "" + address_str = _format_address(addr, prov_abbr) + if prov_abbr in REMOTE_PROVINCES: + break + if addr.get("house_number") and (addr.get("road") or addr.get("residential") or addr.get("footway")) and city_name: + break + time.sleep(sleep_sec) + + firstname, lastname = _random_name() + full_name = f"{firstname} {lastname}" + birthday = _random_birthday() + phone = _random_phone(prov_abbr) + + return { + "firstname": firstname, + "lastname": lastname, + "full_name": full_name, + "birthday": birthday, + "address_str": address_str.split(",")[0], + "city_name": city_name, + "phone": phone, + "postcode": postcode, + "province": next((k for k, v in CA_PROVINCE_ABBR.items() if v == prov_abbr), prov_abbr), + } + + +def get_random_canada_info(province, city) -> Dict[str, str]: + """ + 随机生成加拿大个人与地址信息,省份随机选择,城市随机选择 + + 返回值: + dict: 包含 Firstname、Lastname、全名、生日、街道地址、城市、电话、邮编、州全称 + """ + return generate_canada_info(province, city) + + +def main() -> None: + """ + 演示:生成 Alberta 省 Calgary 的随机信息;可修改为其他省/城市 + """ + info = generate_canada_info("Alberta", "Calgary") + print(info) + + +if __name__ == "__main__": + main() \ No newline at end of file