Files
ca_auto_table/spider/work.py
2025-11-21 12:25:54 +08:00

406 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import random
import time
from datetime import date, timedelta
from typing import Optional, Dict
import requests
from bit_browser import retry
CA_PROVINCE_ABBR = {
"Alberta": "AB",
"British Columbia": "BC",
"Manitoba": "MB",
"New Brunswick": "NB",
"Newfoundland and Labrador": "NL",
"Nova Scotia": "NS",
"Ontario": "ON",
"Prince Edward Island": "PE",
"Quebec": "QC",
"Saskatchewan": "SK",
"Northwest Territories": "NT",
"Nunavut": "NU",
"Yukon": "YT",
}
CA_COORDS = {
"AB": [(51.044733, -114.071883, "Calgary"), (53.546124, -113.493823, "Edmonton")],
"BC": [(49.282729, -123.120738, "Vancouver"), (48.428421, -123.365644, "Victoria")],
"MB": [(49.895137, -97.138374, "Winnipeg"), (50.445211, -96.823611, "East St Paul")],
"NB": [(45.963589, -66.643115, "Fredericton"), (46.510712, -67.255044, "Woodstock")],
"NL": [(53.135509, -57.660435, "Labrador City"), (47.561510, -52.712585, "St. John's")],
"NS": [(44.648862, -63.575320, "Halifax"), (45.010474, -63.416817, "Truro")],
"ON": [(43.653225, -79.383186, "Toronto"), (45.421532, -75.697189, "Ottawa")],
"PE": [(46.238240, -63.131074, "Charlottetown"), (46.392410, -63.787629, "Summerside")],
"QC": [(45.501689, -73.567256, "Montreal"), (46.813878, -71.207980, "Quebec City")],
"SK": [(52.133214, -106.670046, "Saskatoon"), (50.445211, -104.618896, "Regina")],
"NT": [(62.4540, -114.3725, "Yellowknife"), (61.251955, -114.352482, "Yellowknife")],
"NU": [(63.7467, -68.5167, "Iqaluit"), (64.282327, -76.614813, "Nunavut")],
"YT": [(60.7212, -135.0568, "Whitehorse"), (64.000000, -138.000000, "Yukon")],
}
CA_AREA_CODES = {
"AB": ["403", "587", "825"],
"BC": ["236", "250", "604", "672", "778"],
"MB": ["204", "431"],
"NB": ["506"],
"NL": ["709"],
"NS": ["782", "902"],
"ON": ["226", "249", "289", "343", "365", "416", "437", "519", "548", "613", "639", "647", "705", "807", "905"],
"PE": ["902"],
"QC": ["418", "438", "450", "514", "579", "581", "819", "873"],
"SK": ["306", "639"],
"NT": ["867"],
"NU": ["867"],
"YT": ["867"],
}
# 主要城市的区号(更精确的城市级约束)
CITY_AREA_CODES = {
"Calgary": ["403", "587", "825"],
"Edmonton": ["780", "587", "825"],
"Vancouver": ["604", "778", "236", "672"],
"Halifax": ["902", "782"],
"Toronto": ["416", "647", "437"],
}
# 邮编首字母合法性映射(按省份缩写)
POSTAL_PREFIXES = {
"AB": {"T"},
"BC": {"V"},
"MB": {"R"},
"NB": {"E"},
"NL": {"A"},
"NS": {"B"},
"ON": {"K", "L", "M"},
"PE": {"C"},
"QC": {"G", "H", "J"},
"SK": {"S"},
"NT": {"X"},
"NU": {"X"},
"YT": {"Y"},
}
REMOTE_PROVINCES = {"NL", "NT", "NU", "YT"}
def _normalize_province(province: str) -> str:
"""
省份入参规范化,支持全称或缩写,返回缩写
参数:
province (str): 省份,可为全称或缩写(如 "Alberta""AB"
返回值:
str: 省份缩写(如 "AB"
"""
if not province:
raise ValueError("province 不能为空")
p = province.strip()
if len(p) == 2:
return p.upper()
return CA_PROVINCE_ABBR.get(p, p)
def _pick_coords(province_abbr: str, city: Optional[str]) -> tuple[float, float, str]:
"""
按省份与可选城市选择一个坐标点
参数:
province_abbr (str): 省份缩写
city (Optional[str]): 城市名(如 "Calgary"),可为空
返回值:
(lat, lon, city_name): 选中的基础坐标及城市名
"""
coords = CA_COORDS.get(province_abbr)
if not coords:
# 默认回退至 Calgary
return 51.044733, -114.071883, "Calgary"
if city:
c = city.strip().lower()
for lat, lon, cname in coords:
if cname.lower() == c:
return lat, lon, cname
return random.choice(coords)
def _random_near(lat: float, lon: float) -> tuple[float, float]:
"""
在给定坐标附近生成一个随机偏移坐标
参数:
lat (float): 基准纬度
lon (float): 基准经度
返回值:
(new_lat, new_lon): 随机偏移后的坐标
"""
return lat + (random.random() - 0.5) * 0.1, lon + (random.random() - 0.5) * 0.1
@retry(max_retries=3, delay=1.0, backoff=1.0)
def _reverse_geocode(lat: float, lon: float) -> Dict:
"""
使用 Nominatim 反向地理编码,返回地址字典
参数:
lat (float): 纬度
lon (float): 经度
返回值:
dict: 包含 address 字段的响应数据
"""
url = f"https://nominatim.openstreetmap.org/reverse?format=json&lat={lat}&lon={lon}&zoom=18&addressdetails=1"
headers = {"User-Agent": "ca_auto_table/1.0"}
r = requests.get(url, headers=headers, timeout=15)
r.raise_for_status()
return r.json()
def _format_address(address: Dict, province_abbr: str) -> str:
"""
将 Nominatim 的 address 格式化为完整地址字符串
参数:
address (dict): Nominatim 返回的 address 字段
province_abbr (str): 省份缩写(如 "AB"
返回值:
str: 格式化后的地址字符串
"""
house = address.get("house_number")
road = address.get("road") or address.get("residential") or address.get("footway")
city = address.get("city") or address.get("town") or address.get("village")
postcode = address.get("postcode") or ""
if house and road and city:
return f"{house} {road}, {city}, {province_abbr} {postcode}, Canada"
# 远端省份允许部分地址
return f"{city or ''}, {province_abbr} {postcode}, Canada".strip(", ")
def _random_name() -> tuple[str, str]:
"""
生成随机英文名Firstname, Lastname组合空间可达数百万以上
实现策略:
- 60% 概率使用常见英文名与姓氏列表(更自然)
- 40% 概率使用音节组合算法动态生成(数量级远超百万)
返回值:
(firstname, lastname)
"""
common_first = [
"James", "Mary", "Robert", "Patricia", "John", "Jennifer", "Michael", "Linda", "William", "Elizabeth",
"David", "Barbara", "Richard", "Susan", "Joseph", "Jessica", "Thomas", "Sarah", "Charles", "Karen",
"Christopher", "Nancy", "Daniel", "Lisa", "Matthew", "Betty", "Anthony", "Margaret", "Mark", "Sandra",
"Donald", "Ashley", "Steven", "Kimberly", "Paul", "Emily", "Andrew", "Donna", "Joshua", "Michelle",
"Kenneth", "Dorothy", "Kevin", "Carol", "Brian", "Amanda", "George", "Melissa", "Edward", "Deborah",
"Ronald", "Stephanie", "Timothy", "Rebecca", "Jason", "Laura", "Jeffrey", "Sharon", "Ryan", "Cynthia",
"Jacob", "Kathleen", "Gary", "Amy", "Nicholas", "Shirley", "Eric", "Angela", "Stephen", "Helen",
"Jonathan", "Anna", "Larry", "Brenda", "Justin", "Pamela", "Scott", "Nicole", "Brandon", "Samantha",
"Frank", "Katherine", "Benjamin", "Christine", "Gregory", "Emma", "Raymond", "Ruth", "Samuel", "Julie",
"Patrick", "Olivia", "Alexander", "Victoria"
]
common_last = [
"Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez",
"Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin",
"Lee", "Perez", "Thompson", "White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson",
"Walker", "Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill", "Flores",
"Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell", "Mitchell", "Carter", "Roberts",
"Turner", "Phillips", "Parker", "Evans", "Edwards", "Collins", "Stewart", "Sanchez", "Morris", "Rogers",
"Reed", "Cook", "Morgan", "Bell", "Murphy", "Bailey", "Cooper", "Richardson", "Cox", "Howard",
"Ward", "Torres", "Peterson", "Gray", "Ramirez", "James", "Watson", "Brooks", "Kelly", "Sanders",
"Price", "Bennett", "Wood", "Barnes", "Ross", "Henderson", "Coleman", "Jenkins", "Perry", "Powell",
"Long", "Patterson", "Hughes", "Flores"
]
if random.random() < 0.6:
return random.choice(common_first), random.choice(common_last)
# 动态音节组合生成,支持数百万组合
f_beg = [
"al", "ben", "car", "dan", "el", "fran", "ge", "har", "isa", "jo", "ka", "li", "mar", "no",
"ol", "pa", "qui", "ra", "sa", "ta", "ul", "vi", "wil", "xa", "ya", "zo"
]
f_mid = [
"a", "e", "i", "o", "u", "ae", "ai", "ia", "ie", "oa", "ou"
]
f_end = [
"n", "ne", "na", "son", "ton", "la", "ra", "rie", "ry", "ley", "ly", "ah"
]
l_beg = [
"sm", "john", "dav", "wil", "and", "tho", "tay", "mo", "jack", "mar", "lee", "tho", "whi", "har",
"san", "cla", "ram", "lew", "rob", "walk", "young", "all", "king", "wri", "scott", "tor", "nguy",
"hil", "flo", "gre", "ada", "nel", "bak", "hal", "riv", "camp", "mit", "car", "rob"
]
l_mid = [
"a", "e", "i", "o", "u", "ar", "er", "or", "an", "en", "in", "on", "un"
]
l_suf = [
"son", "ton", "man", "ley", "ford", "wood", "well", "er", "ers", "ing", "s", "son", "es"
]
def build_name(beg, mid, end, syllables=(2, 3)) -> str:
parts = [random.choice(beg)]
for _ in range(random.choice(syllables) - 1):
parts.append(random.choice(mid))
parts.append(random.choice(end))
name = "".join(parts)
return name.capitalize()
first = build_name(f_beg, f_mid, f_end)
last = build_name(l_beg, l_mid, l_suf)
return first, last
def _random_birthday() -> str:
"""
生成随机生日,格式为 yyyy-mm-dd
返回值:
str: 生日字符串
"""
start = date(1950, 1, 1)
end = date(2000, 12, 31)
delta_days = (end - start).days
d = start + timedelta(days=random.randint(0, delta_days))
return f"{d.year}-{d.month:02d}-{d.day:02d}"
def _random_phone(province_abbr: str) -> str:
"""
生成随机加拿大电话号码,带区号
参数:
province_abbr (str): 省份缩写
返回值:
str: 电话,例如 "(403) 555-1234"
"""
codes = CA_AREA_CODES.get(province_abbr, ["000"])
area = random.choice(codes)
exchange = str(random.randint(200, 899)).zfill(3)
line = str(random.randint(1000, 9999)).zfill(4)
return f"({area}) {exchange}-{line}"
def _random_phone_city(province_abbr: str, city: Optional[str]) -> str:
"""
按城市优先选择区号,若城市未配置则回退到省份区号
参数:
province_abbr (str): 省份缩写
city (Optional[str]): 城市名
返回值:
str: 电话,例如 "(403) 555-1234"
"""
codes = None
if city:
codes = CITY_AREA_CODES.get(city)
codes = codes or CA_AREA_CODES.get(province_abbr, ["000"])
area = random.choice(codes)
exchange = str(random.randint(200, 899)).zfill(3)
line = str(random.randint(1000, 9999)).zfill(4)
return f"(#{area}) {exchange}-{line}".replace("#", "")
def _postal_valid_for_province(province_abbr: str, postcode: str) -> bool:
"""
校验邮编首字母是否符合省份规范
参数:
province_abbr (str): 省份缩写
postcode (str): 邮编字符串
返回值:
bool: 合法返回 True否则 False
"""
if not postcode:
return False
prefixes = POSTAL_PREFIXES.get(province_abbr)
if not prefixes:
return True
return postcode[0].upper() in prefixes
def generate_canada_info(province: str, city: Optional[str] = None, max_attempts: int = 15, sleep_sec: float = 0.6) -> Dict[str, str]:
"""
随机生成加拿大个人与地址信息,可指定省份(全称或缩写)与可选城市
参数:
province (str): 省份(如 "Alberta""AB"
city (Optional[str]): 城市(如 "Calgary"),不传则在省内随机
max_attempts (int): 反向地理编码最大尝试次数
sleep_sec (float): 每次失败后的等待秒数,用于尊重 Nominatim 频率限制
返回值:
dict: 包含 Firstname、Lastname、全名、生日、街道地址、城市、电话、邮编、州全称
"""
prov_abbr = _normalize_province(province)
base_lat, base_lon, chosen_city = _pick_coords(prov_abbr, city)
address_str = ""
city_name = ""
postcode = ""
for _ in range(max_attempts):
lat, lon = _random_near(base_lat, base_lon)
data = _reverse_geocode(lat, lon)
if not data:
time.sleep(sleep_sec)
continue
addr = data.get("address", {})
city_name = addr.get("city") or addr.get("town") or addr.get("village") or chosen_city
postcode = addr.get("postcode") or ""
address_str = _format_address(addr, prov_abbr)
if prov_abbr in REMOTE_PROVINCES:
break
if addr.get("house_number") and (addr.get("road") or addr.get("residential") or addr.get("footway")) and city_name and _postal_valid_for_province(prov_abbr, postcode):
break
time.sleep(sleep_sec)
firstname, lastname = _random_name()
full_name = f"{firstname} {lastname}"
birthday = _random_birthday()
phone = _random_phone_city(prov_abbr, city or chosen_city)
return {
"firstname": firstname,
"lastname": lastname,
"full_name": full_name,
"birthday": birthday,
"address_str": address_str.split(",")[0],
"city_name": city_name,
"phone": phone,
"postcode": postcode,
"province": next((k for k, v in CA_PROVINCE_ABBR.items() if v == prov_abbr), prov_abbr),
}
def get_random_canada_info(province, city) -> Dict[str, str]:
"""
随机生成加拿大个人与地址信息,省份随机选择,城市随机选择
返回值:
dict: 包含 Firstname、Lastname、全名、生日、街道地址、城市、电话、邮编、州全称
"""
return generate_canada_info(province, city)
def main() -> None:
"""
演示:生成 Alberta 省 Calgary 的随机信息;可修改为其他省/城市
"""
info = generate_canada_info("Alberta", "Calgary")
print(info)
if __name__ == "__main__":
main()