This commit is contained in:
2025-11-20 11:42:18 +08:00
parent 1bd91df9a1
commit e2d2b0b75b
29 changed files with 2100 additions and 48 deletions

333
spider/work.py Normal file
View File

@@ -0,0 +1,333 @@
import random
import time
from datetime import date, timedelta
from typing import Optional, Dict
import requests
CA_PROVINCE_ABBR = {
"Alberta": "AB",
"British Columbia": "BC",
"Manitoba": "MB",
"New Brunswick": "NB",
"Newfoundland and Labrador": "NL",
"Nova Scotia": "NS",
"Ontario": "ON",
"Prince Edward Island": "PE",
"Quebec": "QC",
"Saskatchewan": "SK",
"Northwest Territories": "NT",
"Nunavut": "NU",
"Yukon": "YT",
}
CA_COORDS = {
"AB": [(51.044733, -114.071883, "Calgary"), (53.546124, -113.493823, "Edmonton")],
"BC": [(49.282729, -123.120738, "Vancouver"), (48.428421, -123.365644, "Victoria")],
"MB": [(49.895137, -97.138374, "Winnipeg"), (50.445211, -96.823611, "East St Paul")],
"NB": [(45.963589, -66.643115, "Fredericton"), (46.510712, -67.255044, "Woodstock")],
"NL": [(53.135509, -57.660435, "Labrador City"), (47.561510, -52.712585, "St. John's")],
"NS": [(44.648862, -63.575320, "Halifax"), (45.010474, -63.416817, "Truro")],
"ON": [(43.653225, -79.383186, "Toronto"), (45.421532, -75.697189, "Ottawa")],
"PE": [(46.238240, -63.131074, "Charlottetown"), (46.392410, -63.787629, "Summerside")],
"QC": [(45.501689, -73.567256, "Montreal"), (46.813878, -71.207980, "Quebec City")],
"SK": [(52.133214, -106.670046, "Saskatoon"), (50.445211, -104.618896, "Regina")],
"NT": [(62.4540, -114.3725, "Yellowknife"), (61.251955, -114.352482, "Yellowknife")],
"NU": [(63.7467, -68.5167, "Iqaluit"), (64.282327, -76.614813, "Nunavut")],
"YT": [(60.7212, -135.0568, "Whitehorse"), (64.000000, -138.000000, "Yukon")],
}
CA_AREA_CODES = {
"AB": ["403", "587", "825"],
"BC": ["236", "250", "604", "672", "778"],
"MB": ["204", "431"],
"NB": ["506"],
"NL": ["709"],
"NS": ["782", "902"],
"ON": ["226", "249", "289", "343", "365", "416", "437", "519", "548", "613", "639", "647", "705", "807", "905"],
"PE": ["902"],
"QC": ["418", "438", "450", "514", "579", "581", "819", "873"],
"SK": ["306", "639"],
"NT": ["867"],
"NU": ["867"],
"YT": ["867"],
}
REMOTE_PROVINCES = {"NL", "NT", "NU", "YT"}
def _normalize_province(province: str) -> str:
"""
省份入参规范化,支持全称或缩写,返回缩写
参数:
province (str): 省份,可为全称或缩写(如 "Alberta""AB"
返回值:
str: 省份缩写(如 "AB"
"""
if not province:
raise ValueError("province 不能为空")
p = province.strip()
if len(p) == 2:
return p.upper()
return CA_PROVINCE_ABBR.get(p, p)
def _pick_coords(province_abbr: str, city: Optional[str]) -> tuple[float, float, str]:
"""
按省份与可选城市选择一个坐标点
参数:
province_abbr (str): 省份缩写
city (Optional[str]): 城市名(如 "Calgary"),可为空
返回值:
(lat, lon, city_name): 选中的基础坐标及城市名
"""
coords = CA_COORDS.get(province_abbr)
if not coords:
# 默认回退至 Calgary
return 51.044733, -114.071883, "Calgary"
if city:
c = city.strip().lower()
for lat, lon, cname in coords:
if cname.lower() == c:
return lat, lon, cname
return random.choice(coords)
def _random_near(lat: float, lon: float) -> tuple[float, float]:
"""
在给定坐标附近生成一个随机偏移坐标
参数:
lat (float): 基准纬度
lon (float): 基准经度
返回值:
(new_lat, new_lon): 随机偏移后的坐标
"""
return lat + (random.random() - 0.5) * 0.1, lon + (random.random() - 0.5) * 0.1
def _reverse_geocode(lat: float, lon: float) -> Dict:
"""
使用 Nominatim 反向地理编码,返回地址字典
参数:
lat (float): 纬度
lon (float): 经度
返回值:
dict: 包含 address 字段的响应数据
"""
url = f"https://nominatim.openstreetmap.org/reverse?format=json&lat={lat}&lon={lon}&zoom=18&addressdetails=1"
headers = {"User-Agent": "ca_auto_table/1.0"}
r = requests.get(url, headers=headers, timeout=15)
r.raise_for_status()
return r.json()
def _format_address(address: Dict, province_abbr: str) -> str:
"""
将 Nominatim 的 address 格式化为完整地址字符串
参数:
address (dict): Nominatim 返回的 address 字段
province_abbr (str): 省份缩写(如 "AB"
返回值:
str: 格式化后的地址字符串
"""
house = address.get("house_number")
road = address.get("road") or address.get("residential") or address.get("footway")
city = address.get("city") or address.get("town") or address.get("village")
postcode = address.get("postcode") or ""
if house and road and city:
return f"{house} {road}, {city}, {province_abbr} {postcode}, Canada"
# 远端省份允许部分地址
return f"{city or ''}, {province_abbr} {postcode}, Canada".strip(", ")
def _random_name() -> tuple[str, str]:
"""
生成随机英文名Firstname, Lastname组合空间可达数百万以上
实现策略:
- 60% 概率使用常见英文名与姓氏列表(更自然)
- 40% 概率使用音节组合算法动态生成(数量级远超百万)
返回值:
(firstname, lastname)
"""
common_first = [
"James", "Mary", "Robert", "Patricia", "John", "Jennifer", "Michael", "Linda", "William", "Elizabeth",
"David", "Barbara", "Richard", "Susan", "Joseph", "Jessica", "Thomas", "Sarah", "Charles", "Karen",
"Christopher", "Nancy", "Daniel", "Lisa", "Matthew", "Betty", "Anthony", "Margaret", "Mark", "Sandra",
"Donald", "Ashley", "Steven", "Kimberly", "Paul", "Emily", "Andrew", "Donna", "Joshua", "Michelle",
"Kenneth", "Dorothy", "Kevin", "Carol", "Brian", "Amanda", "George", "Melissa", "Edward", "Deborah",
"Ronald", "Stephanie", "Timothy", "Rebecca", "Jason", "Laura", "Jeffrey", "Sharon", "Ryan", "Cynthia",
"Jacob", "Kathleen", "Gary", "Amy", "Nicholas", "Shirley", "Eric", "Angela", "Stephen", "Helen",
"Jonathan", "Anna", "Larry", "Brenda", "Justin", "Pamela", "Scott", "Nicole", "Brandon", "Samantha",
"Frank", "Katherine", "Benjamin", "Christine", "Gregory", "Emma", "Raymond", "Ruth", "Samuel", "Julie",
"Patrick", "Olivia", "Alexander", "Victoria"
]
common_last = [
"Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez",
"Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin",
"Lee", "Perez", "Thompson", "White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson",
"Walker", "Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill", "Flores",
"Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell", "Mitchell", "Carter", "Roberts",
"Turner", "Phillips", "Parker", "Evans", "Edwards", "Collins", "Stewart", "Sanchez", "Morris", "Rogers",
"Reed", "Cook", "Morgan", "Bell", "Murphy", "Bailey", "Cooper", "Richardson", "Cox", "Howard",
"Ward", "Torres", "Peterson", "Gray", "Ramirez", "James", "Watson", "Brooks", "Kelly", "Sanders",
"Price", "Bennett", "Wood", "Barnes", "Ross", "Henderson", "Coleman", "Jenkins", "Perry", "Powell",
"Long", "Patterson", "Hughes", "Flores"
]
if random.random() < 0.6:
return random.choice(common_first), random.choice(common_last)
# 动态音节组合生成,支持数百万组合
f_beg = [
"al", "ben", "car", "dan", "el", "fran", "ge", "har", "isa", "jo", "ka", "li", "mar", "no",
"ol", "pa", "qui", "ra", "sa", "ta", "ul", "vi", "wil", "xa", "ya", "zo"
]
f_mid = [
"a", "e", "i", "o", "u", "ae", "ai", "ia", "ie", "oa", "ou"
]
f_end = [
"n", "ne", "na", "son", "ton", "la", "ra", "rie", "ry", "ley", "ly", "ah"
]
l_beg = [
"sm", "john", "dav", "wil", "and", "tho", "tay", "mo", "jack", "mar", "lee", "tho", "whi", "har",
"san", "cla", "ram", "lew", "rob", "walk", "young", "all", "king", "wri", "scott", "tor", "nguy",
"hil", "flo", "gre", "ada", "nel", "bak", "hal", "riv", "camp", "mit", "car", "rob"
]
l_mid = [
"a", "e", "i", "o", "u", "ar", "er", "or", "an", "en", "in", "on", "un"
]
l_suf = [
"son", "ton", "man", "ley", "ford", "wood", "well", "er", "ers", "ing", "s", "son", "es"
]
def build_name(beg, mid, end, syllables=(2, 3)) -> str:
parts = [random.choice(beg)]
for _ in range(random.choice(syllables) - 1):
parts.append(random.choice(mid))
parts.append(random.choice(end))
name = "".join(parts)
return name.capitalize()
first = build_name(f_beg, f_mid, f_end)
last = build_name(l_beg, l_mid, l_suf)
return first, last
def _random_birthday() -> str:
"""
生成随机生日,格式为 yyyy-mm-dd
返回值:
str: 生日字符串
"""
start = date(1950, 1, 1)
end = date(2000, 12, 31)
delta_days = (end - start).days
d = start + timedelta(days=random.randint(0, delta_days))
return f"{d.year}-{d.month:02d}-{d.day:02d}"
def _random_phone(province_abbr: str) -> str:
"""
生成随机加拿大电话号码,带区号
参数:
province_abbr (str): 省份缩写
返回值:
str: 电话,例如 "(403) 555-1234"
"""
codes = CA_AREA_CODES.get(province_abbr, ["000"])
area = random.choice(codes)
exchange = str(random.randint(200, 899)).zfill(3)
line = str(random.randint(1000, 9999)).zfill(4)
return f"({area}) {exchange}-{line}"
def generate_canada_info(province: str, city: Optional[str] = None, max_attempts: int = 15, sleep_sec: float = 0.6) -> Dict[str, str]:
"""
随机生成加拿大个人与地址信息,可指定省份(全称或缩写)与可选城市
参数:
province (str): 省份(如 "Alberta""AB"
city (Optional[str]): 城市(如 "Calgary"),不传则在省内随机
max_attempts (int): 反向地理编码最大尝试次数
sleep_sec (float): 每次失败后的等待秒数,用于尊重 Nominatim 频率限制
返回值:
dict: 包含 Firstname、Lastname、全名、生日、街道地址、城市、电话、邮编、州全称
"""
prov_abbr = _normalize_province(province)
base_lat, base_lon, chosen_city = _pick_coords(prov_abbr, city)
address_str = ""
city_name = ""
postcode = ""
for _ in range(max_attempts):
lat, lon = _random_near(base_lat, base_lon)
data = _reverse_geocode(lat, lon)
addr = data.get("address", {})
city_name = addr.get("city") or addr.get("town") or addr.get("village") or chosen_city
postcode = addr.get("postcode") or ""
address_str = _format_address(addr, prov_abbr)
if prov_abbr in REMOTE_PROVINCES:
break
if addr.get("house_number") and (addr.get("road") or addr.get("residential") or addr.get("footway")) and city_name:
break
time.sleep(sleep_sec)
firstname, lastname = _random_name()
full_name = f"{firstname} {lastname}"
birthday = _random_birthday()
phone = _random_phone(prov_abbr)
return {
"firstname": firstname,
"lastname": lastname,
"full_name": full_name,
"birthday": birthday,
"address_str": address_str.split(",")[0],
"city_name": city_name,
"phone": phone,
"postcode": postcode,
"province": next((k for k, v in CA_PROVINCE_ABBR.items() if v == prov_abbr), prov_abbr),
}
def get_random_canada_info(province, city) -> Dict[str, str]:
"""
随机生成加拿大个人与地址信息,省份随机选择,城市随机选择
返回值:
dict: 包含 Firstname、Lastname、全名、生日、街道地址、城市、电话、邮编、州全称
"""
return generate_canada_info(province, city)
def main() -> None:
"""
演示:生成 Alberta 省 Calgary 的随机信息;可修改为其他省/城市
"""
info = generate_canada_info("Alberta", "Calgary")
print(info)
if __name__ == "__main__":
main()