Files
us_youtube_auto/spider/work.py
2025-12-12 14:40:04 +08:00

1052 lines
37 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import random
import time
from datetime import date, timedelta
from typing import Optional, Dict
import re
import requests
try:
from bit_browser import retry
except ImportError:
def retry(max_retries: int = 3, delay: float = 1.0, backoff: float = 1.0):
"""
简易重试装饰器(本地兜底),用于在缺失 bit_browser 时提供重试能力
参数:
max_retries (int): 最大重试次数
delay (float): 首次重试延时秒数
backoff (float): 每次重试延时的倍增系数
返回值:
Callable: 装饰器,包装被装饰函数以支持重试
"""
def _decorator(func):
def _wrapper(*args, **kwargs):
tries = 0
cur_delay = delay
while True:
try:
return func(*args, **kwargs)
except Exception:
tries += 1
if tries >= max_retries:
raise
time.sleep(cur_delay)
cur_delay *= backoff
return _wrapper
return _decorator
CA_PROVINCE_ABBR = {
"Alberta": "AB",
"British Columbia": "BC",
"Manitoba": "MB",
"New Brunswick": "NB",
"Newfoundland and Labrador": "NL",
"Nova Scotia": "NS",
"Ontario": "ON",
"Prince Edward Island": "PE",
"Quebec": "QC",
"Saskatchewan": "SK",
"Northwest Territories": "NT",
"Nunavut": "NU",
"Yukon": "YT",
}
CA_COORDS = {
"AB": [(51.044733, -114.071883, "Calgary"), (53.546124, -113.493823, "Edmonton")],
"BC": [(49.282729, -123.120738, "Vancouver"), (48.428421, -123.365644, "Victoria")],
"MB": [(49.895137, -97.138374, "Winnipeg"), (50.445211, -96.823611, "East St Paul")],
"NB": [(45.963589, -66.643115, "Fredericton"), (46.510712, -67.255044, "Woodstock")],
"NL": [(53.135509, -57.660435, "Labrador City"), (47.561510, -52.712585, "St. John's")],
"NS": [(44.648862, -63.575320, "Halifax"), (45.010474, -63.416817, "Truro")],
"ON": [(43.653225, -79.383186, "Toronto"), (45.421532, -75.697189, "Ottawa")],
"PE": [(46.238240, -63.131074, "Charlottetown"), (46.392410, -63.787629, "Summerside")],
"QC": [(45.501689, -73.567256, "Montreal"), (46.813878, -71.207980, "Quebec City")],
"SK": [(52.133214, -106.670046, "Saskatoon"), (50.445211, -104.618896, "Regina")],
"NT": [(62.4540, -114.3725, "Yellowknife"), (61.251955, -114.352482, "Yellowknife")],
"NU": [(63.7467, -68.5167, "Iqaluit"), (64.282327, -76.614813, "Nunavut")],
"YT": [(60.7212, -135.0568, "Whitehorse"), (64.000000, -138.000000, "Yukon")],
}
CA_AREA_CODES = {
"AB": ["403", "587", "825"],
"BC": ["236", "250", "604", "672", "778"],
"MB": ["204", "431"],
"NB": ["506"],
"NL": ["709"],
"NS": ["782", "902"],
"ON": ["226", "249", "289", "343", "365", "416", "437", "519", "548", "613", "639", "647", "705", "807", "905"],
"PE": ["902"],
"QC": ["418", "438", "450", "514", "579", "581", "819", "873"],
"SK": ["306", "639"],
"NT": ["867"],
"NU": ["867"],
"YT": ["867"],
}
# 主要城市的区号(更精确的城市级约束)
CITY_AREA_CODES = {
"Calgary": ["403", "587", "825"],
"Edmonton": ["780", "587", "825"],
"Vancouver": ["604", "778", "236", "672"],
"Halifax": ["902", "782"],
"Toronto": ["416", "647", "437"],
}
# 邮编首字母合法性映射(按省份缩写)
POSTAL_PREFIXES = {
"AB": {"T"},
"BC": {"V"},
"MB": {"R"},
"NB": {"E"},
"NL": {"A"},
"NS": {"B"},
"ON": {"K", "L", "M"},
"PE": {"C"},
"QC": {"G", "H", "J"},
"SK": {"S"},
"NT": {"X"},
"NU": {"X"},
"YT": {"Y"},
}
REMOTE_PROVINCES = {"NL", "NT", "NU", "YT"}
def _normalize_province(province: str) -> str:
"""
省份入参规范化,支持全称或缩写,返回缩写
参数:
province (str): 省份,可为全称或缩写(如 "Alberta""AB"
返回值:
str: 省份缩写(如 "AB"
"""
if not province:
raise ValueError("province 不能为空")
p = province.strip()
if len(p) == 2:
return p.upper()
return CA_PROVINCE_ABBR.get(p, p)
def _pick_coords(province_abbr: str, city: Optional[str]) -> tuple[float, float, str]:
"""
按省份与可选城市选择一个坐标点
参数:
province_abbr (str): 省份缩写
city (Optional[str]): 城市名(如 "Calgary"),可为空
返回值:
(lat, lon, city_name): 选中的基础坐标及城市名
"""
coords = CA_COORDS.get(province_abbr)
if not coords:
# 默认回退至 Calgary
return 51.044733, -114.071883, "Calgary"
if city:
c = city.strip().lower()
for lat, lon, cname in coords:
if cname.lower() == c:
return lat, lon, cname
return random.choice(coords)
def _random_near(lat: float, lon: float) -> tuple[float, float]:
"""
在给定坐标附近生成一个随机偏移坐标
参数:
lat (float): 基准纬度
lon (float): 基准经度
返回值:
(new_lat, new_lon): 随机偏移后的坐标
"""
return lat + (random.random() - 0.5) * 0.1, lon + (random.random() - 0.5) * 0.1
@retry(max_retries=3, delay=1.0, backoff=1.0)
def _reverse_geocode(lat: float, lon: float) -> Dict:
"""
使用 Nominatim 反向地理编码,返回地址字典
参数:
lat (float): 纬度
lon (float): 经度
返回值:
dict: 包含 address 字段的响应数据
"""
url = f"https://nominatim.openstreetmap.org/reverse?format=json&lat={lat}&lon={lon}&zoom=18&addressdetails=1"
headers = {"User-Agent": "ca_auto_table/1.0"}
r = requests.get(url, headers=headers, timeout=15)
r.raise_for_status()
return r.json()
def _format_address(address: Dict, province_abbr: str) -> str:
"""
将 Nominatim 的 address 格式化为完整地址字符串
参数:
address (dict): Nominatim 返回的 address 字段
province_abbr (str): 省份缩写(如 "AB"
返回值:
str: 格式化后的地址字符串
"""
house = address.get("house_number")
road = address.get("road") or address.get("residential") or address.get("footway")
city = address.get("city") or address.get("town") or address.get("village")
postcode = address.get("postcode") or ""
if house and road and city:
return f"{house} {road}, {city}, {province_abbr} {postcode}, Canada"
# 远端省份允许部分地址
return f"{city or ''}, {province_abbr} {postcode}, Canada".strip(", ")
def _random_name() -> tuple[str, str]:
"""
生成随机英文名Firstname, Lastname组合空间可达数百万以上
实现策略:
- 60% 概率使用常见英文名与姓氏列表(更自然)
- 40% 概率使用音节组合算法动态生成(数量级远超百万)
返回值:
(firstname, lastname)
"""
common_first = [
"James", "Mary", "Robert", "Patricia", "John", "Jennifer", "Michael", "Linda", "William", "Elizabeth",
"David", "Barbara", "Richard", "Susan", "Joseph", "Jessica", "Thomas", "Sarah", "Charles", "Karen",
"Christopher", "Nancy", "Daniel", "Lisa", "Matthew", "Betty", "Anthony", "Margaret", "Mark", "Sandra",
"Donald", "Ashley", "Steven", "Kimberly", "Paul", "Emily", "Andrew", "Donna", "Joshua", "Michelle",
"Kenneth", "Dorothy", "Kevin", "Carol", "Brian", "Amanda", "George", "Melissa", "Edward", "Deborah",
"Ronald", "Stephanie", "Timothy", "Rebecca", "Jason", "Laura", "Jeffrey", "Sharon", "Ryan", "Cynthia",
"Jacob", "Kathleen", "Gary", "Amy", "Nicholas", "Shirley", "Eric", "Angela", "Stephen", "Helen",
"Jonathan", "Anna", "Larry", "Brenda", "Justin", "Pamela", "Scott", "Nicole", "Brandon", "Samantha",
"Frank", "Katherine", "Benjamin", "Christine", "Gregory", "Emma", "Raymond", "Ruth", "Samuel", "Julie",
"Patrick", "Olivia", "Alexander", "Victoria"
]
common_last = [
"Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez",
"Hernandez", "Lopez", "Gonzalez", "Wilson", "Anderson", "Thomas", "Taylor", "Moore", "Jackson", "Martin",
"Lee", "Perez", "Thompson", "White", "Harris", "Sanchez", "Clark", "Ramirez", "Lewis", "Robinson",
"Walker", "Young", "Allen", "King", "Wright", "Scott", "Torres", "Nguyen", "Hill", "Flores",
"Green", "Adams", "Nelson", "Baker", "Hall", "Rivera", "Campbell", "Mitchell", "Carter", "Roberts",
"Turner", "Phillips", "Parker", "Evans", "Edwards", "Collins", "Stewart", "Sanchez", "Morris", "Rogers",
"Reed", "Cook", "Morgan", "Bell", "Murphy", "Bailey", "Cooper", "Richardson", "Cox", "Howard",
"Ward", "Torres", "Peterson", "Gray", "Ramirez", "James", "Watson", "Brooks", "Kelly", "Sanders",
"Price", "Bennett", "Wood", "Barnes", "Ross", "Henderson", "Coleman", "Jenkins", "Perry", "Powell",
"Long", "Patterson", "Hughes", "Flores"
]
if random.random() < 0.6:
return random.choice(common_first), random.choice(common_last)
# 动态音节组合生成,支持数百万组合
f_beg = [
"al", "ben", "car", "dan", "el", "fran", "ge", "har", "isa", "jo", "ka", "li", "mar", "no",
"ol", "pa", "qui", "ra", "sa", "ta", "ul", "vi", "wil", "xa", "ya", "zo"
]
f_mid = [
"a", "e", "i", "o", "u", "ae", "ai", "ia", "ie", "oa", "ou"
]
f_end = [
"n", "ne", "na", "son", "ton", "la", "ra", "rie", "ry", "ley", "ly", "ah"
]
l_beg = [
"sm", "john", "dav", "wil", "and", "tho", "tay", "mo", "jack", "mar", "lee", "tho", "whi", "har",
"san", "cla", "ram", "lew", "rob", "walk", "young", "all", "king", "wri", "scott", "tor", "nguy",
"hil", "flo", "gre", "ada", "nel", "bak", "hal", "riv", "camp", "mit", "car", "rob"
]
l_mid = [
"a", "e", "i", "o", "u", "ar", "er", "or", "an", "en", "in", "on", "un"
]
l_suf = [
"son", "ton", "man", "ley", "ford", "wood", "well", "er", "ers", "ing", "s", "son", "es"
]
def build_name(beg, mid, end, syllables=(2, 3)) -> str:
parts = [random.choice(beg)]
for _ in range(random.choice(syllables) - 1):
parts.append(random.choice(mid))
parts.append(random.choice(end))
name = "".join(parts)
return name.capitalize()
first = build_name(f_beg, f_mid, f_end)
last = build_name(l_beg, l_mid, l_suf)
return first, last
def _random_birthday() -> str:
"""
生成随机生日,格式为 yyyy-mm-dd
返回值:
str: 生日字符串
"""
start = date(1950, 1, 1)
end = date(2000, 12, 31)
delta_days = (end - start).days
d = start + timedelta(days=random.randint(0, delta_days))
return f"{d.year}-{d.month:02d}-{d.day:02d}"
def _random_phone(province_abbr: str) -> str:
"""
生成随机加拿大电话号码,带区号
参数:
province_abbr (str): 省份缩写
返回值:
str: 电话,例如 "(403) 555-1234"
"""
codes = CA_AREA_CODES.get(province_abbr, ["000"])
area = random.choice(codes)
exchange = str(random.randint(200, 899)).zfill(3)
line = str(random.randint(1000, 9999)).zfill(4)
return f"({area}) {exchange}-{line}"
def _random_phone_city(province_abbr: str, city: Optional[str]) -> str:
"""
按城市优先选择区号,若城市未配置则回退到省份区号
参数:
province_abbr (str): 省份缩写
city (Optional[str]): 城市名
返回值:
str: 电话,例如 "(403) 555-1234"
"""
codes = None
if city:
codes = CITY_AREA_CODES.get(city)
codes = codes or CA_AREA_CODES.get(province_abbr, ["000"])
area = random.choice(codes)
exchange = str(random.randint(200, 899)).zfill(3)
line = str(random.randint(1000, 9999)).zfill(4)
return f"(#{area}) {exchange}-{line}".replace("#", "")
def _postal_valid_for_province(province_abbr: str, postcode: str) -> bool:
"""
校验邮编首字母是否符合省份规范
参数:
province_abbr (str): 省份缩写
postcode (str): 邮编字符串
返回值:
bool: 合法返回 True否则 False
"""
if not postcode:
return False
prefixes = POSTAL_PREFIXES.get(province_abbr)
if not prefixes:
return True
return postcode[0].upper() in prefixes
def generate_canada_info(province: str, city: Optional[str] = None, max_attempts: int = 15, sleep_sec: float = 0.6) -> Dict[str, str]:
"""
随机生成加拿大个人与地址信息,可指定省份(全称或缩写)与可选城市
参数:
province (str): 省份(如 "Alberta""AB"
city (Optional[str]): 城市(如 "Calgary"),不传则在省内随机
max_attempts (int): 反向地理编码最大尝试次数
sleep_sec (float): 每次失败后的等待秒数,用于尊重 Nominatim 频率限制
返回值:
dict: 包含 Firstname、Lastname、全名、生日、街道地址、城市、电话、邮编、州全称
"""
prov_abbr = _normalize_province(province)
base_lat, base_lon, chosen_city = _pick_coords(prov_abbr, city)
address_str = ""
city_name = ""
postcode = ""
for _ in range(max_attempts):
lat, lon = _random_near(base_lat, base_lon)
data = _reverse_geocode(lat, lon)
if not data:
time.sleep(sleep_sec)
continue
addr = data.get("address", {})
city_name = addr.get("city") or addr.get("town") or addr.get("village") or chosen_city
postcode = addr.get("postcode") or ""
address_str = _format_address(addr, prov_abbr)
if prov_abbr in REMOTE_PROVINCES:
break
if addr.get("house_number") and (addr.get("road") or addr.get("residential") or addr.get("footway")) and city_name and _postal_valid_for_province(prov_abbr, postcode):
break
time.sleep(sleep_sec)
firstname, lastname = _random_name()
full_name = f"{firstname} {lastname}"
birthday = _random_birthday()
phone = _random_phone_city(prov_abbr, city or chosen_city)
return {
"firstname": firstname,
"lastname": lastname,
"full_name": full_name,
"birthday": birthday,
"address_str": address_str.split(",")[0],
"city_name": city_name,
"phone": phone,
"postcode": postcode,
"province": next((k for k, v in CA_PROVINCE_ABBR.items() if v == prov_abbr), prov_abbr),
}
def get_random_canada_info(province, city) -> Dict[str, str]:
"""
本地生成加拿大个人与地址信息(不依赖外部网络)
参数:
province (str): 省份(如 "Alberta""AB"
city (str | None): 城市(如 "Calgary"),不传则按省份随机
返回值:
dict: 包含 Firstname、Lastname、全名、生日、街道地址、城市、电话、邮编、州全称
"""
prov_abbr = _normalize_province(province)
_, _, chosen_city = _pick_coords(prov_abbr, city)
firstname, lastname = _random_name()
full_name = f"{firstname} {lastname}"
birthday = _random_birthday()
phone = _random_phone_city(prov_abbr, city or chosen_city)
def _random_street() -> str:
"""
生成本地街道地址
返回值:
str: 形如 '123 Maple Ave' 的地址
"""
house = random.randint(10, 9999)
street_roots = [
"Maple", "Oak", "Pine", "Cedar", "Elm", "Birch", "Willow", "Spruce", "Ash",
"River", "Lake", "Hill", "Queen", "King", "Main", "Victoria", "Wellington",
"Church", "College", "Centre"
]
suffixes = ["St", "Ave", "Rd", "Blvd", "Dr", "Ct", "Pl", "Ln", "Way", "Terrace"]
return f"{house} {random.choice(street_roots)} {random.choice(suffixes)}"
def _random_postal(p_abbr: str) -> str:
"""
生成加拿大邮编A1A 1A1首字母符合省份规范
参数:
p_abbr (str): 省份缩写
返回值:
str: 邮编
"""
allowed_letters = "ABCEGHJKLMNPRSTVXY"
prefixes = POSTAL_PREFIXES.get(p_abbr) or set(allowed_letters)
first_letter = random.choice(sorted(list(prefixes)))
def L() -> str:
return random.choice(allowed_letters)
def D() -> str:
return str(random.randint(0, 9))
return f"{first_letter}{D()}{L()} {D()}{L()}{D()}"
address_str = _random_street()
city_name = city or chosen_city
postcode = _random_postal(prov_abbr)
province_full = next((k for k, v in CA_PROVINCE_ABBR.items() if v == prov_abbr), prov_abbr)
return {
"firstname": firstname,
"lastname": lastname,
"full_name": full_name,
"birthday": birthday,
"address_str": address_str,
"city_name": city_name,
"phone": phone,
"postcode": postcode,
"province": province_full,
}
US_STATE_ABBR = {
"Alabama": "AL",
"Alaska": "AK",
"Arizona": "AZ",
"Arkansas": "AR",
"California": "CA",
"Colorado": "CO",
"Connecticut": "CT",
"Delaware": "DE",
"Florida": "FL",
"Georgia": "GA",
"Hawaii": "HI",
"Idaho": "ID",
"Illinois": "IL",
"Indiana": "IN",
"Iowa": "IA",
"Kansas": "KS",
"Kentucky": "KY",
"Louisiana": "LA",
"Maine": "ME",
"Maryland": "MD",
"Massachusetts": "MA",
"Michigan": "MI",
"Minnesota": "MN",
"Mississippi": "MS",
"Missouri": "MO",
"Montana": "MT",
"Nebraska": "NE",
"Nevada": "NV",
"New Hampshire": "NH",
"New Jersey": "NJ",
"New Mexico": "NM",
"New York": "NY",
"North Carolina": "NC",
"North Dakota": "ND",
"Ohio": "OH",
"Oklahoma": "OK",
"Oregon": "OR",
"Pennsylvania": "PA",
"Rhode Island": "RI",
"South Carolina": "SC",
"South Dakota": "SD",
"Tennessee": "TN",
"Texas": "TX",
"Utah": "UT",
"Vermont": "VT",
"Virginia": "VA",
"Washington": "WA",
"West Virginia": "WV",
"Wisconsin": "WI",
"Wyoming": "WY",
}
US_COORDS = {
"CA": [(34.052235, -118.243683, "Los Angeles"), (37.774929, -122.419416, "San Francisco")],
"NY": [(40.712776, -74.005974, "New York"), (42.886447, -78.878369, "Buffalo")],
"TX": [(29.760427, -95.369804, "Houston"), (32.776665, -96.796989, "Dallas")],
"FL": [(25.761681, -80.191788, "Miami"), (28.538336, -81.379234, "Orlando")],
"IL": [(41.878113, -87.629799, "Chicago"), (39.781721, -89.650148, "Springfield")],
"WA": [(47.606209, -122.332069, "Seattle"), (47.658779, -117.426047, "Spokane")],
"MA": [(42.360082, -71.058880, "Boston"), (42.262593, -71.802293, "Worcester")],
"PA": [(39.952583, -75.165222, "Philadelphia"), (40.440624, -79.995888, "Pittsburgh")],
"AZ": [(33.448376, -112.074036, "Phoenix"), (32.222607, -110.974711, "Tucson")],
"GA": [(33.748997, -84.387985, "Atlanta"), (32.080898, -81.091203, "Savannah")],
"OH": [(39.961178, -82.998795, "Columbus"), (41.499321, -81.694359, "Cleveland")],
"NC": [(35.227085, -80.843124, "Charlotte"), (35.779590, -78.638179, "Raleigh")],
"MI": [(42.331427, -83.045754, "Detroit"), (42.963240, -85.668086, "Grand Rapids")],
"CO": [(39.739236, -104.990251, "Denver"), (38.833881, -104.821363, "Colorado Springs")],
"VA": [(37.540725, -77.436048, "Richmond"), (36.852926, -75.977985, "Virginia Beach")],
"NJ": [(40.735657, -74.172366, "Newark"), (40.717754, -74.043143, "Jersey City")],
"MD": [(39.290385, -76.612189, "Baltimore"), (39.083997, -77.152757, "Rockville")],
"MN": [(44.977753, -93.265011, "Minneapolis"), (44.953703, -93.089958, "Saint Paul")],
"WI": [(43.038902, -87.906474, "Milwaukee"), (43.073051, -89.401230, "Madison")],
"MO": [(38.627003, -90.199404, "St. Louis"), (39.099724, -94.578331, "Kansas City")],
"IN": [(39.768403, -86.158068, "Indianapolis"), (41.079273, -85.139351, "Fort Wayne")],
"TN": [(36.162664, -86.781602, "Nashville"), (35.149532, -90.048981, "Memphis")],
"OR": [(45.515232, -122.678385, "Portland"), (44.942898, -123.035095, "Salem")],
"NV": [(36.169941, -115.139830, "Las Vegas"), (39.529633, -119.813803, "Reno")],
}
US_CITY_AREA_CODES = {
"Los Angeles": ["213", "310", "323", "424", "661"],
"San Francisco": ["415", "628"],
"New York": ["212", "347", "718", "929", "646"],
"Buffalo": ["716"],
"Houston": ["713", "281", "832"],
"Dallas": ["214", "469", "972"],
"Miami": ["305", "786"],
"Orlando": ["407", "689"],
"Chicago": ["312", "773", "872"],
"Seattle": ["206"],
"Spokane": ["509"],
"Boston": ["617", "857"],
"Worcester": ["508", "774"],
"Philadelphia": ["215", "267", "445"],
"Pittsburgh": ["412", "878"],
"Phoenix": ["602", "480", "623"],
"Tucson": ["520"],
"Atlanta": ["404", "470", "678", "770"],
"Savannah": ["912"],
"Columbus": ["614", "380"],
"Cleveland": ["216", "440"],
"Charlotte": ["704", "980"],
"Raleigh": ["919", "984"],
"Detroit": ["313", "734", "586"],
"Grand Rapids": ["616"],
"Denver": ["303", "720"],
"Colorado Springs": ["719"],
"Richmond": ["804"],
"Virginia Beach": ["757"],
"Newark": ["973", "862"],
"Jersey City": ["201", "551"],
"Baltimore": ["410", "443", "667"],
"Rockville": ["240", "301"],
"Minneapolis": ["612"],
"Saint Paul": ["651"],
"Milwaukee": ["414"],
"Madison": ["608"],
"St. Louis": ["314", "636"],
"Kansas City": ["816"],
"Indianapolis": ["317", "463"],
"Fort Wayne": ["260"],
"Nashville": ["615", "629"],
"Memphis": ["901"],
"Portland": ["503", "971"],
"Salem": ["503"],
"Las Vegas": ["702", "725"],
"Reno": ["775"],
}
US_AREA_CODES = {
abbr: sorted({code for _, _, city in cities for code in US_CITY_AREA_CODES.get(city, [])})
for abbr, cities in US_COORDS.items()
}
US_ZIP_RANGES = {
"CA": (900, 961),
"NY": (100, 149),
"TX": (750, 799),
"FL": (320, 349),
"IL": (600, 629),
"WA": (980, 994),
"MA": (10, 27),
"PA": (150, 196),
"AZ": (850, 865),
"GA": (300, 319),
"OH": (430, 459),
"NC": (270, 289),
"MI": (480, 499),
"CO": (800, 816),
"VA": (220, 246),
"NJ": (70, 89),
"MD": (206, 219),
"MN": (550, 567),
"WI": (530, 549),
"MO": (630, 658),
"IN": (460, 479),
"TN": (370, 385),
"OR": (970, 979),
"NV": (889, 898),
}
def _normalize_state(state: str) -> str:
"""
州入参规范化,支持全称或缩写,返回缩写
参数:
state (str): 州名,可为全称或缩写(如 "California""CA"
返回值:
str: 州缩写(如 "CA"
"""
if not state:
raise ValueError("state 不能为空")
s = state.strip()
if len(s) == 2:
return s.upper()
return US_STATE_ABBR.get(s, s)
def _us_pick_coords(state_abbr: str, city: Optional[str]) -> tuple[float, float, str]:
"""
按州与可选城市选择一个坐标点
参数:
state_abbr (str): 州缩写
city (Optional[str]): 城市名(如 "Los Angeles"),可为空
返回值:
(lat, lon, city_name): 选中的基础坐标及城市名
"""
coords = US_COORDS.get(state_abbr)
if not coords:
return 40.712776, -74.005974, "New York"
if city:
c = city.strip().lower()
for lat, lon, cname in coords:
if cname.lower() == c:
return lat, lon, cname
return random.choice(coords)
def _us_format_address(address: Dict, state_abbr: str) -> str:
"""
将 Nominatim 的 address 格式化为美国地址字符串
参数:
address (dict): Nominatim 返回的 address 字段
state_abbr (str): 州缩写(如 "CA"
返回值:
str: 格式化后的地址字符串
"""
house = address.get("house_number")
road = address.get("road") or address.get("residential") or address.get("footway")
city = address.get("city") or address.get("town") or address.get("village")
postcode = address.get("postcode") or ""
if house and road and city:
return f"{house} {road}, {city}, {state_abbr} {postcode}, United States"
return f"{city or ''}, {state_abbr} {postcode}, United States".strip(", ")
def _us_random_phone_state(state_abbr: str, city: Optional[str]) -> str:
"""
生成随机美国电话号码,按城市优先选择区号
参数:
state_abbr (str): 州缩写
city (Optional[str]): 城市名
返回值:
str: 电话,例如 "(213) 555-1234"
"""
codes = None
if city:
codes = US_CITY_AREA_CODES.get(city)
codes = codes or US_AREA_CODES.get(state_abbr, ["000"])
area = random.choice(codes)
exchange = str(random.randint(200, 899)).zfill(3)
line = str(random.randint(1000, 9999)).zfill(4)
return f"({area}) {exchange}-{line}"
def _us_random_zip_for_state(state_abbr: str) -> str:
"""
生成美国 ZIP Code5 位数字),范围符合州常见分配段
参数:
state_abbr (str): 州缩写
返回值:
str: ZIP Code"90012"
"""
rng = US_ZIP_RANGES.get(state_abbr)
if not rng:
prefix = random.randint(100, 999)
else:
prefix = random.randint(rng[0], rng[1])
suffix = random.randint(0, 99)
return f"{prefix:03d}{suffix:02d}"
def generate_us_info(state: str, city: Optional[str] = None, max_attempts: int = 15, sleep_sec: float = 0.6) -> Dict[str, str]:
"""
随机生成美国个人与地址信息,可指定州(全称或缩写)与可选城市
参数:
state (str): 州(如 "California""CA"
city (Optional[str]): 城市(如 "Los Angeles"),不传则在州内随机
max_attempts (int): 反向地理编码最大尝试次数
sleep_sec (float): 每次失败后的等待秒数,用于尊重 Nominatim 频率限制
返回值:
dict: 包含 Firstname、Lastname、全名、生日、街道地址、城市、电话、邮编、州全称
"""
state_abbr = _normalize_state(state)
base_lat, base_lon, chosen_city = _us_pick_coords(state_abbr, city)
address_str = ""
city_name = ""
postcode = ""
for _ in range(max_attempts):
lat, lon = _random_near(base_lat, base_lon)
data = _reverse_geocode(lat, lon)
if not data:
time.sleep(sleep_sec)
continue
addr = data.get("address", {})
city_name = addr.get("city") or addr.get("town") or addr.get("village") or chosen_city
postcode = addr.get("postcode") or ""
address_str = _us_format_address(addr, state_abbr)
if addr.get("house_number") and (addr.get("road") or addr.get("residential") or addr.get("footway")) and city_name and re.fullmatch(r"\d{5}(-\d{4})?", postcode or ""):
break
time.sleep(sleep_sec)
firstname, lastname = _random_name()
full_name = f"{firstname} {lastname}"
birthday = _random_birthday()
phone = _us_random_phone_state(state_abbr, city or chosen_city)
state_full = next((k for k, v in US_STATE_ABBR.items() if v == state_abbr), state_abbr)
return {
"firstname": firstname,
"lastname": lastname,
"full_name": full_name,
"birthday": birthday,
"address_str": address_str.split(",")[0],
"city_name": city_name,
"phone": phone,
"postcode": postcode,
"state": state_full,
}
def get_random_us_info(state: str, city: Optional[str]) -> Dict[str, str]:
"""
本地生成美国个人与地址信息(不依赖外部网络)
参数:
state (str): 州(如 "California""CA"
city (str | None): 城市(如 "Los Angeles"),不传则按州随机
返回值:
dict: 包含 Firstname、Lastname、全名、生日、街道地址、城市、电话、邮编、州全称
"""
state_abbr = _normalize_state(state)
_, _, chosen_city = _us_pick_coords(state_abbr, city)
firstname, lastname = _random_name()
full_name = f"{firstname} {lastname}"
birthday = _random_birthday()
phone = _us_random_phone_state(state_abbr, city or chosen_city)
def _random_street_us() -> str:
"""
生成本地美国街道地址
返回值:
str: 形如 '123 Maple Ave' 的地址
"""
house = random.randint(10, 9999)
street_roots = [
"Maple", "Oak", "Pine", "Cedar", "Elm", "Birch", "Willow", "Spruce", "Ash",
"River", "Lake", "Hill", "Queen", "King", "Main", "Washington", "Lincoln",
"Church", "College", "Center"
]
suffixes = ["St", "Ave", "Rd", "Blvd", "Dr", "Ct", "Pl", "Ln", "Way", "Terrace"]
return f"{house} {random.choice(street_roots)} {random.choice(suffixes)}"
address_str = _random_street_us()
city_name = city or chosen_city
postcode = _us_random_zip_for_state(state_abbr)
state_full = next((k for k, v in US_STATE_ABBR.items() if v == state_abbr), state_abbr)
return {
"firstname": firstname,
"lastname": lastname,
"full_name": full_name,
"birthday": birthday,
"address_str": address_str,
"city_name": city_name,
"phone": phone,
"postcode": postcode,
"state": state_full,
}
def _random_birthday_by_age_range(min_age: int, max_age: int) -> str:
"""
按年龄区间生成随机生日,格式为 yyyy-mm-dd
参数:
min_age (int): 最小年龄(含)
max_age (int): 最大年龄(含)
返回值:
str: 生日字符串
"""
if min_age < 0:
min_age = 0
if max_age < min_age:
max_age = min_age
today = date.today()
start = today - timedelta(days=max_age * 365 + 366)
end = today - timedelta(days=min_age * 365)
delta_days = (end - start).days
d = start + timedelta(days=random.randint(0, max(delta_days, 1)))
return f"{d.year}-{d.month:02d}-{d.day:02d}"
def _random_date_between(start: date, end: date) -> str:
"""
在指定日期区间内生成随机日期,格式为 yyyy-mm-dd
参数:
start (date): 起始日期(含)
end (date): 结束日期(含)
返回值:
str: 随机日期字符串
"""
if end < start:
start, end = end, start
delta_days = (end - start).days
d = start + timedelta(days=random.randint(0, max(delta_days, 1)))
return f"{d.year}-{d.month:02d}-{d.day:02d}"
def generate_child_parent_names(
enforce_period_under13: bool = True,
period_start: str = "2013-07-01",
period_end: str = "2020-04-01",
min_child_age: int = 1,
max_child_age: int = 17,
min_parent_age: int = 25,
max_parent_age: int = 65,
country: str = "US",
province_or_state: Optional[str] = None,
city: Optional[str] = None,
use_network: bool = False,
separate_phones: bool = True,
) -> Dict[str, str]:
"""
生成两个随机人:未成年孩子与家长,孩子与家长共享姓氏,并包含随机地址等完整信息
参数:
enforce_period_under13 (bool): 是否强制孩子在 [period_start, period_end] 期间均小于13岁默认开启
period_start (str): 期间开始日期,默认 "2013-07-01"
period_end (str): 期间结束日期,默认 "2020-04-01"
min_child_age (int): 孩子最小年龄(用于未启用期间约束时)
max_child_age (int): 孩子最大年龄(用于未启用期间约束时)
min_parent_age (int): 家长最小年龄(用于未启用期间约束时)
max_parent_age (int): 家长最大年龄(用于未启用期间约束时)
country (str): 国家,"CA""US",默认 "CA"
province_or_state (str | None): 指定省/州,默认随机
city (str | None): 指定城市,默认随机
use_network (bool): 是否使用网络反向地理编码生成地址,默认 False 使用本地生成
separate_phones (bool): 是否为孩子与家长生成不同的电话号码,默认 True
返回值:
dict: 包含孩子与家长的 Firstname、Lastname、全名、生日与地址等字段
"""
parent_first, parent_last = _random_name()
child_first, _ = _random_name()
if enforce_period_under13:
ps = date.fromisoformat(period_start)
pe = date.fromisoformat(period_end)
bound = date(pe.year - 13, pe.month, pe.day)
child_min = bound + timedelta(days=1)
child_max = pe
child_birthday = _random_date_between(child_min, child_max)
# 依据孩子生日生成家长生日,设定合理的年龄差
y, m, d = map(int, child_birthday.split("-"))
child_dt = date(y, m, d)
def _minus_years_safe(dt: date, years: int) -> date:
try:
return date(dt.year - years, dt.month, dt.day)
except ValueError:
# 处理闰年2月29等情况回退到当月最后一天
while True:
try:
return date(dt.year - years, dt.month, dt.day)
except ValueError:
dt = dt - timedelta(days=1)
gap = random.randint(20, 45)
parent_dt = _minus_years_safe(child_dt, gap)
parent_birthday = f"{parent_dt.year}-{parent_dt.month:02d}-{parent_dt.day:02d}"
else:
child_birthday = _random_birthday_by_age_range(min_child_age, max_child_age)
parent_birthday = _random_birthday_by_age_range(min_parent_age, max_parent_age)
country = (country or "CA").upper()
addr_info: Dict[str, str]
if country == "US":
state_abbr = province_or_state or random.choice(list(US_STATE_ABBR.values()))
if use_network:
addr_info = generate_us_info(state_abbr, city)
else:
addr_info = get_random_us_info(state_abbr, city)
state_full = addr_info.get("state")
child_phone = addr_info.get("phone")
parent_phone = addr_info.get("phone")
if separate_phones:
child_phone = _us_random_phone_state(state_abbr, addr_info.get("city_name"))
return {
"child_firstname": child_first,
"child_lastname": parent_last,
"child_full_name": f"{child_first} {parent_last}",
"child_birthday": child_birthday,
"child_address_str": addr_info.get("address_str"),
"child_city_name": addr_info.get("city_name"),
"child_phone": child_phone,
"child_postcode": addr_info.get("postcode"),
"child_state": state_full,
"parent_firstname": parent_first,
"parent_lastname": parent_last,
"parent_full_name": f"{parent_first} {parent_last}",
"parent_birthday": parent_birthday,
"parent_address_str": addr_info.get("address_str"),
"parent_city_name": addr_info.get("city_name"),
"parent_phone": parent_phone,
"parent_postcode": addr_info.get("postcode"),
"parent_state": state_full,
}
else:
prov_abbr = province_or_state or random.choice(list(CA_PROVINCE_ABBR.values()))
if use_network:
addr_info = generate_canada_info(prov_abbr, city)
else:
addr_info = get_random_canada_info(prov_abbr, city)
province_full = addr_info.get("province")
# 生成孩子与家长电话
parent_phone = addr_info.get("phone")
child_phone = parent_phone
if separate_phones:
# 使用省缩写与城市生成新的号码
ca_abbr = CA_PROVINCE_ABBR.get(province_full, prov_abbr)
child_phone = _random_phone_city(ca_abbr, addr_info.get("city_name"))
return {
"child_firstname": child_first,
"child_lastname": parent_last,
"child_full_name": f"{child_first} {parent_last}",
"child_birthday": child_birthday,
"child_address_str": addr_info.get("address_str"),
"child_city_name": addr_info.get("city_name"),
"child_phone": child_phone,
"child_postcode": addr_info.get("postcode"),
"child_province": province_full,
"parent_firstname": parent_first,
"parent_lastname": parent_last,
"parent_full_name": f"{parent_first} {parent_last}",
"parent_birthday": parent_birthday,
"parent_address_str": addr_info.get("address_str"),
"parent_city_name": addr_info.get("city_name"),
"parent_phone": parent_phone,
"parent_postcode": addr_info.get("postcode"),
"parent_province": province_full,
}
def main() -> None:
"""
演示:生成 Alberta 省 Calgary 的随机信息;可修改为其他省/城市
"""
info = generate_canada_info("Alberta", "Calgary")
print(info)
if __name__ == "__main__":
# main()
info = generate_child_parent_names()
print(info)