|
| 1 | +""" |
| 2 | +Scrape DLA Piper Data Protection handbook for global privacy law info. |
| 3 | +Outputs src/data/globalJurisdictions.json with country-level privacy law data. |
| 4 | +
|
| 5 | +Usage: |
| 6 | + python3 scripts/scrape_global_privacy.py |
| 7 | +""" |
| 8 | + |
| 9 | +import json |
| 10 | +import re |
| 11 | +import time |
| 12 | +import logging |
| 13 | +from pathlib import Path |
| 14 | + |
| 15 | +import requests |
| 16 | +from bs4 import BeautifulSoup |
| 17 | + |
| 18 | +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") |
| 19 | +log = logging.getLogger(__name__) |
| 20 | + |
| 21 | +BASE_URL = "https://www.dlapiperdataprotection.com/" |
| 22 | +OUTPUT = Path(__file__).resolve().parents[1] / "src" / "data" / "globalJurisdictions.json" |
| 23 | + |
| 24 | +HEADERS = { |
| 25 | + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" |
| 26 | +} |
| 27 | + |
| 28 | +# ISO 3166-1 alpha-2 to numeric mapping (for map highlighting) |
| 29 | +ALPHA2_TO_NUMERIC = { |
| 30 | + "AL": "008", "DZ": "012", "AO": "024", "AR": "032", "AM": "051", |
| 31 | + "AU": "036", "AT": "040", "AZ": "031", "BS": "044", "BH": "048", |
| 32 | + "BD": "050", "BB": "052", "BY": "112", "BE": "056", "BJ": "204", |
| 33 | + "BM": "060", "BO": "068", "BA": "070", "BW": "072", "BR": "076", |
| 34 | + "BN": "096", "BG": "100", "BF": "854", "KH": "116", "CA": "124", |
| 35 | + "CV": "132", "KY": "136", "CL": "152", "CN": "156", "CO": "170", |
| 36 | + "CG": "178", "CD": "180", "CR": "188", "HR": "191", "CY": "196", |
| 37 | + "CZ": "203", "DK": "208", "DO": "214", "EC": "218", "EG": "818", |
| 38 | + "SV": "222", "EE": "233", "ET": "231", "FI": "246", "FR": "250", |
| 39 | + "DE": "276", "GH": "288", "GR": "300", "GT": "320", "HK": "344", |
| 40 | + "HU": "348", "IS": "352", "IN": "356", "ID": "360", "IE": "372", |
| 41 | + "IL": "376", "IT": "380", "JM": "388", "JP": "392", "JO": "400", |
| 42 | + "KZ": "398", "KE": "404", "KR": "410", "KW": "414", "LV": "428", |
| 43 | + "LT": "440", "LU": "442", "MO": "446", "MY": "458", "MT": "470", |
| 44 | + "MU": "480", "MX": "484", "MD": "498", "MA": "504", "MZ": "508", |
| 45 | + "NL": "528", "NZ": "554", "NG": "566", "NO": "578", "OM": "512", |
| 46 | + "PK": "586", "PA": "591", "PY": "600", "PE": "604", "PH": "608", |
| 47 | + "PL": "616", "PT": "620", "QA": "634", "RO": "642", "RU": "643", |
| 48 | + "RW": "646", "SA": "682", "SN": "686", "RS": "688", "SG": "702", |
| 49 | + "SK": "703", "SI": "705", "ZA": "710", "ES": "724", "LK": "144", |
| 50 | + "SE": "752", "CH": "756", "TW": "158", "TZ": "834", "TH": "764", |
| 51 | + "TN": "788", "TR": "792", "UG": "800", "UA": "804", "AE": "784", |
| 52 | + "GB": "826", "US": "840", "UY": "858", "VN": "704", "ZM": "894", |
| 53 | + "ZW": "716", "CI": "384", "GN": "324", "MG": "450", "NE": "562", |
| 54 | + "TG": "768", "TD": "148", "MW": "454", "NA": "516", "SZ": "748", |
| 55 | + "LS": "426", "GY": "328", "TT": "780", "BZ": "084", "HN": "340", |
| 56 | + "NI": "558", "CU": "192", "HT": "332", "MM": "104", "LA": "418", |
| 57 | + "NP": "524", "MN": "496", "GE": "268", "UZ": "860", "TM": "795", |
| 58 | + "KG": "417", "TJ": "762", "AF": "004", "IQ": "368", "SY": "760", |
| 59 | + "LB": "422", "LY": "434", "SD": "729", "ER": "232", "DJ": "262", |
| 60 | + "SO": "706", "YE": "887", "IR": "364", "CM": "120", "GA": "266", |
| 61 | + "GQ": "226", "CF": "140", "SS": "728", |
| 62 | +} |
| 63 | + |
| 64 | + |
| 65 | +def get_country_codes() -> list[dict]: |
| 66 | + """Extract country codes and names from the main page.""" |
| 67 | + resp = requests.get(BASE_URL, headers=HEADERS, timeout=30) |
| 68 | + resp.raise_for_status() |
| 69 | + soup = BeautifulSoup(resp.text, "lxml") |
| 70 | + |
| 71 | + countries = [] |
| 72 | + seen = set() |
| 73 | + # Find country select options |
| 74 | + for opt in soup.select("select option"): |
| 75 | + code = opt.get("value", "").strip() |
| 76 | + name = opt.get_text(strip=True) |
| 77 | + if code and name and len(code) <= 3 and code not in seen and code != "Select an option": |
| 78 | + seen.add(code) |
| 79 | + countries.append({"code": code, "name": name}) |
| 80 | + |
| 81 | + # If nothing found from select, parse JS |
| 82 | + if not countries: |
| 83 | + for m in re.finditer(r'"(\w{2,3})"\s*:\s*"([^"]+)"', resp.text): |
| 84 | + code, name = m.group(1), m.group(2) |
| 85 | + if code not in seen and len(name) > 2: |
| 86 | + seen.add(code) |
| 87 | + countries.append({"code": code, "name": name}) |
| 88 | + |
| 89 | + return countries |
| 90 | + |
| 91 | + |
| 92 | +def scrape_country(code: str, name: str) -> dict | None: |
| 93 | + """Scrape a single country page for privacy law data.""" |
| 94 | + url = f"{BASE_URL}?c={code}" |
| 95 | + try: |
| 96 | + resp = requests.get(url, headers=HEADERS, timeout=30) |
| 97 | + resp.raise_for_status() |
| 98 | + except Exception as e: |
| 99 | + log.warning(f"Failed to fetch {name} ({code}): {e}") |
| 100 | + return None |
| 101 | + |
| 102 | + soup = BeautifulSoup(resp.text, "lxml") |
| 103 | + main = soup.select_one("main.page-content") or soup |
| 104 | + |
| 105 | + # Extract sections by headings |
| 106 | + sections = {} |
| 107 | + current = None |
| 108 | + for el in main.find_all(["h2", "h3", "p", "li"]): |
| 109 | + if el.name in ("h2", "h3"): |
| 110 | + heading = el.get_text(strip=True) |
| 111 | + if 3 < len(heading) < 100: |
| 112 | + current = heading.lower() |
| 113 | + sections[current] = [] |
| 114 | + elif current and el.name in ("p", "li"): |
| 115 | + text = el.get_text(strip=True) |
| 116 | + if text and len(text) > 15: |
| 117 | + sections[current].append(text) |
| 118 | + |
| 119 | + # Extract key fields |
| 120 | + overview = "" |
| 121 | + authority = "" |
| 122 | + enforcement = "" |
| 123 | + laws = [] |
| 124 | + |
| 125 | + for heading, texts in sections.items(): |
| 126 | + combined = " ".join(texts[:5])[:600] |
| 127 | + if not combined: |
| 128 | + continue |
| 129 | + |
| 130 | + if any(k in heading for k in ("definition of personal data", "law", "authority")): |
| 131 | + if "authority" in heading and not authority: |
| 132 | + authority = combined |
| 133 | + elif not overview and "definition" in heading: |
| 134 | + overview = combined |
| 135 | + |
| 136 | + if any(k in heading for k in ("enforcement", "sanction", "penalt")): |
| 137 | + enforcement = combined |
| 138 | + |
| 139 | + if "collection" in heading and not overview: |
| 140 | + overview = combined |
| 141 | + |
| 142 | + # Build a general overview from the first substantial section if still empty |
| 143 | + if not overview: |
| 144 | + for texts in sections.values(): |
| 145 | + combined = " ".join(texts[:3]) |
| 146 | + if len(combined) > 100: |
| 147 | + overview = combined[:600] |
| 148 | + break |
| 149 | + |
| 150 | + # Get ISO numeric code for map |
| 151 | + numeric_code = ALPHA2_TO_NUMERIC.get(code[:2], "") |
| 152 | + |
| 153 | + return { |
| 154 | + "code": code, |
| 155 | + "name": name, |
| 156 | + "numericCode": numeric_code, |
| 157 | + "overview": overview, |
| 158 | + "authority": authority, |
| 159 | + "mainLaws": laws, |
| 160 | + "enforcementStyle": enforcement, |
| 161 | + } |
| 162 | + |
| 163 | + |
| 164 | +def main(): |
| 165 | + log.info("Fetching country list...") |
| 166 | + countries = get_country_codes() |
| 167 | + log.info(f"Found {len(countries)} countries") |
| 168 | + |
| 169 | + results = [] |
| 170 | + for i, country in enumerate(countries): |
| 171 | + code = country["code"] |
| 172 | + name = country["name"] |
| 173 | + |
| 174 | + # Skip special regional entries |
| 175 | + if len(code) > 2 and code not in ("BQ1",): |
| 176 | + log.info(f"[{i+1}/{len(countries)}] Skipping regional entry {name} ({code})") |
| 177 | + continue |
| 178 | + |
| 179 | + log.info(f"[{i+1}/{len(countries)}] Scraping {name} ({code})...") |
| 180 | + data = scrape_country(code, name) |
| 181 | + if data: |
| 182 | + results.append(data) |
| 183 | + |
| 184 | + time.sleep(0.3) |
| 185 | + |
| 186 | + OUTPUT.parent.mkdir(parents=True, exist_ok=True) |
| 187 | + OUTPUT.write_text(json.dumps(results, indent=2, ensure_ascii=False), encoding="utf-8") |
| 188 | + log.info(f"Wrote {len(results)} countries to {OUTPUT}") |
| 189 | + |
| 190 | + |
| 191 | +if __name__ == "__main__": |
| 192 | + main() |
0 commit comments