Skip to content

Commit 3dce647

Browse files
viki shiviki shi
authored andcommitted
add signapore and australia
1 parent 8099a17 commit 3dce647

File tree

13 files changed

+32193
-24864
lines changed

13 files changed

+32193
-24864
lines changed

pipeline/collectors/oaic.py

Lines changed: 90 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,113 @@
1-
"""Australia OAIC collector — scrapes OAIC privacy decisions."""
1+
"""Australia OAIC collector — scrapes privacy determinations from oaic.gov.au.
2+
3+
Scrapes all pages of determinations, extracting case metadata and summary text
4+
directly from the OAIC listing. Each case is stored as an HTML document containing
5+
the structured summary (title, date, legislative provision, determination, catchwords).
6+
The AustLII URL is preserved as source_page_url for linking.
7+
"""
28

39
from __future__ import annotations
410

511
import logging
12+
import time
613

714
import requests
815
from bs4 import BeautifulSoup
9-
from urllib.parse import urljoin
1016

1117
from pipeline.collectors.base import BaseCollector
1218
from pipeline.config import HTTP_TIMEOUT
1319
from pipeline.models import DiscoveredDoc
1420

1521
log = logging.getLogger(__name__)
1622

17-
_BASE_URL = "https://www.oaic.gov.au/privacy/privacy-decisions"
23+
_BASE_URL = "https://www.oaic.gov.au/privacy/privacy-assessments-and-decisions/privacy-decisions/privacy-determinations"
1824

1925

2026
class OAICCollector(BaseCollector):
21-
"""Discover OAIC privacy decision documents."""
27+
"""Discover OAIC privacy determinations by scraping the listing pages."""
2228

2329
def discover(self) -> list[DiscoveredDoc]:
2430
docs: list[DiscoveredDoc] = []
25-
26-
try:
27-
resp = requests.get(_BASE_URL, headers=self.get_headers(), timeout=HTTP_TIMEOUT)
28-
resp.raise_for_status()
29-
except requests.RequestException as e:
30-
log.error(f"OAIC fetch failed: {e}")
31-
return []
32-
33-
soup = BeautifulSoup(resp.text, "lxml")
34-
35-
for link in soup.select("a[href*='/privacy-decisions/'], a[href*='/privacy/determinations/']"):
36-
href = link.get("href", "")
37-
title = link.get_text(strip=True)
38-
if not href or not title or len(title) < 5:
39-
continue
40-
41-
case_page_url = urljoin("https://www.oaic.gov.au", href)
42-
43-
# Check for PDF link vs HTML decision
44-
if href.endswith(".pdf"):
45-
doc_url = case_page_url
46-
file_type = "pdf"
47-
else:
48-
doc_url = case_page_url
49-
file_type = "html"
50-
51-
docs.append(DiscoveredDoc(
52-
case_title=title,
53-
source_page_url=case_page_url,
54-
document_url=doc_url,
55-
file_type=file_type,
56-
))
31+
seen_urls: set[str] = set()
32+
page = 1
33+
34+
# First page has no param; subsequent pages use result_26111_result_page=N
35+
next_url: str | None = _BASE_URL
36+
37+
while next_url and page <= 20: # safety limit
38+
try:
39+
resp = requests.get(next_url, headers=self.get_headers(), timeout=HTTP_TIMEOUT)
40+
resp.raise_for_status()
41+
except requests.RequestException as e:
42+
log.warning(f"OAIC page {page} fetch failed: {e}")
43+
break
44+
45+
soup = BeautifulSoup(resp.text, "lxml")
46+
items = soup.select("article.custom-listing__item")
47+
48+
if not items:
49+
break
50+
51+
for item in items:
52+
cells = item.select(".custom-listing__cell")
53+
if len(cells) < 6:
54+
continue
55+
56+
# Extract structured data from cells
57+
title_text = cells[0].get_text(strip=True).removeprefix("Decision").strip()
58+
date_text = cells[1].get_text(strip=True).removeprefix("Decision year").strip()
59+
status_text = cells[2].get_text(strip=True).removeprefix("Status").strip()
60+
provision_text = cells[3].get_text(strip=True).removeprefix("Legislative provision").strip()
61+
determination_text = cells[4].get_text(strip=True).removeprefix("Determination").strip()
62+
catchword_text = cells[5].get_text(strip=True).removeprefix("Catchword summary").strip()
63+
64+
# Get AustLII link
65+
austlii_link = item.select_one("a[href*='austlii']")
66+
austlii_url = austlii_link["href"].split("#")[0] if austlii_link else ""
67+
68+
# Build a synthetic document containing all the summary text
69+
# This will be stored as the "document" for text extraction
70+
doc_content = f"""OAIC Privacy Determination
71+
Title: {title_text}
72+
Date: {date_text}
73+
Status: {status_text}
74+
Legislative Provision: {provision_text}
75+
Determination: {determination_text}
76+
Catchword Summary: {catchword_text}
77+
Source: {austlii_url}"""
78+
79+
# Use a stable URL as document_url for dedup
80+
# Use the OAIC page URL + case identifier
81+
doc_url = austlii_url or f"{_BASE_URL}#{title_text[:50]}"
82+
83+
if doc_url in seen_urls:
84+
continue
85+
seen_urls.add(doc_url)
86+
87+
docs.append(DiscoveredDoc(
88+
case_title=title_text,
89+
source_page_url=austlii_url,
90+
document_url=doc_url,
91+
file_type="text",
92+
# Store the pre-built summary as metadata for the downloader
93+
_oaic_summary=doc_content,
94+
))
95+
96+
log.info(f"OAIC page {page}: found {len(items)} cases (total: {len(docs)})")
97+
98+
# Find next page link
99+
next_link = None
100+
pag_links = soup.select("a.search-results__pagination-navlinks")
101+
for pl in pag_links:
102+
href = pl.get("href", "")
103+
text = pl.get_text(strip=True)
104+
if text == str(page + 1) and href:
105+
next_link = href
106+
break
107+
108+
next_url = next_link
109+
page += 1
110+
time.sleep(0.5) # Be polite
57111

58112
log.info(f"OAIC discovery complete: {len(docs)} documents")
59113
return docs

pipeline/collectors/pdpc.py

Lines changed: 84 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
1-
"""Singapore PDPC collector — scrapes PDPC commission decisions."""
1+
"""Singapore PDPC collector — discovers commission decisions.
2+
3+
Primary source: CSV file with case_url + pdf_url pairs (160 historical decisions).
4+
Secondary: Attempts to scrape the live page for newer decisions (page uses JS rendering,
5+
so this may return 0 results — the CSV is the reliable source).
6+
"""
27

38
from __future__ import annotations
49

10+
import csv
511
import logging
12+
from pathlib import Path
13+
from urllib.parse import urljoin
614

715
import requests
816
from bs4 import BeautifulSoup
9-
from urllib.parse import urljoin
1017

1118
from pipeline.collectors.base import BaseCollector
1219
from pipeline.config import HTTP_TIMEOUT
@@ -15,41 +22,92 @@
1522
log = logging.getLogger(__name__)
1623

1724
_BASE_URL = "https://www.pdpc.gov.sg/all-commissions-decisions"
25+
_CSV_PATH = Path(__file__).resolve().parents[2] / "files" / "inbox" / "Singapore - PDPC" / "pdpc_commission_decisions_pdf_urls.csv"
1826

1927

2028
class PDPCCollector(BaseCollector):
21-
"""Discover PDPC commission decision PDFs."""
29+
"""Discover PDPC commission decision PDFs from CSV + live scraping."""
2230

2331
def discover(self) -> list[DiscoveredDoc]:
2432
docs: list[DiscoveredDoc] = []
33+
seen_urls: set[str] = set()
34+
35+
# ── Primary: CSV with verified case URLs and PDF URLs ────────────
36+
if _CSV_PATH.exists():
37+
with open(_CSV_PATH, newline="", encoding="utf-8") as f:
38+
reader = csv.DictReader(f)
39+
for row in reader:
40+
case_url = (row.get("case_url") or "").strip()
41+
pdf_url = (row.get("pdf_url") or "").strip()
42+
title = (row.get("case_title") or "").strip()
43+
44+
if not pdf_url or not pdf_url.startswith("http"):
45+
continue
46+
47+
# Use PDF URL as the document to download
48+
if pdf_url in seen_urls:
49+
continue
50+
seen_urls.add(pdf_url)
51+
52+
# Extract a readable title from the case URL slug if CSV title is generic
53+
if not title or title.startswith("pdpc_case_"):
54+
title = self._title_from_url(case_url)
55+
56+
docs.append(DiscoveredDoc(
57+
case_title=title,
58+
source_page_url=case_url,
59+
document_url=pdf_url,
60+
file_type="pdf",
61+
))
2562

63+
log.info(f"PDPC CSV: loaded {len(docs)} decisions from {_CSV_PATH.name}")
64+
65+
# ── Secondary: Try live scraping for newer decisions ─────────────
66+
# Note: PDPC uses client-side JS rendering, so this often returns 0.
67+
# It's here as a fallback for when the site structure changes.
2668
try:
2769
resp = requests.get(_BASE_URL, headers=self.get_headers(), timeout=HTTP_TIMEOUT)
2870
resp.raise_for_status()
71+
soup = BeautifulSoup(resp.text, "lxml")
72+
73+
live_count = 0
74+
for link in soup.select("a[href*='/all-commissions-decisions/']"):
75+
href = link.get("href", "")
76+
title = link.get_text(strip=True)
77+
if not href or not title:
78+
continue
79+
# Skip year/month index pages
80+
if href.rstrip("/").count("/") < 5:
81+
continue
82+
83+
case_page_url = urljoin("https://www.pdpc.gov.sg", href)
84+
if case_page_url in seen_urls:
85+
continue
86+
seen_urls.add(case_page_url)
87+
88+
docs.append(DiscoveredDoc(
89+
case_title=title,
90+
source_page_url=case_page_url,
91+
document_url=case_page_url,
92+
file_type="html",
93+
))
94+
live_count += 1
95+
96+
if live_count:
97+
log.info(f"PDPC live scrape: found {live_count} additional decisions")
98+
2999
except requests.RequestException as e:
30-
log.error(f"PDPC fetch failed: {e}")
31-
return []
32-
33-
soup = BeautifulSoup(resp.text, "lxml")
34-
35-
# PDPC lists decisions by year/month with links to individual decision pages
36-
for link in soup.select("a[href*='/all-commissions-decisions/']"):
37-
href = link.get("href", "")
38-
title = link.get_text(strip=True)
39-
if not href or not title:
40-
continue
41-
# Skip year/month index pages — we want individual decision pages
42-
# Decision pages typically have company names in the slug
43-
if href.rstrip("/").count("/") < 5:
44-
continue
45-
46-
case_page_url = urljoin("https://www.pdpc.gov.sg", href)
47-
docs.append(DiscoveredDoc(
48-
case_title=title,
49-
source_page_url=case_page_url,
50-
document_url=case_page_url, # PDFs extracted from case page during download
51-
file_type="html",
52-
))
100+
log.warning(f"PDPC live scrape failed (CSV data still used): {e}")
53101

54102
log.info(f"PDPC discovery complete: {len(docs)} documents")
55103
return docs
104+
105+
@staticmethod
106+
def _title_from_url(url: str) -> str:
107+
"""Extract readable title from PDPC case URL slug."""
108+
if not url:
109+
return "PDPC Decision"
110+
# URL format: .../YYYY/MM/breach-of-the-protection-obligation-by-company-name
111+
slug = url.rstrip("/").split("/")[-1]
112+
# Convert hyphens to spaces, title case
113+
return slug.replace("-", " ").title()

pipeline/export/to_postgres.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,28 @@
88
from __future__ import annotations
99

1010
import json
11+
import re
1112
import logging
1213

1314
from pipeline.db import get_conn
1415
from pipeline.validation.validator import validate_case
1516

17+
18+
def _to_int(val) -> int | None:
19+
"""Safely convert a value to int. Returns None for non-numeric strings like 'Unknown'."""
20+
if val is None:
21+
return None
22+
if isinstance(val, int):
23+
return val
24+
if isinstance(val, float):
25+
return int(val)
26+
if isinstance(val, str):
27+
# Strip non-numeric chars and try to parse
28+
cleaned = re.sub(r"[^\d]", "", val)
29+
if cleaned:
30+
return int(cleaned)
31+
return None
32+
1633
log = logging.getLogger(__name__)
1734

1835
INSERT_SQL = """
@@ -106,7 +123,7 @@ def save_to_cases(document_id: int, file_name: str, file_hash: str,
106123
"file_hash": file_hash,
107124
"case_name": merged.get("case_name"),
108125
"jurisdiction": merged.get("jurisdiction"),
109-
"year": merged.get("year"),
126+
"year": _to_int(merged.get("year")),
110127
"company": merged.get("company"),
111128
"sector": merged.get("sector"),
112129
"data_types": merged.get("data_types"),
@@ -117,7 +134,7 @@ def save_to_cases(document_id: int, file_name: str, file_hash: str,
117134
"enforcement_outcomes": json.dumps(merged.get("enforcement_outcomes", [])),
118135
"penalty_amount_usd": merged.get("penalty_amount_usd"),
119136
"penalty_original": merged.get("penalty_original"),
120-
"individuals_affected": merged.get("individuals_affected"),
137+
"individuals_affected": _to_int(merged.get("individuals_affected")),
121138
"outcome": merged.get("outcome"),
122139
"raw_json": json.dumps(merged),
123140
"document_id": document_id,

pipeline/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class DiscoveredDoc:
1414
source_page_url: str
1515
document_url: str
1616
file_type: str = "pdf"
17+
_oaic_summary: str | None = None # Pre-built summary text (OAIC scrapes)
1718

1819

1920
@dataclass

pipeline/processing/downloader.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,21 @@ def download_document(document_id: int, document_url: str, jurisdiction_id: str)
3939
Returns the raw_documents.id on success, None on failure.
4040
Skips if the content hash already exists (dedup).
4141
"""
42+
# AustLII and some other sites need longer timeouts and browser headers
43+
timeout = HTTP_TIMEOUT
44+
headers = {"User-Agent": USER_AGENT}
45+
if "austlii.edu.au" in document_url:
46+
timeout = 60
47+
headers = {
48+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
49+
"Accept": "application/pdf,text/html,*/*",
50+
}
51+
4252
try:
4353
resp = requests.get(
4454
document_url,
45-
headers={"User-Agent": USER_AGENT},
46-
timeout=HTTP_TIMEOUT,
55+
headers=headers,
56+
timeout=timeout,
4757
allow_redirects=True,
4858
)
4959
resp.raise_for_status()

pipeline/runner.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,22 @@ def run_collect(jurisdiction_id: str | None = None, dry_run: bool = False, limit
8686
)
8787
if cur.rowcount > 0:
8888
inserted += 1
89+
# For OAIC: auto-create raw_document with pre-built summary
90+
if d._oaic_summary:
91+
from pipeline.processing.dedup import sha256_bytes
92+
content = d._oaic_summary.encode("utf-8")
93+
file_hash = sha256_bytes(content)
94+
cur.execute("SELECT id FROM discovered_documents WHERE jurisdiction_id = %s AND document_url = %s", (jid, d.document_url))
95+
dd_row = cur.fetchone()
96+
if dd_row:
97+
dd_id = dd_row[0]
98+
cur.execute(
99+
"""INSERT INTO raw_documents (document_id, file_name, file_hash, file_data, file_size, mime_type, extracted_text)
100+
VALUES (%s, %s, %s, %s, %s, %s, %s)
101+
ON CONFLICT (file_hash) DO NOTHING""",
102+
(dd_id, f"oaic-{dd_id}.txt", file_hash, content, len(content), "text/plain", d._oaic_summary),
103+
)
104+
cur.execute("UPDATE discovered_documents SET status = 'downloaded', downloaded_at = NOW() WHERE id = %s", (dd_id,))
89105
except Exception as e:
90106
log.warning(f"Insert failed for {d.document_url}: {e}")
91107
conn.commit()

src/components/ControlBar.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ const ControlBar = ({
109109
: "hover:bg-secondary"
110110
}`}
111111
>
112-
{selectedJurisdictions.includes(j) ? " " : " "}
112+
{selectedJurisdictions.includes(j) ? " " : " "}
113113
{j}
114114
</button>
115115
))}

0 commit comments

Comments
 (0)