AISmithLab
diff --git a/‎pipeline/collectors/oaic.py‎
Lines changed: 90 additions & 36 deletions b/‎pipeline/collectors/oaic.py‎
Lines changed: 90 additions & 36 deletions
diff --git a/‎pipeline/collectors/pdpc.py‎
Lines changed: 84 additions & 26 deletions b/‎pipeline/collectors/pdpc.py‎
Lines changed: 84 additions & 26 deletions
diff --git a/‎pipeline/export/to_postgres.py‎
Lines changed: 19 additions & 2 deletions b/‎pipeline/export/to_postgres.py‎
Lines changed: 19 additions & 2 deletions
diff --git a/‎pipeline/models.py‎
Lines changed: 1 addition & 0 deletions b/‎pipeline/models.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pipeline/processing/downloader.py‎
Lines changed: 12 additions & 2 deletions b/‎pipeline/processing/downloader.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎pipeline/runner.py‎
Lines changed: 16 additions & 0 deletions b/‎pipeline/runner.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎src/components/ControlBar.tsx‎
Lines changed: 1 addition & 1 deletion b/‎src/components/ControlBar.tsx‎
Lines changed: 1 addition & 1 deletion
@@ -1,59 +1,113 @@
-"""Australia OAIC collector — scrapes OAIC privacy decisions."""
+"""Australia OAIC collector — scrapes privacy determinations from oaic.gov.au.
+
+Scrapes all pages of determinations, extracting case metadata and summary text
+directly from the OAIC listing. Each case is stored as an HTML document containing
+the structured summary (title, date, legislative provision, determination, catchwords).
+The AustLII URL is preserved as source_page_url for linking.
+"""
 
 from __future__ import annotations
 
 import logging
+import time
 
 import requests
 from bs4 import BeautifulSoup
-from urllib.parse import urljoin
 
 from pipeline.collectors.base import BaseCollector
 from pipeline.config import HTTP_TIMEOUT
 from pipeline.models import DiscoveredDoc
 
 log = logging.getLogger(__name__)
 
-_BASE_URL = "https://www.oaic.gov.au/privacy/privacy-decisions"
+_BASE_URL = "https://www.oaic.gov.au/privacy/privacy-assessments-and-decisions/privacy-decisions/privacy-determinations"
 
 
 class OAICCollector(BaseCollector):
-    """Discover OAIC privacy decision documents."""
+    """Discover OAIC privacy determinations by scraping the listing pages."""
 
     def discover(self) -> list[DiscoveredDoc]:
         docs: list[DiscoveredDoc] = []
-
-        try:
-            resp = requests.get(_BASE_URL, headers=self.get_headers(), timeout=HTTP_TIMEOUT)
-            resp.raise_for_status()
-        except requests.RequestException as e:
-            log.error(f"OAIC fetch failed: {e}")
-            return []
-
-        soup = BeautifulSoup(resp.text, "lxml")
-
-        for link in soup.select("a[href*='/privacy-decisions/'], a[href*='/privacy/determinations/']"):
-            href = link.get("href", "")
-            title = link.get_text(strip=True)
-            if not href or not title or len(title) < 5:
-                continue
-
-            case_page_url = urljoin("https://www.oaic.gov.au", href)
-
-            # Check for PDF link vs HTML decision
-            if href.endswith(".pdf"):
-                doc_url = case_page_url
-                file_type = "pdf"
-            else:
-                doc_url = case_page_url
-                file_type = "html"
-
-            docs.append(DiscoveredDoc(
-                case_title=title,
-                source_page_url=case_page_url,
-                document_url=doc_url,
-                file_type=file_type,
-            ))
+        seen_urls: set[str] = set()
+        page = 1
+
+        # First page has no param; subsequent pages use result_26111_result_page=N
+        next_url: str | None = _BASE_URL
+
+        while next_url and page <= 20:  # safety limit
+            try:
+                resp = requests.get(next_url, headers=self.get_headers(), timeout=HTTP_TIMEOUT)
+                resp.raise_for_status()
+            except requests.RequestException as e:
+                log.warning(f"OAIC page {page} fetch failed: {e}")
+                break
+
+            soup = BeautifulSoup(resp.text, "lxml")
+            items = soup.select("article.custom-listing__item")
+
+            if not items:
+                break
+
+            for item in items:
+                cells = item.select(".custom-listing__cell")
+                if len(cells) < 6:
+                    continue
+
+                # Extract structured data from cells
+                title_text = cells[0].get_text(strip=True).removeprefix("Decision").strip()
+                date_text = cells[1].get_text(strip=True).removeprefix("Decision year").strip()
+                status_text = cells[2].get_text(strip=True).removeprefix("Status").strip()
+                provision_text = cells[3].get_text(strip=True).removeprefix("Legislative provision").strip()
+                determination_text = cells[4].get_text(strip=True).removeprefix("Determination").strip()
+                catchword_text = cells[5].get_text(strip=True).removeprefix("Catchword summary").strip()
+
+                # Get AustLII link
+                austlii_link = item.select_one("a[href*='austlii']")
+                austlii_url = austlii_link["href"].split("#")[0] if austlii_link else ""
+
+                # Build a synthetic document containing all the summary text
+                # This will be stored as the "document" for text extraction
+                doc_content = f"""OAIC Privacy Determination
+Title: {title_text}
+Date: {date_text}
+Status: {status_text}
+Legislative Provision: {provision_text}
+Determination: {determination_text}
+Catchword Summary: {catchword_text}
+Source: {austlii_url}"""
+
+                # Use a stable URL as document_url for dedup
+                # Use the OAIC page URL + case identifier
+                doc_url = austlii_url or f"{_BASE_URL}#{title_text[:50]}"
+
+                if doc_url in seen_urls:
+                    continue
+                seen_urls.add(doc_url)
+
+                docs.append(DiscoveredDoc(
+                    case_title=title_text,
+                    source_page_url=austlii_url,
+                    document_url=doc_url,
+                    file_type="text",
+                    # Store the pre-built summary as metadata for the downloader
+                    _oaic_summary=doc_content,
+                ))
+
+            log.info(f"OAIC page {page}: found {len(items)} cases (total: {len(docs)})")
+
+            # Find next page link
+            next_link = None
+            pag_links = soup.select("a.search-results__pagination-navlinks")
+            for pl in pag_links:
+                href = pl.get("href", "")
+                text = pl.get_text(strip=True)
+                if text == str(page + 1) and href:
+                    next_link = href
+                    break
+
+            next_url = next_link
+            page += 1
+            time.sleep(0.5)  # Be polite
 
         log.info(f"OAIC discovery complete: {len(docs)} documents")
         return docs
@@ -1,12 +1,19 @@
-"""Singapore PDPC collector — scrapes PDPC commission decisions."""
+"""Singapore PDPC collector — discovers commission decisions.
+
+Primary source: CSV file with case_url + pdf_url pairs (160 historical decisions).
+Secondary: Attempts to scrape the live page for newer decisions (page uses JS rendering,
+so this may return 0 results — the CSV is the reliable source).
+"""
 
 from __future__ import annotations
 
+import csv
 import logging
+from pathlib import Path
+from urllib.parse import urljoin
 
 import requests
 from bs4 import BeautifulSoup
-from urllib.parse import urljoin
 
 from pipeline.collectors.base import BaseCollector
 from pipeline.config import HTTP_TIMEOUT
@@ -15,41 +22,92 @@
 log = logging.getLogger(__name__)
 
 _BASE_URL = "https://www.pdpc.gov.sg/all-commissions-decisions"
+_CSV_PATH = Path(__file__).resolve().parents[2] / "files" / "inbox" / "Singapore - PDPC" / "pdpc_commission_decisions_pdf_urls.csv"
 
 
 class PDPCCollector(BaseCollector):
-    """Discover PDPC commission decision PDFs."""
+    """Discover PDPC commission decision PDFs from CSV + live scraping."""
 
     def discover(self) -> list[DiscoveredDoc]:
         docs: list[DiscoveredDoc] = []
+        seen_urls: set[str] = set()
+
+        # ── Primary: CSV with verified case URLs and PDF URLs ────────────
+        if _CSV_PATH.exists():
+            with open(_CSV_PATH, newline="", encoding="utf-8") as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    case_url = (row.get("case_url") or "").strip()
+                    pdf_url = (row.get("pdf_url") or "").strip()
+                    title = (row.get("case_title") or "").strip()
+
+                    if not pdf_url or not pdf_url.startswith("http"):
+                        continue
+
+                    # Use PDF URL as the document to download
+                    if pdf_url in seen_urls:
+                        continue
+                    seen_urls.add(pdf_url)
+
+                    # Extract a readable title from the case URL slug if CSV title is generic
+                    if not title or title.startswith("pdpc_case_"):
+                        title = self._title_from_url(case_url)
+
+                    docs.append(DiscoveredDoc(
+                        case_title=title,
+                        source_page_url=case_url,
+                        document_url=pdf_url,
+                        file_type="pdf",
+                    ))
 
+            log.info(f"PDPC CSV: loaded {len(docs)} decisions from {_CSV_PATH.name}")
+
+        # ── Secondary: Try live scraping for newer decisions ─────────────
+        # Note: PDPC uses client-side JS rendering, so this often returns 0.
+        # It's here as a fallback for when the site structure changes.
         try:
             resp = requests.get(_BASE_URL, headers=self.get_headers(), timeout=HTTP_TIMEOUT)
             resp.raise_for_status()
+            soup = BeautifulSoup(resp.text, "lxml")
+
+            live_count = 0
+            for link in soup.select("a[href*='/all-commissions-decisions/']"):
+                href = link.get("href", "")
+                title = link.get_text(strip=True)
+                if not href or not title:
+                    continue
+                # Skip year/month index pages
+                if href.rstrip("/").count("/") < 5:
+                    continue
+
+                case_page_url = urljoin("https://www.pdpc.gov.sg", href)
+                if case_page_url in seen_urls:
+                    continue
+                seen_urls.add(case_page_url)
+
+                docs.append(DiscoveredDoc(
+                    case_title=title,
+                    source_page_url=case_page_url,
+                    document_url=case_page_url,
+                    file_type="html",
+                ))
+                live_count += 1
+
+            if live_count:
+                log.info(f"PDPC live scrape: found {live_count} additional decisions")
+
         except requests.RequestException as e:
-            log.error(f"PDPC fetch failed: {e}")
-            return []
-
-        soup = BeautifulSoup(resp.text, "lxml")
-
-        # PDPC lists decisions by year/month with links to individual decision pages
-        for link in soup.select("a[href*='/all-commissions-decisions/']"):
-            href = link.get("href", "")
-            title = link.get_text(strip=True)
-            if not href or not title:
-                continue
-            # Skip year/month index pages — we want individual decision pages
-            # Decision pages typically have company names in the slug
-            if href.rstrip("/").count("/") < 5:
-                continue
-
-            case_page_url = urljoin("https://www.pdpc.gov.sg", href)
-            docs.append(DiscoveredDoc(
-                case_title=title,
-                source_page_url=case_page_url,
-                document_url=case_page_url,  # PDFs extracted from case page during download
-                file_type="html",
-            ))
+            log.warning(f"PDPC live scrape failed (CSV data still used): {e}")
 
         log.info(f"PDPC discovery complete: {len(docs)} documents")
         return docs
+
+    @staticmethod
+    def _title_from_url(url: str) -> str:
+        """Extract readable title from PDPC case URL slug."""
+        if not url:
+            return "PDPC Decision"
+        # URL format: .../YYYY/MM/breach-of-the-protection-obligation-by-company-name
+        slug = url.rstrip("/").split("/")[-1]
+        # Convert hyphens to spaces, title case
+        return slug.replace("-", " ").title()
@@ -8,11 +8,28 @@
 from __future__ import annotations
 
 import json
+import re
 import logging
 
 from pipeline.db import get_conn
 from pipeline.validation.validator import validate_case
 
+
+def _to_int(val) -> int | None:
+    """Safely convert a value to int. Returns None for non-numeric strings like 'Unknown'."""
+    if val is None:
+        return None
+    if isinstance(val, int):
+        return val
+    if isinstance(val, float):
+        return int(val)
+    if isinstance(val, str):
+        # Strip non-numeric chars and try to parse
+        cleaned = re.sub(r"[^\d]", "", val)
+        if cleaned:
+            return int(cleaned)
+    return None
+
 log = logging.getLogger(__name__)
 
 INSERT_SQL = """
@@ -106,7 +123,7 @@ def save_to_cases(document_id: int, file_name: str, file_hash: str,
         "file_hash": file_hash,
         "case_name": merged.get("case_name"),
         "jurisdiction": merged.get("jurisdiction"),
-        "year": merged.get("year"),
+        "year": _to_int(merged.get("year")),
         "company": merged.get("company"),
         "sector": merged.get("sector"),
         "data_types": merged.get("data_types"),
@@ -117,7 +134,7 @@ def save_to_cases(document_id: int, file_name: str, file_hash: str,
         "enforcement_outcomes": json.dumps(merged.get("enforcement_outcomes", [])),
         "penalty_amount_usd": merged.get("penalty_amount_usd"),
         "penalty_original": merged.get("penalty_original"),
-        "individuals_affected": merged.get("individuals_affected"),
+        "individuals_affected": _to_int(merged.get("individuals_affected")),
         "outcome": merged.get("outcome"),
         "raw_json": json.dumps(merged),
         "document_id": document_id,
 
@@ -14,6 +14,7 @@ class DiscoveredDoc:
     source_page_url: str
     document_url: str
     file_type: str = "pdf"
+    _oaic_summary: str | None = None  # Pre-built summary text (OAIC scrapes)
 
 
 @dataclass
 
@@ -39,11 +39,21 @@ def download_document(document_id: int, document_url: str, jurisdiction_id: str)
     Returns the raw_documents.id on success, None on failure.
     Skips if the content hash already exists (dedup).
     """
+    # AustLII and some other sites need longer timeouts and browser headers
+    timeout = HTTP_TIMEOUT
+    headers = {"User-Agent": USER_AGENT}
+    if "austlii.edu.au" in document_url:
+        timeout = 60
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "Accept": "application/pdf,text/html,*/*",
+        }
+
     try:
         resp = requests.get(
             document_url,
-            headers={"User-Agent": USER_AGENT},
-            timeout=HTTP_TIMEOUT,
+            headers=headers,
+            timeout=timeout,
             allow_redirects=True,
         )
         resp.raise_for_status()
 
@@ -86,6 +86,22 @@ def run_collect(jurisdiction_id: str | None = None, dry_run: bool = False, limit
                         )
                         if cur.rowcount > 0:
                             inserted += 1
+                            # For OAIC: auto-create raw_document with pre-built summary
+                            if d._oaic_summary:
+                                from pipeline.processing.dedup import sha256_bytes
+                                content = d._oaic_summary.encode("utf-8")
+                                file_hash = sha256_bytes(content)
+                                cur.execute("SELECT id FROM discovered_documents WHERE jurisdiction_id = %s AND document_url = %s", (jid, d.document_url))
+                                dd_row = cur.fetchone()
+                                if dd_row:
+                                    dd_id = dd_row[0]
+                                    cur.execute(
+                                        """INSERT INTO raw_documents (document_id, file_name, file_hash, file_data, file_size, mime_type, extracted_text)
+                                           VALUES (%s, %s, %s, %s, %s, %s, %s)
+                                           ON CONFLICT (file_hash) DO NOTHING""",
+                                        (dd_id, f"oaic-{dd_id}.txt", file_hash, content, len(content), "text/plain", d._oaic_summary),
+                                    )
+                                    cur.execute("UPDATE discovered_documents SET status = 'downloaded', downloaded_at = NOW() WHERE id = %s", (dd_id,))
                     except Exception as e:
                         log.warning(f"Insert failed for {d.document_url}: {e}")
             conn.commit()
 
@@ -109,7 +109,7 @@ const ControlBar = ({
                   : "hover:bg-secondary"
               }`}
             >
-              {selectedJurisdictions.includes(j) ? "☑ " : "☐ "}
+              {selectedJurisdictions.includes(j) ? "● " : "○ "}
               {j}
             </button>
           ))}
Original file line number	Diff line number	Diff line change
`@@ -109,7 +109,7 @@ const ControlBar = ({`
`109`	`109`	`: "hover:bg-secondary"`
`110`	`110`	}`}
`111`	`111`	`>`
`112`		`- {selectedJurisdictions.includes(j) ? "☑ " : "☐ "}`
	`112`	`+ {selectedJurisdictions.includes(j) ? "● " : "○ "}`
`113`	`113`	`{j}`
`114`	`114`	`</button>`
`115`	`115`	`))}`