add canada

viki shi · viki shi · commit 588b13c9eab9 · 2026-03-29T10:40:30.000-07:00
diff --git a/pipeline/collectors/canada_opc.py b/pipeline/collectors/canada_opc.py
@@ -0,0 +1,124 @@
+"""Canada OPC collector — scrapes Office of the Privacy Commissioner investigation pages.
+
+Scrapes two sources:
+  1. Investigations into businesses (PIPEDA): ~164 cases
+  2. Investigations into federal institutions (Privacy Act): ~132 cases
+
+All content is inline HTML (no PDFs). Pagination uses ?o=d&Page=N&Filter=True.
+"""
+
+from __future__ import annotations
+
+import logging
+from urllib.parse import urljoin
+
+import requests
+from bs4 import BeautifulSoup
+
+from pipeline.collectors.base import BaseCollector
+from pipeline.config import HTTP_TIMEOUT
+from pipeline.models import DiscoveredDoc
+
+log = logging.getLogger(__name__)
+
+_SOURCES = [
+    {
+        "name": "businesses",
+        "base_url": "https://www.priv.gc.ca/en/opc-actions-and-decisions/investigations/investigations-into-businesses/",
+        "base_path": "/en/opc-actions-and-decisions/investigations/investigations-into-businesses/",
+    },
+    {
+        "name": "federal",
+        "base_url": "https://www.priv.gc.ca/en/opc-actions-and-decisions/investigations/investigations-into-federal-institutions/",
+        "base_path": "/en/opc-actions-and-decisions/investigations/investigations-into-federal-institutions/",
+    },
+]
+
+
+class CanadaOPCCollector(BaseCollector):
+    """Discover OPC investigation reports from both business and federal pages."""
+
+    def discover(self) -> list[DiscoveredDoc]:
+        docs: list[DiscoveredDoc] = []
+        seen_urls: set[str] = set()
+
+        for source in _SOURCES:
+            page_num = 1
+            while True:
+                if page_num == 1:
+                    url = source["base_url"]
+                else:
+                    url = f"{source['base_url']}?o=d&Page={page_num}&Filter=True"
+
+                try:
+                    resp = requests.get(url, headers=self.get_headers(), timeout=HTTP_TIMEOUT)
+                    resp.raise_for_status()
+                except requests.RequestException as e:
+                    log.error(f"OPC {source['name']} page {page_num} fetch failed: {e}")
+                    break
+
+                soup = BeautifulSoup(resp.text, "lxml")
+                page_docs = self._extract_cases(soup, source["base_path"], seen_urls)
+
+                if not page_docs:
+                    break
+
+                docs.extend(page_docs)
+                log.info(f"OPC {source['name']} page {page_num}: found {len(page_docs)} cases (total: {len(docs)})")
+
+                # Check for next page by looking for a Page=N+1 link
+                has_next = False
+                for a in soup.select(f'a[href*="Page="]'):
+                    href_val = a.get("href", "")
+                    if f"Page={page_num + 1}" in href_val and "/en/" in href_val:
+                        has_next = True
+                        break
+                if not has_next:
+                    break
+
+                page_num += 1
+                if page_num > 25:  # Safety limit
+                    break
+
+        log.info(f"OPC discovery complete: {len(docs)} documents")
+        return docs
+
+    def _extract_cases(self, soup: BeautifulSoup, base_path: str, seen: set[str]) -> list[DiscoveredDoc]:
+        """Extract case links from a listing page."""
+        docs: list[DiscoveredDoc] = []
+
+        for link in soup.select("a[href]"):
+            href = link.get("href", "").strip()
+            if not href:
+                continue
+
+            # Must be under the investigations path and have a case identifier
+            if base_path not in href:
+                continue
+
+            # Skip the listing page itself
+            remainder = href.split(base_path)[-1].strip("/")
+            if not remainder:
+                continue
+
+            # Must have a year/identifier pattern (e.g., 2026/pipeda-2026-001)
+            if "/" not in remainder:
+                continue
+
+            full_url = urljoin("https://www.priv.gc.ca", href)
+            if full_url in seen:
+                continue
+            seen.add(full_url)
+
+            title = link.get_text(strip=True)
+            if not title or len(title) < 10:
+                continue
+
+            docs.append(DiscoveredDoc(
+                case_title=title,
+                source_page_url=full_url,
+                document_url=full_url,
+                file_type="html",
+            ))
+
+        return docs
diff --git a/pipeline/collectors/registry.py b/pipeline/collectors/registry.py
@@ -18,6 +18,7 @@
     "eu_gdpr": ("pipeline.collectors.gdpr", "GDPRCollector"),
     "eu_edpb": ("pipeline.collectors.edpb", "EDPBCollector"),
     "us_ca_doj": ("pipeline.collectors.california_doj", "CaliforniaDOJCollector"),
+    "ca_opc": ("pipeline.collectors.canada_opc", "CanadaOPCCollector"),
 }
 
 
diff --git a/pipeline/migrate.py b/pipeline/migrate.py
@@ -96,6 +96,17 @@
         "access_method": "scrape",
         "crawl_frequency": "weekly",
     },
+    {
+        "id": "ca_opc",
+        "display_name": "Canada OPC",
+        "country": "Canada",
+        "regulator": "Office of the Privacy Commissioner of Canada",
+        "base_url": "https://www.priv.gc.ca",
+        "case_list_url": "https://www.priv.gc.ca/en/opc-actions-and-decisions/investigations/investigations-into-businesses/",
+        "document_types": ["html"],
+        "access_method": "scrape",
+        "crawl_frequency": "weekly",
+    },
 ]
 
 # Map from display_name (as stored in cases.jurisdiction via norm_jurisdiction) to jurisdiction id
@@ -127,6 +138,12 @@
     "australia": "au_oaic",
     "australia oaic": "au_oaic",
     "australia/oaic": "au_oaic",
+    "canada": "ca_opc",
+    "canada opc": "ca_opc",
+    "canada/pipeda": "ca_opc",
+    "pipeda": "ca_opc",
+    "opc": "ca_opc",
+    "privacy act (canada)": "ca_opc",
 }
 
 
diff --git a/pipeline/processing/url_resolver.py b/pipeline/processing/url_resolver.py
@@ -75,6 +75,7 @@ def url_works(url: str) -> bool:
     "edpb.europa.eu", "oag.ca.gov", "gov.uk",
     "naih.hu",  # Hungarian DPA
     "giodo.gov.pl",  # Polish DPA
+    "priv.gc.ca",  # Canada OPC
 )
 
 
diff --git a/pipeline/validation/normalizer.py b/pipeline/validation/normalizer.py
@@ -19,6 +19,7 @@
     "EU GDPR",
     "EU EDPB",
     "Australia OAIC",
+    "Canada OPC",
 ]
 
 SECTORS = [
@@ -69,6 +70,8 @@ def norm_jurisdiction(j: str | None) -> str:
         return "EU GDPR"
     if "oaic" in j_lower or "australia" in j_lower:
         return "Australia OAIC"
+    if "canada" in j_lower or "pipeda" in j_lower or "opc" in j_lower or "privacy act (canada)" in j_lower:
+        return "Canada OPC"
     return "US FTC"
 
 
diff --git a/src/data/cases.ts b/src/data/cases.ts
@@ -25,7 +25,7 @@ export type ViolationType =
   | "Improper Data Disposal"
   | "Illegal Monitoring/Surveillance";
 
-export type Jurisdiction = "US FTC" | "California DOJ" | "UK ICO" | "Singapore PDPC" | "EU GDPR" | "EU EDPB" | "Australia OAIC";
+export type Jurisdiction = "US FTC" | "California DOJ" | "UK ICO" | "Singapore PDPC" | "EU GDPR" | "EU EDPB" | "Australia OAIC" | "Canada OPC";
 
 export type Sector = "Technology" | "Social Media" | "Healthcare" | "E-Commerce" | "Gaming" | "Finance" | "Advertising" | "Food Delivery" | "Hospitality" | "Retail" | "Transportation";
 
@@ -90,6 +90,7 @@ export const JURISDICTIONS: Jurisdiction[] = [
   "EU GDPR",
   "EU EDPB",
   "Australia OAIC",
+  "Canada OPC",
 ];
 
 export const OUTCOME_TYPES: OutcomeType[] = [
diff --git a/src/data/generatedCases.json b/src/data/generatedCases.json
diff --git a/src/data/jurisdictionInfo.ts b/src/data/jurisdictionInfo.ts

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@`
`18`	`18`	`"eu_gdpr": ("pipeline.collectors.gdpr", "GDPRCollector"),`
`19`	`19`	`"eu_edpb": ("pipeline.collectors.edpb", "EDPBCollector"),`
`20`	`20`	`"us_ca_doj": ("pipeline.collectors.california_doj", "CaliforniaDOJCollector"),`
	`21`	`+ "ca_opc": ("pipeline.collectors.canada_opc", "CanadaOPCCollector"),`
`21`	`22`	`}`
`22`	`23`
`23`	`24`
Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,7 @@ def url_works(url: str) -> bool:`
`75`	`75`	`"edpb.europa.eu", "oag.ca.gov", "gov.uk",`
`76`	`76`	`"naih.hu", # Hungarian DPA`
`77`	`77`	`"giodo.gov.pl", # Polish DPA`
	`78`	`+ "priv.gc.ca", # Canada OPC`
`78`	`79`	`)`
`79`	`80`
`80`	`81`