Skip to content

Commit 588b13c

Browse files
viki shiviki shi
authored andcommitted
add canada
1 parent 115f761 commit 588b13c

8 files changed

Lines changed: 48489 additions & 26722 deletions

File tree

pipeline/collectors/canada_opc.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
"""Canada OPC collector — scrapes Office of the Privacy Commissioner investigation pages.
2+
3+
Scrapes two sources:
4+
1. Investigations into businesses (PIPEDA): ~164 cases
5+
2. Investigations into federal institutions (Privacy Act): ~132 cases
6+
7+
All content is inline HTML (no PDFs). Pagination uses ?o=d&Page=N&Filter=True.
8+
"""
9+
10+
from __future__ import annotations
11+
12+
import logging
13+
from urllib.parse import urljoin
14+
15+
import requests
16+
from bs4 import BeautifulSoup
17+
18+
from pipeline.collectors.base import BaseCollector
19+
from pipeline.config import HTTP_TIMEOUT
20+
from pipeline.models import DiscoveredDoc
21+
22+
log = logging.getLogger(__name__)
23+
24+
_SOURCES = [
25+
{
26+
"name": "businesses",
27+
"base_url": "https://www.priv.gc.ca/en/opc-actions-and-decisions/investigations/investigations-into-businesses/",
28+
"base_path": "/en/opc-actions-and-decisions/investigations/investigations-into-businesses/",
29+
},
30+
{
31+
"name": "federal",
32+
"base_url": "https://www.priv.gc.ca/en/opc-actions-and-decisions/investigations/investigations-into-federal-institutions/",
33+
"base_path": "/en/opc-actions-and-decisions/investigations/investigations-into-federal-institutions/",
34+
},
35+
]
36+
37+
38+
class CanadaOPCCollector(BaseCollector):
39+
"""Discover OPC investigation reports from both business and federal pages."""
40+
41+
def discover(self) -> list[DiscoveredDoc]:
42+
docs: list[DiscoveredDoc] = []
43+
seen_urls: set[str] = set()
44+
45+
for source in _SOURCES:
46+
page_num = 1
47+
while True:
48+
if page_num == 1:
49+
url = source["base_url"]
50+
else:
51+
url = f"{source['base_url']}?o=d&Page={page_num}&Filter=True"
52+
53+
try:
54+
resp = requests.get(url, headers=self.get_headers(), timeout=HTTP_TIMEOUT)
55+
resp.raise_for_status()
56+
except requests.RequestException as e:
57+
log.error(f"OPC {source['name']} page {page_num} fetch failed: {e}")
58+
break
59+
60+
soup = BeautifulSoup(resp.text, "lxml")
61+
page_docs = self._extract_cases(soup, source["base_path"], seen_urls)
62+
63+
if not page_docs:
64+
break
65+
66+
docs.extend(page_docs)
67+
log.info(f"OPC {source['name']} page {page_num}: found {len(page_docs)} cases (total: {len(docs)})")
68+
69+
# Check for next page by looking for a Page=N+1 link
70+
has_next = False
71+
for a in soup.select(f'a[href*="Page="]'):
72+
href_val = a.get("href", "")
73+
if f"Page={page_num + 1}" in href_val and "/en/" in href_val:
74+
has_next = True
75+
break
76+
if not has_next:
77+
break
78+
79+
page_num += 1
80+
if page_num > 25: # Safety limit
81+
break
82+
83+
log.info(f"OPC discovery complete: {len(docs)} documents")
84+
return docs
85+
86+
def _extract_cases(self, soup: BeautifulSoup, base_path: str, seen: set[str]) -> list[DiscoveredDoc]:
87+
"""Extract case links from a listing page."""
88+
docs: list[DiscoveredDoc] = []
89+
90+
for link in soup.select("a[href]"):
91+
href = link.get("href", "").strip()
92+
if not href:
93+
continue
94+
95+
# Must be under the investigations path and have a case identifier
96+
if base_path not in href:
97+
continue
98+
99+
# Skip the listing page itself
100+
remainder = href.split(base_path)[-1].strip("/")
101+
if not remainder:
102+
continue
103+
104+
# Must have a year/identifier pattern (e.g., 2026/pipeda-2026-001)
105+
if "/" not in remainder:
106+
continue
107+
108+
full_url = urljoin("https://www.priv.gc.ca", href)
109+
if full_url in seen:
110+
continue
111+
seen.add(full_url)
112+
113+
title = link.get_text(strip=True)
114+
if not title or len(title) < 10:
115+
continue
116+
117+
docs.append(DiscoveredDoc(
118+
case_title=title,
119+
source_page_url=full_url,
120+
document_url=full_url,
121+
file_type="html",
122+
))
123+
124+
return docs

pipeline/collectors/registry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"eu_gdpr": ("pipeline.collectors.gdpr", "GDPRCollector"),
1919
"eu_edpb": ("pipeline.collectors.edpb", "EDPBCollector"),
2020
"us_ca_doj": ("pipeline.collectors.california_doj", "CaliforniaDOJCollector"),
21+
"ca_opc": ("pipeline.collectors.canada_opc", "CanadaOPCCollector"),
2122
}
2223

2324

pipeline/migrate.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,17 @@
9696
"access_method": "scrape",
9797
"crawl_frequency": "weekly",
9898
},
99+
{
100+
"id": "ca_opc",
101+
"display_name": "Canada OPC",
102+
"country": "Canada",
103+
"regulator": "Office of the Privacy Commissioner of Canada",
104+
"base_url": "https://www.priv.gc.ca",
105+
"case_list_url": "https://www.priv.gc.ca/en/opc-actions-and-decisions/investigations/investigations-into-businesses/",
106+
"document_types": ["html"],
107+
"access_method": "scrape",
108+
"crawl_frequency": "weekly",
109+
},
99110
]
100111

101112
# Map from display_name (as stored in cases.jurisdiction via norm_jurisdiction) to jurisdiction id
@@ -127,6 +138,12 @@
127138
"australia": "au_oaic",
128139
"australia oaic": "au_oaic",
129140
"australia/oaic": "au_oaic",
141+
"canada": "ca_opc",
142+
"canada opc": "ca_opc",
143+
"canada/pipeda": "ca_opc",
144+
"pipeda": "ca_opc",
145+
"opc": "ca_opc",
146+
"privacy act (canada)": "ca_opc",
130147
}
131148

132149

pipeline/processing/url_resolver.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def url_works(url: str) -> bool:
7575
"edpb.europa.eu", "oag.ca.gov", "gov.uk",
7676
"naih.hu", # Hungarian DPA
7777
"giodo.gov.pl", # Polish DPA
78+
"priv.gc.ca", # Canada OPC
7879
)
7980

8081

pipeline/validation/normalizer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"EU GDPR",
2020
"EU EDPB",
2121
"Australia OAIC",
22+
"Canada OPC",
2223
]
2324

2425
SECTORS = [
@@ -69,6 +70,8 @@ def norm_jurisdiction(j: str | None) -> str:
6970
return "EU GDPR"
7071
if "oaic" in j_lower or "australia" in j_lower:
7172
return "Australia OAIC"
73+
if "canada" in j_lower or "pipeda" in j_lower or "opc" in j_lower or "privacy act (canada)" in j_lower:
74+
return "Canada OPC"
7275
return "US FTC"
7376

7477

src/data/cases.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ export type ViolationType =
2525
| "Improper Data Disposal"
2626
| "Illegal Monitoring/Surveillance";
2727

28-
export type Jurisdiction = "US FTC" | "California DOJ" | "UK ICO" | "Singapore PDPC" | "EU GDPR" | "EU EDPB" | "Australia OAIC";
28+
export type Jurisdiction = "US FTC" | "California DOJ" | "UK ICO" | "Singapore PDPC" | "EU GDPR" | "EU EDPB" | "Australia OAIC" | "Canada OPC";
2929

3030
export type Sector = "Technology" | "Social Media" | "Healthcare" | "E-Commerce" | "Gaming" | "Finance" | "Advertising" | "Food Delivery" | "Hospitality" | "Retail" | "Transportation";
3131

@@ -90,6 +90,7 @@ export const JURISDICTIONS: Jurisdiction[] = [
9090
"EU GDPR",
9191
"EU EDPB",
9292
"Australia OAIC",
93+
"Canada OPC",
9394
];
9495

9596
export const OUTCOME_TYPES: OutcomeType[] = [

0 commit comments

Comments
 (0)