Skip to content

Commit 9a5cb64

Browse files
committed
Update the check links script to take into account false positives.
1 parent 8dccf06 commit 9a5cb64

File tree

1 file changed

+75
-6
lines changed

1 file changed

+75
-6
lines changed

utils/check_links.py

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,26 +70,95 @@ def check_url(url: str, timeout: int = 10, allow_redirects: bool = True) -> Tupl
7070
"""
7171
Check if URL is accessible.
7272
Returns: (is_valid, status_code, error_message)
73+
Uses GET request to check content for false positives (pages that return 200 but show 404).
7374
"""
7475
try:
76+
headers = {'User-Agent': 'Mozilla/5.0 (compatible; LinkChecker/1.0)'}
77+
7578
if USE_REQUESTS:
76-
response = requests.head(
79+
# Use GET instead of HEAD to check page content for false positives
80+
response = requests.get(
7781
url,
7882
timeout=timeout,
7983
allow_redirects=allow_redirects,
80-
headers={'User-Agent': 'Mozilla/5.0 (compatible; LinkChecker/1.0)'}
84+
headers=headers
8185
)
8286
status_code = response.status_code
87+
88+
# Check if page is actually valid (not a 404 page with 200 status)
8389
is_valid = status_code < 400
84-
error_message = f"{status_code} {response.reason}" if not is_valid else ""
90+
if is_valid and status_code == 200:
91+
# Check for common 404 indicators in HTML content
92+
content_lower = response.text.lower()
93+
94+
# Extract title tag content more reliably
95+
title_match = None
96+
title_pattern = r'<title[^>]*>(.*?)</title>'
97+
title_matches = re.findall(title_pattern, content_lower, re.DOTALL | re.IGNORECASE)
98+
if title_matches:
99+
title_text = title_matches[0].strip()
100+
# Check for various 404 indicators in title
101+
if any(phrase in title_text for phrase in [
102+
'page not found',
103+
'not found',
104+
'404',
105+
'page does not exist',
106+
'couldn\'t find the page',
107+
'we couldn\'t find'
108+
]):
109+
is_valid = False
110+
error_message = f"200 (but page shows '{title_matches[0].strip()}' in title)"
111+
else:
112+
error_message = ""
113+
else:
114+
# No title tag found, check body content for 404 indicators
115+
if 'page not found' in content_lower[:5000] or '404' in content_lower[:5000]:
116+
is_valid = False
117+
error_message = "200 (but page content suggests 404)"
118+
else:
119+
error_message = ""
120+
else:
121+
error_message = f"{status_code} {response.reason}" if not is_valid else ""
85122
else:
86-
request = urllib.request.Request(url, method='HEAD')
87-
request.add_header('User-Agent', 'Mozilla/5.0 (compatible; LinkChecker/1.0)')
123+
# Fallback to urllib - use GET to check content
124+
request = urllib.request.Request(url, headers=headers)
88125
try:
89126
with urllib.request.urlopen(request, timeout=timeout) as response:
90127
status_code = response.status
91128
is_valid = status_code < 400
92-
error_message = f"{status_code}" if not is_valid else ""
129+
130+
# For 200 status, read content to check for false positives
131+
if is_valid and status_code == 200:
132+
content = response.read().decode('utf-8', errors='ignore').lower()
133+
134+
# Extract title tag content more reliably
135+
title_match = None
136+
title_pattern = r'<title[^>]*>(.*?)</title>'
137+
title_matches = re.findall(title_pattern, content, re.DOTALL | re.IGNORECASE)
138+
if title_matches:
139+
title_text = title_matches[0].strip()
140+
# Check for various 404 indicators in title
141+
if any(phrase in title_text for phrase in [
142+
'page not found',
143+
'not found',
144+
'404',
145+
'page does not exist',
146+
'couldn\'t find the page',
147+
'we couldn\'t find'
148+
]):
149+
is_valid = False
150+
error_message = f"200 (but page shows '{title_matches[0].strip()}' in title)"
151+
else:
152+
error_message = ""
153+
else:
154+
# No title tag found, check body content for 404 indicators
155+
if 'page not found' in content[:5000] or '404' in content[:5000]:
156+
is_valid = False
157+
error_message = "200 (but page content suggests 404)"
158+
else:
159+
error_message = ""
160+
else:
161+
error_message = f"{status_code}" if not is_valid else ""
93162
except urllib.error.HTTPError as e:
94163
status_code = e.code
95164
is_valid = status_code < 400

0 commit comments

Comments
 (0)