@@ -70,26 +70,95 @@ def check_url(url: str, timeout: int = 10, allow_redirects: bool = True) -> Tupl
7070 """
7171 Check if URL is accessible.
7272 Returns: (is_valid, status_code, error_message)
73+ Uses GET request to check content for false positives (pages that return 200 but show 404).
7374 """
7475 try :
76+ headers = {'User-Agent' : 'Mozilla/5.0 (compatible; LinkChecker/1.0)' }
77+
7578 if USE_REQUESTS :
76- response = requests .head (
79+ # Use GET instead of HEAD to check page content for false positives
80+ response = requests .get (
7781 url ,
7882 timeout = timeout ,
7983 allow_redirects = allow_redirects ,
80- headers = { 'User-Agent' : 'Mozilla/5.0 (compatible; LinkChecker/1.0)' }
84+ headers = headers
8185 )
8286 status_code = response .status_code
87+
88+ # Check if page is actually valid (not a 404 page with 200 status)
8389 is_valid = status_code < 400
84- error_message = f"{ status_code } { response .reason } " if not is_valid else ""
90+ if is_valid and status_code == 200 :
91+ # Check for common 404 indicators in HTML content
92+ content_lower = response .text .lower ()
93+
94+ # Extract title tag content more reliably
95+ title_match = None
96+ title_pattern = r'<title[^>]*>(.*?)</title>'
97+ title_matches = re .findall (title_pattern , content_lower , re .DOTALL | re .IGNORECASE )
98+ if title_matches :
99+ title_text = title_matches [0 ].strip ()
100+ # Check for various 404 indicators in title
101+ if any (phrase in title_text for phrase in [
102+ 'page not found' ,
103+ 'not found' ,
104+ '404' ,
105+ 'page does not exist' ,
106+ 'couldn\' t find the page' ,
107+ 'we couldn\' t find'
108+ ]):
109+ is_valid = False
110+ error_message = f"200 (but page shows '{ title_matches [0 ].strip ()} ' in title)"
111+ else :
112+ error_message = ""
113+ else :
114+ # No title tag found, check body content for 404 indicators
115+ if 'page not found' in content_lower [:5000 ] or '404' in content_lower [:5000 ]:
116+ is_valid = False
117+ error_message = "200 (but page content suggests 404)"
118+ else :
119+ error_message = ""
120+ else :
121+ error_message = f"{ status_code } { response .reason } " if not is_valid else ""
85122 else :
86- request = urllib . request . Request ( url , method = 'HEAD' )
87- request . add_header ( 'User-Agent' , 'Mozilla/5.0 (compatible; LinkChecker/1.0)' )
123+ # Fallback to urllib - use GET to check content
124+ request = urllib . request . Request ( url , headers = headers )
88125 try :
89126 with urllib .request .urlopen (request , timeout = timeout ) as response :
90127 status_code = response .status
91128 is_valid = status_code < 400
92- error_message = f"{ status_code } " if not is_valid else ""
129+
130+ # For 200 status, read content to check for false positives
131+ if is_valid and status_code == 200 :
132+ content = response .read ().decode ('utf-8' , errors = 'ignore' ).lower ()
133+
134+ # Extract title tag content more reliably
135+ title_match = None
136+ title_pattern = r'<title[^>]*>(.*?)</title>'
137+ title_matches = re .findall (title_pattern , content , re .DOTALL | re .IGNORECASE )
138+ if title_matches :
139+ title_text = title_matches [0 ].strip ()
140+ # Check for various 404 indicators in title
141+ if any (phrase in title_text for phrase in [
142+ 'page not found' ,
143+ 'not found' ,
144+ '404' ,
145+ 'page does not exist' ,
146+ 'couldn\' t find the page' ,
147+ 'we couldn\' t find'
148+ ]):
149+ is_valid = False
150+ error_message = f"200 (but page shows '{ title_matches [0 ].strip ()} ' in title)"
151+ else :
152+ error_message = ""
153+ else :
154+ # No title tag found, check body content for 404 indicators
155+ if 'page not found' in content [:5000 ] or '404' in content [:5000 ]:
156+ is_valid = False
157+ error_message = "200 (but page content suggests 404)"
158+ else :
159+ error_message = ""
160+ else :
161+ error_message = f"{ status_code } " if not is_valid else ""
93162 except urllib .error .HTTPError as e :
94163 status_code = e .code
95164 is_valid = status_code < 400
0 commit comments