Skip to content

Commit a463569

Browse files
authored
Merge pull request #1 from 0x-pankaj/fix/robots-txt-parsing
fix(crawler): improve robots.txt parsing accuracy
2 parents e68052a + b69fb1d commit a463569

File tree

1 file changed

+43
-15
lines changed

1 file changed

+43
-15
lines changed

src/crawler/robots.ts

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,10 @@ export class RobotsParser {
113113
this.sitemaps.set(host, sitemaps);
114114
}
115115

116+
116117
/**
117118
* Check if a URL is allowed by robots.txt
119+
* Follows Google's spec: longest matching pattern wins
118120
*/
119121
isAllowed(url: string, userAgent = '*'): boolean {
120122
try {
@@ -136,28 +138,48 @@ export class RobotsParser {
136138
return true; // No matching rules = allow
137139
}
138140

139-
// Check rules (more specific rules take precedence)
141+
// Collect ALL matching patterns from all rules
142+
const matches: { pattern: string; isAllow: boolean }[] = [];
143+
140144
for (const rule of matchingRules) {
141-
// Check disallow first
145+
// Collect matching allow patterns
146+
for (const allow of rule.allow) {
147+
if (this.pathMatches(path, allow)) {
148+
matches.push({ pattern: allow, isAllow: true });
149+
}
150+
}
151+
// Collect matching disallow patterns
142152
for (const disallow of rule.disallow) {
143153
if (this.pathMatches(path, disallow)) {
144-
// Check if there's a more specific allow
145-
for (const allow of rule.allow) {
146-
if (this.pathMatches(path, allow) && allow.length > disallow.length) {
147-
return true;
148-
}
149-
}
150-
return false;
154+
matches.push({ pattern: disallow, isAllow: false });
151155
}
152156
}
153157
}
154158

155-
return true;
159+
// No matching patterns = allow
160+
if (matches.length === 0) {
161+
return true;
162+
}
163+
164+
// Find the longest matching pattern (Google's spec: most specific wins)
165+
// If tied, allow wins (per spec: "allow" takes precedence on equal length)
166+
const longest = matches.reduce((best, current) => {
167+
if (current.pattern.length > best.pattern.length) {
168+
return current;
169+
}
170+
if (current.pattern.length === best.pattern.length && current.isAllow) {
171+
return current; // Allow wins on tie
172+
}
173+
return best;
174+
});
175+
176+
return longest.isAllow;
156177
} catch {
157178
return true;
158179
}
159180
}
160181

182+
161183
/**
162184
* Get crawl delay for a host
163185
*/
@@ -191,17 +213,23 @@ export class RobotsParser {
191213
}
192214

193215
private pathMatches(path: string, pattern: string): boolean {
194-
// Simple pattern matching (supports * and $ wildcards)
216+
// Robots.txt pattern matching (supports * and $ wildcards)
217+
// Per Google spec: * matches any sequence, $ means end-of-URL
195218
if (pattern === '/') return true;
196219

220+
// Check if pattern ends with $ (exact match anchor)
221+
const hasEndAnchor = pattern.endsWith('$');
222+
const patternToConvert = hasEndAnchor ? pattern.slice(0, -1) : pattern;
223+
197224
// Convert pattern to regex
198-
let regex = pattern
225+
// Escape special regex chars EXCEPT * (we handle it separately)
226+
let regex = patternToConvert
199227
.replace(/[.+?^${}()|[\]\\]/g, '\\$&') // Escape special chars
200228
.replace(/\*/g, '.*'); // * becomes .*
201229

202-
// $ at end means exact match
203-
if (regex.endsWith('\\$')) {
204-
regex = regex.slice(0, -2) + '$';
230+
// Add end anchor if pattern ended with $
231+
if (hasEndAnchor) {
232+
regex += '$';
205233
}
206234

207235
try {

0 commit comments

Comments
 (0)