@@ -113,8 +113,10 @@ export class RobotsParser {
113113 this . sitemaps . set ( host , sitemaps ) ;
114114 }
115115
116+
116117 /**
117118 * Check if a URL is allowed by robots.txt
119+ * Follows Google's spec: longest matching pattern wins
118120 */
119121 isAllowed ( url : string , userAgent = '*' ) : boolean {
120122 try {
@@ -136,28 +138,48 @@ export class RobotsParser {
136138 return true ; // No matching rules = allow
137139 }
138140
139- // Check rules (more specific rules take precedence)
141+ // Collect ALL matching patterns from all rules
142+ const matches : { pattern : string ; isAllow : boolean } [ ] = [ ] ;
143+
140144 for ( const rule of matchingRules ) {
141- // Check disallow first
145+ // Collect matching allow patterns
146+ for ( const allow of rule . allow ) {
147+ if ( this . pathMatches ( path , allow ) ) {
148+ matches . push ( { pattern : allow , isAllow : true } ) ;
149+ }
150+ }
151+ // Collect matching disallow patterns
142152 for ( const disallow of rule . disallow ) {
143153 if ( this . pathMatches ( path , disallow ) ) {
144- // Check if there's a more specific allow
145- for ( const allow of rule . allow ) {
146- if ( this . pathMatches ( path , allow ) && allow . length > disallow . length ) {
147- return true ;
148- }
149- }
150- return false ;
154+ matches . push ( { pattern : disallow , isAllow : false } ) ;
151155 }
152156 }
153157 }
154158
155- return true ;
159+ // No matching patterns = allow
160+ if ( matches . length === 0 ) {
161+ return true ;
162+ }
163+
164+ // Find the longest matching pattern (Google's spec: most specific wins)
165+ // If tied, allow wins (per spec: "allow" takes precedence on equal length)
166+ const longest = matches . reduce ( ( best , current ) => {
167+ if ( current . pattern . length > best . pattern . length ) {
168+ return current ;
169+ }
170+ if ( current . pattern . length === best . pattern . length && current . isAllow ) {
171+ return current ; // Allow wins on tie
172+ }
173+ return best ;
174+ } ) ;
175+
176+ return longest . isAllow ;
156177 } catch {
157178 return true ;
158179 }
159180 }
160181
182+
161183 /**
162184 * Get crawl delay for a host
163185 */
@@ -191,17 +213,23 @@ export class RobotsParser {
191213 }
192214
193215 private pathMatches ( path : string , pattern : string ) : boolean {
194- // Simple pattern matching (supports * and $ wildcards)
216+ // Robots.txt pattern matching (supports * and $ wildcards)
217+ // Per Google spec: * matches any sequence, $ means end-of-URL
195218 if ( pattern === '/' ) return true ;
196219
220+ // Check if pattern ends with $ (exact match anchor)
221+ const hasEndAnchor = pattern . endsWith ( '$' ) ;
222+ const patternToConvert = hasEndAnchor ? pattern . slice ( 0 , - 1 ) : pattern ;
223+
197224 // Convert pattern to regex
198- let regex = pattern
225+ // Escape special regex chars EXCEPT * (we handle it separately)
226+ let regex = patternToConvert
199227 . replace ( / [ . + ? ^ $ { } ( ) | [ \] \\ ] / g, '\\$&' ) // Escape special chars
200228 . replace ( / \* / g, '.*' ) ; // * becomes .*
201229
202- // $ at end means exact match
203- if ( regex . endsWith ( '\\$' ) ) {
204- regex = regex . slice ( 0 , - 2 ) + '$' ;
230+ // Add end anchor if pattern ended with $
231+ if ( hasEndAnchor ) {
232+ regex += '$' ;
205233 }
206234
207235 try {
0 commit comments