|
| 1 | +import { NextRequest, NextResponse } from 'next/server'; |
| 2 | +import TurndownService from 'turndown'; |
| 3 | + |
| 4 | +const rateLimitMap = new Map<string, { count: number; resetTime: number }>(); |
| 5 | +const RATE_LIMIT_WINDOW_MS = 60 * 1000; |
| 6 | +const RATE_LIMIT_MAX_REQUESTS = 10; |
| 7 | + |
| 8 | +function checkRateLimit(key: string): { allowed: boolean; remaining: number } { |
| 9 | + const now = Date.now(); |
| 10 | + const entry = rateLimitMap.get(key); |
| 11 | + |
| 12 | + if (!entry || now > entry.resetTime) { |
| 13 | + rateLimitMap.set(key, { count: 1, resetTime: now + RATE_LIMIT_WINDOW_MS }); |
| 14 | + return { allowed: true, remaining: RATE_LIMIT_MAX_REQUESTS - 1 }; |
| 15 | + } |
| 16 | + |
| 17 | + if (entry.count >= RATE_LIMIT_MAX_REQUESTS) { |
| 18 | + return { allowed: false, remaining: 0 }; |
| 19 | + } |
| 20 | + |
| 21 | + entry.count++; |
| 22 | + return { allowed: true, remaining: RATE_LIMIT_MAX_REQUESTS - entry.count }; |
| 23 | +} |
| 24 | + |
| 25 | +const BLOCKED_HOSTS = new Set(['localhost', '127.0.0.1', '[::1]', '::1', '0.0.0.0']); |
| 26 | + |
| 27 | +function isAllowedUrl(url: string): boolean { |
| 28 | + try { |
| 29 | + const parsed = new URL(url); |
| 30 | + if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') return false; |
| 31 | + |
| 32 | + const hostname = parsed.hostname.toLowerCase(); |
| 33 | + const bare = hostname.replace(/^\[|\]$/g, ''); |
| 34 | + |
| 35 | + if (BLOCKED_HOSTS.has(hostname) || BLOCKED_HOSTS.has(bare)) return false; |
| 36 | + if (bare.startsWith('::ffff:')) return isAllowedUrl(`http://${bare.slice(7)}`); |
| 37 | + if (/^127\./.test(bare) || /^0\./.test(bare)) return false; |
| 38 | + if (bare.startsWith('10.') || bare.startsWith('192.168.')) return false; |
| 39 | + if (/^172\.(1[6-9]|2\d|3[01])\./.test(bare)) return false; |
| 40 | + if (bare.startsWith('169.254.')) return false; |
| 41 | + if (bare.startsWith('fe80:') || bare.startsWith('fc00:') || bare.startsWith('fd')) return false; |
| 42 | + if (/^(22[4-9]|23\d|24\d|25[0-5])\./.test(bare)) return false; |
| 43 | + if (/^ff[0-9a-f]{2}:/.test(bare)) return false; |
| 44 | + return true; |
| 45 | + } catch { |
| 46 | + return false; |
| 47 | + } |
| 48 | +} |
| 49 | + |
| 50 | +const GITHUB_URL_PATTERN = /^https?:\/\/github\.com\/([^/]+)\/([^/]+)\/blob\/([^/]+)\/(.+)$/; |
| 51 | +const GITHUB_RAW_PATTERN = /^https?:\/\/raw\.githubusercontent\.com\//; |
| 52 | +const FETCH_TIMEOUT = 30_000; |
| 53 | + |
| 54 | +const TECH_KEYWORDS = new Set([ |
| 55 | + 'react', 'vue', 'angular', 'svelte', 'nextjs', 'nuxt', 'remix', |
| 56 | + 'typescript', 'javascript', 'python', 'rust', 'go', 'java', 'ruby', |
| 57 | + 'node', 'deno', 'bun', 'docker', 'kubernetes', 'terraform', |
| 58 | + 'aws', 'gcp', 'azure', 'vercel', 'netlify', 'cloudflare', |
| 59 | + 'graphql', 'rest', 'grpc', 'websocket', 'redis', 'postgres', |
| 60 | + 'mongodb', 'sqlite', 'mysql', 'prisma', 'drizzle', |
| 61 | + 'tailwind', 'css', 'html', 'sass', 'webpack', 'vite', 'esbuild', |
| 62 | + 'git', 'ci', 'cd', 'testing', 'security', 'authentication', |
| 63 | + 'api', 'cli', 'sdk', 'mcp', 'llm', 'ai', 'ml', 'openai', 'anthropic', |
| 64 | +]); |
| 65 | + |
| 66 | +const TAG_PATTERN = /^[a-z0-9]+(-[a-z0-9]+)*$/; |
| 67 | + |
| 68 | +const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' }); |
| 69 | + |
| 70 | +interface ExtractedContent { |
| 71 | + title: string; |
| 72 | + content: string; |
| 73 | + sourceUrl: string; |
| 74 | + contentType: string; |
| 75 | + language?: string; |
| 76 | +} |
| 77 | + |
| 78 | +async function extractFromUrl(url: string): Promise<ExtractedContent> { |
| 79 | + if (GITHUB_URL_PATTERN.test(url) || GITHUB_RAW_PATTERN.test(url)) { |
| 80 | + return fetchGitHubContent(url); |
| 81 | + } |
| 82 | + |
| 83 | + const MAX_BODY_SIZE = 5 * 1024 * 1024; |
| 84 | + const response = await fetch(url, { signal: AbortSignal.timeout(FETCH_TIMEOUT) }); |
| 85 | + if (!response.ok) { |
| 86 | + throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`); |
| 87 | + } |
| 88 | + |
| 89 | + const contentLength = Number(response.headers.get('content-length') || '0'); |
| 90 | + if (contentLength > MAX_BODY_SIZE) { |
| 91 | + throw new Error('Response too large'); |
| 92 | + } |
| 93 | + |
| 94 | + const contentType = response.headers.get('content-type') ?? ''; |
| 95 | + const body = await response.text(); |
| 96 | + if (body.length > MAX_BODY_SIZE) { |
| 97 | + throw new Error('Response too large'); |
| 98 | + } |
| 99 | + |
| 100 | + if (contentType.includes('text/html')) { |
| 101 | + const titleMatch = body.match(/<title[^>]*>([^<]+)<\/title>/i); |
| 102 | + const title = titleMatch?.[1]?.trim() ?? new URL(url).hostname; |
| 103 | + const bodyMatch = body.match(/<body[^>]*>([\s\S]*)<\/body>/i); |
| 104 | + const content = turndown.turndown(bodyMatch?.[1] ?? body); |
| 105 | + return { title, content, sourceUrl: url, contentType: 'webpage' }; |
| 106 | + } |
| 107 | + |
| 108 | + const title = new URL(url).pathname.split('/').pop() ?? 'Untitled'; |
| 109 | + return { title, content: body, sourceUrl: url, contentType: 'text' }; |
| 110 | +} |
| 111 | + |
| 112 | +const LANG_MAP: Record<string, string> = { |
| 113 | + '.ts': 'typescript', '.tsx': 'typescript', '.js': 'javascript', '.jsx': 'javascript', |
| 114 | + '.py': 'python', '.rb': 'ruby', '.go': 'go', '.rs': 'rust', '.java': 'java', |
| 115 | + '.kt': 'kotlin', '.swift': 'swift', '.sh': 'shell', '.yml': 'yaml', '.yaml': 'yaml', |
| 116 | + '.json': 'json', '.md': 'markdown', '.html': 'html', '.css': 'css', '.sql': 'sql', |
| 117 | +}; |
| 118 | + |
| 119 | +async function fetchGitHubContent(url: string): Promise<ExtractedContent> { |
| 120 | + let rawUrl = url; |
| 121 | + const match = url.match(GITHUB_URL_PATTERN); |
| 122 | + if (match) { |
| 123 | + const [, owner, repo, branch, path] = match; |
| 124 | + rawUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${branch}/${path}`; |
| 125 | + } |
| 126 | + |
| 127 | + const MAX_BODY_SIZE = 5 * 1024 * 1024; |
| 128 | + const response = await fetch(rawUrl, { signal: AbortSignal.timeout(FETCH_TIMEOUT) }); |
| 129 | + if (!response.ok) { |
| 130 | + throw new Error(`Failed to fetch GitHub content: ${response.status} ${response.statusText}`); |
| 131 | + } |
| 132 | + |
| 133 | + const contentLength = Number(response.headers.get('content-length') || '0'); |
| 134 | + if (contentLength > MAX_BODY_SIZE) { |
| 135 | + throw new Error('Response too large'); |
| 136 | + } |
| 137 | + |
| 138 | + const body = await response.text(); |
| 139 | + if (body.length > MAX_BODY_SIZE) { |
| 140 | + throw new Error('Response too large'); |
| 141 | + } |
| 142 | + const filename = rawUrl.split('/').pop() ?? 'file'; |
| 143 | + const ext = filename.includes('.') ? '.' + filename.split('.').pop()!.toLowerCase() : ''; |
| 144 | + const language = LANG_MAP[ext]; |
| 145 | + const isCode = language !== undefined && language !== 'markdown'; |
| 146 | + const content = isCode ? `\`\`\`${language}\n${body}\n\`\`\`` : body; |
| 147 | + |
| 148 | + return { title: filename, content, sourceUrl: url, contentType: 'github', language }; |
| 149 | +} |
| 150 | + |
| 151 | +function addTag(counts: Map<string, number>, tag: string, weight: number): void { |
| 152 | + if (TAG_PATTERN.test(tag)) { |
| 153 | + counts.set(tag, (counts.get(tag) ?? 0) + weight); |
| 154 | + } |
| 155 | +} |
| 156 | + |
| 157 | +function detectTags(extracted: ExtractedContent): string[] { |
| 158 | + const counts = new Map<string, number>(); |
| 159 | + |
| 160 | + try { |
| 161 | + const segments = new URL(extracted.sourceUrl).pathname |
| 162 | + .split('/').filter(Boolean) |
| 163 | + .map((s) => s.toLowerCase().replace(/[^a-z0-9-]/g, '')); |
| 164 | + for (const seg of segments) { |
| 165 | + if (seg.length >= 2 && seg.length <= 30) { |
| 166 | + addTag(counts, seg, 2); |
| 167 | + } |
| 168 | + } |
| 169 | + } catch { /* skip */ } |
| 170 | + |
| 171 | + const headingRe = /^#{1,2}\s+(.+)$/gm; |
| 172 | + let m: RegExpExecArray | null; |
| 173 | + while ((m = headingRe.exec(extracted.content)) !== null) { |
| 174 | + for (const word of m[1].toLowerCase().split(/\s+/)) { |
| 175 | + const cleaned = word.replace(/[^a-z0-9-]/g, ''); |
| 176 | + if (cleaned.length >= 2) { |
| 177 | + addTag(counts, cleaned, 2); |
| 178 | + } |
| 179 | + } |
| 180 | + } |
| 181 | + |
| 182 | + const codeBlockRe = /^```(\w+)/gm; |
| 183 | + while ((m = codeBlockRe.exec(extracted.content)) !== null) { |
| 184 | + const lang = m[1].toLowerCase(); |
| 185 | + if (lang.length >= 2) { |
| 186 | + addTag(counts, lang, 3); |
| 187 | + } |
| 188 | + } |
| 189 | + |
| 190 | + const lower = extracted.content.toLowerCase(); |
| 191 | + for (const keyword of TECH_KEYWORDS) { |
| 192 | + if (new RegExp(`\\b${keyword}\\b`, 'i').test(lower)) { |
| 193 | + addTag(counts, keyword, 1); |
| 194 | + } |
| 195 | + } |
| 196 | + |
| 197 | + if (extracted.language) { |
| 198 | + addTag(counts, extracted.language.toLowerCase(), 3); |
| 199 | + } |
| 200 | + |
| 201 | + return Array.from(counts.entries()) |
| 202 | + .sort((a, b) => b[1] - a[1]) |
| 203 | + .slice(0, 10) |
| 204 | + .map(([tag]) => tag); |
| 205 | +} |
| 206 | + |
| 207 | +function slugify(input: string): string { |
| 208 | + const slug = input |
| 209 | + .toLowerCase() |
| 210 | + .replace(/[^a-z0-9]+/g, '-') |
| 211 | + .replace(/^-+|-+$/g, '') |
| 212 | + .replace(/-{2,}/g, '-'); |
| 213 | + return slug.slice(0, 64).replace(/-+$/, '') || 'untitled-skill'; |
| 214 | +} |
| 215 | + |
| 216 | +function yamlEscape(value: string): string { |
| 217 | + const singleLine = value.replace(/\r?\n/g, ' ').trim(); |
| 218 | + if (/[:#{}[\],&*?|>!%@`]/.test(singleLine) || singleLine.startsWith("'") || singleLine.startsWith('"')) { |
| 219 | + return `"${singleLine.replace(/\\/g, '\\\\').replace(/"/g, '\\"')}"`; |
| 220 | + } |
| 221 | + return singleLine; |
| 222 | +} |
| 223 | + |
| 224 | +export async function POST(request: NextRequest) { |
| 225 | + const ip = request.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ?? 'unknown'; |
| 226 | + const { allowed, remaining } = checkRateLimit(ip); |
| 227 | + |
| 228 | + if (!allowed) { |
| 229 | + return NextResponse.json( |
| 230 | + { error: 'Too many requests. Try again in a minute.' }, |
| 231 | + { status: 429, headers: { 'X-RateLimit-Remaining': '0', 'Retry-After': '60' } }, |
| 232 | + ); |
| 233 | + } |
| 234 | + |
| 235 | + let body: { url?: string; name?: string }; |
| 236 | + try { |
| 237 | + body = await request.json(); |
| 238 | + } catch { |
| 239 | + return NextResponse.json({ error: 'Invalid JSON body' }, { status: 400 }); |
| 240 | + } |
| 241 | + |
| 242 | + const { url, name } = body; |
| 243 | + if (!url || typeof url !== 'string') { |
| 244 | + return NextResponse.json({ error: 'Missing required field: url' }, { status: 400 }); |
| 245 | + } |
| 246 | + |
| 247 | + if (name !== undefined && typeof name !== 'string') { |
| 248 | + return NextResponse.json({ error: 'Field "name" must be a string' }, { status: 400 }); |
| 249 | + } |
| 250 | + |
| 251 | + if (!isAllowedUrl(url)) { |
| 252 | + return NextResponse.json({ error: 'URL not allowed' }, { status: 403 }); |
| 253 | + } |
| 254 | + |
| 255 | + try { |
| 256 | + const extracted = await extractFromUrl(url); |
| 257 | + const tags = detectTags(extracted); |
| 258 | + |
| 259 | + const skillName = slugify(name || extracted.title || 'untitled'); |
| 260 | + const description = extracted.content |
| 261 | + .split('\n') |
| 262 | + .find((l) => l.trim().length > 0) |
| 263 | + ?.replace(/^#+\s*/, '') |
| 264 | + .trim() |
| 265 | + .slice(0, 200) || 'Saved skill'; |
| 266 | + const savedAt = new Date().toISOString(); |
| 267 | + |
| 268 | + const yamlTags = tags.length > 0 |
| 269 | + ? `tags:\n${tags.map((t) => ` - ${t}`).join('\n')}\n` |
| 270 | + : ''; |
| 271 | + |
| 272 | + const skillMd = |
| 273 | + `---\n` + |
| 274 | + `name: ${skillName}\n` + |
| 275 | + `description: ${yamlEscape(description)}\n` + |
| 276 | + yamlTags + |
| 277 | + `metadata:\n` + |
| 278 | + ` source: ${yamlEscape(url)}\n` + |
| 279 | + ` savedAt: ${savedAt}\n` + |
| 280 | + `---\n\n` + |
| 281 | + extracted.content + '\n'; |
| 282 | + |
| 283 | + return NextResponse.json( |
| 284 | + { name: skillName, skillMd, tags, description }, |
| 285 | + { headers: { 'X-RateLimit-Remaining': String(remaining) } }, |
| 286 | + ); |
| 287 | + } catch (err) { |
| 288 | + const message = err instanceof Error ? err.message : 'Failed to extract content'; |
| 289 | + return NextResponse.json({ error: message }, { status: 502 }); |
| 290 | + } |
| 291 | +} |
0 commit comments