-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml
More file actions
79 lines (67 loc) · 1.99 KB
/
config.yaml
File metadata and controls
79 lines (67 loc) · 1.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# GitHub Repository Discovery Configuration
repos:
# Language to search for
language: Python
# Date range for repository activity (YYYY-MM-DD)
# cutoff: earliest pushed date to consider
# today: latest pushed date (normally today's date)
cutoff: "2025-07-04"
today: "2026-01-04"
# Maximum number of repositories to fetch
max_repos: 50000
# Output file name (timestamp will be appended)
output_file: repos.jsonl
# Star count bins for query partitioning
# Each tuple is [min_stars, max_stars] (null means no limit)
star_bins:
- [10000, null]
# Uncomment for more granular searches:
# - [5000, 9999]
# - [2000, 4999]
# - [1000, 1999]
# - [500, 999]
# - [200, 499]
# - [100, 199]
# - [50, 99]
# - [20, 49]
# Query partitioning settings
partition:
# Target maximum results per query (GitHub API limit is 1000)
max_per_query: 1000
# Sleep between partition count checks (seconds)
partition_sleep: 0.2
# Sleep between result pages (seconds)
page_sleep: 0.25
# API retry settings
api:
timeout: 60
max_retries: 12
# Conservative backoff: min(max_backoff, 2 ** min(attempt, backoff_exponent))
max_backoff: 60
backoff_exponent: 6
# AGENTS.md Download Configuration
agents_md:
# Base output directory name (timestamp will be appended)
output_dir: agents_md
# Delay between downloads to avoid rate limiting (seconds)
delay: 0.1
# Number of parallel download workers (1 for sequential)
workers: 2
# Filename variants to try (case-sensitive search order)
filename_variants:
agents_md:
- AGENTS.md
# - Agents.md
# - agents.md
claude_md:
- CLAUDE.md
# - Claude.md
# - claude.md
# Download settings
download:
timeout: 30
max_retries: 3
# Exponential backoff base for rate limit retries
backoff_base: 2
# GitHub raw content URL pattern (filename will be inserted)
raw_url_pattern: "https://raw.githubusercontent.com/{repo}/{branch}/{filename}"