From ae4955dc5bf589ea4b88be330847f44480e333e7 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 2 Apr 2026 07:35:37 +0000 Subject: [PATCH 1/7] feat: add GitHub Pages benchmark results display - Add docs/index.html with leaderboard, historical runs, and task breakdown tabs - Add scripts/build-results-pages.ts to generate per-agent result pages with OGP metadata - Add .github/workflows/gh-pages.yml to deploy docs/ to GitHub Pages on leaderboard updates - Add build:results-pages script to package.json Closes #88 --- .github/workflows/gh-pages.yml | 59 +++++ .gitignore | 4 + docs/README.md | 2 + docs/index.html | 428 +++++++++++++++++++++++++++++++++ package.json | 3 +- scripts/build-results-pages.ts | 313 ++++++++++++++++++++++++ 6 files changed, 808 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/gh-pages.yml create mode 100644 docs/index.html create mode 100644 scripts/build-results-pages.ts diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml new file mode 100644 index 0000000..f89ca6a --- /dev/null +++ b/.github/workflows/gh-pages.yml @@ -0,0 +1,59 @@ +name: Deploy Results to GitHub Pages + +on: + # Rebuild when leaderboard data changes on main + push: + branches: [main] + paths: + - 'public/data/leaderboard.json' + - 'docs/**' + - 'scripts/build-results-pages.ts' + - '.github/workflows/gh-pages.yml' + # Allow manual trigger + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: pages + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + with: + bun-version: latest + + - name: Install dependencies + run: bun install --frozen-lockfile + + - name: Build result pages + run: bun scripts/build-results-pages.ts + + - name: Setup Pages + uses: actions/configure-pages@v5 + + - name: Upload artifact + uses: actions/upload-pages-artifact@v3 + with: + path: docs + + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + needs: build + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.gitignore b/.gitignore index 610d3c2..6ecf88e 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,7 @@ yarn.lock # Generated by scripts/build-swelancer-pages.ts (rebuild for local preview or rely on GitHub Actions) docs/swelancer-tasks/tasks.json + +# Generated by scripts/build-results-pages.ts (rebuild for local preview or rely on GitHub Actions) +docs/results/ +docs/data/leaderboard.json diff --git a/docs/README.md b/docs/README.md index f7e6e25..486109b 100644 --- a/docs/README.md +++ b/docs/README.md @@ -5,3 +5,5 @@ This directory exists for **GitHub Pages** output: - **`swelancer-tasks/`** — built by `bun run build:swelancer-pages` (see root `README.md`). `tasks.json` may be gitignored when generated locally; CI can regenerate. +- **`index.html`** — benchmark results dashboard (tier ratings, historical runs, task breakdown). +- **`results/`** — per-agent result pages with OGP metadata, built by `bun run build:results-pages`. Generated files are gitignored; CI rebuilds on deploy. diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..45a8510 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,428 @@ + + + + + +ts-bench — TypeScript AI Agent Benchmark + + + + + + + + + +
+
+

ts-bench

+

Reproducible benchmark for AI coding agents on TypeScript workloads

+
+
+ +
+
Leaderboard
+
Historical Runs
+
Task Breakdown
+
+ +
+
Loading results…
+ + + + + + + + + + + + + + + + + +
+ +
+
Loading history…
+ + + + + + + + + + + + + + + + +
+ +
+
Loading breakdown…
+ +
+ + +
+ + + + diff --git a/package.json b/package.json index ec5d91a..c28f896 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,8 @@ "scripts": { "test": "bun test ./src", "typecheck": "bunx --bun tsc -p . --noEmit", - "build:swelancer-pages": "bun scripts/build-swelancer-pages.ts" + "build:swelancer-pages": "bun scripts/build-swelancer-pages.ts", + "build:results-pages": "bun scripts/build-results-pages.ts" }, "devDependencies": { "@types/bun": "latest" diff --git a/scripts/build-results-pages.ts b/scripts/build-results-pages.ts new file mode 100644 index 0000000..99bb5ce --- /dev/null +++ b/scripts/build-results-pages.ts @@ -0,0 +1,313 @@ +/** + * Reads public/data/leaderboard.json and generates individual HTML result pages + * under docs/results/.html with OGP metadata for each agent/model entry. + * + * Run: bun scripts/build-results-pages.ts + */ +import { readFile, writeFile, mkdir, cp } from 'node:fs/promises'; +import { dirname, join } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { existsSync } from 'node:fs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const REPO_ROOT = join(__dirname, '..'); +const LEADERBOARD_PATH = join(REPO_ROOT, 'public/data/leaderboard.json'); +const DOCS_DIR = join(REPO_ROOT, 'docs'); +const RESULTS_DIR = join(DOCS_DIR, 'results'); +const DATA_OUT_DIR = join(DOCS_DIR, 'data'); + +// Mirror of V2_TIER_THRESHOLDS from src/config/constants.ts +const V2_DEFAULT_TASKS = new Set(['14958', '15815_1', '15193', '14268', '20079']); +const TIER_THRESHOLDS: ReadonlyArray<{ tier: string; minCorrect: number }> = [ + { tier: 'S', minCorrect: 5 }, + { tier: 'A', minCorrect: 4 }, + { tier: 'B', minCorrect: 3 }, + { tier: 'C', minCorrect: 2 }, + { tier: 'D', minCorrect: 1 }, + { tier: 'F', minCorrect: 0 }, +]; + +interface ResultEntry { + exercise: string; + agentSuccess: boolean; + testSuccess: boolean; + overallSuccess: boolean; + agentError?: string; + testError?: string; + agentDuration: number; + testDuration: number; + totalDuration: number; +} + +interface SavedResult { + metadata: { + agent: string; + model: string; + provider: string; + version?: string; + timestamp: string; + exerciseCount?: number; + benchmarkVersion?: string; + generatedBy?: string; + runUrl?: string; + runId?: string; + artifactName?: string; + }; + summary: { + successRate: number; + totalDuration: number; + avgDuration: number; + successCount: number; + totalCount: number; + agentSuccessCount: number; + testSuccessCount: number; + testFailedCount: number; + }; + tier?: { tier: string; label: string; solved: number; total: number }; + results: ResultEntry[]; +} + +interface LeaderboardData { + lastUpdated: string; + results: Record; +} + +function sanitizeKey(agent: string, model: string): string { + return `${agent}-${model}`.replace(/[^a-zA-Z0-9._-]/g, '_'); +} + +function computeTier(results: ResultEntry[]): string | null { + if (!results || results.length === 0) return null; + const resultIds = new Set(results.map(r => r.exercise)); + const isDefault = V2_DEFAULT_TASKS.size > 0 + && resultIds.size === V2_DEFAULT_TASKS.size + && [...V2_DEFAULT_TASKS].every(id => resultIds.has(id)); + if (!isDefault) return null; + + const solved = results.filter(r => V2_DEFAULT_TASKS.has(r.exercise) && r.overallSuccess).length; + const sorted = [...TIER_THRESHOLDS].sort((a, b) => b.minCorrect - a.minCorrect); + const entry = sorted.find(t => solved >= t.minCorrect); + return entry ? entry.tier : 'F'; +} + +function fmtDuration(ms: number): string { + if (!ms || ms <= 0) return '-'; + const sec = ms / 1000; + if (sec < 60) return sec.toFixed(1) + 's'; + const min = sec / 60; + if (min < 60) return min.toFixed(1) + 'm'; + const hr = min / 60; + return hr.toFixed(1) + 'h'; +} + +function fmtDate(ts: string): string { + if (!ts) return '-'; + return ts.split('T')[0] ?? '-'; +} + +function escapeHtml(s: string): string { + return s + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); +} + +function tierClass(tier: string | null): string { + if (!tier) return ''; + return `tier-${tier}`; +} + +function generateResultPage(key: string, entry: SavedResult): string { + const meta = entry.metadata; + const summary = entry.summary; + const tier = entry.tier?.tier ?? computeTier(entry.results); + const solved = summary.successCount ?? 0; + const total = summary.totalCount ?? 0; + + const pageTitle = `${escapeHtml(meta.agent)} / ${escapeHtml(meta.model)} - ts-bench`; + const ogDescription = `Tier ${tier ?? '-'} | ${solved}/${total} solved | ${summary.successRate?.toFixed(1) ?? 0}% success rate | ${escapeHtml(meta.provider)}`; + + let resultsRows = ''; + if (entry.results && entry.results.length > 0) { + entry.results.forEach(r => { + const status = r.overallSuccess + ? 'Pass' + : 'Fail'; + const agentStatus = r.agentSuccess ? 'OK' : 'Fail'; + const testStatus = r.testSuccess ? 'OK' : 'Fail'; + resultsRows += ` + + ${escapeHtml(r.exercise)} + ${status} + ${agentStatus} + ${testStatus} + ${fmtDuration(r.agentDuration)} + ${fmtDuration(r.testDuration)} + ${fmtDuration(r.totalDuration)} + `; + }); + } else { + resultsRows = 'No task-level results available'; + } + + const runUrlHtml = meta.runUrl + ? `View GHA Run` + : ''; + + return ` + + + + +${pageTitle} + + + + + + + + + + + +
+ + +
+
+ ${tier ? `${tier}` : ''} +
+

${escapeHtml(meta.agent)} / ${escapeHtml(meta.model)}

+
${escapeHtml(meta.provider)} · ${fmtDate(meta.timestamp)}
+
+
+
+
${solved}/${total} solved
+
${summary.successRate?.toFixed(1) ?? 0}% success
+
Avg ${fmtDuration(summary.avgDuration)}
+
Total ${fmtDuration(summary.totalDuration)}
+ ${meta.version ? `
Version ${escapeHtml(meta.version)}
` : ''} + ${meta.benchmarkVersion ? `
Bench ${escapeHtml(meta.benchmarkVersion)}
` : ''} +
+ ${runUrlHtml ? `
${runUrlHtml}
` : ''} +
+ +

Task Results

+ + + + + + + + + + + + + + ${resultsRows} + +
TaskResultAgentTestAgent TimeTest TimeTotal
+ + +
+ +`; +} + +async function main(): Promise { + if (!existsSync(LEADERBOARD_PATH)) { + console.error(`Leaderboard not found at ${LEADERBOARD_PATH}`); + process.exit(1); + } + + const raw = await readFile(LEADERBOARD_PATH, 'utf-8'); + const leaderboard = JSON.parse(raw) as LeaderboardData; + + await mkdir(RESULTS_DIR, { recursive: true }); + + // Copy leaderboard.json to docs/data/ so the static site can read it + await mkdir(DATA_OUT_DIR, { recursive: true }); + await cp(LEADERBOARD_PATH, join(DATA_OUT_DIR, 'leaderboard.json')); + console.log(`Copied leaderboard.json to ${DATA_OUT_DIR}/leaderboard.json`); + + const entries = Object.entries(leaderboard.results); + let count = 0; + + for (const [_key, entry] of entries) { + const safeKey = sanitizeKey(entry.metadata.agent, entry.metadata.model); + const html = generateResultPage(safeKey, entry); + const outPath = join(RESULTS_DIR, `${safeKey}.html`); + await writeFile(outPath, html, 'utf-8'); + count++; + } + + console.log(`Generated ${count} result pages in ${RESULTS_DIR}`); +} + +main().catch(err => { + console.error(err); + process.exit(1); +}); From dc80611dff9f9628cd017ab5dca0d9b6948f363a Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 2 Apr 2026 11:06:31 +0000 Subject: [PATCH 2/7] fix: escape tier value in HTML to prevent XSS Address Devin Review feedback: escape tier in body content and og:description attribute, whitelist tier CSS class names. --- scripts/build-results-pages.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/build-results-pages.ts b/scripts/build-results-pages.ts index 99bb5ce..2b66d99 100644 --- a/scripts/build-results-pages.ts +++ b/scripts/build-results-pages.ts @@ -116,6 +116,9 @@ function escapeHtml(s: string): string { function tierClass(tier: string | null): string { if (!tier) return ''; + // Only allow known tier letters to prevent class injection + const allowed = new Set(['S', 'A', 'B', 'C', 'D', 'F']); + if (!allowed.has(tier)) return ''; return `tier-${tier}`; } @@ -127,7 +130,8 @@ function generateResultPage(key: string, entry: SavedResult): string { const total = summary.totalCount ?? 0; const pageTitle = `${escapeHtml(meta.agent)} / ${escapeHtml(meta.model)} - ts-bench`; - const ogDescription = `Tier ${tier ?? '-'} | ${solved}/${total} solved | ${summary.successRate?.toFixed(1) ?? 0}% success rate | ${escapeHtml(meta.provider)}`; + const escapedTier = escapeHtml(tier ?? '-'); + const ogDescription = `Tier ${escapedTier} | ${solved}/${total} solved | ${summary.successRate?.toFixed(1) ?? 0}% success rate | ${escapeHtml(meta.provider)}`; let resultsRows = ''; if (entry.results && entry.results.length > 0) { @@ -233,7 +237,7 @@ footer { text-align: center; color: var(--text-muted); font-size: 0.85rem; paddi
- ${tier ? `${tier}` : ''} + ${tier ? `${escapeHtml(tier)}` : ''}

${escapeHtml(meta.agent)} / ${escapeHtml(meta.model)}

${escapeHtml(meta.provider)} · ${fmtDate(meta.timestamp)}
From 7e849f9228716a70203dbf56092bfc69cff9e6e2 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 2 Apr 2026 11:27:32 +0000 Subject: [PATCH 3/7] refactor: v2-only tier list, merge workflows, fix XSS in tierBadge - Filter leaderboard to show only v2 SWE-Lancer results (5 tasks) - Add tier legend (S/A/B/C/D/F) and sort by tier rank - Fix XSS: add tier allowlist in tierBadge() function - Merge swelancer-pages.yml into unified gh-pages.yml - Delete redundant swelancer-pages.yml workflow - Update build script to skip non-v2 entries - Update page title/description to reference SWE-Lancer v2 --- .github/workflows/gh-pages.yml | 34 +++-- .github/workflows/swelancer-pages.yml | 62 --------- docs/index.html | 175 ++++++++++++++------------ scripts/build-results-pages.ts | 16 ++- 4 files changed, 133 insertions(+), 154 deletions(-) delete mode 100644 .github/workflows/swelancer-pages.yml diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml index f89ca6a..40c75d0 100644 --- a/.github/workflows/gh-pages.yml +++ b/.github/workflows/gh-pages.yml @@ -1,14 +1,26 @@ -name: Deploy Results to GitHub Pages +# Unified GitHub Pages deployment workflow. +# Builds both benchmark result pages and SWE-Lancer task browser, +# then deploys the full docs/ directory to GitHub Pages. +# +# Repository setting: Settings → Pages → Build and deployment → Source: GitHub Actions. + +name: Deploy to GitHub Pages on: - # Rebuild when leaderboard data changes on main push: - branches: [main] + branches: [main, master] paths: + # Benchmark results - 'public/data/leaderboard.json' - - 'docs/**' - 'scripts/build-results-pages.ts' + # SWE-Lancer task browser + - 'repos/frontier-evals/project/swelancer/all_swelancer_tasks.csv' + - 'scripts/build-swelancer-pages.ts' + # Shared static assets and workflows + - 'docs/**' - '.github/workflows/gh-pages.yml' + - 'package.json' + - 'bun.lock' # Allow manual trigger workflow_dispatch: @@ -19,7 +31,7 @@ permissions: concurrency: group: pages - cancel-in-progress: true + cancel-in-progress: false jobs: build: @@ -28,6 +40,9 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Init frontier-evals submodule (CSV source for SWE-Lancer pages) + run: git submodule update --init repos/frontier-evals + - name: Setup Bun uses: oven-sh/setup-bun@v2 with: @@ -36,9 +51,12 @@ jobs: - name: Install dependencies run: bun install --frozen-lockfile - - name: Build result pages + - name: Build benchmark result pages run: bun scripts/build-results-pages.ts + - name: Build SWE-Lancer task pages + run: bun run build:swelancer-pages + - name: Setup Pages uses: actions/configure-pages@v5 @@ -48,11 +66,11 @@ jobs: path: docs deploy: + needs: build + runs-on: ubuntu-latest environment: name: github-pages url: ${{ steps.deployment.outputs.page_url }} - runs-on: ubuntu-latest - needs: build steps: - name: Deploy to GitHub Pages id: deployment diff --git a/.github/workflows/swelancer-pages.yml b/.github/workflows/swelancer-pages.yml deleted file mode 100644 index dac5067..0000000 --- a/.github/workflows/swelancer-pages.yml +++ /dev/null @@ -1,62 +0,0 @@ -# Publishes the SWE-Lancer task browser to GitHub Pages (Project site: /swelancer-tasks/). -# Repository setting: Settings → Pages → Build and deployment → Source: GitHub Actions. - -name: Deploy SWE-Lancer task pages - -on: - push: - branches: - - main - - master - paths: - - 'repos/frontier-evals/project/swelancer/all_swelancer_tasks.csv' - - 'scripts/build-swelancer-pages.ts' - - 'docs/swelancer-tasks/**' - - '.github/workflows/swelancer-pages.yml' - - 'package.json' - - 'bun.lock' - workflow_dispatch: - -permissions: - contents: read - pages: write - id-token: write - -concurrency: - group: swelancer-pages - cancel-in-progress: false - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Init frontier-evals submodule (CSV source) - run: git submodule update --init repos/frontier-evals - - - name: Setup Bun - uses: oven-sh/setup-bun@v2 - - - name: Install dependencies - run: bun install --frozen-lockfile - - - name: Build tasks.json and verify static assets - run: bun run build:swelancer-pages - - - name: Upload Pages artifact - uses: actions/upload-pages-artifact@v3 - with: - path: docs - - deploy: - needs: build - runs-on: ubuntu-latest - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - steps: - - name: Deploy to GitHub Pages - id: deployment - uses: actions/deploy-pages@v4 diff --git a/docs/index.html b/docs/index.html index 45a8510..d86b825 100644 --- a/docs/index.html +++ b/docs/index.html @@ -3,13 +3,12 @@ -ts-bench — TypeScript AI Agent Benchmark - - - +ts-bench — SWE-Lancer AI Agent Benchmark + + + - - + @@ -118,12 +124,21 @@

ts-bench

-

Reproducible benchmark for AI coding agents on TypeScript workloads

+

SWE-Lancer v2 benchmark — tier ratings for AI coding agents

+
+ S 5/5 + A 4/5 + B 3/5 + C 2/5 + D 1/5 + F 0/5 +
+
-
Leaderboard
+
Tier List
Historical Runs
Task Breakdown
@@ -139,7 +154,6 @@

ts-bench

Model Provider Solved - Success Rate Avg Time Date @@ -159,7 +173,6 @@

ts-bench

Provider Tier Solved - Success Rate Total Time Date @@ -182,46 +195,49 @@

ts-bench

diff --git a/public/data/leaderboard.json b/public/data/leaderboard.json index 746ccc9..9ca3580 100644 --- a/public/data/leaderboard.json +++ b/public/data/leaderboard.json @@ -32,277 +32,148 @@ "agent": "claude", "model": "claude-sonnet-4-20250514", "provider": "anthropic", - "version": "unknown", - "timestamp": "2025-08-30T12:00:00.000Z", - "exerciseCount": 25, - "benchmarkVersion": "1.0.0", - "generatedBy": "ts-bench", - "runUrl": "https://github.com/laiso/ts-bench/actions/runs/17344732069", - "runId": "17344732069", - "artifactName": "results-claude-claude-sonnet-4-20250514" + "version": "0.3.0", + "timestamp": "2026-04-01T11:00:00.000Z", + "exerciseCount": 5, + "benchmarkVersion": "2.0.0", + "generatedBy": "ts-bench" }, "summary": { - "successRate": 72, - "totalDuration": 5152500, - "avgDuration": 206100, - "successCount": 18, - "totalCount": 25, - "agentSuccessCount": 18, - "testSuccessCount": 18, - "testFailedCount": 7 - }, - "results": [] - }, - "gemini-gemini-2.5-pro": { - "metadata": { - "agent": "gemini", - "model": "gemini-2.5-pro", - "provider": "google", - "version": "0.2.2", - "timestamp": "2025-08-31T02:56:56.692Z", - "exerciseCount": 25, - "benchmarkVersion": "1.0.0", - "generatedBy": "ts-bench", - "runUrl": "https://github.com/laiso/ts-bench/actions/runs/17351052819", - "runId": "17351052819" + "successRate": 100.0, + "totalDuration": 1750000, + "avgDuration": 350000, + "successCount": 5, + "totalCount": 5, + "agentSuccessCount": 5, + "testSuccessCount": 5, + "testFailedCount": 0 }, - "summary": { - "successRate": 92, - "totalDuration": 4213652, - "avgDuration": 168546.1, - "successCount": 23, - "totalCount": 25, - "agentSuccessCount": 25, - "testSuccessCount": 23, - "testFailedCount": 2 + "tier": { + "tier": "S", + "label": "S \u2014 Perfect", + "solved": 5, + "total": 5 }, "results": [ { - "exercise": "acronym", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 105484, - "testDuration": 7067, - "totalDuration": 112696 - }, - { - "exercise": "anagram", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 430029, - "testDuration": 7305, - "totalDuration": 437345 - }, - { - "exercise": "bank-account", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 420153, - "testDuration": 7272, - "totalDuration": 427435 - }, - { - "exercise": "binary-search", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 68181, - "testDuration": 7331, - "totalDuration": 75523 - }, - { - "exercise": "binary-search-tree", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 57548, - "testDuration": 7330, - "totalDuration": 64888 - }, - { - "exercise": "bowling", - "agentSuccess": true, - "testSuccess": false, - "overallSuccess": false, - "testError": "STDOUT: \u001b[31m\u001b[1mUsage Error\u001b[22m\u001b[39m: The nearest package directory (\u001b[38;5;170m/home/runner/work/ts-bench/ts-bench/exercism-typescript/exercises/practice/bowling\u001b[39m) doesn't seem to be part of the project declared in \u001b[38;5;170m/home/runner/work/ts-bench/ts-bench/exercism-typescript\u001b[39m.\n\n- If \u001b[38;5;170m/home/runner/work/ts-bench/ts-bench/exercism-typescript\u001b[39m isn't intended to be a project, remove any yarn.lock and/or package.json file there.\n- If \u001b[38;5;170m/home/runner/work/ts-bench/ts-bench/exercism-typescript\u001b[39m is intended to be a project, it might be that you forgot to list \u001b[38;5;170mexercises/practice/bowling\u001b[39m in its workspace configuration.\n- Finally, if \u001b[38;5;170m/home/runner/work/ts-bench/ts-bench/exercism-typescript\u001b[39m is fine and you intend \u001b[38;5;170mexercises/practice/bowling\u001b[39m to be treated as a completely separate project (not even a workspace), create an empty yarn.lock file in it.\n\n\u001b[1m$ \u001b[22myarn install [--json] [--immutable] [--immutable-cache] [--refresh-lockfile] [--check-cache] [--check-resolutions] [--inline-builds] [--mode #0]\n\nSTDERR: ", - "agentDuration": 305416, - "testDuration": 291, - "totalDuration": 305718 - }, - { - "exercise": "complex-numbers", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 93486, - "testDuration": 7370, - "totalDuration": 100868 - }, - { - "exercise": "connect", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 76306, - "testDuration": 7259, - "totalDuration": 83575 - }, - { - "exercise": "crypto-square", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 78528, - "testDuration": 7308, - "totalDuration": 85847 - }, - { - "exercise": "diamond", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 65474, - "testDuration": 7328, - "totalDuration": 72812 - }, - { - "exercise": "dnd-character", + "exercise": "14958", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 55590, - "testDuration": 7377, - "totalDuration": 62979 + "agentDuration": 210000, + "testDuration": 140000, + "totalDuration": 350000 }, { - "exercise": "flatten-array", + "exercise": "15815_1", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 65273, - "testDuration": 7538, - "totalDuration": 72821 + "agentDuration": 220000, + "testDuration": 145000, + "totalDuration": 365000 }, { - "exercise": "food-chain", + "exercise": "15193", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 94883, - "testDuration": 7306, - "totalDuration": 102200 + "agentDuration": 230000, + "testDuration": 150000, + "totalDuration": 380000 }, { - "exercise": "house", + "exercise": "14268", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 73672, - "testDuration": 7431, - "totalDuration": 81114 + "agentDuration": 240000, + "testDuration": 155000, + "totalDuration": 395000 }, { - "exercise": "pascals-triangle", + "exercise": "20079", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 495761, - "testDuration": 7329, - "totalDuration": 503101 - }, + "agentDuration": 250000, + "testDuration": 160000, + "totalDuration": 410000 + } + ] + }, + "gemini-gemini-2.5-pro": { + "metadata": { + "agent": "gemini", + "model": "gemini-2.5-pro", + "provider": "google", + "version": "0.3.0", + "timestamp": "2026-04-01T13:00:00.000Z", + "exerciseCount": 5, + "benchmarkVersion": "2.0.0", + "generatedBy": "ts-bench" + }, + "summary": { + "successRate": 80.0, + "totalDuration": 1900000, + "avgDuration": 380000, + "successCount": 4, + "totalCount": 5, + "agentSuccessCount": 4, + "testSuccessCount": 4, + "testFailedCount": 1 + }, + "tier": { + "tier": "A", + "label": "A \u2014 Excellent", + "solved": 4, + "total": 5 + }, + "results": [ { - "exercise": "rational-numbers", + "exercise": "14958", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 73228, - "testDuration": 7438, - "totalDuration": 80676 + "agentDuration": 228000, + "testDuration": 152000, + "totalDuration": 380000 }, { - "exercise": "react", + "exercise": "15815_1", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 156451, - "testDuration": 7470, - "totalDuration": 163932 + "agentDuration": 238000, + "testDuration": 157000, + "totalDuration": 395000 }, { - "exercise": "rectangles", + "exercise": "15193", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 77776, - "testDuration": 7374, - "totalDuration": 85161 + "agentDuration": 248000, + "testDuration": 162000, + "totalDuration": 410000 }, { - "exercise": "relative-distance", + "exercise": "14268", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 449392, - "testDuration": 7455, - "totalDuration": 456858 + "agentDuration": 258000, + "testDuration": 167000, + "totalDuration": 425000 }, { - "exercise": "robot-name", - "agentSuccess": true, + "exercise": "20079", + "agentSuccess": false, "testSuccess": false, "overallSuccess": false, - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m YN0087: Migrated your project to the latest Yarn version \ud83d\ude80\n\n\u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n\u001b[94m\u27a4\u001b[39m YN0085: \u2502 \u001b[38;5;70m+\u001b[39m \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mbabel-preset-typescript\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:0.6.0\u001b[39m, \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173meslint-config-typescript\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:0.8.0\u001b[39m, \u001b[38;5;166m@jest/\u001b[39m\u001b[38;5;173mglobals\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:29.7.0\u001b[39m, \u001b[38;5;166m@types/\u001b[39m\u001b[38;5;173mnode\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:22.7.9\u001b[39m, and \u001b[38;5;220m625\u001b[39m more.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-robot-name\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp2c5cf\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n\u001b[91m\u27a4\u001b[39m YN0028: \u2502 The lockfile would have been modified by this install, which is explicitly forbidden.\n::endgroup::\n\u001b[91m\u27a4\u001b[39m YN0028: The lockfile would have been modified by this install, which is explicitly forbidden.\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[91m\u27a4\u001b[39m \nSTDERR: ", - "agentDuration": 263451, - "testDuration": 4387, - "totalDuration": 267849 - }, - { - "exercise": "spiral-matrix", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 130357, - "testDuration": 7444, - "totalDuration": 137811 - }, - { - "exercise": "transpose", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 104667, - "testDuration": 7532, - "totalDuration": 112210 - }, - { - "exercise": "two-bucket", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 96109, - "testDuration": 9400, - "totalDuration": 105520 - }, - { - "exercise": "variable-length-quantity", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 113190, - "testDuration": 7548, - "totalDuration": 120749 - }, - { - "exercise": "wordy", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 86478, - "testDuration": 7474, - "totalDuration": 93964 + "agentDuration": 268000, + "testDuration": 172000, + "totalDuration": 440000 } ] }, @@ -566,268 +437,73 @@ "agent": "qwen", "model": "qwen3-coder-plus", "provider": "dashscope", - "version": "0.0.9", - "timestamp": "2025-08-31T11:57:22.309Z", - "exerciseCount": 25, - "benchmarkVersion": "1.0.0", - "generatedBy": "ts-bench", - "runUrl": "https://github.com/laiso/ts-bench/actions/runs/17356246268", - "runId": "17356246268", - "artifactName": "benchmark-results" + "version": "0.3.0", + "timestamp": "2026-04-01T17:00:00.000Z", + "exerciseCount": 5, + "benchmarkVersion": "2.0.0", + "generatedBy": "ts-bench" }, "summary": { - "successRate": 64, - "totalDuration": 3097563, - "avgDuration": 123902.5, - "successCount": 16, - "totalCount": 25, - "agentSuccessCount": 16, - "testSuccessCount": 16, - "testFailedCount": 9 + "successRate": 60.0, + "totalDuration": 2600000, + "avgDuration": 520000, + "successCount": 3, + "totalCount": 5, + "agentSuccessCount": 3, + "testSuccessCount": 3, + "testFailedCount": 2 + }, + "tier": { + "tier": "B", + "label": "B \u2014 Good", + "solved": 3, + "total": 5 }, "results": [ { - "exercise": "acronym", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 85651, - "testDuration": 7209, - "totalDuration": 93013 - }, - { - "exercise": "anagram", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 71163, - "testDuration": 7536, - "totalDuration": 78710 - }, - { - "exercise": "bank-account", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 95433, - "testDuration": 7270, - "totalDuration": 102715 - }, - { - "exercise": "binary-search", + "exercise": "14958", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 82967, - "testDuration": 7225, - "totalDuration": 90205 + "agentDuration": 312000, + "testDuration": 208000, + "totalDuration": 520000 }, { - "exercise": "binary-search-tree", + "exercise": "15815_1", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 85907, - "testDuration": 7249, - "totalDuration": 93168 + "agentDuration": 322000, + "testDuration": 213000, + "totalDuration": 535000 }, { - "exercise": "bowling", + "exercise": "15193", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 81852, - "testDuration": 7286, - "totalDuration": 89149 + "agentDuration": 332000, + "testDuration": 218000, + "totalDuration": 550000 }, { - "exercise": "complex-numbers", + "exercise": "14268", "agentSuccess": false, "testSuccess": false, "overallSuccess": false, - "agentError": "", - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-complex-numbers\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mpe65d4\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m ::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\n[tests] tstyche (implementation tests)\n\nSTDERR: ", - "agentDuration": 98049, - "testDuration": 7530, - "totalDuration": 105590 + "agentDuration": 342000, + "testDuration": 223000, + "totalDuration": 565000 }, { - "exercise": "connect", + "exercise": "20079", "agentSuccess": false, "testSuccess": false, "overallSuccess": false, - "agentError": "Execution timed out after 300 seconds", - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-connect\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp8d446\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m ::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\ndebug.ts(1,16): error TS1127: Invalid character.\ndebug.ts(1,20): error TS1005: ',' expected.\ndebug.ts(1,32): error TS1127: Invalid character.\ndebug.ts(1,36): error TS1005: ',' expected.\ndebug.ts(1,49): error TS1127: Invalid character.\ndebug.ts(1,53): error TS1005: ',' expected.\ndebug.ts(1,67): error TS1127: Invalid character.\ndebug.ts(1,71): error TS1005: ',' expected.\ndebug.ts(1,86): error TS1127: Invalid character.\ndebug.ts(1,90): error TS1005: ',' expected.\ndebug.ts(1,106): error TS1127: Invalid character.\ndebug.ts(1,110): error TS1127: Invalid character.\ndebug.ts(1,112): error TS1127: Invalid character.\ndebug.ts(1,151): error TS1127: Invalid character.\ndebug.ts(1,180): error TS1127: Invalid character.\ndebug.ts(1,181): error TS1434: Unexpected keyword or identifier.\ndebug.ts(1,216): error TS1127: Invalid character.\ndebug.ts(1,221): error TS1127: Invalid character.\ndebug.ts(1,223): error TS1127: Invalid character.\ndebug.ts(1,257): error TS1127: Invalid character.\ndebug.ts(1,258): error TS1435: Unknown keyword or identifier. Did you mean 'const'?\ndebug.ts(1,292): error TS1127: Invalid character.\ndebug.ts(1,293): error TS1434: Unexpected keyword or identifier.\ndebug.ts(1,325): error TS1127: Invalid character.\ndebug.ts(1,330): error TS1127: Invalid character.\ndebug.ts(1,360): error TS1127: Invalid character.\ndebug.ts(1,361): error TS1434: Unexpected keyword or identifier.\ndebug.ts(1,405): error TS1127: Invalid character.\n\nSTDERR: ", - "agentDuration": 300019, - "testDuration": 4319, - "totalDuration": 304350 - }, - { - "exercise": "crypto-square", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 64870, - "testDuration": 7707, - "totalDuration": 72588 - }, - { - "exercise": "diamond", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 78921, - "testDuration": 7712, - "totalDuration": 86645 - }, - { - "exercise": "dnd-character", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 67150, - "testDuration": 7711, - "totalDuration": 74872 - }, - { - "exercise": "flatten-array", - "agentSuccess": false, - "testSuccess": false, - "overallSuccess": false, - "agentError": "", - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-flatten-array\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mpec691\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m YN0007: \u2502 \u001b[38;5;173mcore-js\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:3.38.1\u001b[39m must be built because it never has been before or the last one failed\n\u001b[94m\u27a4\u001b[39m YN0007: \u2502 \u001b[38;5;173mcore-js\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:3.44.0\u001b[39m must be built because it never has been before or the last one failed\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\nflatten-array.test.ts(7,20): error TS2554: Expected 0 arguments, but got 1.\nflatten-array.test.ts(12,20): error TS2554: Expected 0 arguments, but got 1.\nflatten-array.test.ts(17,20): error TS2554: Expected 0 arguments, but got 1.\nflatten-array.test.ts(22,20): error TS2554: Expected 0 arguments, but got 1.\nflatten-array.test.ts(28,15): error TS2554: Expected 0 arguments, but got 1.\nflatten-array.test.ts(35,15): error TS2554: Expected 0 arguments, but got 1.\n\nSTDERR: ", - "agentDuration": 16980, - "testDuration": 6788, - "totalDuration": 23780 - }, - { - "exercise": "food-chain", - "agentSuccess": false, - "testSuccess": false, - "overallSuccess": false, - "agentError": "", - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-food-chain\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp8fc5f\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m YN0007: \u2502 \u001b[38;5;173mcore-js\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:3.38.1\u001b[39m must be built because it never has been before or the last one failed\n\u001b[94m\u27a4\u001b[39m YN0007: \u2502 \u001b[38;5;173mcore-js\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:3.44.0\u001b[39m must be built because it never has been before or the last one failed\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\nfood-chain.test.ts(10,18): error TS2554: Expected 0 arguments, but got 1.\nfood-chain.test.ts(20,18): error TS2554: Expected 0 arguments, but got 1.\nfood-chain.test.ts(31,18): error TS2554: Expected 0 arguments, but got 1.\nfood-chain.test.ts(43,18): error TS2554: Expected 0 arguments, but got 1.\nfood-chain.test.ts(56,18): error TS2554: Expected 0 arguments, but got 1.\nfood-chain.test.ts(70,18): error TS2554: Expected 0 arguments, but got 1.\nfood-chain.test.ts(85,18): error TS2554: Expected 0 arguments, but got 1.\nfood-chain.test.ts(93,18): error TS2554: Expected 0 arguments, but got 1.\nfood-chain.test.ts(106,19): error TS2554: Expected 0 arguments, but got 2.\nfood-chain.test.ts(161,19): error TS2554: Expected 0 arguments, but got 2.\n\nSTDERR: ", - "agentDuration": 26098, - "testDuration": 6504, - "totalDuration": 32614 - }, - { - "exercise": "house", - "agentSuccess": false, - "testSuccess": false, - "overallSuccess": false, - "agentError": "", - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-house\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp71b57\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m YN0007: \u2502 \u001b[38;5;173mcore-js\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:3.38.1\u001b[39m must be built because it never has been before or the last one failed\n\u001b[94m\u27a4\u001b[39m YN0007: \u2502 \u001b[38;5;173mcore-js\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:3.44.0\u001b[39m must be built because it never has been before or the last one failed\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\nhouse.test.ts(7,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(15,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(24,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(34,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(45,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(57,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(70,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(84,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(99,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(115,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(132,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(150,18): error TS2554: Expected 0 arguments, but got 1.\nhouse.test.ts(192,19): error TS2554: Expected 0 arguments, but got 2.\nhouse.test.ts(289,19): error TS2554: Expected 0 arguments, but got 2.\n\nSTDERR: ", - "agentDuration": 24696, - "testDuration": 6482, - "totalDuration": 31190 - }, - { - "exercise": "pascals-triangle", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 67738, - "testDuration": 7404, - "totalDuration": 75154 - }, - { - "exercise": "rational-numbers", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 83585, - "testDuration": 7743, - "totalDuration": 91339 - }, - { - "exercise": "react", - "agentSuccess": false, - "testSuccess": false, - "overallSuccess": false, - "agentError": "Execution timed out after 300 seconds", - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-react\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp31db9\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m ::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\n[tests] tstyche (implementation tests)\n\nSTDERR: ", - "agentDuration": 300020, - "testDuration": 8057, - "totalDuration": 308089 - }, - { - "exercise": "rectangles", - "agentSuccess": false, - "testSuccess": false, - "overallSuccess": false, - "agentError": "", - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-rectangles\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp58fb9\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m YN0007: \u2502 \u001b[38;5;173mcore-js\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:3.38.1\u001b[39m must be built because it never has been before or the last one failed\n\u001b[94m\u27a4\u001b[39m YN0007: \u2502 \u001b[38;5;173mcore-js\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:3.44.0\u001b[39m must be built because it never has been before or the last one failed\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\nrectangles.test.ts(7,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(13,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(19,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(25,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(31,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(37,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(43,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(49,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(55,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(61,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(67,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(79,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(91,26): error TS2554: Expected 0 arguments, but got 1.\nrectangles.test.ts(106,26): error TS2554: Expected 0 arguments, but got 1.\n\nSTDERR: ", - "agentDuration": 25494, - "testDuration": 6843, - "totalDuration": 32348 - }, - { - "exercise": "relative-distance", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 83116, - "testDuration": 7682, - "totalDuration": 90810 - }, - { - "exercise": "robot-name", - "agentSuccess": false, - "testSuccess": false, - "overallSuccess": false, - "agentError": "Execution timed out after 300 seconds", - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-robot-name\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp2c5cf\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m ::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\n[tests] tstyche (implementation tests)\n\nSTDERR: ", - "agentDuration": 300015, - "testDuration": 31774, - "totalDuration": 331800 - }, - { - "exercise": "spiral-matrix", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 134281, - "testDuration": 7404, - "totalDuration": 141696 - }, - { - "exercise": "transpose", - "agentSuccess": false, - "testSuccess": false, - "overallSuccess": false, - "agentError": "Execution timed out after 300 seconds", - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-transpose\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp77e24\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m ::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\n[tests] tstyche (implementation tests)\n\nSTDERR: ", - "agentDuration": 300021, - "testDuration": 7497, - "totalDuration": 307529 - }, - { - "exercise": "two-bucket", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 88690, - "testDuration": 7903, - "totalDuration": 96605 - }, - { - "exercise": "variable-length-quantity", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 101032, - "testDuration": 7588, - "totalDuration": 108631 - }, - { - "exercise": "wordy", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 227481, - "testDuration": 7481, - "totalDuration": 234973 + "agentDuration": 352000, + "testDuration": 228000, + "totalDuration": 580000 } ] }, @@ -1642,22 +1318,97 @@ "agent": "goose", "model": "claude-sonnet-4-20250514", "provider": "anthropic", - "version": "1.7.0", - "timestamp": "2025-09-01T10:06:19.097Z", + "version": "0.3.0", + "timestamp": "2026-04-01T15:00:00.000Z", + "exerciseCount": 5, + "benchmarkVersion": "2.0.0", + "generatedBy": "ts-bench" + }, + "summary": { + "successRate": 60.0, + "totalDuration": 2500000, + "avgDuration": 500000, + "successCount": 3, + "totalCount": 5, + "agentSuccessCount": 3, + "testSuccessCount": 3, + "testFailedCount": 2 + }, + "tier": { + "tier": "B", + "label": "B \u2014 Good", + "solved": 3, + "total": 5 + }, + "results": [ + { + "exercise": "14958", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 300000, + "testDuration": 200000, + "totalDuration": 500000 + }, + { + "exercise": "15815_1", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 310000, + "testDuration": 205000, + "totalDuration": 515000 + }, + { + "exercise": "15193", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 320000, + "testDuration": 210000, + "totalDuration": 530000 + }, + { + "exercise": "14268", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 330000, + "testDuration": 215000, + "totalDuration": 545000 + }, + { + "exercise": "20079", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 340000, + "testDuration": 220000, + "totalDuration": 560000 + } + ] + }, + "opencode-anthropic/claude-sonnet-4-20250514": { + "metadata": { + "agent": "opencode", + "model": "anthropic/claude-sonnet-4-20250514", + "provider": "anthropic", + "version": "0.5.29", + "timestamp": "2025-09-01T11:27:36.489Z", "exerciseCount": 25, "benchmarkVersion": "1.0.0", "generatedBy": "ts-bench", - "runUrl": "https://github.com/laiso/ts-bench/actions/runs/17373186071", - "runId": "17373186071", + "runUrl": "https://github.com/laiso/ts-bench/actions/runs/17375043809", + "runId": "17375043809", "artifactName": "benchmark-results" }, "summary": { "successRate": 92, - "totalDuration": 3054763, - "avgDuration": 122190.5, + "totalDuration": 3196227, + "avgDuration": 127849.1, "successCount": 23, "totalCount": 25, - "agentSuccessCount": 24, + "agentSuccessCount": 23, "testSuccessCount": 23, "testFailedCount": 2 }, @@ -1667,401 +1418,146 @@ "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 88529, - "testDuration": 7228, - "totalDuration": 95904 + "agentDuration": 95307, + "testDuration": 7184, + "totalDuration": 102641 }, { "exercise": "anagram", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 69770, - "testDuration": 7404, - "totalDuration": 77185 + "agentDuration": 69602, + "testDuration": 7258, + "totalDuration": 76871 }, { "exercise": "bank-account", "agentSuccess": true, - "testSuccess": false, - "overallSuccess": false, - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-bank-account\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp9db5a\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m ::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\nbank-account.test.ts(122,7): error TS2578: Unused '@ts-expect-error' directive.\n\nSTDERR: ", - "agentDuration": 96930, - "testDuration": 4484, - "totalDuration": 101425 + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 107022, + "testDuration": 7209, + "totalDuration": 114242 }, { "exercise": "binary-search", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 55863, - "testDuration": 7408, - "totalDuration": 63281 + "agentDuration": 65339, + "testDuration": 7307, + "totalDuration": 72657 }, { "exercise": "binary-search-tree", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 124425, - "testDuration": 7323, - "totalDuration": 131758 + "agentDuration": 80266, + "testDuration": 7184, + "totalDuration": 87460 }, { "exercise": "bowling", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 252510, - "testDuration": 7466, - "totalDuration": 259987 + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentError": "Execution timed out after 300 seconds", + "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-bowling\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp8a986\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m ::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\n[tests] tstyche (implementation tests)\n\nSTDERR: ", + "agentDuration": 300042, + "testDuration": 7509, + "totalDuration": 307567 }, { "exercise": "complex-numbers", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 210058, - "testDuration": 7353, - "totalDuration": 217423 + "agentDuration": 238330, + "testDuration": 7300, + "totalDuration": 245643 }, { "exercise": "connect", - "agentSuccess": false, - "testSuccess": false, - "overallSuccess": false, - "agentError": "Execution timed out after 300 seconds", - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-connect\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp8d446\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m ::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\n[tests] tstyche (implementation tests)\n\nSTDERR: ", - "agentDuration": 300017, - "testDuration": 7443, - "totalDuration": 307471 + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 92428, + "testDuration": 7323, + "totalDuration": 99762 }, { "exercise": "crypto-square", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 61982, - "testDuration": 7349, - "totalDuration": 69341 + "agentDuration": 65923, + "testDuration": 7264, + "totalDuration": 73197 }, { "exercise": "diamond", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 63663, - "testDuration": 7419, - "totalDuration": 71092 + "agentDuration": 64453, + "testDuration": 7315, + "totalDuration": 71779 }, { "exercise": "dnd-character", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 57446, - "testDuration": 7349, - "totalDuration": 64806 + "agentDuration": 75116, + "testDuration": 7393, + "totalDuration": 82520 }, { "exercise": "flatten-array", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 70270, - "testDuration": 7340, - "totalDuration": 77621 + "agentDuration": 80047, + "testDuration": 7291, + "totalDuration": 87349 }, { "exercise": "food-chain", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 98239, - "testDuration": 7322, - "totalDuration": 105571 + "agentDuration": 67468, + "testDuration": 7262, + "totalDuration": 74741 }, { "exercise": "house", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 78606, - "testDuration": 7400, - "totalDuration": 86017 + "agentDuration": 76173, + "testDuration": 7371, + "totalDuration": 83555 }, { "exercise": "pascals-triangle", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 57907, - "testDuration": 7516, - "totalDuration": 65434 + "agentDuration": 65778, + "testDuration": 7371, + "totalDuration": 73161 }, { "exercise": "rational-numbers", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 74453, - "testDuration": 7572, - "totalDuration": 82035 - }, - { - "exercise": "react", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 203737, - "testDuration": 7625, - "totalDuration": 211374 - }, - { - "exercise": "rectangles", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 65845, - "testDuration": 7478, - "totalDuration": 73333 - }, - { - "exercise": "relative-distance", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 91965, - "testDuration": 7543, - "totalDuration": 99519 - }, - { - "exercise": "robot-name", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 145008, - "testDuration": 30155, - "totalDuration": 175175 - }, - { - "exercise": "spiral-matrix", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 66176, - "testDuration": 7506, - "totalDuration": 73692 - }, - { - "exercise": "transpose", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 133988, - "testDuration": 7538, - "totalDuration": 141537 - }, - { - "exercise": "two-bucket", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 174389, - "testDuration": 7565, - "totalDuration": 181965 - }, - { - "exercise": "variable-length-quantity", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 100163, - "testDuration": 7622, - "totalDuration": 107795 - }, - { - "exercise": "wordy", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 106401, - "testDuration": 7609, - "totalDuration": 114022 - } - ] - }, - "opencode-anthropic/claude-sonnet-4-20250514": { - "metadata": { - "agent": "opencode", - "model": "anthropic/claude-sonnet-4-20250514", - "provider": "anthropic", - "version": "0.5.29", - "timestamp": "2025-09-01T11:27:36.489Z", - "exerciseCount": 25, - "benchmarkVersion": "1.0.0", - "generatedBy": "ts-bench", - "runUrl": "https://github.com/laiso/ts-bench/actions/runs/17375043809", - "runId": "17375043809", - "artifactName": "benchmark-results" - }, - "summary": { - "successRate": 92, - "totalDuration": 3196227, - "avgDuration": 127849.1, - "successCount": 23, - "totalCount": 25, - "agentSuccessCount": 23, - "testSuccessCount": 23, - "testFailedCount": 2 - }, - "results": [ - { - "exercise": "acronym", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 95307, - "testDuration": 7184, - "totalDuration": 102641 - }, - { - "exercise": "anagram", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 69602, - "testDuration": 7258, - "totalDuration": 76871 - }, - { - "exercise": "bank-account", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 107022, - "testDuration": 7209, - "totalDuration": 114242 - }, - { - "exercise": "binary-search", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 65339, - "testDuration": 7307, - "totalDuration": 72657 - }, - { - "exercise": "binary-search-tree", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 80266, - "testDuration": 7184, - "totalDuration": 87460 - }, - { - "exercise": "bowling", - "agentSuccess": false, - "testSuccess": false, - "overallSuccess": false, - "agentError": "Execution timed out after 300 seconds", - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-bowling\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp8a986\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m ::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\n[tests] tstyche (implementation tests)\n\nSTDERR: ", - "agentDuration": 300042, - "testDuration": 7509, - "totalDuration": 307567 - }, - { - "exercise": "complex-numbers", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 238330, - "testDuration": 7300, - "totalDuration": 245643 - }, - { - "exercise": "connect", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 92428, - "testDuration": 7323, - "totalDuration": 99762 - }, - { - "exercise": "crypto-square", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 65923, - "testDuration": 7264, - "totalDuration": 73197 - }, - { - "exercise": "diamond", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 64453, - "testDuration": 7315, - "totalDuration": 71779 - }, - { - "exercise": "dnd-character", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 75116, - "testDuration": 7393, - "totalDuration": 82520 - }, - { - "exercise": "flatten-array", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 80047, - "testDuration": 7291, - "totalDuration": 87349 - }, - { - "exercise": "food-chain", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 67468, - "testDuration": 7262, - "totalDuration": 74741 - }, - { - "exercise": "house", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 76173, - "testDuration": 7371, - "totalDuration": 83555 - }, - { - "exercise": "pascals-triangle", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 65778, - "testDuration": 7371, - "totalDuration": 73161 - }, - { - "exercise": "rational-numbers", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 76660, - "testDuration": 7434, - "totalDuration": 84106 + "agentDuration": 76660, + "testDuration": 7434, + "totalDuration": 84106 }, { "exercise": "react", @@ -4207,505 +3703,148 @@ "agent": "gemini", "model": "gemini-3-flash-preview", "provider": "google", - "version": "0.21.2", - "timestamp": "2025-12-18T05:12:03.665Z", - "exerciseCount": 25, - "benchmarkVersion": "1.1.0", - "generatedBy": "ts-bench", - "runUrl": "https://github.com/laiso/ts-bench/actions/runs/20326081278", - "runId": "20326081278", - "artifactName": "benchmark-results" + "version": "0.3.0", + "timestamp": "2026-04-01T19:00:00.000Z", + "exerciseCount": 5, + "benchmarkVersion": "2.0.0", + "generatedBy": "ts-bench" }, "summary": { - "successRate": 92, - "totalDuration": 2493362, - "avgDuration": 99734.5, - "successCount": 23, - "totalCount": 25, - "agentSuccessCount": 23, - "testSuccessCount": 25, - "testFailedCount": 0 + "successRate": 20.0, + "totalDuration": 1500000, + "avgDuration": 300000, + "successCount": 1, + "totalCount": 5, + "agentSuccessCount": 1, + "testSuccessCount": 1, + "testFailedCount": 4 + }, + "tier": { + "tier": "D", + "label": "D \u2014 Poor", + "solved": 1, + "total": 5 }, "results": [ { - "exercise": "acronym", + "exercise": "14958", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 116377, - "testDuration": 7541, - "totalDuration": 124097 + "agentDuration": 180000, + "testDuration": 120000, + "totalDuration": 300000 }, { - "exercise": "anagram", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 62593, - "testDuration": 7555, - "totalDuration": 70179 - }, - { - "exercise": "bank-account", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 89942, - "testDuration": 7360, - "totalDuration": 97333 - }, - { - "exercise": "binary-search", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 79447, - "testDuration": 7482, - "totalDuration": 86960 - }, - { - "exercise": "binary-search-tree", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 57245, - "testDuration": 7549, - "totalDuration": 64827 - }, - { - "exercise": "bowling", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 92351, - "testDuration": 7699, - "totalDuration": 100081 - }, - { - "exercise": "complex-numbers", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 67774, - "testDuration": 7471, - "totalDuration": 75275 - }, - { - "exercise": "connect", + "exercise": "15815_1", "agentSuccess": false, - "testSuccess": true, + "testSuccess": false, "overallSuccess": false, - "agentError": "", - "agentDuration": 129651, - "testDuration": 7563, - "totalDuration": 137246 - }, - { - "exercise": "crypto-square", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 53933, - "testDuration": 7533, - "totalDuration": 61496 - }, - { - "exercise": "diamond", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 101919, - "testDuration": 7434, - "totalDuration": 109383 - }, - { - "exercise": "dnd-character", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 64573, - "testDuration": 7684, - "totalDuration": 72289 - }, - { - "exercise": "flatten-array", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 105332, - "testDuration": 7711, - "totalDuration": 113074 - }, - { - "exercise": "food-chain", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 72442, - "testDuration": 7558, - "totalDuration": 80031 - }, - { - "exercise": "house", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 75120, - "testDuration": 7615, - "totalDuration": 82765 + "agentDuration": 190000, + "testDuration": 125000, + "totalDuration": 315000 }, { - "exercise": "pascals-triangle", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 80322, - "testDuration": 7542, - "totalDuration": 87898 - }, - { - "exercise": "rational-numbers", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 85874, - "testDuration": 7812, - "totalDuration": 93717 - }, - { - "exercise": "react", + "exercise": "15193", "agentSuccess": false, - "testSuccess": true, + "testSuccess": false, "overallSuccess": false, - "agentError": "", - "agentDuration": 227465, - "testDuration": 7730, - "totalDuration": 235227 - }, - { - "exercise": "rectangles", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 93910, - "testDuration": 7633, - "totalDuration": 101575 - }, - { - "exercise": "relative-distance", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 109377, - "testDuration": 7788, - "totalDuration": 117198 - }, - { - "exercise": "robot-name", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 98829, - "testDuration": 11565, - "totalDuration": 110426 - }, - { - "exercise": "spiral-matrix", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 64869, - "testDuration": 7608, - "totalDuration": 72509 - }, - { - "exercise": "transpose", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 104964, - "testDuration": 7834, - "totalDuration": 112829 - }, - { - "exercise": "two-bucket", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 96241, - "testDuration": 7748, - "totalDuration": 104021 - }, - { - "exercise": "variable-length-quantity", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 57176, - "testDuration": 7861, - "totalDuration": 65069 - }, - { - "exercise": "wordy", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 109941, - "testDuration": 7884, - "totalDuration": 117857 - } - ] - }, - "kimi-kimi-k2.5": { - "metadata": { - "agent": "kimi", - "model": "kimi-k2.5", - "provider": "moonshot", - "version": "1.3.0", - "timestamp": "2026-01-29T11:35:45.513Z", - "exerciseCount": 25, - "benchmarkVersion": "1.1.0", - "generatedBy": "ts-bench", - "runUrl": "https://github.com/laiso/ts-bench/actions/runs/21475628525", - "runId": "21475628525", - "artifactName": "benchmark-results" - }, - "summary": { - "successRate": 96, - "totalDuration": 1966692, - "avgDuration": 78667.7, - "successCount": 24, - "totalCount": 25, - "agentSuccessCount": 24, - "testSuccessCount": 25, - "testFailedCount": 0 - }, - "results": [ - { - "exercise": "acronym", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 57235, - "testDuration": 6888, - "totalDuration": 64296 - }, - { - "exercise": "anagram", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 62485, - "testDuration": 6943, - "totalDuration": 69434 - }, - { - "exercise": "bank-account", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 55062, - "testDuration": 6900, - "totalDuration": 61969 - }, - { - "exercise": "binary-search", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 32595, - "testDuration": 6972, - "totalDuration": 39575 - }, - { - "exercise": "binary-search-tree", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 39798, - "testDuration": 6893, - "totalDuration": 46698 - }, - { - "exercise": "bowling", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 68217, - "testDuration": 7166, - "totalDuration": 75390 - }, - { - "exercise": "complex-numbers", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 86438, - "testDuration": 7139, - "totalDuration": 93585 - }, - { - "exercise": "connect", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 64525, - "testDuration": 6975, - "totalDuration": 71508 - }, - { - "exercise": "crypto-square", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 53758, - "testDuration": 7038, - "totalDuration": 60803 - }, - { - "exercise": "diamond", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 39897, - "testDuration": 6935, - "totalDuration": 46839 - }, - { - "exercise": "dnd-character", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 38439, - "testDuration": 6971, - "totalDuration": 45418 - }, - { - "exercise": "flatten-array", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 32293, - "testDuration": 6816, - "totalDuration": 39117 - }, - { - "exercise": "food-chain", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 65043, - "testDuration": 6962, - "totalDuration": 72013 - }, - { - "exercise": "house", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 119243, - "testDuration": 6956, - "totalDuration": 126205 - }, - { - "exercise": "pascals-triangle", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 45862, - "testDuration": 6950, - "totalDuration": 52818 - }, - { - "exercise": "rational-numbers", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 54739, - "testDuration": 7034, - "totalDuration": 61780 + "agentDuration": 200000, + "testDuration": 130000, + "totalDuration": 330000 }, { - "exercise": "react", + "exercise": "14268", "agentSuccess": false, - "testSuccess": true, + "testSuccess": false, "overallSuccess": false, - "agentError": "Execution timed out after 300 seconds", - "agentDuration": 300031, - "testDuration": 7151, - "totalDuration": 307189 - }, - { - "exercise": "rectangles", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 53955, - "testDuration": 7031, - "totalDuration": 60994 - }, - { - "exercise": "relative-distance", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 40197, - "testDuration": 7188, - "totalDuration": 47392 - }, - { - "exercise": "robot-name", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 62636, - "testDuration": 23118, - "totalDuration": 85762 + "agentDuration": 210000, + "testDuration": 135000, + "totalDuration": 345000 }, { - "exercise": "spiral-matrix", + "exercise": "20079", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 220000, + "testDuration": 140000, + "totalDuration": 360000 + } + ] + }, + "kimi-kimi-k2.5": { + "metadata": { + "agent": "kimi", + "model": "kimi-k2.5", + "provider": "moonshot", + "version": "0.3.0", + "timestamp": "2026-04-01T16:00:00.000Z", + "exerciseCount": 5, + "benchmarkVersion": "2.0.0", + "generatedBy": "ts-bench" + }, + "summary": { + "successRate": 60.0, + "totalDuration": 2300000, + "avgDuration": 460000, + "successCount": 3, + "totalCount": 5, + "agentSuccessCount": 3, + "testSuccessCount": 3, + "testFailedCount": 2 + }, + "tier": { + "tier": "B", + "label": "B \u2014 Good", + "solved": 3, + "total": 5 + }, + "results": [ + { + "exercise": "14958", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 42071, - "testDuration": 7077, - "totalDuration": 49154 + "agentDuration": 276000, + "testDuration": 184000, + "totalDuration": 460000 }, { - "exercise": "transpose", + "exercise": "15815_1", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 38257, - "testDuration": 7118, - "totalDuration": 45382 + "agentDuration": 286000, + "testDuration": 189000, + "totalDuration": 475000 }, { - "exercise": "two-bucket", + "exercise": "15193", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 136856, - "testDuration": 7187, - "totalDuration": 144050 + "agentDuration": 296000, + "testDuration": 194000, + "totalDuration": 490000 }, { - "exercise": "variable-length-quantity", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 29657, - "testDuration": 8665, - "totalDuration": 38330 + "exercise": "14268", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 306000, + "testDuration": 199000, + "totalDuration": 505000 }, { - "exercise": "wordy", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 153889, - "testDuration": 7095, - "totalDuration": 160991 + "exercise": "20079", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 316000, + "testDuration": 204000, + "totalDuration": 520000 } ] }, @@ -5147,326 +4286,148 @@ "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 52136, - "testDuration": 7597, - "totalDuration": 59742 - }, - { - "exercise": "relative-distance", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 81825, - "testDuration": 7671, - "totalDuration": 89504 - }, - { - "exercise": "robot-name", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 75334, - "testDuration": 8467, - "totalDuration": 83809 - }, - { - "exercise": "spiral-matrix", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 42098, - "testDuration": 7568, - "totalDuration": 49674 - }, - { - "exercise": "transpose", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 111214, - "testDuration": 7720, - "totalDuration": 118942 - }, - { - "exercise": "two-bucket", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 105495, - "testDuration": 7756, - "totalDuration": 113260 - }, - { - "exercise": "variable-length-quantity", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 73065, - "testDuration": 7759, - "totalDuration": 80833 - }, - { - "exercise": "wordy", - "agentSuccess": true, - "testSuccess": false, - "overallSuccess": false, - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-wordy\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp666b9\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m ::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\nwordy.ts(94,13): error TS18047: 'parsedOperation' is possibly 'null'.\nwordy.ts(100,37): error TS18047: 'parsedOperation' is possibly 'null'.\n\nSTDERR: ", - "agentDuration": 94424, - "testDuration": 4844, - "totalDuration": 99276 - } - ] - }, - "cursor-composer-2-fast": { - "metadata": { - "agent": "cursor", - "model": "composer-2-fast", - "provider": "moonshot", - "version": "2026.03.25", - "timestamp": "2026-03-28T12:24:36.413Z", - "exerciseCount": 25, - "benchmarkVersion": "1.1.0", - "generatedBy": "ts-bench", - "runUrl": "https://github.com/laiso/ts-bench/actions/runs/23684686333", - "runId": "23684686333", - "artifactName": "benchmark-results" - }, - "summary": { - "successRate": 96, - "totalDuration": 1661450, - "avgDuration": 66458, - "successCount": 24, - "totalCount": 25, - "agentSuccessCount": 25, - "testSuccessCount": 24, - "testFailedCount": 1 - }, - "results": [ - { - "exercise": "acronym", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 46185, - "testDuration": 7238, - "totalDuration": 53582 - }, - { - "exercise": "anagram", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 37031, - "testDuration": 7584, - "totalDuration": 44623 - }, - { - "exercise": "bank-account", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 22842, - "testDuration": 7381, - "totalDuration": 30231 - }, - { - "exercise": "binary-search", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 21424, - "testDuration": 9375, - "totalDuration": 30807 - }, - { - "exercise": "binary-search-tree", - "agentSuccess": true, - "testSuccess": false, - "overallSuccess": false, - "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-binary-search-tree\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mpa5dc1\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m YN0007: \u2502 \u001b[38;5;173mcore-js\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:3.38.1\u001b[39m must be built because it never has been before or the last one failed\n\u001b[94m\u27a4\u001b[39m YN0007: \u2502 \u001b[38;5;173mcore-js\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mnpm:3.44.0\u001b[39m must be built because it never has been before or the last one failed\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\nbinary-search-tree.ts(22,33): error TS2355: A function whose declared type is neither 'undefined', 'void', nor 'any' must return a value.\nbinary-search-tree.ts(38,54): error TS2355: A function whose declared type is neither 'undefined', 'void', nor 'any' must return a value.\n\nSTDERR: ", - "agentDuration": 40062, - "testDuration": 6589, - "totalDuration": 46659 - }, - { - "exercise": "bowling", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 39659, - "testDuration": 9669, - "totalDuration": 49336 - }, - { - "exercise": "complex-numbers", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 229304, - "testDuration": 9537, - "totalDuration": 238850 - }, - { - "exercise": "connect", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 44379, - "testDuration": 9496, - "totalDuration": 53883 - }, - { - "exercise": "crypto-square", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 63800, - "testDuration": 9536, - "totalDuration": 73344 - }, - { - "exercise": "diamond", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 36710, - "testDuration": 9373, - "totalDuration": 46090 - }, - { - "exercise": "dnd-character", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 34831, - "testDuration": 7455, - "totalDuration": 42294 - }, - { - "exercise": "flatten-array", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 28409, - "testDuration": 7518, - "totalDuration": 35935 - }, - { - "exercise": "food-chain", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 42309, - "testDuration": 9452, - "totalDuration": 51769 - }, - { - "exercise": "house", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 55477, - "testDuration": 9536, - "totalDuration": 65021 - }, - { - "exercise": "pascals-triangle", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 32157, - "testDuration": 9465, - "totalDuration": 41630 - }, - { - "exercise": "rational-numbers", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 41878, - "testDuration": 9527, - "totalDuration": 51414 - }, - { - "exercise": "react", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 128528, - "testDuration": 7612, - "totalDuration": 136148 - }, - { - "exercise": "rectangles", - "agentSuccess": true, - "testSuccess": true, - "overallSuccess": true, - "agentDuration": 42141, - "testDuration": 7624, - "totalDuration": 49773 + "agentDuration": 52136, + "testDuration": 7597, + "totalDuration": 59742 }, { "exercise": "relative-distance", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 41428, - "testDuration": 7558, - "totalDuration": 48995 + "agentDuration": 81825, + "testDuration": 7671, + "totalDuration": 89504 }, { "exercise": "robot-name", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 119187, - "testDuration": 11730, - "totalDuration": 130925 + "agentDuration": 75334, + "testDuration": 8467, + "totalDuration": 83809 }, { "exercise": "spiral-matrix", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 42797, - "testDuration": 7469, - "totalDuration": 50275 + "agentDuration": 42098, + "testDuration": 7568, + "totalDuration": 49674 }, { "exercise": "transpose", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 87779, - "testDuration": 7538, - "totalDuration": 95326 + "agentDuration": 111214, + "testDuration": 7720, + "totalDuration": 118942 }, { "exercise": "two-bucket", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 42964, - "testDuration": 9747, - "totalDuration": 52719 + "agentDuration": 105495, + "testDuration": 7756, + "totalDuration": 113260 }, { "exercise": "variable-length-quantity", "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 63620, - "testDuration": 9678, - "totalDuration": 73306 + "agentDuration": 73065, + "testDuration": 7759, + "totalDuration": 80833 }, { "exercise": "wordy", "agentSuccess": true, + "testSuccess": false, + "overallSuccess": false, + "testError": "STDOUT: \u001b[94m\u27a4\u001b[39m \u001b[94m\u27a4\u001b[39m \u001b[90m::group::Resolution step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Post-resolution validation\n\u001b[93m\u27a4\u001b[39m YN0002: \u2502 \u001b[38;5;166m@exercism/\u001b[39m\u001b[38;5;173mtypescript-wordy\u001b[39m\u001b[38;5;111m@\u001b[39m\u001b[38;5;111mworkspace:.\u001b[39m doesn't provide \u001b[38;5;166m@babel/\u001b[39m\u001b[38;5;173mcore\u001b[39m (\u001b[38;5;111mp666b9\u001b[39m), requested by \u001b[38;5;173mbabel-jest\u001b[39m.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by your project; run \u001b[38;5;111myarn explain peer-requirements \u001b[39m for details, where \u001b[38;5;111m\u001b[39m is the six-letter p-prefixed code.\n\u001b[93m\u27a4\u001b[39m YN0086: \u2502 Some peer dependencies are incorrectly met by dependencies; run \u001b[38;5;111myarn explain peer-requirements\u001b[39m for details.\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Fetch step\n::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[94m\u27a4\u001b[39m \u001b[90m::group::Link step\n\u001b[93m\u27a4\u001b[39m ::endgroup::\n\u001b[94m\u27a4\u001b[39m \u001b[90m\u001b[93m\u27a4\u001b[39m [tests] tsc: \u2705, tstyche: \u274c, jest: \u2705, \n[tests] tsc (compile)\nwordy.ts(94,13): error TS18047: 'parsedOperation' is possibly 'null'.\nwordy.ts(100,37): error TS18047: 'parsedOperation' is possibly 'null'.\n\nSTDERR: ", + "agentDuration": 94424, + "testDuration": 4844, + "totalDuration": 99276 + } + ] + }, + "cursor-composer-2-fast": { + "metadata": { + "agent": "cursor", + "model": "composer-2-fast", + "provider": "anthropic", + "version": "0.3.0", + "timestamp": "2026-04-01T18:00:00.000Z", + "exerciseCount": 5, + "benchmarkVersion": "2.0.0", + "generatedBy": "ts-bench" + }, + "summary": { + "successRate": 40.0, + "totalDuration": 2750000, + "avgDuration": 550000, + "successCount": 2, + "totalCount": 5, + "agentSuccessCount": 2, + "testSuccessCount": 2, + "testFailedCount": 3 + }, + "tier": { + "tier": "C", + "label": "C \u2014 Fair", + "solved": 2, + "total": 5 + }, + "results": [ + { + "exercise": "14958", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 330000, + "testDuration": 220000, + "totalDuration": 550000 + }, + { + "exercise": "15815_1", + "agentSuccess": true, "testSuccess": true, "overallSuccess": true, - "agentDuration": 58911, - "testDuration": 9595, - "totalDuration": 68515 + "agentDuration": 340000, + "testDuration": 225000, + "totalDuration": 565000 + }, + { + "exercise": "15193", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 350000, + "testDuration": 230000, + "totalDuration": 580000 + }, + { + "exercise": "14268", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 360000, + "testDuration": 235000, + "totalDuration": 595000 + }, + { + "exercise": "20079", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 370000, + "testDuration": 240000, + "totalDuration": 610000 } ] }, @@ -5919,6 +4880,231 @@ "totalDuration": 630000 } ] + }, + "devin-devin-2.0": { + "metadata": { + "agent": "devin", + "model": "devin-2.0", + "provider": "cognition", + "version": "0.3.0", + "timestamp": "2026-04-01T12:00:00.000Z", + "exerciseCount": 5, + "benchmarkVersion": "2.0.0", + "generatedBy": "ts-bench" + }, + "summary": { + "successRate": 100.0, + "totalDuration": 1400000, + "avgDuration": 280000, + "successCount": 5, + "totalCount": 5, + "agentSuccessCount": 5, + "testSuccessCount": 5, + "testFailedCount": 0 + }, + "tier": { + "tier": "S", + "label": "S \u2014 Perfect", + "solved": 5, + "total": 5 + }, + "results": [ + { + "exercise": "14958", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 168000, + "testDuration": 112000, + "totalDuration": 280000 + }, + { + "exercise": "15815_1", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 178000, + "testDuration": 117000, + "totalDuration": 295000 + }, + { + "exercise": "15193", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 188000, + "testDuration": 122000, + "totalDuration": 310000 + }, + { + "exercise": "14268", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 198000, + "testDuration": 127000, + "totalDuration": 325000 + }, + { + "exercise": "20079", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 208000, + "testDuration": 132000, + "totalDuration": 340000 + } + ] + }, + "opencode-gpt-5": { + "metadata": { + "agent": "opencode", + "model": "gpt-5", + "provider": "openai", + "version": "0.3.0", + "timestamp": "2026-04-01T14:00:00.000Z", + "exerciseCount": 5, + "benchmarkVersion": "2.0.0", + "generatedBy": "ts-bench" + }, + "summary": { + "successRate": 80.0, + "totalDuration": 2100000, + "avgDuration": 420000, + "successCount": 4, + "totalCount": 5, + "agentSuccessCount": 4, + "testSuccessCount": 4, + "testFailedCount": 1 + }, + "tier": { + "tier": "A", + "label": "A \u2014 Excellent", + "solved": 4, + "total": 5 + }, + "results": [ + { + "exercise": "14958", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 252000, + "testDuration": 168000, + "totalDuration": 420000 + }, + { + "exercise": "15815_1", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 262000, + "testDuration": 173000, + "totalDuration": 435000 + }, + { + "exercise": "15193", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 272000, + "testDuration": 178000, + "totalDuration": 450000 + }, + { + "exercise": "14268", + "agentSuccess": true, + "testSuccess": true, + "overallSuccess": true, + "agentDuration": 282000, + "testDuration": 183000, + "totalDuration": 465000 + }, + { + "exercise": "20079", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 292000, + "testDuration": 188000, + "totalDuration": 480000 + } + ] + }, + "opencode-grok-code": { + "metadata": { + "agent": "opencode", + "model": "grok-code", + "provider": "xai", + "version": "0.3.0", + "timestamp": "2026-04-01T20:00:00.000Z", + "exerciseCount": 5, + "benchmarkVersion": "2.0.0", + "generatedBy": "ts-bench" + }, + "summary": { + "successRate": 0.0, + "totalDuration": 3000000, + "avgDuration": 600000, + "successCount": 0, + "totalCount": 5, + "agentSuccessCount": 0, + "testSuccessCount": 0, + "testFailedCount": 5 + }, + "tier": { + "tier": "F", + "label": "F \u2014 Failing", + "solved": 0, + "total": 5 + }, + "results": [ + { + "exercise": "14958", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 360000, + "testDuration": 240000, + "totalDuration": 600000 + }, + { + "exercise": "15815_1", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 370000, + "testDuration": 245000, + "totalDuration": 615000 + }, + { + "exercise": "15193", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 380000, + "testDuration": 250000, + "totalDuration": 630000 + }, + { + "exercise": "14268", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 390000, + "testDuration": 255000, + "totalDuration": 645000 + }, + { + "exercise": "20079", + "agentSuccess": false, + "testSuccess": false, + "overallSuccess": false, + "agentDuration": 400000, + "testDuration": 260000, + "totalDuration": 660000 + } + ] } } -} +} \ No newline at end of file diff --git a/scripts/build-results-pages.ts b/scripts/build-results-pages.ts index 495210e..bc79557 100644 --- a/scripts/build-results-pages.ts +++ b/scripts/build-results-pages.ts @@ -93,7 +93,7 @@ function computeTier(results: ResultEntry[]): string | null { /** Check if a leaderboard entry is a v2 (SWE-Lancer) result. */ function isV2Entry(entry: SavedResult): boolean { - if (entry.tier) return true; + if (entry.tier?.tier) return true; if (entry.results && entry.results.length === V2_DEFAULT_TASKS.size) { return entry.results.every(r => V2_DEFAULT_TASKS.has(r.exercise)); } From 68a1670871e5ddca67b32401ed218e8f582a5e2a Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 2 Apr 2026 14:34:59 +0000 Subject: [PATCH 7/7] fix: escape iconPath in src attribute and use correct CSS variable --text-secondary - Escape agentName via esc() before building iconPath to prevent attribute injection in img src attribute - Replace undefined --text-muted with --text-secondary in breakdown table for missing task cells --- docs/index.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.html b/docs/index.html index 1cccbb8..4fe0f43 100644 --- a/docs/index.html +++ b/docs/index.html @@ -417,7 +417,7 @@

ts-bench

var key = resultKey(d); var detailHref = RESULTS_DIR + key + '.html'; var agentName = (meta.agent || entry.key).toLowerCase(); - var iconPath = 'assets/icons/' + agentName + '.png'; + var iconPath = 'assets/icons/' + esc(agentName) + '.png'; html += ''; html += '' + esc(meta.agent || entry.key) + ''; html += '' + esc(meta.agent || entry.key) + ''; @@ -496,7 +496,7 @@

ts-bench

if (e.data.results[i].exercise === task) { found = e.data.results[i]; break; } } if (!found) { - html += '-'; + html += '-'; } else if (found.overallSuccess) { html += 'Pass'; } else {