bbulb · dongwhee · Apr 15, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
diff --git a/.env.example b/.env.example
@@ -6,7 +6,7 @@
 # ---- Embeddings (required for retrieval) ----
 # bge-m3 served by llama-server with --embeddings
 TRAWL_EMBED_URL=http://localhost:8081/v1
-TRAWL_EMBED_MODEL=bge-m3-Q8_0.gguf
+TRAWL_EMBED_MODEL=bge-m3
 
 # ---- Cross-encoder reranker (optional; falls back to cosine-only) ----
 # bge-reranker-v2-m3 served by llama-server with --reranking --pooling rank
@@ -16,11 +16,12 @@ TRAWL_RERANK_MODEL=bge-reranker-v2-m3
 # ---- HyDE query expansion (optional; off by default) ----
 # Small utility LLM (e.g. Gemma 4B)
 TRAWL_HYDE_URL=http://localhost:8082/v1
-TRAWL_HYDE_MODEL=gemma-4-E4B-it-Q8_0.gguf
+TRAWL_HYDE_MODEL=gemma
 # Pin to a specific llama-server slot for KV-cache reuse (optional)
 # TRAWL_HYDE_SLOT=1
 
-# ---- VLM page profiling (optional; required only for profile_page) ----
+# ---- VLM page profiling (required only for profile_page) ----
+# When unset, the MCP server hides profile_page from its tool list.
 TRAWL_VLM_URL=http://localhost:8080/v1
 TRAWL_VLM_MODEL=gemma
 TRAWL_VLM_TIMEOUT=120
@@ -34,3 +35,10 @@ TRAWL_VLM_MAX_TOKENS=2048
 # ---- Benchmark only (benchmarks/run_benchmark.py) ----
 # Get a free key at https://jina.ai/reader/
 # JINA_API_KEY=
+
+# ---- Raw passthrough (optional; off by default) ----
+# Hard cap on raw-passthrough response size in bytes. When fetch_page
+# receives JSON/XML/RSS/Atom, the body is returned as-is (no extraction,
+# no chunking, no embedding) up to this many bytes. Default: 262144
+# (256 KB ≈ 64K tokens — fits most local LLM context windows).
+# TRAWL_PASSTHROUGH_MAX_BYTES=262144
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,9 @@ tests/results/*.json
 
 # benchmark outputs
 benchmarks/results/
+
+# WCXB benchmark data (downloaded, not redistributed)
+benchmarks/wcxb/data/
+
+# git worktrees
+.worktrees/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -76,6 +76,11 @@ trawl directory. Humans should read `README.md` first, then
     pin requests to a specific llama-server slot (via `id_slot`) to
     avoid evicting other consumers' KV cache on shared servers with
     prompt caching.
+  - **Raw passthrough** — JSON/XML/RSS/Atom responses are returned as-is
+    without extraction. URL suffixes (`.json`, `.xml`, `.rss`, `.atom`)
+    take an httpx fast path; suffix-less API endpoints are detected by
+    response `Content-Type`. Byte cap via `TRAWL_PASSTHROUGH_MAX_BYTES`
+    (default 256 KB).
 
 ## Quick Reference
 
@@ -117,6 +122,9 @@ from trawl import fetch_relevant
 r = fetch_relevant('https://example.com/', 'what is this')
 print(r.chunks)
 "
+
+# WCXB external extraction benchmark (one-shot)
+python benchmarks/wcxb/fetch.py && python benchmarks/wcxb/run.py
 ```
 
 ## Architecture pointer
@@ -170,6 +178,12 @@ benchmarks/
   run_benchmark.py               trawl (base/profile/cached) vs Jina runner
   profile_eval_cases.yaml        36 cases for VLM profile eval
   profile_eval.py                profile generation quality evaluator
+  wcxb/                          external WCXB extraction benchmark (Phase 1)
+    fetch.py                       snapshot download + hash verify
+    run.py                         runner (trawl + Trafilatura baseline)
+    aggregate.py                   summary + report rendering
+    evaluate.py                    vendored WCXB word-F1 evaluator
+    manifest.json                  pinned SHA-256 manifest of dev split
   results/                       gitignored benchmark outputs
 
 examples/
@@ -216,6 +230,7 @@ change them, run `tests/test_pipeline.py` before AND after.
 | `pipeline.py retrieve_k multiplier` | `2` | Retrieves 2x candidates for reranking; fewer reduces rerank benefit, more adds latency |
 | `profiles/mapper.py DEFAULT_MAX_CANDIDATES_PER_ANCHOR` | `5` | Enough headroom to find non-noise candidates after sidebar/nav filtering |
 | `profiles/mapper.py NOISE_CLS_RE` | `nav\|sidebar\|toc\|...` | Noise region detection for anchor filtering; too broad catches content, too narrow misses sidebars |
+| `fetchers/passthrough.py` | `PASSTHROUGH_MAX_BYTES` env default `262144` | 256 KB ≈ 64K tokens; weather-like APIs fit, larger than local LLM contexts |
 
 ## In / out of scope
 

diff --git a/Dockerfile b/Dockerfile
@@ -5,25 +5,39 @@ FROM mcr.microsoft.com/playwright/python:v1.47.0-jammy
 WORKDIR /app
 
 # Install Python deps first so source changes do not invalidate the dep layer.
+# Stub packages let `pip install -e .` resolve deps before real source is copied.
 COPY pyproject.toml README.md ./
-COPY src ./src
+RUN mkdir -p src/trawl src/trawl_mcp && \
+    touch src/trawl/__init__.py src/trawl_mcp/__init__.py && \
+    pip install --no-cache-dir -e .
 
-RUN pip install --no-cache-dir -e . && \
-    playwright install --with-deps chromium
+# Real source — only this layer rebuilds on code changes.
+COPY src ./src
 
-# Playwright browsers are installed above under the default path.
+# Chromium + runtime libs are already in the base image at /ms-playwright.
 ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
 
-# trawl expects these at runtime. Compose overrides these at service
-# definition time to point at the host llama-servers.
-ENV TRAWL_EMBED_URL=http://host.docker.internal:8081/v1
-ENV TRAWL_EMBED_MODEL=bge-m3-Q8_0.gguf
-ENV TRAWL_RERANK_URL=http://host.docker.internal:8083/v1
-ENV TRAWL_RERANK_MODEL=bge-reranker-v2-m3
-ENV TRAWL_HYDE_URL=http://host.docker.internal:8082/v1
-ENV TRAWL_HYDE_MODEL=gemma-4-E4B-it-Q8_0.gguf
-ENV TRAWL_VLM_URL=http://host.docker.internal:8080/v1
-ENV TRAWL_VLM_MODEL=gemma
+# trawl runtime config — inject via `docker run -e ...`, compose
+# `environment:`, or `--env-file .env`. Not baked into the image so the
+# same image works across local-dev, LAN llama-servers, and remote hosts.
+# See .env.example for the full list.
+#
+# Required:
+#   TRAWL_EMBED_URL    e.g. http://host.docker.internal:8081/v1
+#   TRAWL_EMBED_MODEL  e.g. bge-m3
+#
+# Optional (feature degrades or is unused when absent):
+#   TRAWL_RERANK_URL / TRAWL_RERANK_MODEL   — cross-encoder reranker;
+#                                             falls back to cosine-only
+#   TRAWL_HYDE_URL   / TRAWL_HYDE_MODEL     — HyDE query expansion (off by default)
+#   TRAWL_VLM_URL    / TRAWL_VLM_MODEL      — required for profile_page;
+#                                             unset = tool hidden from MCP list
+#   TRAWL_PASSTHROUGH_MAX_BYTES             — default 262144 (256 KB)
+#   TRAWL_HYDE_SLOT / TRAWL_VLM_SLOT        — llama-server slot pinning
+#
+# Profile/visit cache is persisted at /root/.cache/trawl via VOLUME below.
+# Mount from host to retain state across container lifecycle:
+#   docker run -v ~/.cache/trawl:/root/.cache/trawl ...
 
 EXPOSE 8765
 

diff --git a/README.md b/README.md
@@ -50,6 +50,21 @@ that matter for your query, at ~1k tokens of output.
 trawl wins on every token-efficiency axis and runs entirely on your
 own infrastructure. In exchange you pay a real cost elsewhere:
 
+### External: WCXB dev (1,497 pages)
+
+Beyond the internal 12-case parity matrix, trawl's extraction stage is
+cross-validated against the [WCXB](https://github.com/Murrough-Foley/web-content-extraction-benchmark)
+public benchmark (CC-BY-4.0, 1,497 dev pages across 7 page types).
+
+| Extractor                         |   F1   |
+|-----------------------------------|--------|
+| trawl (`html_to_markdown`)        |  0.777 |
+| Trafilatura (same environment)    |  0.750 |
+
+Per-page-type breakdown and error counts: see
+[`benchmarks/wcxb/README.md`](benchmarks/wcxb/README.md) and run the
+benchmark locally to regenerate.
+
 ### When *not* to use trawl
 
 - **You want the whole page verbatim.** Selective retrieval is the