rllm-org · jeffreysijuntan · Mar 17, 2026 · Mar 16, 2026 · Mar 17, 2026 · Mar 17, 2026
diff --git a/README.md b/README.md
diff --git a/cookbooks/geo3k/README.md b/cookbooks/geo3k/README.md
@@ -0,0 +1,97 @@
+# Geo3K Flow
+
+A VLM geometry problem solver for rLLM that trains on the [Geometry3K](https://huggingface.co/datasets/hiyouga/geometry3k) dataset using the **AgentFlow protocol**.
+
+## Overview
+
+A single-turn VLM agent that receives a geometry problem with a diagram image and produces a step-by-step solution with a boxed final answer. Uses a plain `OpenAI` client with multimodal content blocks (base64-encoded images).
+
+During training, `config.base_url` points to the model gateway which transparently captures token IDs and logprobs. During eval, it points directly to the model provider. The agent code is identical in both cases.
+
+## Architecture
+
+```
+AgentFlow.run(task, config)
+  │
+  └── Solver
+        └── OpenAI(base_url=config.base_url).chat.completions.create(
+                messages=[system_prompt, {images + question}]
+            )
+            → Trajectory(name="solver", steps=[Step(action=response)])
+  │
+  └── Episode(trajectories=[solver], artifacts={"answer": response})
+```
+
+The evaluator extracts `\boxed{}` from the response and grades it against the ground truth using symbolic math grading.
+
+## Installation
+
+```bash
+# From the rllm repo root
+uv pip install -e ".[tinker]"          # rllm + tinker backend
+uv pip install -e cookbooks/geo3k      # this cookbook
+```
+
+After installation, the agent and evaluator are discoverable by the CLI:
+
+```bash
+rllm agent list    # should show "geo3k" as a plugin
+```
+
+## Dataset
+
+Pull the Geometry3K dataset (one-time):
+
+```bash
+rllm dataset pull geo3k
+```
+
+## Training
+
+### Option 1: rllm CLI
+
+```bash
+rllm train geo3k \
+    --agent geo3k \
+    --evaluator geo3k_math \
+    --model Qwen/Qwen3-VL-30B-A3B-Instruct \
+    --lora-rank 32 \
+    --group-size 8 \
+    --epochs 3
+```
+
+### Option 2: Python API
+
+```bash
+python cookbooks/geo3k/train.py \
+    rllm/backend=tinker \
+    model.name=Qwen/Qwen3-VL-30B-A3B-Instruct \
+    model.lora_rank=32 \
+    training.group_size=8
+```
+
+Or use the provided script (wraps train.py with defaults):
+
+```bash
+bash cookbooks/geo3k/train.sh
+```
+
+## Eval
+
+```bash
+rllm eval geo3k \
+    --agent geo3k \
+    --evaluator geo3k_math \
+    --model Qwen/Qwen3-VL-30B-A3B-Instruct
+```
+
+## Files
+
+| File | Description |
+|------|-------------|
+| `geo3k_flow.py` | `Geo3KFlow` — AgentFlow implementation (VLM single-turn solver) |
+| `evaluator.py` | `Geo3KEvaluator` — math answer grading with `\boxed{}` extraction |
+| `train.py` | Python API training script (Hydra config) |
+| `train.sh` | Shell wrapper — calls `train.py` with default overrides |
+| `pyproject.toml` | Plugin metadata and entry points |
+| `test.py` | Unit tests for image handling and evaluation |
diff --git a/cookbooks/geo3k/evaluator.py b/cookbooks/geo3k/evaluator.py
@@ -0,0 +1,42 @@
+"""Geo3K evaluator: scores geometry answers using math grading."""
+
+from __future__ import annotations
+
+import rllm
+from rllm.experimental.eval.types import EvalOutput, Signal, _extract_agent_answer
+from rllm.types import Episode
+
+
+@rllm.evaluator
+def geo3k_evaluator(task: dict, episode: Episode) -> EvalOutput:
+    """Grade geometry answers by extracting the boxed answer and comparing to ground truth."""
+    from rllm.rewards.math_utils.utils import extract_answer, grade_answer_mathd, grade_answer_sympy
+
+    answer_text = _extract_agent_answer(episode)
+    model_answer = extract_answer(answer_text)
+
+    if model_answer is None:
+        return EvalOutput(
+            reward=0.0,
+            is_correct=False,
+            signals=[Signal(name="accuracy", value=0.0)],
+        )
+
+    ground_truth = task.get("ground_truth")
+    if ground_truth is None:
+        return EvalOutput(
+            reward=0.0,
+            is_correct=False,
+            signals=[Signal(name="accuracy", value=0.0)],
+        )
+
+    gt_str = str(ground_truth)
+    gt_extracted = extract_answer(gt_str) if "\\boxed" in gt_str else gt_str
+
+    is_correct = grade_answer_mathd(model_answer, gt_extracted) or grade_answer_sympy(model_answer, gt_extracted)
+    reward = 1.0 if is_correct else 0.0
+    return EvalOutput(
+        reward=reward,
+        is_correct=is_correct,
+        signals=[Signal(name="accuracy", value=reward)],
+    )
diff --git a/cookbooks/geo3k/geo3k_flow.py b/cookbooks/geo3k/geo3k_flow.py
@@ -0,0 +1,96 @@
+"""Geo3K AgentFlow — VLM geometry problem solver.
+
+A single-turn VLM agent that solves geometry problems from the Geometry3K
+dataset. Uses plain OpenAI client with multimodal content blocks — works
+identically for eval and training (the gateway handles trace capture).
+"""
+
+from __future__ import annotations
+
+import base64
+import logging
+
+from openai import OpenAI
+
+import rllm
+from rllm.experimental.eval.types import AgentConfig, Task
+from rllm.types import Episode, Trajectory
+
+logger = logging.getLogger(__name__)
+
+SYSTEM_PROMPT = """\
+You are a math problem solver with vision capabilities. You are given a \
+geometry problem that includes a diagram.
+Solve the problem step by step, showing your reasoning clearly.
+Put your final answer in \\boxed{} notation.
+
+For example: The answer is \\boxed{42}."""
+
+
+@rllm.rollout
+def geo3k_flow(task: Task, config: AgentConfig) -> Episode:
+    """Single-turn VLM geometry solver."""
+    data = task.data
+    client = OpenAI(base_url=config.base_url, api_key="EMPTY")
+    question = data.get("question", "")
+    images = data.get("images", [])
+
+    user_content = _build_vlm_content(question, images) if images else question
+
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": user_content},
+    ]
+
+    response_text = ""
+    try:
+        response = client.chat.completions.create(
+            model=config.model,
+            messages=messages,
+            temperature=0.6,
+            max_tokens=2048,
+        )
+        response_text = response.choices[0].message.content or ""
+    except Exception as e:
+        logger.warning("LLM call failed: %s", e)
+
+    return Episode(
+        task=data,
+        trajectories=[Trajectory(name="solver", steps=[])],
+        artifacts={"answer": response_text},
+    )
+
+
+def _detect_mime(data: bytes) -> str:
+    if data[:4] == b"\x89PNG":
+        return "image/png"
+    if data[:3] == b"\xff\xd8\xff":
+        return "image/jpeg"
+    if len(data) >= 12 and data[:4] == b"RIFF" and data[8:12] == b"WEBP":
+        return "image/webp"
+    return "image/png"
+
+
+def _build_vlm_content(text: str, images: list) -> list[dict]:
+    """Build OpenAI multimodal content blocks from text + image data."""
+    content: list[dict] = []
+    for img in images:
+        if img is None:
+            continue
+        if isinstance(img, bytes):
+            mime = _detect_mime(img)
+            encoded = base64.b64encode(img).decode("utf-8")
+            data_uri = f"data:{mime};base64,{encoded}"
+        elif isinstance(img, str):
+            data_uri = img  # assume already a URI or URL
+        else:
+            # PIL Image — convert to bytes
+            import io
+
+            buf = io.BytesIO()
+            img.save(buf, format="PNG")
+            encoded = base64.b64encode(buf.getvalue()).decode("utf-8")
+            data_uri = f"data:image/png;base64,{encoded}"
+        content.append({"type": "image_url", "image_url": {"url": data_uri}})
+    content.append({"type": "text", "text": text})
+    return content
diff --git a/cookbooks/geo3k/pyproject.toml b/cookbooks/geo3k/pyproject.toml
@@ -0,0 +1,20 @@
+[build-system]
+requires = ["setuptools>=64"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "geo3k-flow"
+version = "0.1.0"
+description = "Geo3K VLM geometry solver plugin for rLLM (AgentFlow protocol)"
+requires-python = ">=3.10"
+dependencies = ["rllm", "openai"]
+
+[tool.setuptools]
+py-modules = ["geo3k_flow", "evaluator"]
+
+# Plugin discovery: rllm CLI finds these by name
+[project.entry-points."rllm.agents"]
+geo3k = "geo3k_flow:geo3k_agent"
+
+[project.entry-points."rllm.evaluators"]
+geo3k_math = "evaluator:Geo3KEvaluator"
diff --git a/cookbooks/geo3k/test.py b/cookbooks/geo3k/test.py
@@ -0,0 +1,76 @@
+"""Tests for geo3k flow."""
+
+from evaluator import geo3k_evaluator
+from geo3k_flow import _build_vlm_content, _detect_mime
+
+from rllm.types import Episode, Step, Trajectory
+
+
+def test_detect_mime_png():
+    assert _detect_mime(b"\x89PNG\r\n\x1a\n") == "image/png"
+
+
+def test_detect_mime_jpeg():
+    assert _detect_mime(b"\xff\xd8\xff\xe0") == "image/jpeg"
+
+
+def test_build_vlm_content_with_bytes():
+    # Minimal PNG header
+    fake_png = b"\x89PNG" + b"\x00" * 20
+    content = _build_vlm_content("What is x?", [fake_png])
+    assert len(content) == 2
+    assert content[0]["type"] == "image_url"
+    assert content[0]["image_url"]["url"].startswith("data:image/png;base64,")
+    assert content[1]["type"] == "text"
+    assert content[1]["text"] == "What is x?"
+
+
+def test_build_vlm_content_no_images():
+    content = _build_vlm_content("What is x?", [])
+    assert len(content) == 1
+    assert content[0]["type"] == "text"
+
+
+def test_evaluator_correct():
+    task = {"question": "Find x", "ground_truth": "48"}
+
+    episode = Episode(
+        trajectories=[
+            Trajectory(name="solver", steps=[Step(action="The answer is \\boxed{48}")]),
+        ],
+        artifacts={"answer": "The answer is \\boxed{48}"},
+    )
+
+    result = geo3k_evaluator.evaluate(task, episode)
+    assert result.is_correct is True
+    assert result.reward == 1.0
+
+
+def test_evaluator_wrong():
+    task = {"question": "Find x", "ground_truth": "48"}
+
+    episode = Episode(
+        trajectories=[
+            Trajectory(name="solver", steps=[Step(action="The answer is \\boxed{24}")]),
+        ],
+        artifacts={"answer": "The answer is \\boxed{24}"},
+    )
+
+    result = geo3k_evaluator.evaluate(task, episode)
+    assert result.is_correct is False
+    assert result.reward == 0.0
+
+
+def test_evaluator_no_boxed():
+    task = {"question": "Find x", "ground_truth": "48"}
+
+    episode = Episode(
+        trajectories=[
+            Trajectory(name="solver", steps=[Step(action="I think 48")]),
+        ],
+        artifacts={"answer": "I think 48"},
+    )
+
+    result = geo3k_evaluator.evaluate(task, episode)
+    assert result.is_correct is False
+    assert result.reward == 0.0
diff --git a/cookbooks/geo3k/train.py b/cookbooks/geo3k/train.py
@@ -0,0 +1,39 @@
+"""Train geo3k VLM geometry solver using the Python API.
+
+Usage (from rllm repo root):
+    python cookbooks/geo3k/train.py
+
+Or with Hydra overrides:
+    python cookbooks/geo3k/train.py model.name=Qwen/Qwen3-VL-30B-A3B-Instruct training.group_size=4
+"""
+
+import hydra
+from evaluator import geo3k_evaluator
+from geo3k_flow import geo3k_flow
+from omegaconf import DictConfig
+
+from rllm.data.dataset import DatasetRegistry
+from rllm.experimental.unified_trainer import AgentTrainer
+
+
+@hydra.main(config_path="pkg://rllm.experimental.config", config_name="unified", version_base=None)
+def main(config: DictConfig):
+    train_dataset = DatasetRegistry.load_dataset("geo3k", "train")
+    test_dataset = DatasetRegistry.load_dataset("geo3k", "test")
+
+    if train_dataset is None:
+        raise RuntimeError("geo3k train split not found. Run: rllm dataset pull geo3k")
+
+    trainer = AgentTrainer(
+        backend="tinker",
+        agent_flow=geo3k_flow,
+        evaluator=geo3k_evaluator,
+        config=config,
+        train_dataset=train_dataset,
+        val_dataset=test_dataset,
+    )
+    trainer.train()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cookbooks/geo3k/train.sh b/cookbooks/geo3k/train.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Train geo3k VLM geometry solver via train.py with Hydra overrides.
+#
+# Prerequisites:
+#   1. Install rllm with tinker extras:  uv pip install -e ".[tinker]"
+#   2. Install this cookbook:             uv pip install -e cookbooks/geo3k
+#   3. Pull the dataset:                 rllm dataset pull geo3k
+
+set -euo pipefail
+
+python -u cookbooks/geo3k/train.py \
+    rllm/backend=tinker \
+    model.name=Qwen/Qwen3-VL-30B-A3B-Instruct \
+    model.lora_rank=32 \
+    training.group_size=8 \
+    rllm.trainer.total_epochs=3 \
+    rllm.trainer.test_freq=10 \
+    rllm.trainer.project_name=geo3k \
+    rllm.trainer.experiment_name=qwen3-vl-30b-instruct \
+    "$@"