rllm-org · jeffreysijuntan · Mar 13, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/agenthub/frozenlake_agent/agent/__init__.py b/agenthub/frozenlake_agent/agent/__init__.py
@@ -0,0 +1,5 @@
+"""FrozenLake agent plugin for rLLM."""
+
+from .agent import FrozenLakeAgentFlow, frozenlake_agent
+
+__all__ = ["FrozenLakeAgentFlow", "frozenlake_agent"]
diff --git a/agenthub/frozenlake_agent/agent/agent.py b/agenthub/frozenlake_agent/agent/agent.py
@@ -0,0 +1,141 @@
+"""FrozenLake AgentFlow — multi-turn grid navigation agent."""
+
+from __future__ import annotations
+
+import logging
+import re
+
+import openai
+
+from rllm.experimental.eval.types import AgentConfig
+from rllm.types import Episode, Step, Trajectory
+
+from .env import ACTION_INVALID, FrozenLakeEnv
+
+logger = logging.getLogger(__name__)
+
+DIRECTION_MAP = {"left": 1, "down": 2, "right": 3, "up": 4}
+
+SYSTEM_PROMPT = """\
+You are a helpful assistant. You are walking on a frozen lake.
+
+FrozenLake Quick Guide
+Goal: Reach the goal (G). Player (P) and Goal (G) must overlap.
+
+Symbols:
+_ Frozen | O Hole | G Goal | P Player
+
+Rules:
+1. Avoid falling into holes (O).
+2. Frozen tiles are slippery, you may move perpendicular to your intended direction.
+
+Valid Action (separated by | ):
+Up | Down | Left | Right
+
+Rewards:
+Fall into hole: 0
+Reach goal: +1.0
+
+You will be provided the current observation, please decide on the next Action.
+You should show your thought process and then input the final action in ``` ```.
+You should only output the NEXT ACTION at each interation in the ``` ```. For example, if you want to move up, you should output ```Up```.
+You should plan ahead and need to achieve it in minimum number of steps.
+You should be aware that frozen tiles can be slippery, but the chance is small and you should not overthink it.
+
+Please show your thinking process and put the final action in ``` ```. In every turn, the final action MUST be one of Up, Down, Left, Right.
+"""
+
+DEFAULT_MAX_STEPS = 10
+
+
+def _parse_action(response: str) -> int:
+    """Extract a direction action from the model response.
+
+    Looks for the last ```...``` block and maps its content to an action int.
+    Returns ACTION_INVALID (0) if parsing fails.
+    """
+    matches = re.findall(r"```(.*?)```", response, re.DOTALL)
+    if not matches:
+        return ACTION_INVALID
+
+    text = matches[-1].strip().lower()
+    if text in DIRECTION_MAP:
+        return DIRECTION_MAP[text]
+    if text.isdigit() and int(text) in DIRECTION_MAP.values():
+        return int(text)
+    return ACTION_INVALID
+
+
+class FrozenLakeAgentFlow:
+    """AgentFlow implementation for the FrozenLake grid navigation task."""
+
+    def run(self, task: dict, config: AgentConfig) -> Episode:
+        seed = task.get("seed", 42)
+        size = task.get("size", 4)
+        p = task.get("p", 0.8)
+        max_steps = task.get("max_steps", DEFAULT_MAX_STEPS)
+
+        env = FrozenLakeEnv(size=size, p=p, seed=seed, max_steps=max_steps)
+        obs = env.reset()
+
+        client = openai.OpenAI(base_url=config.base_url, api_key="not-needed")
+
+        messages: list[dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]
+        steps: list[Step] = []
+        num_steps = 0
+
+        for turn in range(max_steps):
+            user_content = f"Current Observation ({turn}):\n{obs}\nYou have not achieved the goal, P has not reached G yet. Please give the next action."
+            if turn > 0 and steps and not steps[-1].metadata.get("action_is_effective", True):
+                user_content += "\nYour last response is invalid. Your position didn't change at all. You may need to recheck your thinking process, action outputted, and the format of response. Remember, you should only output the NEXT ACTION at each interation in the ``` ```. For example, if you want to move up, you should output ```Up```."
+            remaining = max_steps - turn
+            user_content += f"\nThe maximum number of steps remaining is {remaining}."
+
+            messages.append({"role": "user", "content": user_content})
+
+            response = client.chat.completions.create(
+                model=config.model,
+                messages=messages,
+                temperature=0.0,
+            )
+            assistant_text = response.choices[0].message.content or ""
+            messages.append({"role": "assistant", "content": assistant_text})
+
+            action = _parse_action(assistant_text)
+            obs, reward, done, info = env.step(action)
+            num_steps += 1
+
+            steps.append(
+                Step(
+                    input=user_content,
+                    output=assistant_text,
+                    action=action,
+                    reward=reward,
+                    done=done,
+                    metadata=info,
+                )
+            )
+
+            if done:
+                break
+
+        success = env.success()
+        task_id = task.get("task_id", f"frozenlake_s{seed}")
+
+        trajectory = Trajectory(
+            name="navigator",
+            task=task,
+            steps=steps,
+            reward=1.0 if success else 0.0,
+        )
+
+        return Episode(
+            id=f"{task_id}:0",
+            task=task,
+            trajectories=[trajectory],
+            artifacts={"success": success, "num_steps": num_steps},
+        )
+
+
+# Module-level singleton for plugin entry point
+frozenlake_agent = FrozenLakeAgentFlow()
diff --git a/agenthub/frozenlake_agent/agent/env.py b/agenthub/frozenlake_agent/agent/env.py
@@ -0,0 +1,203 @@
+"""Lightweight FrozenLake environment — no gymnasium dependency.
+
+Reimplements the core grid-world logic from the legacy
+rllm/environments/frozenlake/frozenlake.py using only numpy.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+# ---------------------------------------------------------------------------
+# Map generation
+# ---------------------------------------------------------------------------
+
+
+def _is_valid(board: list[list[str]], max_steps: int) -> bool:
+    """DFS check that a path from S to G exists within max_steps."""
+    arr = np.array(board)
+    start_r, start_c = np.where(arr == "S")
+    frontier: list[tuple[int, int, int]] = [(int(start_r[0]), int(start_c[0]), 0)]
+    discovered: set[tuple[int, int]] = set()
+    size = len(board)
+
+    while frontier:
+        r, c, steps = frontier.pop()
+        if steps > max_steps:
+            continue
+        if (r, c) in discovered:
+            continue
+        discovered.add((r, c))
+        for dr, dc in [(1, 0), (0, 1), (-1, 0), (0, -1)]:
+            nr, nc = r + dr, c + dc
+            if 0 <= nr < size and 0 <= nc < size:
+                if board[nr][nc] == "G":
+                    return True
+                if board[nr][nc] != "H":
+                    frontier.append((nr, nc, steps + 1))
+    return False
+
+
+def generate_random_map(size: int = 8, p: float = 0.8, seed: int = 0, max_steps: int = 5) -> tuple[list[str], tuple[int, int]]:
+    """Generate a random valid FrozenLake map.
+
+    Args:
+        size: Grid side length.
+        p: Probability a tile is frozen (vs hole).
+        seed: RNG seed for reproducibility.
+        max_steps: Maximum steps for path-validity check.
+
+    Returns:
+        (map_rows, goal_position) where map_rows is a list of strings
+        like ``["SFFF", "FHFH", "FFFH", "HFFG"]`` and goal_position
+        is ``(row, col)`` of G.
+    """
+    rng = np.random.RandomState(seed)
+    p = min(1.0, p)
+
+    while True:
+        board = rng.choice(["F", "H"], (size, size), p=[p, 1 - p]).tolist()
+
+        # Pick distinct start and goal positions
+        while True:
+            sr, sc = int(rng.randint(0, size)), int(rng.randint(0, size))
+            gr, gc = int(rng.randint(0, size)), int(rng.randint(0, size))
+            if (sr, sc) != (gr, gc):
+                break
+
+        board[sr][sc] = "S"
+        board[gr][gc] = "G"
+
+        if _is_valid(board, max_steps):
+            return ["".join(row) for row in board], (gr, gc)
+
+
+# ---------------------------------------------------------------------------
+# Environment
+# ---------------------------------------------------------------------------
+
+# Action constants
+ACTION_INVALID = 0
+ACTION_LEFT = 1
+ACTION_DOWN = 2
+ACTION_RIGHT = 3
+ACTION_UP = 4
+
+ACTION_LOOKUP = {0: "None", 1: "Left", 2: "Down", 3: "Right", 4: "Up"}
+
+# Deltas: (row_delta, col_delta) for each action
+_DELTAS = {
+    ACTION_LEFT: (0, -1),
+    ACTION_DOWN: (1, 0),
+    ACTION_RIGHT: (0, 1),
+    ACTION_UP: (-1, 0),
+}
+
+# Render symbols
+_GRID_LOOKUP = {
+    "P": " P \t",
+    "F": " _ \t",
+    "H": " O \t",
+    "G": " G \t",
+    "X": " X \t",  # player fell in hole
+    "V": " √ \t",  # player reached goal
+}
+
+
+class FrozenLakeEnv:
+    """Pure-Python FrozenLake grid-world environment."""
+
+    def __init__(
+        self,
+        size: int = 4,
+        p: float = 0.8,
+        seed: int = 42,
+        max_steps: int = 5,
+        is_slippery: bool = False,
+    ):
+        self.size = size
+        self.p = p
+        self.seed = seed
+        self.max_steps = max_steps
+        self.is_slippery = is_slippery
+
+        self._map: list[str] = []
+        self._goal: tuple[int, int] = (0, 0)
+        self._player: tuple[int, int] = (0, 0)
+        self._done = False
+        self.reset()
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def reset(self) -> str:
+        """Reset environment and return initial observation."""
+        self._map, self._goal = generate_random_map(size=self.size, p=self.p, seed=self.seed, max_steps=self.max_steps)
+        # Find start position
+        for r, row in enumerate(self._map):
+            for c, ch in enumerate(row):
+                if ch == "S":
+                    self._player = (r, c)
+                    break
+        self._done = False
+        return self.render()
+
+    def step(self, action: int) -> tuple[str, float, bool, dict]:
+        """Take an action and return (observation, reward, done, info).
+
+        Actions: 1=Left, 2=Down, 3=Right, 4=Up. 0 is invalid (no-op).
+        """
+        if self._done:
+            return self.render(), 0.0, True, {"action_is_effective": False}
+
+        if action == ACTION_INVALID or action not in _DELTAS:
+            return self.render(), 0.0, False, {"action_is_effective": False}
+
+        prev = self._player
+        dr, dc = _DELTAS[action]
+        nr, nc = prev[0] + dr, prev[1] + dc
+
+        # Boundary check
+        if 0 <= nr < self.size and 0 <= nc < self.size:
+            self._player = (nr, nc)
+
+        tile = self._map[self._player[0]][self._player[1]]
+        effective = self._player != prev
+
+        if tile == "G":
+            self._done = True
+            return self.render(), 1.0, True, {"action_is_effective": effective}
+        if tile == "H":
+            self._done = True
+            return self.render(), 0.0, True, {"action_is_effective": effective}
+
+        return self.render(), 0.0, False, {"action_is_effective": effective}
+
+    def render(self) -> str:
+        """Render the grid as a text string (P=player, _=frozen, O=hole, G=goal)."""
+        rows = []
+        for r in range(self.size):
+            cells = []
+            for c in range(self.size):
+                if (r, c) == self._player:
+                    tile = self._map[r][c]
+                    if tile == "H":
+                        cells.append(_GRID_LOOKUP["X"])
+                    elif tile == "G":
+                        cells.append(_GRID_LOOKUP["V"])
+                    else:
+                        cells.append(_GRID_LOOKUP["P"])
+                else:
+                    ch = self._map[r][c]
+                    # Replace start marker with frozen
+                    sym = "F" if ch == "S" else ch
+                    cells.append(_GRID_LOOKUP[sym])
+            rows.append("".join(cells))
+        return "\n".join(rows)
+
+    def finished(self) -> bool:
+        return self._done
+
+    def success(self) -> bool:
+        return self._done and self._map[self._player[0]][self._player[1]] == "G"
diff --git a/agenthub/frozenlake_agent/eval/__init__.py b/agenthub/frozenlake_agent/eval/__init__.py
@@ -0,0 +1,5 @@
+"""FrozenLake evaluator plugin for rLLM."""
+
+from .evaluator import FrozenLakeEvaluator
+
+__all__ = ["FrozenLakeEvaluator"]
diff --git a/agenthub/frozenlake_agent/eval/evaluator.py b/agenthub/frozenlake_agent/eval/evaluator.py
@@ -0,0 +1,25 @@
+"""FrozenLake evaluator: scores episodes based on goal-reaching success."""
+
+from __future__ import annotations
+
+from rllm.experimental.eval.types import EvalOutput, Signal
+from rllm.types import Episode
+
+
+class FrozenLakeEvaluator:
+    """Evaluator that checks whether the agent reached the goal."""
+
+    def evaluate(self, task: dict, episode: Episode) -> EvalOutput:
+        success = episode.artifacts.get("success", False)
+        num_steps = episode.artifacts.get("num_steps", 0)
+
+        reward = 1.0 if success else 0.0
+        return EvalOutput(
+            reward=reward,
+            is_correct=bool(success),
+            signals=[
+                Signal("success", float(success)),
+                Signal("num_steps", float(num_steps)),
+            ],
+            metadata={"num_steps": num_steps},
+        )