Active sampling simplified (#1143)

mnoukhov · root · finbarrtimbers · web-flow · commit da87d77d8988 · 2025-11-09T22:25:09.000Z
* tmp

* initial conversion to reward in accumulate_inference_batches

* nearly working

* first test fixes

* running, just need to test reduced logging

* test scripts, tmp commit for integration test

* update tests

* intermediate commit

* fix accumulate_inference_batches inputs

* change model to actually solve

* refill filtered prompts

move weight sync directly after update
episode now refers to "training episode", not "generation episode" as
previously

* fix test reward fn

* cleanup and move episode to later

* allow for not having time/reward metric

* always calculate advantage

becomes the same as reward when num_responses_per_prompt is 1
just because cursor keeps complaining

* try to fix test

* fix ground truths and datasets

makes grpo and ppo reward functions the same

* fix test

we now return k repeats of a prompts, not just 1 in the batch

* active sampling in large tests

* Update open_instruct/grpo_fast.py

Co-authored-by: Finbarr Timbers &lt;finbarrtimbers@gmail.com&gt;

* Update open_instruct/grpo_fast.py

Co-authored-by: Finbarr Timbers &lt;finbarrtimbers@gmail.com&gt;

* cursor was right

* address comments

* nit

* 32b without active sampling

* repeat each fix

---------

Co-authored-by: root &lt;root@titan-cs-aus-468.reviz.ai2.in&gt;
Co-authored-by: Finbarr Timbers &lt;finbarrtimbers@gmail.com&gt;
diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py
diff --git a/open_instruct/model_utils.py b/open_instruct/model_utils.py
@@ -55,7 +55,9 @@ class Batch:
     ground_truths: list[list[int]]
     datasets: list[str]
     raw_queries: list[str] | None
+    decoded_responses: list[str] | None
     indices: list[int] | None
+    scores: list[float] | None
 
     def __getitem__(self, key: slice | int | list[int]) -> "Batch":
         """Enable indexing and slicing: batch[5], batch[start:end], or batch[[1,3,5]]."""
@@ -262,7 +264,8 @@ async def apply_verifiable_reward(
     reward_fn_mapping: dict[str, VerifierFunction],
     responses: list[torch.Tensor],
     decoded_responses: list[str],
-    batch: Batch,
+    ground_truths: list[float],
+    datasets: list[str],
     reward_mult: int = 10,
     queries: list[str] | None = None,
 ):
@@ -274,7 +277,7 @@ async def apply_verifiable_reward(
     task_metadata = []
 
     for i, (tok_prediction, prediction, ground_truth, dataset, query) in enumerate(
-        zip(responses, decoded_responses, batch.ground_truths, batch.datasets, queries)
+        zip(responses, decoded_responses, ground_truths, datasets, queries)
     ):
         # allow multiple ground truths and datasets for a single response
 
@@ -308,7 +311,7 @@ async def apply_verifiable_reward(
     # Execute all tasks in parallel
     if async_tasks:
         reward_results = await asyncio.gather(*async_tasks)
-        logger.info(f"Applied {len(reward_results)} ground truth rewards in parallel 🤗")
+        logger.debug(f"Applied {len(reward_results)} ground truth rewards in parallel 🤗")
     else:
         reward_results = []
 
diff --git a/open_instruct/test_grpo_fast.py b/open_instruct/test_grpo_fast.py
@@ -4,6 +4,7 @@
 import threading
 import time
 import unittest
+from typing import Any
 from unittest.mock import MagicMock, Mock
 
 import numpy as np
@@ -198,6 +199,26 @@ def create_mock_result(self, dataset_index, epoch_number, num_samples_per_prompt
             logprobs=[[0.0, 0.0, 0.0] for _ in range(total_responses)],
         )
 
+    def create_mock_tokenizer_and_reward_fn(self):
+        # Set up dummy tokenizer
+        tokenizer_name = "EleutherAI/pythia-14m"  # Using a small model for testing
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+        # Set up dummy reward fn that will guarantee nonzero std
+        async def reward_fn(
+            responses: list[torch.Tensor],
+            decoded_responses: list[str],
+            ground_truths: list[Any],
+            datasets: list[str],
+            finish_reasons: list[str],
+            infos: list[list[int]],
+            queries: list[str] | None = None,
+        ) -> (list[float], dict[str, Any]):
+            num_responses = len(responses)
+            return [i / num_responses for i in range(num_responses)], {"time/reward": 0.0}
+
+        return tokenizer, reward_fn
+
     def setup_and_split_batch(
         self, queries, ground_truths, datasets, raw_queries, indices, num_engines, training_step=1
     ):
@@ -212,7 +233,13 @@ def setup_and_split_batch(
         self._ray_queues.extend([param_prompt_Q, inference_results_Q])
 
         batch = model_utils.Batch(
-            queries=queries, ground_truths=ground_truths, datasets=datasets, raw_queries=raw_queries, indices=indices
+            queries=queries,
+            ground_truths=ground_truths,
+            datasets=datasets,
+            raw_queries=raw_queries,
+            indices=indices,
+            decoded_responses=None,
+            scores=None,
         )
 
         mock_generation_config = MagicMock()
@@ -558,6 +585,9 @@ def test_out_of_order_processing(self):
         # Create test data
         queries, ground_truths, datasets, raw_queries, indices = self.create_test_data(num_prompts)
 
+        # Create mock tokenizer and reward
+        tokenizer, reward_fn = self.create_mock_tokenizer_and_reward_fn()
+
         # Setup and split batch
         param_prompt_Q, inference_results_Q, pending_queries_map = self.setup_and_split_batch(
             queries, ground_truths, datasets, raw_queries, indices, num_engines
@@ -580,17 +610,19 @@ def test_out_of_order_processing(self):
         mock_generation_config.n = num_samples_per_prompt
 
         mock_model_dims = self.create_mock_model_dims()
-        combined_result, batch, prompt_lengths, response_lengths = grpo_fast.accumulate_inference_batches(
+        combined_result, batch, reward_metrics, batch_stats = grpo_fast.accumulate_inference_batches(
             inference_results_Q,
             pending_queries_map,
             mock_args,
             generation_config=mock_generation_config,
             num_prompts=num_prompts,
             model_dims=mock_model_dims,
+            tokenizer=tokenizer,
+            reward_fn=reward_fn,
         )
 
         # Verify results work correctly even with out-of-order processing
-        self.assertEqual(len(batch.queries), num_prompts)
+        self.assertEqual(len(batch.queries), num_prompts * num_samples_per_prompt)
         self.assertEqual(len(combined_result.responses), num_prompts * num_samples_per_prompt)
         self.assertEqual(len(pending_queries_map), 0)
 
@@ -643,6 +675,9 @@ def test_accumulate_waits_for_all_engines(self):
         num_engines = 4
         num_prompts = 16
 
+        # Create mock tokenizer and reward
+        tokenizer, reward_fn = self.create_mock_tokenizer_and_reward_fn()
+
         # Setup with results from only 3 engines
         # Queue size must be large enough for all results being put before accumulation starts
         expected_results = 3 * (num_prompts // num_engines)  # 3 engines * 4 results each = 12
@@ -682,6 +717,8 @@ def run_accumulate():
                     generation_config=mock_generation_config,
                     num_prompts=num_prompts,
                     model_dims=mock_model_dims,
+                    tokenizer=tokenizer,
+                    reward_fn=reward_fn,
                 )
                 completed.set()
             except Exception:
@@ -717,7 +754,13 @@ def test_more_engines_than_queries(self):
         self._ray_queues.append(param_prompt_Q)
 
         batch = model_utils.Batch(
-            queries=queries, ground_truths=ground_truths, datasets=datasets, raw_queries=raw_queries, indices=indices
+            queries=queries,
+            ground_truths=ground_truths,
+            datasets=datasets,
+            raw_queries=raw_queries,
+            indices=indices,
+            decoded_responses=None,
+            scores=None,
         )
 
         mock_generation_config = MagicMock()
@@ -768,7 +811,13 @@ def test_uneven_distribution_no_empty_batches(self):
         self._ray_queues.append(param_prompt_Q)
 
         batch = model_utils.Batch(
-            queries=queries, ground_truths=ground_truths, datasets=datasets, raw_queries=raw_queries, indices=indices
+            queries=queries,
+            ground_truths=ground_truths,
+            datasets=datasets,
+            raw_queries=raw_queries,
+            indices=indices,
+            decoded_responses=None,
+            scores=None,
         )
 
         mock_generation_config = MagicMock()
diff --git a/open_instruct/utils.py b/open_instruct/utils.py
@@ -43,6 +43,7 @@
 import sys
 import threading
 import time
+from collections import defaultdict
 from collections.abc import Iterable
 from concurrent import futures
 from ctypes import CDLL, POINTER, Structure, c_char_p, c_int, c_ulong, c_void_p
@@ -2366,3 +2367,29 @@ def check_calculation(
     )
 
     logger.warning(warning_message)
+
+
+def combine_reward_metrics(reward_metrics: list[dict[str, Any]]) -> dict[str, Any]:
+    """Assumes same number of metric_records in each dict in the list"""
+    buckets = defaultdict(list)
+    for metrics in reward_metrics:
+        for key, value in metrics.items():
+            buckets[key].append(value)
+
+    combined: dict[str, Any] = {}
+    for key, records in buckets.items():
+        sample_value = records[0]
+        if isinstance(sample_value, np.ndarray):
+            combined[key] = [x for value in records for x in value]
+        elif isinstance(sample_value, (list | tuple)):
+            concatenated: list[Any] = []
+            for value in records:
+                concatenated.extend(list(value))
+            combined[key] = concatenated
+        elif isinstance(sample_value, (int | float | bool | np.integer | np.floating)):
+            # combine and get average value
+            combined[key] = sum(value for value in records) / len(records) if len(records) > 0 else sample_value
+        else:
+            # Fallback: keep the latest value if aggregation strategy is unclear.
+            combined[key] = records[-1]
+    return combined
diff --git a/scripts/train/debug/grpo_fast.sh b/scripts/train/debug/grpo_fast.sh
@@ -14,7 +14,6 @@ uv run python open_instruct/grpo_fast.py \
     --num_samples_per_prompt_rollout 4 \
     --model_name_or_path Qwen/Qwen3-0.6B \
     --stop_strings "</answer>" \
-    --apply_r1_style_format_reward \
     --apply_verifiable_reward true \
     --temperature 0.7 \
     --ground_truths_key ground_truth \
@@ -36,4 +35,5 @@ uv run python open_instruct/grpo_fast.py \
     --single_gpu_mode \
     --push_to_hub false \
     --system_prompt_override_file scripts/train/debug/cute_debug_system_prompt.txt \
+    --active_sampling --async_steps 8
     # --with_tracking
diff --git a/scripts/train/debug/large_test_script.sh b/scripts/train/debug/large_test_script.sh
@@ -61,4 +61,6 @@ uv run python mason.py \
         --oe_eval_max_length 32768 \
         --oe_eval_tasks "codex_humanevalplus:0-shot-chat-v1::tulu-thinker,mbppplus:0-shot-chat::tulu-thinker,livecodebench_codegeneration::tulu-thinker" \
         --dataset_skip_cache True \
+        --active_sampling \
+        --async_steps 4 \
 	--push_to_hub False
diff --git a/scripts/train/debug/single_gpu_integration_test.sh b/scripts/train/debug/single_gpu_integration_test.sh
@@ -28,12 +28,12 @@ uv run python mason.py \
     --dataset_mixer_eval_list ai2-adapt-dev/rlvr_gsm8k_zs 16 \
     --dataset_mixer_eval_list_splits train \
     --max_prompt_token_length 512 \
-    --response_length 512 \
-    --pack_length 1024 \
+    --response_length 1024 \
+    --pack_length 2048 \
     --per_device_train_batch_size 1 \
     --num_unique_prompts_rollout 8 \
     --num_samples_per_prompt_rollout 4 \
-    --model_name_or_path Qwen/Qwen3-1.7B \
+    --model_name_or_path Qwen/Qwen2.5-0.5B \
     --stop_strings "</answer>" \
     --apply_r1_style_format_reward \
     --apply_verifiable_reward true \
@@ -55,4 +55,6 @@ uv run python mason.py \
     --vllm_enforce_eager \
     --gradient_checkpointing \
     --push_to_hub false \
+    --active_sampling \
+    --async_steps 8 \
     --single_gpu_mode
diff --git a/scripts/train/olmo3/32b_rl_smoke_test.sh b/scripts/train/olmo3/32b_rl_smoke_test.sh
@@ -45,7 +45,7 @@ uv run python mason.py \
         --verbose False \
         --ground_truths_key ground_truth \
         --sft_messages_key messages \
-        --total_episodes 200_000 \
+        --total_episodes 10240 \
 	--gather_whole_model False \
         --deepspeed_stage 3 \
         --num_learners_per_node 8 8 8 \