Skip to content

Commit d754a3c

Browse files
committed
eta logging
1 parent f1e0063 commit d754a3c

File tree

1 file changed

+18
-0
lines changed

1 file changed

+18
-0
lines changed

open_instruct/grpo_fast.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3013,6 +3013,8 @@ def health_check_fn():
30133013
else:
30143014
num_total_tokens = 0
30153015

3016+
num_prompts_to_refill = 0
3017+
avg_step_time = 0.0
30163018
training_start_time = time.perf_counter() # Track overall training start time
30173019
for training_step in range(resume_training_step, args.num_training_steps + 1):
30183020
start_time = time.perf_counter()
@@ -3095,6 +3097,22 @@ def health_check_fn():
30953097
iter_dataloader,
30963098
)
30973099

3100+
logger.debug(f"[Main Thread] Triggered weight sync for step {training_step}")
3101+
weight_sync_trigger_event.set()
3102+
3103+
# Print ETA (estimated time remaining)
3104+
current_step_time = time.perf_counter() - start_time
3105+
if avg_step_time == 0:
3106+
avg_step_time = current_step_time
3107+
else:
3108+
avg_step_time = 0.1 * current_step_time + 0.9 * avg_step_time
3109+
3110+
remaining_steps = args.num_training_steps - training_step
3111+
eta_seconds = remaining_steps * avg_step_time
3112+
logger.info(
3113+
f"[Main Thread] ⏳ ETA to finish: {utils.format_eta(eta_seconds)} ({avg_step_time:.2f}s/step, remaining {remaining_steps} steps)"
3114+
)
3115+
30983116
# Checkpoint after one_training_step (or even if it was skipped)
30993117
# This ensures we checkpoint progress even if the exact checkpoint step has no data
31003118
if (

0 commit comments

Comments
 (0)