File tree Expand file tree Collapse file tree 1 file changed +18
-0
lines changed
Expand file tree Collapse file tree 1 file changed +18
-0
lines changed Original file line number Diff line number Diff line change @@ -3013,6 +3013,8 @@ def health_check_fn():
30133013 else :
30143014 num_total_tokens = 0
30153015
3016+ num_prompts_to_refill = 0
3017+ avg_step_time = 0.0
30163018 training_start_time = time .perf_counter () # Track overall training start time
30173019 for training_step in range (resume_training_step , args .num_training_steps + 1 ):
30183020 start_time = time .perf_counter ()
@@ -3095,6 +3097,22 @@ def health_check_fn():
30953097 iter_dataloader ,
30963098 )
30973099
3100+ logger .debug (f"[Main Thread] Triggered weight sync for step { training_step } " )
3101+ weight_sync_trigger_event .set ()
3102+
3103+ # Print ETA (estimated time remaining)
3104+ current_step_time = time .perf_counter () - start_time
3105+ if avg_step_time == 0 :
3106+ avg_step_time = current_step_time
3107+ else :
3108+ avg_step_time = 0.1 * current_step_time + 0.9 * avg_step_time
3109+
3110+ remaining_steps = args .num_training_steps - training_step
3111+ eta_seconds = remaining_steps * avg_step_time
3112+ logger .info (
3113+ f"[Main Thread] ⏳ ETA to finish: { utils .format_eta (eta_seconds )} ({ avg_step_time :.2f} s/step, remaining { remaining_steps } steps)"
3114+ )
3115+
30983116 # Checkpoint after one_training_step (or even if it was skipped)
30993117 # This ensures we checkpoint progress even if the exact checkpoint step has no data
31003118 if (
You can’t perform that action at this time.
0 commit comments