Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion open_instruct/grpo_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,8 @@ class Args:
"""whether to offload parameters to CPU (reduces GPU memory usage)"""
deepspeed_offload_optimizer: bool = False
"""whether to offload optimizer states to CPU (reduces GPU memory usage)"""
deepspeed_cpu_adam: bool = False
"""Whether to use DeepSpeedCPUAdam optimizer"""
gather_whole_model: bool = True
"""whether to gather the whole model to boardcast (not doable for 70B but can be faster for 8B)"""
enable_queue_dashboard: bool = True
Expand Down Expand Up @@ -708,7 +710,11 @@ def load(self, path: str, map_location=None):
optim_params = get_optimizer_grouped_parameters(self.policy, args.weight_decay)
else:
optim_params = self.policy.parameters()
self.optimizer = torch.optim.AdamW(optim_params, lr=args.learning_rate, fused=args.fused_optimizer)
if args.deepspeed_cpu_adam:
from deepspeed.ops.adam import DeepSpeedCPUAdam
self.optimizer = DeepSpeedCPUAdam(optim_params, lr=args.learning_rate)
else:
self.optimizer = torch.optim.AdamW(optim_params, lr=args.learning_rate, fused=args.fused_optimizer)
num_scheduler_steps = args.num_training_steps * args.num_epochs * args.num_mini_batches
warm_up_steps = args.warm_up_steps
if args.warmup_ratio > 0.0:
Expand Down