Fix aux loss scale when CP is enabled. (#2237)

Victarry · web-flow · commit f6e0d425a539 · 2025-12-07T12:52:19.000-08:00
diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py
@@ -268,7 +268,7 @@ def forward_step_calc_loss(
         if config.calculate_per_token_loss:
             MoEAuxLossAutoScaler.set_loss_scale(loss_scale)
         else:
-            MoEAuxLossAutoScaler.set_loss_scale(loss_scale / num_microbatches)
+            MoEAuxLossAutoScaler.set_loss_scale(loss_scale * cp_group_size / num_microbatches)
 
     # Set the loss scale for Multi-Token Prediction (MTP) loss.
     if hasattr(config, 'mtp_num_layers') and config.mtp_num_layers is not None:
diff --git a/tests/unit_tests/transformer/moe/test_aux_loss.py b/tests/unit_tests/transformer/moe/test_aux_loss.py
@@ -331,7 +331,7 @@ def test_seq_aux_loss(self, tp_size, ep_size, cp_size):
         not torch.cuda.is_available() or not HAVE_ROUTER_FUSION,
         reason="CUDA or TE fused router ops not available",
     )
-    @pytest.mark.parametrize("aux_type", ["aux_loss", "seq_aux_loss"])
+    @pytest.mark.parametrize("aux_type", ["aux_loss", "seq_aux_loss", "global_aux_loss"])
     def test_aux_loss_fusion_equivalence(self, aux_type):
         # Compare fused vs unfused aux loss path to ensure numerical equivalence
         router_ref = self.new_router(
@@ -350,6 +350,7 @@ def test_aux_loss_fusion_equivalence(self, aux_type):
         loss_name_map = {
             "aux_loss": "load_balancing_loss",
             "seq_aux_loss": "seq_load_balancing_loss",
+            "global_aux_loss": "global_load_balancing_loss",
         }
         loss_name = loss_name_map[aux_type]