ServiceNow
diff --git a/‎fast_llm/core/distributed.py‎
Lines changed: 6 additions & 3 deletions b/‎fast_llm/core/distributed.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎fast_llm/engine/schedule/runner.py‎
Lines changed: 3 additions & 1 deletion b/‎fast_llm/engine/schedule/runner.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎fast_llm/functional/autograd.py‎
Lines changed: 11 additions & 0 deletions b/‎fast_llm/functional/autograd.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎fast_llm/functional/config.py‎
Lines changed: 4 additions & 3 deletions b/‎fast_llm/functional/config.py‎
Lines changed: 4 additions & 3 deletions
@@ -72,10 +72,12 @@ def check_parallel_match(tensor: torch.Tensor, group: ProcessGroup | None, name:
         )
 
 
-def safe_barrier(group: ProcessGroup | None, value: int | str = 1, timeout: float | None = None) -> None:
+def safe_barrier(
+    group: ProcessGroup | None, value: int | str = 1, timeout: float | None = None, device: torch.device | None = None
+) -> None:
     if group:
         hashed = hash(value) % 2**32
-        out = allreduce_scalar(hashed, dtype=torch.int64, group=group, timeout=timeout)
+        out = allreduce_scalar(hashed, dtype=torch.int64, group=group, timeout=timeout, device=device)
         if out != hashed * group.size():
             raise RuntimeError(f"Desync detected for barrier {value} ({out}!={hashed*group.size()})")
 
@@ -86,9 +88,10 @@ def allreduce_scalar(
     group: torch.distributed.ProcessGroup | None = None,
     op=ReduceOp.SUM,
     timeout: float | None = None,
+    device: torch.device | None = None,
 ) -> float | int:
     if group:
-        value = torch.full([1], value, dtype=dtype, device=torch.cuda.current_device())
+        value = torch.full([1], value, dtype=dtype, device=torch.cuda.current_device() if device is None else device)
         with set_timeout(group, timeout):
             torch.distributed.all_reduce(value, op=op, group=group)
         return value.item()
 
@@ -327,7 +327,9 @@ def _preprocess_data(
         self, context: BatchContext, data_iterator: typing.Iterator, preprocessed: bool
     ) -> typing.Generator[None, None, None]:
         batch_config = context.schedule.batch_config
-        grad_output = (1 if self._optimizer is None else self._optimizer.grad_scale) / batch_config.num_inputs
+        grad_output = (
+            self._optimizer.grad_scale / batch_config.num_inputs if context.schedule.phase.is_training else None
+        )
         for micro_batch in range(batch_config.sequential_micro_batches):
             micro_batch_data = next(data_iterator)
             if not preprocessed:
 
@@ -60,3 +60,14 @@ def call(*args, **kwargs):
 
 def grad_is_context(grad_output: torch.Tensor, context: torch.Tensor) -> torch.Tensor:  # noqa
     return context
+
+
+class AuxiliaryLoss(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_: torch.Tensor, aux_loss: torch.Tensor, grad: float) -> torch.Tensor:  # noqa
+        ctx.grad = torch.full_like(aux_loss, grad)
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> tuple[torch.Tensor | None, ...]:  # noqa
+        return grad_output, ctx.grad, None
@@ -93,16 +93,17 @@ def _set_activation_fn_map() -> None:
 MAX_DROPLESS_BLOCK_SIZE_ROW = 128
 
 
-class CrossEntropyImpl(str, enum.Enum):
+class EntropyLossImplementation(enum.StrEnum):
     auto = "auto"
     torch = "torch"
     fused = "fused"
     triton = "triton"
 
 
-class DistillationLossImpl(str, enum.Enum):
-    reverse_kl = "reverse_kl"
+class EntropyLossType(enum.StrEnum):
     cross_entropy = "cross_entropy"
+    forward_kl = "forward_kl"
+    reverse_kl = "reverse_kl"
 
 
 class TargetFormat(enum.StrEnum):