Remove gate_activation to allow pickling

michaeldeistler · michaeldeistler · commit 6ffe7b6fecf5 · 2025-11-06T14:04:48.000+01:00
diff --git a/sbi/neural_nets/net_builders/vector_field_nets.py b/sbi/neural_nets/net_builders/vector_field_nets.py
@@ -2,7 +2,7 @@
 # under the Apache License Version 2.0, see <https://www.apache.org/licenses/>
 
 import math
-from typing import Callable, Literal, Optional, Sequence, Union
+from typing import Literal, Optional, Sequence, Union
 
 import torch
 import torch.nn as nn
@@ -281,7 +281,6 @@ def __init__(
         cond_dim: int,
         mlp_ratio: int = 1,
         activation: type[nn.Module] = nn.GELU,
-        gate_activation: Callable = lambda x: (x + 1.0),
     ):
         super().__init__()
 
@@ -302,8 +301,6 @@ def __init__(
             nn.Linear(hidden_features * mlp_ratio, hidden_features),
         )
 
-        self.gate_activation = gate_activation
-
     def forward(self, x: Tensor, cond: Tensor) -> Tensor:
         """
         Arguments:
@@ -315,7 +312,7 @@ def forward(self, x: Tensor, cond: Tensor) -> Tensor:
         """
 
         shift_, scale_, gate_ = self.ada_ln(cond).chunk(3, dim=-1)
-        gate_ = self.gate_activation(gate_)
+        gate_ = gate_ + 1.0  # Gate activation: `lambda x: x + 1`.
         y = (scale_ + 1) * x + shift_
         y = self.block(y)
         y = x + gate_ * y
@@ -665,7 +662,6 @@ def __init__(
         num_heads: int,
         mlp_ratio: int = 2,
         activation: type[nn.Module] = nn.GELU,
-        gate_activation: Callable = lambda x: (x + 1.0),
     ):
         """Initialize dit transformer block.
 
@@ -675,7 +671,6 @@ def __init__(
             num_heads: number of attention heads
             mlp_ratio: ratio for mlp hidden dimension
             activation: activation function
-            gate_activation: activation function for the gate
         """
         super().__init__()
 
@@ -705,7 +700,6 @@ def __init__(
             activation(),
             nn.Linear(hidden_features * mlp_ratio, hidden_features),
         )
-        self.gate_activation = gate_activation
 
         # layer norms
         self.norm1 = nn.LayerNorm(hidden_features)
@@ -737,8 +731,8 @@ def forward(self, x: Tensor, cond: Tensor) -> Tensor:
         mlp_shift = mlp_shift.view(batch_size, 1, -1)
         mlp_gate = mlp_gate.view(batch_size, 1, -1)
 
-        attn_gate = self.gate_activation(attn_gate)
-        mlp_gate = self.gate_activation(mlp_gate)
+        attn_gate = attn_gate + 1.0  # Gate activation: `lambda x: x + 1`.
+        mlp_gate = mlp_gate + 1.0  # Gate activation: `lambda x: x + 1`.
 
         # attention with adaptive ln
         x_norm = self.norm1(x)
@@ -776,7 +770,6 @@ def __init__(
         num_heads: int,
         mlp_ratio: int = 4,
         activation: type[nn.Module] = nn.GELU,
-        gate_activation: Callable = lambda x: (x + 1.0),
     ):
         super().__init__()
 
@@ -817,7 +810,6 @@ def __init__(
         self.norm1 = nn.LayerNorm(hidden_features)
         self.norm2 = nn.LayerNorm(hidden_features)
         self.norm3 = nn.LayerNorm(hidden_features)
-        self.gate_activation = gate_activation
 
     def forward(
         self, x: Tensor, cross_attention_condition: Tensor, time_condition: Tensor
@@ -846,8 +838,8 @@ def forward(
         mlp_shift = mlp_shift.unsqueeze(1)
         mlp_gate = mlp_gate.unsqueeze(1)
 
-        attn_gate = self.gate_activation(attn_gate)
-        mlp_gate = self.gate_activation(mlp_gate)
+        attn_gate = attn_gate + 1.0  # Gate activation: `lambda x: x + 1`.
+        mlp_gate = mlp_gate + 1.0  # Gate activation: `lambda x: x + 1`.
 
         # self-attention with adaptive ln
         x_norm = self.norm1(x)