add patches for Funnel

xadupre · xadupre · commit b37d6d1f0147 · 2025-12-18T12:22:14.000+01:00
diff --git a/_unittests/ut_torch_export_patches/test_patch_transformers.py b/_unittests/ut_torch_export_patches/test_patch_transformers.py
@@ -18,7 +18,10 @@
 from onnx_diagnostic.torch_models.hghub.hub_api import get_cached_configuration
 from onnx_diagnostic.torch_export_patches import torch_export_patches
 from onnx_diagnostic.torch_export_patches.patch_inputs import use_dyn_not_str
-from onnx_diagnostic.torch_export_patches.patches.patch_transformers import patch_qwen2_5
+from onnx_diagnostic.torch_export_patches.patches.patch_transformers import (
+    patch_qwen2_5,
+    patch_funnel,
+)
 from onnx_diagnostic.export.api import to_onnx
 
 
@@ -787,6 +790,42 @@ def test_plug_multi_head_attention_qwen25_loopa24_float32(self):
         self.assertEqualArray(results.eager_outputs[0], results.onnx_outputs[0], atol=1e-5)
         self.assertLess(results.diffs[0]["abs"], 1e-5)
 
+    @unittest.skipIf(not patch_funnel, "Funnel not part of this transformers")
+    def test_model_funnel(self):
+        from onnx_diagnostic.torch_export_patches.patches.patch_transformers import (
+            patched_FunnelAttentionStructure,
+            patched_FunnelRelMultiheadAttention,
+        )
+
+        pos = torch.tensor([0, 4, 5, 8], dtype=torch.long)
+        stride = 2
+        config = transformers.models.funnel.modeling_funnel.FunnelConfig()
+        original = transformers.models.funnel.modeling_funnel.FunnelAttentionStructure(config)
+        patched = patched_FunnelAttentionStructure()
+        self.assertEqualArray(
+            original.relative_pos(pos, stride=stride), patched.relative_pos(pos, stride=stride)
+        )
+
+        rmha = transformers.models.funnel.modeling_funnel.FunnelRelMultiheadAttention(
+            config, 2
+        )
+        patched = patched_FunnelRelMultiheadAttention()
+        patched.config = config
+        for att in ["block_index", "r_r_bias", "scale", "r_kernel"]:
+            setattr(patched, att, getattr(rmha, att))
+        inputs = dict(
+            position_embeds=[
+                [torch.rand((24, 768)), None],
+                [torch.rand((12, 768)), torch.rand((24, 768))],
+                [torch.rand((6, 768)), torch.rand((12, 768))],
+            ],
+            q_head=torch.rand((2, 12, 12, 64)),
+            context_len=12,
+        )
+        expected = rmha.relative_positional_attention(**inputs)
+        got = patched.relative_positional_attention(**inputs)
+        self.assertEqualArray(expected, got)
+
 
 if __name__ == "__main__":
     unittest.main(verbosity=2)
diff --git a/onnx_diagnostic/tasks/text_generation.py b/onnx_diagnostic/tasks/text_generation.py
@@ -40,6 +40,9 @@ def reduce_model_config(config: Any) -> Dict[str, Any]:
             state_size=8 if config is None else getattr(config, "state_size", None),
             conv_kernel=4 if config is None else getattr(config, "conv_kernel", None),
         )
+    elif config.__class__.__name__ == "FunnelConfig":
+        # does not support num_hidden_layers
+        kwargs = dict()
     else:
         kwargs = dict(
             head_dim=getattr(
diff --git a/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_funnel.py b/onnx_diagnostic/torch_export_patches/patches/_patch_transformers_funnel.py
@@ -0,0 +1,80 @@
+import torch
+
+try:
+    import transformers.models.funnel.modeling_funnel
+
+    patch_funnel = True
+except ImportError:
+    patch_funnel = False
+
+if patch_funnel:
+    from transformers.models.funnel.modeling_funnel import _relative_shift_gather
+
+    class patched_FunnelAttentionStructure(torch.nn.Module):
+        _PATCHES_ = ["relative_pos"]
+        _PATCHED_CLASS_ = transformers.models.funnel.modeling_funnel.FunnelAttentionStructure
+
+        def relative_pos(
+            self, pos: torch.Tensor, stride: int, pooled_pos=None, shift: int = 1
+        ) -> torch.Tensor:
+            if pooled_pos is None:
+                pooled_pos = pos
+            ref_point = pooled_pos[0] - pos[0]
+            # PATCHED
+            num_remove = shift * pooled_pos.shape[0]
+            max_dist = ref_point + num_remove * stride
+            min_dist = pooled_pos[0] - pos[-1]
+            return torch.arange(
+                max_dist.to(torch.long),
+                (min_dist - 1).to(torch.long),
+                torch.tensor(-stride, dtype=torch.long),
+                dtype=torch.long,
+                device=pos.device,
+            )
+
+    class patched_FunnelRelMultiheadAttention(torch.nn.Module):
+        _PATCHES_ = ["relative_positional_attention"]
+        _PATCHED_CLASS_ = (
+            transformers.models.funnel.modeling_funnel.FunnelRelMultiheadAttention
+        )
+
+        def relative_positional_attention(
+            self, position_embeds, q_head, context_len, cls_mask=None
+        ):
+            """Relative attention score for the positional encodings"""
+            # q_head has shape batch_size x sea_len x n_head x d_head
+            if self.config.attention_type == "factorized":
+                phi, pi, psi, omega = position_embeds
+                # Shape n_head x d_head
+                u = self.r_r_bias * self.scale
+                # Shape d_model x n_head x d_head
+                w_r = self.r_kernel
+
+                # Shape batch_size x sea_len x n_head x d_model
+                q_r_attention = torch.einsum("binh,dnh->bind", q_head + u, w_r)
+                q_r_attention_1 = q_r_attention * phi[:, None]
+                q_r_attention_2 = q_r_attention * pi[:, None]
+
+                # Shape batch_size x n_head x seq_len x context_len
+                positional_attn = torch.einsum(
+                    "bind,jd->bnij", q_r_attention_1, psi
+                ) + torch.einsum("bind,jd->bnij", q_r_attention_2, omega)
+            else:
+                shift = 2 if q_head.shape[1] != context_len else 1
+                r = position_embeds[self.block_index][shift - 1]
+                # Shape n_head x d_head
+                v = self.r_r_bias * self.scale
+                # Shape d_model x n_head x d_head
+                w_r = self.r_kernel
+
+                # Shape max_rel_len x n_head x d_model
+                r_head = torch.einsum("td,dnh->tnh", r, w_r)
+                # Shape batch_size x n_head x seq_len x max_rel_len
+                positional_attn = torch.einsum("binh,tnh->bnit", q_head + v, r_head)
+                # Shape batch_size x n_head x seq_len x context_len
+                positional_attn = _relative_shift_gather(positional_attn, context_len, shift)
+
+            if cls_mask is not None:
+                # PATCHED
+                positional_attn = positional_attn * cls_mask
+            return positional_attn
diff --git a/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py b/onnx_diagnostic/torch_export_patches/patches/patch_transformers.py
@@ -1,29 +1,37 @@
 # transformers
 from typing import List
 from .patch_helper import _has_transformers
-
 from ._patch_transformers_attention import (
     patched_sdpa_attention_forward,
     patched_model_bart_eager_attention_forward,
     patched_modeling_marian_eager_attention_forward,
 )
+from ._patch_transformers_generation_mixin import patched_GenerationMixin
+from ._patch_transformers_causal_mask import patched_AttentionMaskConverter
+from ._patch_transformers_rotary_embedding import (
+    patched__compute_dynamic_ntk_parameters,
+    patched_dynamic_rope_update,
+    patched_GemmaRotaryEmbedding,
+    patched_LlamaRotaryEmbedding,
+    patched_MistralRotaryEmbedding,
+    patched_MixtralRotaryEmbedding,
+    patched_PhiRotaryEmbedding,
+)
+from ._patch_transformers_idefics import patched_IdeficsEmbedding, patched_IdeficsAttention
+from ._patch_transformers_sam_mask_decoder import patched_SamMaskDecoder
+
+# transformers dependant patches
 
 from ._patch_transformers_cache_utils import patch_parse_processor_args
 
 if patch_parse_processor_args:
     from ._patch_transformers_cache_utils import patched_parse_processor_args
-
-from ._patch_transformers_causal_mask import patched_AttentionMaskConverter
-
 from ._patch_transformers_dynamic_cache import patch_DynamicLayer, patch_DynamicCache
 
 if patch_DynamicLayer:
     from ._patch_transformers_dynamic_cache import patched_DynamicLayer
 if patch_DynamicCache:
     from ._patch_transformers_dynamic_cache import patched_DynamicCache
-
-from ._patch_transformers_generation_mixin import patched_GenerationMixin
-
 from ._patch_transformers_masking_utils import patch_masking_utils
 
 if patch_masking_utils:
@@ -33,15 +41,7 @@
         patched_sdpa_mask_recent_torch,
     )
 
-from ._patch_transformers_rotary_embedding import (
-    patched__compute_dynamic_ntk_parameters,
-    patched_dynamic_rope_update,
-    patched_GemmaRotaryEmbedding,
-    patched_LlamaRotaryEmbedding,
-    patched_MistralRotaryEmbedding,
-    patched_MixtralRotaryEmbedding,
-    patched_PhiRotaryEmbedding,
-)
+# transformers models dependant patches
 
 if _has_transformers("4.51"):
     from ._patch_transformers_rotary_embedding import patched_Phi3RotaryEmbedding
@@ -54,16 +54,11 @@
 if _has_transformers("4.53"):
     from ._patch_transformers_rotary_embedding import patched_SmolLM3RotaryEmbedding
 
-# Models
-
 from ._patch_transformers_gemma3 import patch_gemma3
 
 if patch_gemma3:
     from ._patch_transformers_gemma3 import patched_Gemma3Model
 
-from ._patch_transformers_idefics import patched_IdeficsEmbedding, patched_IdeficsAttention
-
-
 from ._patch_transformers_qwen2 import patch_qwen2
 
 if patch_qwen2:
@@ -80,14 +75,17 @@
         patched_Qwen2_5_VLModel,
         PLUGS as PLUGS_Qwen25,
     )
-
 from ._patch_transformers_qwen3 import patch_qwen3
 
 if patch_qwen3:
     from ._patch_transformers_qwen3 import patched_Qwen3MoeSparseMoeBlock
+from ._patch_transformers_funnel import patch_funnel
 
-
-from ._patch_transformers_sam_mask_decoder import patched_SamMaskDecoder
+if patch_funnel:
+    from ._patch_transformers_funnel import (
+        patched_FunnelAttentionStructure,
+        patched_FunnelRelMultiheadAttention,
+    )
 
 
 def get_transformers_plugs() -> List["EagerDirectReplacementWithOnnx"]:  # noqa: F821
diff --git a/onnx_diagnostic/torch_models/hghub/hub_data.py b/onnx_diagnostic/torch_models/hghub/hub_data.py
@@ -32,6 +32,7 @@
     ConvNextV2Model,image-feature-extraction
     CosmosTransformer3DModel,image-to-video
     CvtModel,feature-extraction
+    ClvpModelForConditionalGeneration,audio-feature-extraction
     DPTModel,image-feature-extraction
     Data2VecAudioModel,feature-extraction
     Data2VecTextModel,feature-extraction
@@ -49,6 +50,7 @@
     ElectraModel,feature-extraction
     EsmModel,feature-extraction
     FalconMambaForCausalLM,text-generation
+    FunnelBaseModel,feature-extraction
     GLPNModel,image-feature-extraction
     GPT2LMHeadModel,text-generation
     GPTBigCodeModel,feature-extraction
@@ -63,6 +65,7 @@
     Glm4vMoeForConditionalGeneration,image-text-to-text
     GraniteForCausalLM,text-generation
     GroupViTModel,feature-extraction
+    HeliumForCausalLM,text-generation
     HieraForImageClassification,image-classification
     HubertModel,feature-extraction
     IBertModel,feature-extraction