modelscope · Jintao-Huang · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/docs/source/Instruction/Supported-models-and-datasets.md b/docs/source/Instruction/Supported-models-and-datasets.md
@@ -773,6 +773,10 @@
 |[Qwen/Qwen3-VL-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-VL-Embedding-8B)|qwen3_vl_emb|qwen3_vl_emb|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-Embedding-8B](https://huggingface.co/Qwen/Qwen3-VL-Embedding-8B)|
 |[Qwen/Qwen3-VL-Reranker-2B](https://modelscope.cn/models/Qwen/Qwen3-VL-Reranker-2B)|qwen3_vl_reranker|qwen3_vl_reranker|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-Reranker-2B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B)|
 |[Qwen/Qwen3-VL-Reranker-8B](https://modelscope.cn/models/Qwen/Qwen3-VL-Reranker-8B)|qwen3_vl_reranker|qwen3_vl_reranker|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-Reranker-8B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-8B)|
+|[Qwen/Qwen3.5-27B](https://modelscope.cn/models/Qwen/Qwen3.5-27B)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-27B](https://huggingface.co/Qwen/Qwen3.5-27B)|
+|[Qwen/Qwen3.5-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.5-35B-A3B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-35B-A3B](https://huggingface.co/Qwen/Qwen3.5-35B-A3B)|
+|[Qwen/Qwen3.5-35B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3.5-35B-A3B-Base)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-35B-A3B-Base](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-Base)|
+|[Qwen/Qwen3.5-122B-A10B](https://modelscope.cn/models/Qwen/Qwen3.5-122B-A10B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-122B-A10B](https://huggingface.co/Qwen/Qwen3.5-122B-A10B)|
 |[Qwen/Qwen3.5-397B-A17B](https://modelscope.cn/models/Qwen/Qwen3.5-397B-A17B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-397B-A17B](https://huggingface.co/Qwen/Qwen3.5-397B-A17B)|
 |[Qwen/Qwen3.5-397B-A17B-FP8](https://modelscope.cn/models/Qwen/Qwen3.5-397B-A17B-FP8)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-397B-A17B-FP8](https://huggingface.co/Qwen/Qwen3.5-397B-A17B-FP8)|
 |[iic/gme-Qwen2-VL-2B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct)|qwen2_gme|qwen2_gme|-|&#x2718;|vision|[Alibaba-NLP/gme-Qwen2-VL-2B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)|

diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -774,6 +774,10 @@ The table below introduces the models integrated with ms-swift:
 |[Qwen/Qwen3-VL-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-VL-Embedding-8B)|qwen3_vl_emb|qwen3_vl_emb|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-Embedding-8B](https://huggingface.co/Qwen/Qwen3-VL-Embedding-8B)|
 |[Qwen/Qwen3-VL-Reranker-2B](https://modelscope.cn/models/Qwen/Qwen3-VL-Reranker-2B)|qwen3_vl_reranker|qwen3_vl_reranker|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-Reranker-2B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B)|
 |[Qwen/Qwen3-VL-Reranker-8B](https://modelscope.cn/models/Qwen/Qwen3-VL-Reranker-8B)|qwen3_vl_reranker|qwen3_vl_reranker|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3-VL-Reranker-8B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-8B)|
+|[Qwen/Qwen3.5-27B](https://modelscope.cn/models/Qwen/Qwen3.5-27B)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-27B](https://huggingface.co/Qwen/Qwen3.5-27B)|
+|[Qwen/Qwen3.5-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.5-35B-A3B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-35B-A3B](https://huggingface.co/Qwen/Qwen3.5-35B-A3B)|
+|[Qwen/Qwen3.5-35B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3.5-35B-A3B-Base)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-35B-A3B-Base](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-Base)|
+|[Qwen/Qwen3.5-122B-A10B](https://modelscope.cn/models/Qwen/Qwen3.5-122B-A10B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-122B-A10B](https://huggingface.co/Qwen/Qwen3.5-122B-A10B)|
 |[Qwen/Qwen3.5-397B-A17B](https://modelscope.cn/models/Qwen/Qwen3.5-397B-A17B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-397B-A17B](https://huggingface.co/Qwen/Qwen3.5-397B-A17B)|
 |[Qwen/Qwen3.5-397B-A17B-FP8](https://modelscope.cn/models/Qwen/Qwen3.5-397B-A17B-FP8)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|&#x2714;|vision, video|[Qwen/Qwen3.5-397B-A17B-FP8](https://huggingface.co/Qwen/Qwen3.5-397B-A17B-FP8)|
 |[iic/gme-Qwen2-VL-2B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct)|qwen2_gme|qwen2_gme|-|&#x2718;|vision|[Alibaba-NLP/gme-Qwen2-VL-2B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)|

diff --git a/swift/megatron/model/gpt_bridge.py b/swift/megatron/model/gpt_bridge.py
@@ -1316,8 +1316,12 @@ def _set_layer_mlp(self, mg_layer, hf_state_dict, layer_idx: int, to_mcore: bool
                                  to_mcore)
         else:
             hf_state_dict.update(self._set_mlp_state(mg_mlp, hf_state_dict, f'{hf_mlp_prefix}.', layer_idx, to_mcore))
-            self._set_state_dict(mg_layer, 'mlp.linear_fc1.layer_norm_weight', hf_state_dict,
-                                 'post_attention_layernorm.weight', to_mcore)
+            if self.model_type == 'qwen3_5':
+                self._set_state_dict(mg_layer, 'pre_mlp_layernorm.weight', hf_state_dict,
+                                     'post_attention_layernorm.weight', to_mcore)
+            else:
+                self._set_state_dict(mg_layer, 'mlp.linear_fc1.layer_norm_weight', hf_state_dict,
+                                     'post_attention_layernorm.weight', to_mcore)
         return hf_state_dict
 
     def _set_layer_state(self, mg_layer, hf_state_dict, hf_prefix: str, layer_idx: int, to_mcore: bool):

diff --git a/swift/megatron/model/gpts/qwen3_next.py b/swift/megatron/model/gpts/qwen3_next.py
@@ -5,13 +5,12 @@
 from megatron.core.extensions.transformer_engine import TEColumnParallelLinear, _get_extra_te_kwargs
 from megatron.core.inference.contexts import BaseInferenceContext
 from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, get_gpt_mtp_block_spec
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
 from megatron.core.models.huggingface import HuggingFaceModule as _HuggingFaceModule
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.tensor_parallel import (gather_from_sequence_parallel_region,
                                            reduce_scatter_to_sequence_parallel_region)
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.spec_utils import build_module
 from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
 from megatron.core.utils import deprecate_inference_params, is_fa_min_version
@@ -533,9 +532,11 @@ def get_transformer_layer_spec(self, vp_stage: Optional[int] = None):
                 layer_spec.submodules.self_attention.module = Qwen3NextSelfAttention
             # Replace ALL layernorms with Qwen3NextRMSNorm (Zero-Centered)
             layer_spec.submodules.input_layernorm = layer_norm_impl
-            if hasattr(layer_spec.submodules,
-                       'pre_mlp_layernorm') and layer_spec.submodules.pre_mlp_layernorm is not IdentityOp:
+            if hasattr(layer_spec.submodules, 'pre_mlp_layernorm'):
                 layer_spec.submodules.pre_mlp_layernorm = layer_norm_impl
+            # qwen3.5 dense
+            if config.hf_model_type == 'qwen3_5':
+                layer_spec.submodules.mlp.submodules.linear_fc1 = TEColumnParallelLinear
             # Replace qk_layernorm if present
             if hasattr(layer_spec.submodules.self_attention.submodules, 'q_layernorm'):
                 layer_spec.submodules.self_attention.submodules.q_layernorm = layer_norm_impl

diff --git a/swift/model/models/qwen.py b/swift/model/models/qwen.py
@@ -1118,6 +1118,9 @@ def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrain
     ModelMeta(
         MLLMModelType.qwen3_5_moe, [
             ModelGroup([
+                Model('Qwen/Qwen3.5-35B-A3B', 'Qwen/Qwen3.5-35B-A3B'),
+                Model('Qwen/Qwen3.5-35B-A3B-Base', 'Qwen/Qwen3.5-35B-A3B-Base'),
+                Model('Qwen/Qwen3.5-122B-A10B', 'Qwen/Qwen3.5-122B-A10B'),
                 Model('Qwen/Qwen3.5-397B-A17B', 'Qwen/Qwen3.5-397B-A17B'),
                 Model('Qwen/Qwen3.5-397B-A17B-FP8', 'Qwen/Qwen3.5-397B-A17B-FP8'),
             ], TemplateType.qwen3_5),
@@ -1140,7 +1143,9 @@ def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrain
 register_model(
     ModelMeta(
         MLLMModelType.qwen3_5, [
-            ModelGroup([], TemplateType.qwen3_5),
+            ModelGroup([
+                Model('Qwen/Qwen3.5-27B', 'Qwen/Qwen3.5-27B'),
+            ], TemplateType.qwen3_5),
         ],
         Qwen3_5Loader,
         model_arch=ModelArch.qwen2_vl,

diff --git a/tests/megatron/test_align/test_mllm.py b/tests/megatron/test_align/test_mllm.py
@@ -103,6 +103,11 @@ def test_llama4():
     _test_model('LLM-Research/Llama-4-Scout-17B-16E-Instruct')
 
 
+def test_qwen3_5():
+    _test_model('Qwen/Qwen3.5-35B-A3B')
+    # _test_model('Qwen/Qwen3.5-27B')
+
+
 if __name__ == '__main__':
     # test_qwen2_5_vl()
     # test_qwen2_vl()
@@ -114,10 +119,11 @@ def test_llama4():
     # test_internvl3_5_hf()
     # test_internvl3_5_moe_hf()
     # test_glm4_5v()
-    test_glm4_6v_flash()
+    # test_glm4_6v_flash()
     # test_ovis2_5()
     # test_kimi_vl()
     # test_qwen3_vl()
     # test_qwen3_vl_moe()
     # test_qwen3_omni()
     # test_llama4()
+    test_qwen3_5()