Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/source/Instruction/Supported-models-and-datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,10 @@
|[Qwen/Qwen3-VL-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-VL-Embedding-8B)|qwen3_vl_emb|qwen3_vl_emb|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-Embedding-8B](https://huggingface.co/Qwen/Qwen3-VL-Embedding-8B)|
|[Qwen/Qwen3-VL-Reranker-2B](https://modelscope.cn/models/Qwen/Qwen3-VL-Reranker-2B)|qwen3_vl_reranker|qwen3_vl_reranker|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-Reranker-2B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B)|
|[Qwen/Qwen3-VL-Reranker-8B](https://modelscope.cn/models/Qwen/Qwen3-VL-Reranker-8B)|qwen3_vl_reranker|qwen3_vl_reranker|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-Reranker-8B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-8B)|
|[Qwen/Qwen3.5-27B](https://modelscope.cn/models/Qwen/Qwen3.5-27B)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-27B](https://huggingface.co/Qwen/Qwen3.5-27B)|
|[Qwen/Qwen3.5-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.5-35B-A3B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-35B-A3B](https://huggingface.co/Qwen/Qwen3.5-35B-A3B)|
|[Qwen/Qwen3.5-35B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3.5-35B-A3B-Base)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-35B-A3B-Base](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-Base)|
|[Qwen/Qwen3.5-122B-A10B](https://modelscope.cn/models/Qwen/Qwen3.5-122B-A10B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-122B-A10B](https://huggingface.co/Qwen/Qwen3.5-122B-A10B)|
|[Qwen/Qwen3.5-397B-A17B](https://modelscope.cn/models/Qwen/Qwen3.5-397B-A17B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-397B-A17B](https://huggingface.co/Qwen/Qwen3.5-397B-A17B)|
|[Qwen/Qwen3.5-397B-A17B-FP8](https://modelscope.cn/models/Qwen/Qwen3.5-397B-A17B-FP8)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-397B-A17B-FP8](https://huggingface.co/Qwen/Qwen3.5-397B-A17B-FP8)|
|[iic/gme-Qwen2-VL-2B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct)|qwen2_gme|qwen2_gme|-|✘|vision|[Alibaba-NLP/gme-Qwen2-VL-2B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)|
Expand Down
4 changes: 4 additions & 0 deletions docs/source_en/Instruction/Supported-models-and-datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,10 @@ The table below introduces the models integrated with ms-swift:
|[Qwen/Qwen3-VL-Embedding-8B](https://modelscope.cn/models/Qwen/Qwen3-VL-Embedding-8B)|qwen3_vl_emb|qwen3_vl_emb|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-Embedding-8B](https://huggingface.co/Qwen/Qwen3-VL-Embedding-8B)|
|[Qwen/Qwen3-VL-Reranker-2B](https://modelscope.cn/models/Qwen/Qwen3-VL-Reranker-2B)|qwen3_vl_reranker|qwen3_vl_reranker|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-Reranker-2B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-2B)|
|[Qwen/Qwen3-VL-Reranker-8B](https://modelscope.cn/models/Qwen/Qwen3-VL-Reranker-8B)|qwen3_vl_reranker|qwen3_vl_reranker|transformers>=4.57, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3-VL-Reranker-8B](https://huggingface.co/Qwen/Qwen3-VL-Reranker-8B)|
|[Qwen/Qwen3.5-27B](https://modelscope.cn/models/Qwen/Qwen3.5-27B)|qwen3_5|qwen3_5|transformers>=5.0.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-27B](https://huggingface.co/Qwen/Qwen3.5-27B)|
|[Qwen/Qwen3.5-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.5-35B-A3B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-35B-A3B](https://huggingface.co/Qwen/Qwen3.5-35B-A3B)|
|[Qwen/Qwen3.5-35B-A3B-Base](https://modelscope.cn/models/Qwen/Qwen3.5-35B-A3B-Base)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-35B-A3B-Base](https://huggingface.co/Qwen/Qwen3.5-35B-A3B-Base)|
|[Qwen/Qwen3.5-122B-A10B](https://modelscope.cn/models/Qwen/Qwen3.5-122B-A10B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-122B-A10B](https://huggingface.co/Qwen/Qwen3.5-122B-A10B)|
|[Qwen/Qwen3.5-397B-A17B](https://modelscope.cn/models/Qwen/Qwen3.5-397B-A17B)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-397B-A17B](https://huggingface.co/Qwen/Qwen3.5-397B-A17B)|
|[Qwen/Qwen3.5-397B-A17B-FP8](https://modelscope.cn/models/Qwen/Qwen3.5-397B-A17B-FP8)|qwen3_5_moe|qwen3_5|transformers>=5.2.0.dev, qwen_vl_utils>=0.0.14, decord|✔|vision, video|[Qwen/Qwen3.5-397B-A17B-FP8](https://huggingface.co/Qwen/Qwen3.5-397B-A17B-FP8)|
|[iic/gme-Qwen2-VL-2B-Instruct](https://modelscope.cn/models/iic/gme-Qwen2-VL-2B-Instruct)|qwen2_gme|qwen2_gme|-|✘|vision|[Alibaba-NLP/gme-Qwen2-VL-2B-Instruct](https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct)|
Expand Down
8 changes: 6 additions & 2 deletions swift/megatron/model/gpt_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1316,8 +1316,12 @@ def _set_layer_mlp(self, mg_layer, hf_state_dict, layer_idx: int, to_mcore: bool
to_mcore)
else:
hf_state_dict.update(self._set_mlp_state(mg_mlp, hf_state_dict, f'{hf_mlp_prefix}.', layer_idx, to_mcore))
self._set_state_dict(mg_layer, 'mlp.linear_fc1.layer_norm_weight', hf_state_dict,
'post_attention_layernorm.weight', to_mcore)
if self.model_type == 'qwen3_5':
self._set_state_dict(mg_layer, 'pre_mlp_layernorm.weight', hf_state_dict,
'post_attention_layernorm.weight', to_mcore)
else:
self._set_state_dict(mg_layer, 'mlp.linear_fc1.layer_norm_weight', hf_state_dict,
'post_attention_layernorm.weight', to_mcore)
return hf_state_dict

def _set_layer_state(self, mg_layer, hf_state_dict, hf_prefix: str, layer_idx: int, to_mcore: bool):
Expand Down
9 changes: 5 additions & 4 deletions swift/megatron/model/gpts/qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,12 @@
from megatron.core.extensions.transformer_engine import TEColumnParallelLinear, _get_extra_te_kwargs
from megatron.core.inference.contexts import BaseInferenceContext
from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, get_gpt_mtp_block_spec
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
from megatron.core.models.huggingface import HuggingFaceModule as _HuggingFaceModule
from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.tensor_parallel import (gather_from_sequence_parallel_region,
reduce_scatter_to_sequence_parallel_region)
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.spec_utils import build_module
from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
from megatron.core.utils import deprecate_inference_params, is_fa_min_version
Expand Down Expand Up @@ -533,9 +532,11 @@ def get_transformer_layer_spec(self, vp_stage: Optional[int] = None):
layer_spec.submodules.self_attention.module = Qwen3NextSelfAttention
# Replace ALL layernorms with Qwen3NextRMSNorm (Zero-Centered)
layer_spec.submodules.input_layernorm = layer_norm_impl
if hasattr(layer_spec.submodules,
'pre_mlp_layernorm') and layer_spec.submodules.pre_mlp_layernorm is not IdentityOp:
if hasattr(layer_spec.submodules, 'pre_mlp_layernorm'):
layer_spec.submodules.pre_mlp_layernorm = layer_norm_impl
# qwen3.5 dense
if config.hf_model_type == 'qwen3_5':
layer_spec.submodules.mlp.submodules.linear_fc1 = TEColumnParallelLinear
# Replace qk_layernorm if present
if hasattr(layer_spec.submodules.self_attention.submodules, 'q_layernorm'):
layer_spec.submodules.self_attention.submodules.q_layernorm = layer_norm_impl
Expand Down
7 changes: 6 additions & 1 deletion swift/model/models/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,9 @@ def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrain
ModelMeta(
MLLMModelType.qwen3_5_moe, [
ModelGroup([
Model('Qwen/Qwen3.5-35B-A3B', 'Qwen/Qwen3.5-35B-A3B'),
Model('Qwen/Qwen3.5-35B-A3B-Base', 'Qwen/Qwen3.5-35B-A3B-Base'),
Model('Qwen/Qwen3.5-122B-A10B', 'Qwen/Qwen3.5-122B-A10B'),
Model('Qwen/Qwen3.5-397B-A17B', 'Qwen/Qwen3.5-397B-A17B'),
Model('Qwen/Qwen3.5-397B-A17B-FP8', 'Qwen/Qwen3.5-397B-A17B-FP8'),
], TemplateType.qwen3_5),
Expand All @@ -1140,7 +1143,9 @@ def get_model(self, model_dir: str, config, processor, model_kwargs) -> PreTrain
register_model(
ModelMeta(
MLLMModelType.qwen3_5, [
ModelGroup([], TemplateType.qwen3_5),
ModelGroup([
Model('Qwen/Qwen3.5-27B', 'Qwen/Qwen3.5-27B'),
], TemplateType.qwen3_5),
],
Qwen3_5Loader,
model_arch=ModelArch.qwen2_vl,
Expand Down
8 changes: 7 additions & 1 deletion tests/megatron/test_align/test_mllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ def test_llama4():
_test_model('LLM-Research/Llama-4-Scout-17B-16E-Instruct')


def test_qwen3_5():
_test_model('Qwen/Qwen3.5-35B-A3B')
# _test_model('Qwen/Qwen3.5-27B')


if __name__ == '__main__':
# test_qwen2_5_vl()
# test_qwen2_vl()
Expand All @@ -114,10 +119,11 @@ def test_llama4():
# test_internvl3_5_hf()
# test_internvl3_5_moe_hf()
# test_glm4_5v()
test_glm4_6v_flash()
# test_glm4_6v_flash()
# test_ovis2_5()
# test_kimi_vl()
# test_qwen3_vl()
# test_qwen3_vl_moe()
# test_qwen3_omni()
# test_llama4()
test_qwen3_5()
Loading