We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 4cf7b1a commit 0b8b049Copy full SHA for 0b8b049
1 file changed
nemo_deploy/llm/inference/inference_base.py
@@ -229,6 +229,12 @@ def setup_megatron_model_and_tokenizer_for_inference(
229
torch_distributed_init(dist_config)
230
model_config, mlm_args = load_model_config(checkpoint_path)
231
232
+ # MLA models require cache_mla_latents=True for the dynamic inference backend.
233
+ # The checkpoint may have saved it as False (training default), but inference
234
+ # with the dynamic engine always needs it enabled.
235
+ if hasattr(model_config, "cache_mla_latents"):
236
+ model_config.cache_mla_latents = True
237
+
238
# Convert attention_backend from string to enum if needed
239
if hasattr(model_config, "attention_backend") and isinstance(model_config.attention_backend, str):
240
if model_config.attention_backend == "AttnBackend.fused":
0 commit comments