Skip to content

Commit 0b8b049

Browse files
Fix moonlight deployment issue
Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
1 parent 4cf7b1a commit 0b8b049

1 file changed

Lines changed: 6 additions & 0 deletions

File tree

nemo_deploy/llm/inference/inference_base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,12 @@ def setup_megatron_model_and_tokenizer_for_inference(
229229
torch_distributed_init(dist_config)
230230
model_config, mlm_args = load_model_config(checkpoint_path)
231231

232+
# MLA models require cache_mla_latents=True for the dynamic inference backend.
233+
# The checkpoint may have saved it as False (training default), but inference
234+
# with the dynamic engine always needs it enabled.
235+
if hasattr(model_config, "cache_mla_latents"):
236+
model_config.cache_mla_latents = True
237+
232238
# Convert attention_backend from string to enum if needed
233239
if hasattr(model_config, "attention_backend") and isinstance(model_config.attention_backend, str):
234240
if model_config.attention_backend == "AttnBackend.fused":

0 commit comments

Comments
 (0)