persist cudagraphs

mathemakitten · mathemakitten · commit 4185aac5504d · 2025-12-08T09:58:12.000-08:00
diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py
@@ -270,6 +270,7 @@ def __init__(
         cuda_graph_mixed_prefill_count: Optional[int] = 16,
         metrics_writer: Optional['WandbModule'] = None,
         num_request_metadata: Optional[int] = None,
+        persist_cuda_graphs: Optional[bool] = False,
     ):
         super().__init__(materialize_only_last_token_logits=materialize_only_last_token_logits)
 
@@ -360,6 +361,7 @@ def __init__(
 
         # Unified memory.
         self.unified_memory_level = unified_memory_level
+        self.persist_cuda_graphs = persist_cuda_graphs
         if unified_memory_level > 0:
             try:
                 self.unified_memory_mempool = create_unified_mempool()
diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py
@@ -167,6 +167,7 @@ def __init__(
         self.enable_chunked_prefill = enable_chunked_prefill
         self.inference_logging_step_interval = inference_logging_step_interval
         self.unified_memory_level = context.unified_memory_level
+        self.persist_cuda_graphs = context.persist_cuda_graphs
 
         if enable_cuda_graph is not None:
             self.cuda_graph_impl = "local" if enable_cuda_graph else "none"
@@ -552,9 +553,9 @@ def suspend(self):
             self.context.deallocate_all_tensors()
 
         # Delete cuda graphs when not using unified memory at all (level 0) and
-        # `rl-persist-cuda-graphs` is not passed. For UVM levels 1 and 2, the context's tensors
+        # `--rl-persist-cuda-graphs` is not passed. For UVM levels 1 and 2, the context's tensors
         # maintain static memory addresses, so the cuda graphs are re-used.
-        if self.unified_memory_level == 0 and not args.rl_persist_cuda_graphs:
+        if self.unified_memory_level == 0 and not self.persist_cuda_graphs:
             delete_cuda_graphs()
 
         # Maintain references to requests before reset.
@@ -596,7 +597,7 @@ def resume(self):
             # 0). For levels 1 and 2, the context's tensors maintain static
             # memory addresses, so the cuda graphs are re-used.
             capture_time = time.time()
-            if self.unified_memory_level == 0:
+            if self.unified_memory_level == 0 and not self.persist_cuda_graphs:
                 self.create_cuda_graphs()
             capture_time = time.time() - capture_time
 
diff --git a/megatron/rl/inference/megatron.py b/megatron/rl/inference/megatron.py
@@ -136,6 +136,7 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen
         use_flashinfer_fused_rope=None,
         unified_memory_level=args.inference_dynamic_batching_unified_memory_level,
         metrics_writer=metrics_writer,
+        persist_cuda_graphs=args.rl_persist_cuda_graphs
     )
 
     inference_wrapped_model = GPTInferenceWrapper(model, args, inference_context)

Original file line number	Diff line number	Diff line change
`@@ -136,6 +136,7 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen`
`136`	`136`	`use_flashinfer_fused_rope=None,`
`137`	`137`	`unified_memory_level=args.inference_dynamic_batching_unified_memory_level,`
`138`	`138`	`metrics_writer=metrics_writer,`
	`139`	`+ persist_cuda_graphs=args.rl_persist_cuda_graphs`
`139`	`140`	`)`
`140`	`141`
`141`	`142`	`inference_wrapped_model = GPTInferenceWrapper(model, args, inference_context)`