NVIDIA
diff --git a/‎.github/copy-pr-bot.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/copy-pr-bot.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/auto-update-copy-pr-bot.yml‎
Lines changed: 4 additions & 2 deletions b/‎.github/workflows/auto-update-copy-pr-bot.yml‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎.github/workflows/community-bot.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/community-bot.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference.py‎
Lines changed: 171 additions & 104 deletions b/‎examples/inference/gpt/gpt_dynamic_inference.py‎
Lines changed: 171 additions & 104 deletions
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference_12b.sh‎
Lines changed: 4 additions & 6 deletions b/‎examples/inference/gpt/gpt_dynamic_inference_12b.sh‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference_357m.sh‎
Lines changed: 4 additions & 6 deletions b/‎examples/inference/gpt/gpt_dynamic_inference_357m.sh‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py‎
Lines changed: 150 additions & 56 deletions b/‎examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py‎
Lines changed: 150 additions & 56 deletions
@@ -1,4 +1,4 @@
 enabled: true
 auto_sync_draft: false
 auto_sync_ready: true
-trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "hxbai", "jaredcasper", "jiemingz", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]
+trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "guyueh1", "hxbai", "jaredcasper", "jiemingz", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]
@@ -48,8 +48,10 @@ jobs:
           mv .github/copy-pr-bot.yaml.new .github/copy-pr-bot.yaml
 
       - name: Commit changes
+        env:
+          GH_TOKEN: ${{ secrets.PAT }}
         run: |
-          git remote set-url origin https://x-access-token:${{ secrets.PAT }}@github.com/NVIDIA/Megatron-LM.git
+          git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/NVIDIA/Megatron-LM.git
           git config --global user.name "GitHub Actions"
           git config --global user.email "github-actions[bot]@users.noreply.github.com"
           git add .github/copy-pr-bot.yaml
@@ -58,4 +60,4 @@ jobs:
             exit 0
           fi
           git commit -m "Update copy-pr-bot.yaml [skip ci]"
-          git push
+          git push -u origin main
@@ -22,7 +22,8 @@ on:
 jobs:
   community-bot:
     uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
+    with:
+      community_project_id: ${{ vars.COMMUNITY_PROJECT_ID }}
     if: github.repository == 'NVIDIA/Megatron-LM'
     secrets:
       GH_TOKEN: ${{ secrets.PAT }}
-      environment: main
 
@@ -24,13 +24,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 
 # Dynamic context.
 : ${BUFFER_SIZE_GB=50.}
-: ${BUFFER_OVERFLOW_FACTOR=1.}
-: ${BUFFER_GUARANTEED_FRACTION=0.05}
 
 # Cuda graphs.
-: ${CUDA_GRAPH_IMPL=local}
 : ${NUM_CUDA_GRAPHS=16}
-: ${CUDA_GRAPH_SHARE_IO_BUFFERS=1}
 
 # Miscellaneous.
 : ${USE_COORDINATOR=0}
@@ -79,8 +75,6 @@ ARGS=" \
     \
     --inference-dynamic-batching \
     --inference-dynamic-batching-buffer-size-gb ${BUFFER_SIZE_GB} \
-    --inference-dynamic-batching-buffer-overflow-factor ${BUFFER_OVERFLOW_FACTOR} \
-    --inference-dynamic-batching-buffer-guaranteed-fraction ${BUFFER_GUARANTEED_FRACTION} \
     \
     ${EXTRA_ARGS} \
 "
@@ -91,6 +85,10 @@ if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then
         --cuda-graph-impl local \
         --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \
     "
+else
+    ARGS+=" \
+        --cuda-graph-impl none \
+    "
 fi
 
 # Prompts.
 
@@ -25,13 +25,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 
 # Dynamic context.
 : ${BUFFER_SIZE_GB=50.}
-: ${BUFFER_OVERFLOW_FACTOR=1.}
-: ${BUFFER_GUARANTEED_FRACTION=0.05}
 
 # Cuda graphs.
-: ${CUDA_GRAPH_IMPL=local}
 : ${NUM_CUDA_GRAPHS=16}
-: ${CUDA_GRAPH_SHARE_IO_BUFFERS=1}
 
 # Miscellaneous.
 : ${USE_COORDINATOR=0}
@@ -65,8 +61,6 @@ ARGS=" \
     \
     --inference-dynamic-batching \
     --inference-dynamic-batching-buffer-size-gb ${BUFFER_SIZE_GB} \
-    --inference-dynamic-batching-buffer-overflow-factor ${BUFFER_OVERFLOW_FACTOR} \
-    --inference-dynamic-batching-buffer-guaranteed-fraction ${BUFFER_GUARANTEED_FRACTION} \
     \
     ${EXTRA_ARGS} \
 "
@@ -77,6 +71,10 @@ if [ "${NUM_CUDA_GRAPHS}" != "0" ]; then
         --cuda-graph-impl local \
         --inference-dynamic-batching-num-cuda-graphs ${NUM_CUDA_GRAPHS} \
     "
+else
+    ARGS+=" \
+        --cuda-graph-impl none \
+    "
 fi
 
 # Prompts.
 
@@ -1,26 +1,41 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
-from megatron.core.inference.inference_client import InferenceClient
-from examples.inference.gpt.utils import add_common_inference_args
 import asyncio
-import torch.distributed as dist
-from examples.inference.gpt.gpt_dynamic_inference import get_model, get_inference_context, get_inference_controller, add_dynamic_inference_args
-from megatron.core.inference.inference_request import DynamicInferenceRequest
-from megatron.training import initialize_megatron
-import torch
-import os 
-from megatron.training import get_args, get_tokenizer 
-from megatron.core.inference.sampling_params import SamplingParams
-from examples.inference.gpt.utils import build_requests, build_dynamic_engine_setup_prefix, Request
-from megatron.core.inference.engines import DynamicInferenceEngine
+import json
+import os
 import time
+import torch
+import torch.distributed as dist
+from collections import defaultdict
 from tqdm import tqdm
 from typing import List
-import json
-from megatron.training.arguments import parse_args
+import warnings
+import logging
+
+from examples.inference.gpt.gpt_dynamic_inference import (
+    add_dynamic_inference_args,
+    get_inference_context,
+    get_inference_controller,
+    get_model,
+)
+from examples.inference.gpt.utils import (
+    Request, 
+    build_dynamic_engine_setup_prefix, 
+    build_requests,
+    add_common_inference_args
+)
+
 from megatron.core import parallel_state
+from megatron.core.inference.engines import DynamicInferenceEngine
+from megatron.core.inference.inference_client import InferenceClient
+from megatron.core.inference.inference_request import DynamicInferenceRequestRecord
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.utils import get_mamba_inference_state_config_from_model
 
-import logging
+from megatron.training import get_args, get_tokenizer, initialize_megatron
+from megatron.training.arguments import parse_args
+
+# pylint: disable=line-too-long
 
 logging.basicConfig(level=logging.INFO, force=True)
 
@@ -38,81 +53,150 @@ async def main(
         )
     # once you call engine.start_listening_to_data_parallel_coordinator,
     # the engine will start accepting requests from the data parallel coordinator.
-    # and processing them in an asyncio coroutine. 
-    await engine.start_listening_to_data_parallel_coordinator( 
-        inference_coordinator_port=port, launch_inference_coordinator=True
+    # and processing them in an asyncio coroutine.
+    
+    await engine.start_listening_to_data_parallel_coordinator(
+        inference_coordinator_port=port,
+        launch_inference_coordinator=True,
+        verbose=True,
     )
-    # if you want to use your own inference coordinator - 
+
+    # if you want to use your own inference coordinator -
     # 1. set launch_inference_coordinator to False
     # 2. setup a router socket at tcp://MASTER_ADDR:PORT
     # 3. wait for data parallel groups to establish connection (BasicInferenceCoordinator.__init__)
     # 4. look at InferenceCoordinator.start() to see how we can route requests from users <-> data parallel groups
-    #   based on headers. 
-    # 5. look at InferenceClient to see how we create requests with headers. 
-    if dist.get_rank() == 0: 
-        client = InferenceClient(port) # submits requests to the inference coordinator
+    #   based on headers.
+    # 5. look at InferenceClient to see how we create requests with headers.
+
+    args = get_args()
+
+    # Test suspend/resume intervals.
+    if args.suspend_resume_interval is not None:
+        # Since the client doesn't directly call engine.async_step here, we test
+        # the suspend-resume system ~4 times.
+        suspend_resume_interval = max(1, len(requests) // 4)
+        suspend_idxs = set(range(
+            suspend_resume_interval,
+            len(requests) + 1,
+            suspend_resume_interval,
+        ))
+        resume_idxs = set(
+            min(len(requests), i + suspend_resume_interval // 2)
+            for i in suspend_idxs
+        )
+    else:
+        suspend_idxs = set()
+        resume_idxs = set()
+
+    # Create client and run example.
+    if dist.get_rank() == 0:
+        client = InferenceClient(port)  # submits requests to the inference coordinator
         await client.start()
         base_arrival_time = time.time_ns() / 10**9
         for request in requests:
             request.time_arrival = request.time_offset + base_arrival_time
         futures = []
         num_requests_total = len(requests)
         num_requests_added = 0
-        #tbar = tqdm(total=num_requests_total)
+        
         while True:
             current_time = time.time_ns() / 10**9
-            # Only add requests that have arrived at the current time.
-            while num_requests_added < num_requests_total and requests[num_requests_added].time_arrival <= current_time:
-                request = requests[num_requests_added]
-                # These add-request calls will queue up the request on a zmq socket and return
-                # instantaneously. They will return an asyncio future which can be awaited for
-                # request completion.
-                futures.append(client.add_request(request.prompt_text, request.sampling_params))
-                num_requests_added += 1
-                #tbar.update(1)
+            if args.incoming_requests_per_step is None:
+                # Only add requests that have arrived at the current time.
+                while num_requests_added < num_requests_total and requests[num_requests_added].time_arrival <= current_time:
+                    request = requests[num_requests_added]
+                    # These add-request calls will queue up the request on a zmq socket and return
+                    # instantaneously. They will return an asyncio future which can be awaited for
+                    # request completion.
+                    futures.append(client.add_request(request.prompt_text, request.sampling_params))
+                    num_requests_added += 1
+
+                    # Test suspend/resume.
+                    if num_requests_added in suspend_idxs:
+                        client.suspend_engines()
+                    if num_requests_added in resume_idxs:
+                        client.resume_engines()
+
+            else:
+                # Add deterministic number of requests (generally used for debugging).
+                for i in range(min(
+                    args.incoming_requests_per_step,
+                    num_requests_total - num_requests_added
+                )):
+                    # Change sampling parameters to force different generation lengths.
+                    request = requests[num_requests_added]
+                    n = request.sampling_params.num_tokens_to_generate
+                    request.sampling_params.num_tokens_to_generate = n + i
+                    futures.append(client.add_request(request.prompt_text, request.sampling_params))
+                    num_requests_added += 1
+
+                    # Test suspend/resume.
+                    if num_requests_added in suspend_idxs:
+                        client.suspend_engines()
+                    if num_requests_added in resume_idxs:
+                        client.resume_engines()
+
             if num_requests_added == num_requests_total:
                 break
-            # Relinquish control since there are no more requests to add at the moment. This allows the engine to run. 
+            # Relinquish control since there are no more requests to add at the moment. This allows the engine to run.
             await asyncio.sleep(0)
-        # While we wait for the requests to complete, the engine runs in the background.
-        results: List[DynamicInferenceRequest] = await asyncio.gather(*futures)
 
+        # While we wait for the requests to complete, the engine runs in the background.
+        results: List[DynamicInferenceRequestRecord] = await asyncio.gather(*futures)
 
     if dist.get_rank() == 0:
         # Write results to JSON. Primarily used for functional testing.
         if args.output_path:
             json_results = {}
+            throughputs = []
 
-            for req in results:
+            for record in results:
+                req = record.merge(engine.controller.tokenizer)
                 result_dict = {
                     "input_prompt": req.prompt,
                     "generated_text": req.generated_text.replace("\n", "\\n"),
                     "generated_tokens": req.generated_tokens,
-                    "latency": req.latency, #InferenceClient populates this field in the returned future.
+                    "latency": req.latency,  # InferenceClient populates this field in the returned future.
                 }
                 if req.sampling_params["return_log_probs"]:
                     result_dict["logprobs"] = req.prompt_log_probs + req.generated_log_probs
+                throughput = len(req.generated_tokens) / req.latency
+                throughputs.append(throughput)
                 json_results[req.request_id] = result_dict
+            throughput_dict = {"throughput": throughputs}
+            if args.throughput_check_only:
+                json_results = throughput_dict
             with open(args.output_path, "w") as fp:
                 json.dump(json_results, fp, indent=4)
         else:
             print("Results:")
-            for req in results:
-                print(f"rid: {req.request_id}\nprompt: {req.prompt!r}\noutput: {req.generated_text!r}\n\n")
- 
+            unique_prompt_map = defaultdict(list)
+            for record in results:
+                req = record.merge(engine.controller.tokenizer)
+                unique_prompt_map[req.prompt].append(req)
+            for idx, (prompt_text, reqs) in enumerate(unique_prompt_map.items()):
+                print(f"%d/%d. prompt '%s' ... [%d] output '%s'." % (
+                    idx,
+                    len(unique_prompt_map),
+                    prompt_text.replace("\n", "\\n"),
+                    len(reqs),
+                    reqs[0].generated_text.replace("\n", "\\n"),
+                ))
+
         # kill the engines and suspend the client
         client.stop_engines()
         client.stop()
-        
+
     # once the stop signal eventually makes its way to each GPU, the engines will stop.
     await asyncio.gather(engine.engine_loop_task)
 
+
 if __name__ == "__main__":
-    # enable inference mode in the very beginning as some fp-8 optimizations 
+    # enable inference mode in the very beginning as some fp-8 optimizations
     # check for it.
     with torch.inference_mode():
         initialize_megatron(
-            #parsed_args=args
             extra_args_provider=add_dynamic_inference_args,
             args_defaults={'no_load_rng': True, 'no_load_optim': True},
         )
@@ -131,17 +215,25 @@ async def main(
             top_p=args.top_p,
             return_log_probs=args.return_log_probs,
             num_tokens_to_generate=args.num_tokens_to_generate,
-            termination_id=args.termination_id if args.termination_id is not None else tokenizer.eod,
+            termination_id=(
+                args.termination_id if args.termination_id is not None else tokenizer.eod
+            ),
         )
 
         # Requests, context, conroller.
         model = get_model()
-        requests = build_requests(args, tokenizer, sampling_params) if dist.get_rank() == 0 else None
+        mamba_inference_state_config = get_mamba_inference_state_config_from_model(model)
+        requests = (
+            build_requests(args, tokenizer, sampling_params) if dist.get_rank() == 0 else None
+        )
+
+        context = get_inference_context(
+            None,
+            None,
+            calculate_max_sequence_length_from_requests=False,
+            mamba_inference_state_config=mamba_inference_state_config,
+        )
 
-        context = get_inference_context(None, 
-                                        None,
-                                        calculate_max_sequence_length_from_requests=False)
-        
         controller = get_inference_controller(model, context)
 
         # Inference engine.
@@ -150,17 +242,19 @@ async def main(
             context,
             enable_cuda_graph=args.cuda_graph_impl == "local",
             random_seed=args.seed,
-            enable_chunked_prefill=not args.disable_chunked_prefill
+            enable_chunked_prefill=not args.disable_chunked_prefill,
         )
 
-        
         if dist.get_rank() == 0:
             setup_prefix = build_dynamic_engine_setup_prefix(args, model, context, requests)
             print("~~~")
             print(setup_prefix)
             print("~~~")
-        
-        asyncio.run(main(engine, 
-                        requests,
-                        args.inference_coordinator_port))
 
+        asyncio.run(
+            main(
+                engine,
+                requests,
+                args.inference_coordinator_port,
+            )
+        )