microsoft
diff --git a/‎llumnix_slurm_job.sh‎
Lines changed: 9 additions & 1 deletion b/‎llumnix_slurm_job.sh‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎run_tests.py‎
Lines changed: 197 additions & 56 deletions b/‎run_tests.py‎
Lines changed: 197 additions & 56 deletions
@@ -17,4 +17,12 @@ source .venv/bin/activate
 
 echo "HOST=$(hostname) JOBID=$SLURM_JOB_ID TASK=$SLURM_ARRAY_TASK_ID" >&2
 
-python3 run_tests.py --index "$SLURM_ARRAY_TASK_ID"
+# Run Llumnix plots and Llumnix vs LOR comparisons in parallel for this task index.
+# Plots stay Llumnix-only; compare runs paired Llumnix/LOR metrics (no plots).
+python3 run_tests.py --mode plots --index "$SLURM_ARRAY_TASK_ID" &
+PLOTS_PID=$!
+
+python3 run_tests.py --mode compare --index "$SLURM_ARRAY_TASK_ID" &
+COMPARE_PID=$!
+
+wait $PLOTS_PID $COMPARE_PID
@@ -1,47 +1,29 @@
 """
-Run all Llumnix latency scenarios, generate plots, and log results to Weights & Biases.
-
-For each scenario in vidur.metrics.latency_config.LATENCY_TESTS:
- 1) Execute the simulator with a scenario-specific output root.
- 2) Run latency_analysis to produce plots under <run_dir>/plots.
- 3) Log summaries + plots to wandb under a test-name namespace.
-
-Environment:
-  - Set WANDB_PROJECT / WANDB_ENTITY / WANDB_MODE as needed for logging.
-  - Metrics tracing must be enabled (already set in the base command).
-
-Look at the PRIORITY_DISTRIBUTION.md file and make plots (you need to change the variable 
---synthetic_request_generator_config_priority_distribution_type in the config 
-files used in the different tests (latency_config.py)) to compare the performance over different priority distributions.
-In addition, we can now change the amount of priority levels used in the tests (for example from 2 to 5).
-This can be changed in the config files used in the different tests by changing the variable
---synthetic_request_generator_config_num_priority_levels. Please keep the same structure in the config files
-(latency_config.py) as before, where we have several tests with a specific name, hardness, etc etc. Keep having descriptions
-for each test that explain what is being tested.
-
-Also make the plot compare the different TTFT and TBT of the different priority levels for each distribution.
+Run Llumnix/Llumlet plots or Llumnix vs LOR+vLLM metric comparisons.
+
+Two modes:
+  - plots: generate latency plots for Llumnix+Llumlet only (existing behavior).
+  - compare: run matched scenarios for Llumnix+Llumlet and LOR+vLLM, then compute
+             aggregate metrics + speedups via vidur.metrics.system_metrics.
 """
 
 from __future__ import annotations
 
+import argparse
 import os
 import subprocess
 from pathlib import Path
-from typing import Dict, List, Tuple, Optional
+from typing import Dict, List, Optional
 
 import pandas as pd
 import wandb
 
-from vidur.metrics.latency_config import LATENCY_TESTS
+from vidur.metrics.latency_config import LATENCY_TESTS_BY_SYSTEM, TEST_SCENARIO_MATRIX
 from vidur.metrics import latency_analysis as la
+from vidur.metrics import system_metrics as sm
 
-import sys
-
-# Support Slurm job-array mode: run a single test if --index is provided.
-if "--index" in sys.argv:
-    idx = int(sys.argv[sys.argv.index("--index") + 1])
-    from vidur.metrics.latency_config import LATENCY_TESTS as _ALL_TESTS
-    LATENCY_TESTS = [_ALL_TESTS[idx]]
+SYSTEM_LLUMNIX = "llumnix_llumlet"
+SYSTEM_LOR = "lor_vllm"
 
 
 def _run_command(cmd: str) -> None:
@@ -136,12 +118,110 @@ def _load_wandb_api_key(env_path: Path = Path(".env")) -> Optional[str]:
     return key or None
 
 
-def run_all_tests() -> None:
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run Llumnix plots or Llumnix vs LOR+vLLM comparisons."
+    )
+    parser.add_argument(
+        "--mode",
+        choices=["plots", "compare"],
+        default="plots",
+        help="plots: Llumnix-only plots. compare: run Llumnix+Llumlet vs LOR+vLLM comparisons.",
+    )
+    parser.add_argument(
+        "--index",
+        type=int,
+        default=None,
+        help="Optional scenario index (0-based) to run a single scenario.",
+    )
+    parser.add_argument(
+        "--latency-target",
+        type=float,
+        default=None,
+        help="Override latency target when computing cost-vs-latency metrics.",
+    )
+    parser.add_argument(
+        "--skip-plots",
+        action="store_true",
+        help="Skip latency plot generation (mostly useful in compare mode).",
+    )
+    return parser.parse_args()
+
+
+def _select_tests(tests: List[dict], index: Optional[int]) -> List[dict]:
+    if index is None:
+        return tests
+    if index < 0 or index >= len(tests):
+        raise IndexError(f"Index {index} out of range for {len(tests)} tests.")
+    return [tests[index]]
+
+
+def _extract_flag_value(cmd: str, flag: str) -> Optional[str]:
+    """Return the value following a CLI flag inside a command string."""
+    tokens = cmd.split()
+    for i, tok in enumerate(tokens):
+        if tok == flag and i + 1 < len(tokens):
+            return tokens[i + 1]
+    return None
+
+
+def _derive_compare_run_name(llumnix_cmd: str) -> str:
+    """Build wandb run name like comparison_qps_X_req_Y from the Llumnix command."""
+    qps = _extract_flag_value(
+        llumnix_cmd, "--poisson_request_interval_generator_config_qps"
+    ) or "unknown"
+    num_req = _extract_flag_value(
+        llumnix_cmd, "--synthetic_request_generator_config_num_requests"
+    ) or "unknown"
+    # strip any trailing punctuation/commas if present
+    qps_clean = str(qps).strip().strip(",")
+    req_clean = str(num_req).strip().strip(",")
+    return f"comparison_qps_{qps_clean}_req_{req_clean}"
+
+
+def _execute_test(test: dict, generate_plots: bool, step: int, wandb_run=None) -> Path:
+    name = test["name"]
+    desc = test.get("description", "")
+    base_root = Path("simulator_output") / name
+    before_dirs = {p for p in base_root.glob("*") if p.is_dir()}
+    base_root.mkdir(parents=True, exist_ok=True)
+
+    cmd = f"{test['cmd']} --metrics_config_output_dir {base_root}"
+    _run_command(cmd)
+
+    run_dir = _find_new_run_dir(base_root, before_dirs)
+    print(f"[info] Latest run dir for {name}: {run_dir}")
+
+    plots: List[Path] = []
+    summary: Dict[str, float] = {}
+    if generate_plots:
+        la.main(str(run_dir))
+        plots_dir = run_dir / "plots"
+        plots = sorted(p for p in plots_dir.glob("*.png"))
+        summary = _build_summary(run_dir)
+
+    _log_to_wandb(
+        wandb_run,
+        test_name=name,
+        description=desc,
+        cmd=cmd,
+        run_dir=run_dir,
+        plots=plots,
+        summary=summary,
+        step=step,
+    )
+
+    return run_dir
+
+
+def run_llumnix_plots(args: argparse.Namespace) -> None:
     api_key = _load_wandb_api_key()
     if api_key:
         wandb.login(key=api_key)
 
-    for idx, test in enumerate(LATENCY_TESTS):
+    tests = _select_tests(LATENCY_TESTS_BY_SYSTEM[SYSTEM_LLUMNIX], args.index)
+
+    for idx, test in enumerate(tests):
         name = test["name"]
         desc = test.get("description", "")
         run_name = os.getenv("WANDB_RUN_NAME", name)
@@ -152,41 +232,102 @@ def run_all_tests() -> None:
             mode=os.getenv("WANDB_MODE", "online"),
             name=run_name,
             group=os.getenv("WANDB_GROUP"),
-            config={"test_name": name, "description": desc, "num_tests": len(LATENCY_TESTS)},
+            config={
+                "test_name": name,
+                "description": desc,
+                "num_tests": len(tests),
+                "system": SYSTEM_LLUMNIX,
+            },
         )
 
-        # Direct outputs for this scenario under simulator_output/<name>/...
-        base_root = Path("simulator_output") / name
-        before_dirs = {p for p in base_root.glob("*") if p.is_dir()}
-        base_root.mkdir(parents=True, exist_ok=True)
+        _execute_test(test, generate_plots=not args.skip_plots, step=idx, wandb_run=wandb_run)
 
-        cmd = f"{test['cmd']} --metrics_config_output_dir {base_root}"
-        _run_command(cmd)
+        if wandb_run:
+            wandb_run.finish()
 
-        run_dir = _find_new_run_dir(base_root, before_dirs)
-        print(f"[info] Latest run dir for {name}: {run_dir}")
 
-        # Generate plots
-        la.main(str(run_dir))
+def run_comparison(args: argparse.Namespace) -> None:
+    api_key = _load_wandb_api_key()
+    if api_key:
+        wandb.login(key=api_key)
 
-        plots_dir = run_dir / "plots"
-        plots = sorted(p for p in plots_dir.glob("*.png"))
-        summary = _build_summary(run_dir)
+    scenario_items = sorted(TEST_SCENARIO_MATRIX.items())
+    if args.index is not None:
+        if args.index < 0 or args.index >= len(scenario_items):
+            raise IndexError(f"Index {args.index} out of range for {len(scenario_items)} scenarios.")
+        scenario_items = [scenario_items[args.index]]
+
+    rows = []
+    for step, (scenario_id, system_tests) in enumerate(scenario_items):
+        if SYSTEM_LLUMNIX not in system_tests or SYSTEM_LOR not in system_tests:
+            print(f"[warn] Skipping scenario {scenario_id} because one system is missing.")
+            continue
+        llumnix_test = system_tests[SYSTEM_LLUMNIX]
+        lor_test = system_tests[SYSTEM_LOR]
+
+        run_name = _derive_compare_run_name(llumnix_test["cmd"])
+        wandb_run = wandb.init(
+            project=os.getenv("WANDB_PROJECT", "llumnix"),
+            entity=os.getenv("WANDB_ENTITY"),
+            mode=os.getenv("WANDB_MODE", "online"),
+            name=run_name,
+            group=os.getenv("WANDB_GROUP", "comparison"),
+            config={
+                "scenario": scenario_id,
+                "llumnix_command": llumnix_test["cmd"],
+                "lor_command": lor_test["cmd"],
+            },
+        )
+
+        llumnix_run_dir = _execute_test(
+            llumnix_test, generate_plots=False, step=step, wandb_run=wandb_run
+        )
+        lor_run_dir = _execute_test(
+            lor_test, generate_plots=False, step=step, wandb_run=wandb_run
+        )
 
-        _log_to_wandb(
-            wandb_run,
-            test_name=name,
-            description=desc,
-            cmd=cmd,
-            run_dir=run_dir,
-            plots=plots,
-            summary=summary,
-            step=idx,
+        _, llumnix_metrics = sm.compute_run_metrics(
+            llumnix_run_dir, SYSTEM_LLUMNIX, llumnix_test["name"], latency_target=args.latency_target
+        )
+        _, lor_metrics = sm.compute_run_metrics(
+            lor_run_dir, SYSTEM_LOR, lor_test["name"], latency_target=args.latency_target
+        )
+        comparison = sm.compare_runs(llumnix_metrics, lor_metrics)
+
+        rows.append(
+            {
+                "scenario": scenario_id,
+                "llumnix_run_dir": str(llumnix_run_dir),
+                "lor_run_dir": str(lor_run_dir),
+                **comparison,
+            }
         )
 
+        print(f"[info] Scenario {scenario_id} speedups:")
+        for metric, value in comparison.items():
+            print(f"  {metric}: {value}")
+
         if wandb_run:
+            payload = {
+                "scenario": scenario_id,
+                "llumnix_run_dir": str(llumnix_run_dir),
+                "lor_run_dir": str(lor_run_dir),
+            }
+            payload.update({k: v for k, v in comparison.items() if v is not None})
+            wandb.log(payload, step=step)
             wandb_run.finish()
 
+    if rows:
+        df = pd.DataFrame(rows)
+        output_path = Path("simulator_output") / "comparison_metrics.csv"
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        df.to_csv(output_path, index=False)
+        print(f"[info] Wrote comparison metrics to {output_path}")
+
 
 if __name__ == "__main__":
-    run_all_tests()
+    args = _parse_args()
+    if args.mode == "compare":
+        run_comparison(args)
+    else:
+        run_llumnix_plots(args)