workload-replay: Faster hydration

def- · def- · commit 2465198d32a1 · 2026-02-26T23:10:34.000Z
diff --git a/misc/python/materialize/workload_replay/executor.py b/misc/python/materialize/workload_replay/executor.py
@@ -110,9 +110,12 @@ def test(
         "queries": {"total": 0, "failed": 0, "slow": 0},
         "ingestions": {"total": 0, "failed": 0, "slow": 0},
     }
+    original_cluster_sizes: dict[str, str] = {}
     if create_objects:
         start_time = time.time()
-        run_create_objects_part_1(c, services, workload, verbose)
+        original_cluster_sizes = run_create_objects_part_1(
+            c, services, workload, verbose
+        )
         if not early_initial_data:
             run_create_objects_part_2(c, services, workload, verbose)
         stats["object_creation"] = time.time() - start_time
@@ -202,12 +205,13 @@ def test(
     # otherwise frontiers haven't advanced yet and everything looks fresh.
     print("Waiting for freshness")
     time.sleep(10)
+    prev_lagging: set[str] = set()
     while True:
-        lagging: list[tuple[str, str]] = [
-            (entry[0], entry[1])
+        lagging: set[str] = {
+            entry[0]
             for entry in c.sql_query(
                 """
-            SELECT o.name, COALESCE(l.global_lag, INTERVAL '999 hours')::text
+            SELECT o.name
             FROM mz_internal.mz_materialization_lag l
             JOIN mz_objects o ON o.id = l.object_id
             WHERE o.name NOT LIKE 'mz_%'
@@ -216,14 +220,25 @@ def test(
             ORDER BY l.global_lag DESC NULLS FIRST
             LIMIT 5;"""
             )
-        ]
+        }
         if lagging:
-            summary = ", ".join(f"{name} ({lag})" for name, lag in lagging)
-            print(f"  Lagging: {summary}")
+            if lagging != prev_lagging:
+                print(f"  Lagging: {', '.join(sorted(lagging))}")
+                prev_lagging = lagging
             time.sleep(5)
         else:
             break
     print("Freshness complete")
+
+    # Scale clusters back down to their original sizes.
+    for name, original_size in original_cluster_sizes.items():
+        print(f"  Scaling down {name} back to {original_size}")
+        c.sql(
+            f"ALTER CLUSTER {name} SET (SIZE = '{original_size}')",
+            user="mz_system",
+            port=6877,
+        )
+
     if run_ingestions:
         print("Starting continuous ingestions")
         threads.extend(
diff --git a/misc/python/materialize/workload_replay/objects.py b/misc/python/materialize/workload_replay/objects.py
@@ -13,6 +13,7 @@
 
 from __future__ import annotations
 
+import os
 import re
 import time
 from textwrap import dedent
@@ -30,6 +31,7 @@
     setup_polaris_for_iceberg,
 )
 from materialize.mzcompose.services.sql_server import SqlServer
+from materialize.workload_replay.config import cluster_replica_sizes
 from materialize.workload_replay.util import (
     get_kafka_topic,
     get_mysql_reference_db_table,
@@ -41,8 +43,12 @@
 
 def run_create_objects_part_1(
     c: Composition, services: set[str], workload: dict[str, Any], verbose: bool
-) -> None:
-    """Create clusters, databases, schemas, types, connections, and prepare sources."""
+) -> dict[str, str]:
+    """Create clusters, databases, schemas, types, connections, and prepare sources.
+
+    Returns a mapping of cluster name to original SIZE, so the caller can
+    scale clusters back down after hydration completes.
+    """
     c.sql(
         "DROP CLUSTER IF EXISTS quickstart CASCADE",
         user="mz_system",
@@ -117,12 +123,40 @@ def run_create_objects_part_1(
     )
 
     print("Creating clusters")
+    # Create clusters at a large size for faster hydration. The original
+    # sizes are returned so the caller can scale back down afterwards.
+    # Pick the largest valid scale=1 size that fits on this machine.
+    num_cpus = os.cpu_count() or 1
+    best_workers = max(
+        (
+            cfg["workers"]
+            for cfg in cluster_replica_sizes.values()
+            if cfg.get("scale") == 1
+            and isinstance(cfg.get("workers"), int)
+            and cfg["workers"] <= num_cpus
+        ),
+        default=1,
+    )
+    hydration_size = f"scale=1,workers={best_workers}"
+    original_cluster_sizes: dict[str, str] = {}
     for name, cluster in workload["clusters"].items():
         if cluster["managed"]:
             # Need at least one replica for everything to hydrate
             create_sql = cluster["create_sql"].replace(
                 "REPLICATION FACTOR = 0", "REPLICATION FACTOR = 1"
             )
+            # Swap in the hydration size, remembering the original.
+            size_match = re.search(r"SIZE\s*=\s*'([^']+)'", create_sql, re.IGNORECASE)
+            if size_match:
+                original_cluster_sizes[name] = size_match.group(1)
+                create_sql = (
+                    create_sql[: size_match.start()]
+                    + f"SIZE = '{hydration_size}'"
+                    + create_sql[size_match.end() :]
+                )
+                print(
+                    f"  {name}: creating at {hydration_size} (original: {size_match.group(1)})"
+                )
             c.sql(create_sql, user="mz_system", port=6877, print_statement=verbose)
         else:
             raise ValueError("Handle unmanaged clusters")
@@ -541,6 +575,8 @@ def run_create_objects_part_1(
                         flags=re.DOTALL | re.IGNORECASE,
                     )
 
+    return original_cluster_sizes
+
 
 def run_create_objects_part_2(
     c: Composition, services: set[str], workload: dict[str, Any], verbose: bool