workload-replay: Faster hydration & feature-benchmark StartupLoaded fix

def- · def- · commit 0b9508dd8ca1 · 2026-02-27T10:42:26.000Z
diff --git a/ci/release-qualification/pipeline.template.yml b/ci/release-qualification/pipeline.template.yml
@@ -210,7 +210,7 @@ steps:
                 - common-ancestor
                 - --skip-without-data-scale
         agents:
-          queue: hetzner-x86-64-dedi-32cpu-128gb
+          queue: hetzner-x86-64-dedi-16cpu-64gb
 
   - group: SQLsmith
     key: sqlsmith
diff --git a/misc/python/materialize/feature_benchmark/scenarios/benchmark_main.py b/misc/python/materialize/feature_benchmark/scenarios/benchmark_main.py
@@ -2205,6 +2205,8 @@ class StartupLoaded(Scenario):
     """Measure the time it takes to restart a populated Mz instance and have all the dataflows be ready to return something"""
 
     SCALE = 1  # 10 objects of each kind
+    # Can not scale to 100s of objects, so --size=+N will have no effect
+    FIXED_SCALE = True
 
     def shared(self) -> Action:
         return TdAction(
diff --git a/misc/python/materialize/workload_replay/column.py b/misc/python/materialize/workload_replay/column.py
@@ -22,7 +22,6 @@
 from pg8000.native import literal
 
 from materialize.workload_replay.util import (
-    long_tail_choice,
     long_tail_float,
     long_tail_int,
     long_tail_text,
@@ -44,6 +43,8 @@ def __init__(
         if data_shape:
             assert typ in ("text", "bytea"), f"Can't create text shape for type {typ}"
 
+        self._years = list(range(2019, 2026))
+
         self._hot_strings = [
             f"{name}_a",
             f"{name}_b",
@@ -56,6 +57,18 @@ def __init__(
             "NULL",
         ]
 
+    def _random_date(self, rng: random.Random) -> str:
+        """Generate a uniformly random date string."""
+        year = rng.choice(self._years)
+        return f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}"
+
+    def _random_datetime(self, rng: random.Random) -> str:
+        """Generate a uniformly random datetime string."""
+        return (
+            f"{self._random_date(rng)}"
+            f"T{rng.randrange(0, 24):02}:{rng.randrange(0, 60):02}:{rng.randrange(0, 60):02}Z"
+        )
+
     def avro_type(self) -> str | list[str]:
         """Return the Avro type for this column."""
         result = self.typ
@@ -100,12 +113,7 @@ def kafka_value(self, rng: random.Random) -> Any:
 
         elif self.typ in ("text", "bytea"):
             if self.data_shape == "datetime":
-                year = long_tail_choice(
-                    [2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
-                )
-                return literal(
-                    f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}T{rng.randrange(0, 23):02}:{rng.randrange(0, 59):02}:{rng.randrange(0, 59):02}Z"
-                )
+                return literal(self._random_datetime(rng))
             elif self.data_shape:
                 raise ValueError(f"Unhandled text shape {self.data_shape}")
             return literal(long_tail_text(self.chars, 100, self._hot_strings, rng=rng))
@@ -123,23 +131,15 @@ def kafka_value(self, rng: random.Random) -> Any:
             return json.dumps(result)
 
         elif self.typ in ("timestamp with time zone", "timestamp without time zone"):
-            now = 1700000000000  # doesn't need to be exact
-            if rng.random() < 0.9:
-                return now + long_tail_int(-86_400_000, 86_400_000, rng=rng)
-            else:
-                return rng.randrange(0, 9223372036854775807)
+            # Epoch millis spread uniformly across 2019–2025
+            # 2019-01-01 = 1546300800000, 2026-01-01 = 1767225600000
+            return rng.randrange(1546300800000, 1767225600000)
 
         elif self.typ == "mz_timestamp":
-            year = long_tail_choice(
-                [2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
-            )
-            return literal(f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}")
+            return literal(self._random_date(rng))
 
         elif self.typ == "date":
-            year = long_tail_choice(
-                [2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
-            )
-            return literal(f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}")
+            return literal(self._random_date(rng))
 
         elif self.typ == "time":
             if rng.random() < 0.8:
@@ -221,13 +221,7 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
 
         elif self.typ in ("text", "bytea"):
             if self.data_shape == "datetime":
-                year = long_tail_choice(
-                    [2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
-                )
-                s = (
-                    f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}"
-                    f"T{rng.randrange(0, 23):02}:{rng.randrange(0, 59):02}:{rng.randrange(0, 59):02}Z"
-                )
+                s = self._random_datetime(rng)
                 return literal(s) if in_query else s
 
             elif self.data_shape:
@@ -254,24 +248,15 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
                 return json.dumps(obj)
 
         elif self.typ in ("timestamp with time zone", "timestamp without time zone"):
-            year = long_tail_choice(
-                [2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
-            )
-            s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
+            s = self._random_date(rng)
             return literal(s) if in_query else s
 
         elif self.typ == "mz_timestamp":
-            year = long_tail_choice(
-                [2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
-            )
-            s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
+            s = self._random_date(rng)
             return literal(s) if in_query else s
 
         elif self.typ == "date":
-            year = long_tail_choice(
-                [2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
-            )
-            s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
+            s = self._random_date(rng)
             return literal(s) if in_query else s
 
         elif self.typ == "time":
diff --git a/misc/python/materialize/workload_replay/executor.py b/misc/python/materialize/workload_replay/executor.py
@@ -53,6 +53,43 @@
 from materialize.workload_replay.util import print_workload_stats, resolve_tag
 
 
+def wait_for_freshness(c: Composition) -> None:
+    """Wait for all user materializations to be caught up (fresh).
+
+    Uses local_lag (lag relative to source) rather than global_lag (lag
+    relative to wall-clock) because with captured/historical data the source
+    frontiers are stuck at capture time and global_lag will never shrink.
+    Sleeps first so the system has time to start processing data;
+    otherwise frontiers haven't advanced yet and everything looks fresh.
+    """
+    print("Waiting for freshness")
+    time.sleep(10)
+    prev_lagging: set[str] = set()
+    while True:
+        lagging: set[str] = {
+            entry[0]
+            for entry in c.sql_query(
+                """
+            SELECT o.name
+            FROM mz_internal.mz_materialization_lag l
+            JOIN mz_objects o ON o.id = l.object_id
+            WHERE o.name NOT LIKE 'mz_%'
+              AND o.id NOT IN (SELECT id FROM mz_sinks)
+              AND (l.local_lag IS NULL OR l.local_lag > INTERVAL '10 seconds')
+            ORDER BY l.local_lag DESC NULLS FIRST
+            LIMIT 5;"""
+            )
+        }
+        if lagging:
+            if lagging != prev_lagging:
+                print(f"  Lagging: {', '.join(sorted(lagging))}")
+                prev_lagging = lagging
+            time.sleep(5)
+        else:
+            break
+    print("Freshness complete")
+
+
 def test(
     c: Composition,
     workload: dict[str, Any],
@@ -110,23 +147,26 @@ def test(
         "queries": {"total": 0, "failed": 0, "slow": 0},
         "ingestions": {"total": 0, "failed": 0, "slow": 0},
     }
+    original_cluster_sizes: dict[str, str] = {}
     if create_objects:
         start_time = time.time()
-        run_create_objects_part_1(c, services, workload, verbose)
+        original_cluster_sizes = run_create_objects_part_1(
+            c, services, workload, verbose
+        )
         if not early_initial_data:
             run_create_objects_part_2(c, services, workload, verbose)
         stats["object_creation"] = time.time() - start_time
-    created_data = False
-    if initial_data:
-        print("Creating initial data")
-        stats["initial_data"] = {"docker": [], "time": 0.0}
-        stats_thread = PropagatingThread(
-            target=docker_stats,
-            name="docker-stats",
-            args=(stats["initial_data"]["docker"], stop_event),
-        )
-        stats_thread.start()
-        try:
+    stats["initial_data"] = {"docker": [], "time": 0.0}
+    stats_thread = PropagatingThread(
+        target=docker_stats,
+        name="docker-stats",
+        args=(stats["initial_data"]["docker"], stop_event),
+    )
+    stats_thread.start()
+    try:
+        created_data = False
+        if initial_data:
+            print("Creating initial data")
             start_time = time.time()
             created_data = create_initial_data_external(
                 c,
@@ -137,6 +177,7 @@ def test(
             if early_initial_data:
                 obj_start = time.time()
                 run_create_objects_part_2(c, services, workload, verbose)
+                stats["initial_data"]["sources_created_at"] = time.time()
                 stats["object_creation"] += time.time() - obj_start
             created_data_requiring_mz = create_initial_data_requiring_mz(
                 c,
@@ -146,84 +187,71 @@ def test(
             )
             created_data = created_data or created_data_requiring_mz
             stats["initial_data"]["time"] = time.time() - start_time
-            if not created_data:
-                del stats["initial_data"]
-        finally:
-            stop_event.set()
-            stats_thread.join()
-            stop_event.clear()
-    elif early_initial_data:
-        start_time = time.time()
-        run_create_objects_part_2(c, services, workload, verbose)
-        stats["object_creation"] += time.time() - start_time
+        elif early_initial_data:
+            start_time = time.time()
+            run_create_objects_part_2(c, services, workload, verbose)
+            stats["object_creation"] += time.time() - start_time
 
-    # Wait for all user objects to hydrate before starting queries.
-    print("Waiting for hydration")
-    prev_not_hydrated: list[str] = []
-    while True:
-        not_hydrated: list[str] = [
-            entry[0]
-            for entry in c.sql_query(
-                """
-            SELECT DISTINCT name
-                FROM (
-                  SELECT o.name
-                  FROM mz_objects o
-                  JOIN mz_internal.mz_hydration_statuses h
-                    ON o.id = h.object_id
-                  WHERE NOT h.hydrated
-                    AND o.name NOT LIKE 'mz_%'
-                    AND o.id NOT IN (SELECT id FROM mz_sinks)
+        # Wait for all user objects to hydrate before starting queries.
+        print("Waiting for hydration")
+        prev_not_hydrated: list[str] = []
+        while True:
+            not_hydrated: list[str] = [
+                entry[0]
+                for entry in c.sql_query(
+                    """
+                SELECT DISTINCT name
+                    FROM (
+                      SELECT o.name
+                      FROM mz_objects o
+                      JOIN mz_internal.mz_hydration_statuses h
+                        ON o.id = h.object_id
+                      WHERE NOT h.hydrated
+                        AND o.name NOT LIKE 'mz_%'
+                        AND o.id NOT IN (SELECT id FROM mz_sinks)
 
-                  UNION ALL
+                      UNION ALL
 
-                  SELECT o.name
-                  FROM mz_objects o
-                  JOIN mz_internal.mz_compute_hydration_statuses h
-                    ON o.id = h.object_id
-                  WHERE NOT h.hydrated
-                    AND o.name NOT LIKE 'mz_%'
-                    AND o.id NOT IN (SELECT id FROM mz_sinks)
-                ) x
-                ORDER BY 1;"""
-            )
-        ]
-        if not_hydrated:
-            if not_hydrated != prev_not_hydrated:
-                print(f"  Not yet hydrated: {', '.join(not_hydrated)}")
-                prev_not_hydrated = not_hydrated
-            time.sleep(1)
-        else:
-            break
-    print("Hydration complete")
+                      SELECT o.name
+                      FROM mz_objects o
+                      JOIN mz_internal.mz_compute_hydration_statuses h
+                        ON o.id = h.object_id
+                      WHERE NOT h.hydrated
+                        AND o.name NOT LIKE 'mz_%'
+                        AND o.id NOT IN (SELECT id FROM mz_sinks)
+                    ) x
+                    ORDER BY 1;"""
+                )
+            ]
+            if not_hydrated:
+                if not_hydrated != prev_not_hydrated:
+                    print(f"  Not yet hydrated: {', '.join(not_hydrated)}")
+                    prev_not_hydrated = not_hydrated
+                time.sleep(1)
+            else:
+                break
+        print("Hydration complete")
+
+        wait_for_freshness(c)
+
+        # Scale clusters back down to their original sizes.
+        if original_cluster_sizes:
+            print("Scaling clusters to original sizes")
+            for name, original_size in original_cluster_sizes.items():
+                c.sql(
+                    f"ALTER CLUSTER \"{name}\" SET (SIZE = '{original_size}')",
+                    user="mz_system",
+                    port=6877,
+                )
+            wait_for_freshness(c)
+    finally:
+        stop_event.set()
+        stats_thread.join()
+        stop_event.clear()
+
+    if not created_data:
+        del stats["initial_data"]
 
-    # Wait for all user materializations to be caught up (fresh).
-    # Sleep first so the system has time to start processing imported data;
-    # otherwise frontiers haven't advanced yet and everything looks fresh.
-    print("Waiting for freshness")
-    time.sleep(10)
-    while True:
-        lagging: list[tuple[str, str]] = [
-            (entry[0], entry[1])
-            for entry in c.sql_query(
-                """
-            SELECT o.name, COALESCE(l.global_lag, INTERVAL '999 hours')::text
-            FROM mz_internal.mz_materialization_lag l
-            JOIN mz_objects o ON o.id = l.object_id
-            WHERE o.name NOT LIKE 'mz_%'
-              AND o.id NOT IN (SELECT id FROM mz_sinks)
-              AND (l.global_lag IS NULL OR l.global_lag > INTERVAL '10 seconds')
-            ORDER BY l.global_lag DESC NULLS FIRST
-            LIMIT 5;"""
-            )
-        ]
-        if lagging:
-            summary = ", ".join(f"{name} ({lag})" for name, lag in lagging)
-            print(f"  Lagging: {summary}")
-            time.sleep(5)
-        else:
-            break
-    print("Freshness complete")
     if run_ingestions:
         print("Starting continuous ingestions")
         threads.extend(
diff --git a/misc/python/materialize/workload_replay/objects.py b/misc/python/materialize/workload_replay/objects.py
diff --git a/misc/python/materialize/workload_replay/stats.py b/misc/python/materialize/workload_replay/stats.py