Skip to content

Commit 0b9508d

Browse files
committed
workload-replay: Faster hydration & feature-benchmark StartupLoaded fix
1 parent b372e56 commit 0b9508d

File tree

6 files changed

+202
-128
lines changed

6 files changed

+202
-128
lines changed

ci/release-qualification/pipeline.template.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ steps:
210210
- common-ancestor
211211
- --skip-without-data-scale
212212
agents:
213-
queue: hetzner-x86-64-dedi-32cpu-128gb
213+
queue: hetzner-x86-64-dedi-16cpu-64gb
214214

215215
- group: SQLsmith
216216
key: sqlsmith

misc/python/materialize/feature_benchmark/scenarios/benchmark_main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2205,6 +2205,8 @@ class StartupLoaded(Scenario):
22052205
"""Measure the time it takes to restart a populated Mz instance and have all the dataflows be ready to return something"""
22062206

22072207
SCALE = 1 # 10 objects of each kind
2208+
# Can not scale to 100s of objects, so --size=+N will have no effect
2209+
FIXED_SCALE = True
22082210

22092211
def shared(self) -> Action:
22102212
return TdAction(

misc/python/materialize/workload_replay/column.py

Lines changed: 24 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from pg8000.native import literal
2323

2424
from materialize.workload_replay.util import (
25-
long_tail_choice,
2625
long_tail_float,
2726
long_tail_int,
2827
long_tail_text,
@@ -44,6 +43,8 @@ def __init__(
4443
if data_shape:
4544
assert typ in ("text", "bytea"), f"Can't create text shape for type {typ}"
4645

46+
self._years = list(range(2019, 2026))
47+
4748
self._hot_strings = [
4849
f"{name}_a",
4950
f"{name}_b",
@@ -56,6 +57,18 @@ def __init__(
5657
"NULL",
5758
]
5859

60+
def _random_date(self, rng: random.Random) -> str:
61+
"""Generate a uniformly random date string."""
62+
year = rng.choice(self._years)
63+
return f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}"
64+
65+
def _random_datetime(self, rng: random.Random) -> str:
66+
"""Generate a uniformly random datetime string."""
67+
return (
68+
f"{self._random_date(rng)}"
69+
f"T{rng.randrange(0, 24):02}:{rng.randrange(0, 60):02}:{rng.randrange(0, 60):02}Z"
70+
)
71+
5972
def avro_type(self) -> str | list[str]:
6073
"""Return the Avro type for this column."""
6174
result = self.typ
@@ -100,12 +113,7 @@ def kafka_value(self, rng: random.Random) -> Any:
100113

101114
elif self.typ in ("text", "bytea"):
102115
if self.data_shape == "datetime":
103-
year = long_tail_choice(
104-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
105-
)
106-
return literal(
107-
f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}T{rng.randrange(0, 23):02}:{rng.randrange(0, 59):02}:{rng.randrange(0, 59):02}Z"
108-
)
116+
return literal(self._random_datetime(rng))
109117
elif self.data_shape:
110118
raise ValueError(f"Unhandled text shape {self.data_shape}")
111119
return literal(long_tail_text(self.chars, 100, self._hot_strings, rng=rng))
@@ -123,23 +131,15 @@ def kafka_value(self, rng: random.Random) -> Any:
123131
return json.dumps(result)
124132

125133
elif self.typ in ("timestamp with time zone", "timestamp without time zone"):
126-
now = 1700000000000 # doesn't need to be exact
127-
if rng.random() < 0.9:
128-
return now + long_tail_int(-86_400_000, 86_400_000, rng=rng)
129-
else:
130-
return rng.randrange(0, 9223372036854775807)
134+
# Epoch millis spread uniformly across 2019–2025
135+
# 2019-01-01 = 1546300800000, 2026-01-01 = 1767225600000
136+
return rng.randrange(1546300800000, 1767225600000)
131137

132138
elif self.typ == "mz_timestamp":
133-
year = long_tail_choice(
134-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
135-
)
136-
return literal(f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}")
139+
return literal(self._random_date(rng))
137140

138141
elif self.typ == "date":
139-
year = long_tail_choice(
140-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
141-
)
142-
return literal(f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}")
142+
return literal(self._random_date(rng))
143143

144144
elif self.typ == "time":
145145
if rng.random() < 0.8:
@@ -221,13 +221,7 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
221221

222222
elif self.typ in ("text", "bytea"):
223223
if self.data_shape == "datetime":
224-
year = long_tail_choice(
225-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
226-
)
227-
s = (
228-
f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}"
229-
f"T{rng.randrange(0, 23):02}:{rng.randrange(0, 59):02}:{rng.randrange(0, 59):02}Z"
230-
)
224+
s = self._random_datetime(rng)
231225
return literal(s) if in_query else s
232226

233227
elif self.data_shape:
@@ -254,24 +248,15 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
254248
return json.dumps(obj)
255249

256250
elif self.typ in ("timestamp with time zone", "timestamp without time zone"):
257-
year = long_tail_choice(
258-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
259-
)
260-
s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
251+
s = self._random_date(rng)
261252
return literal(s) if in_query else s
262253

263254
elif self.typ == "mz_timestamp":
264-
year = long_tail_choice(
265-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
266-
)
267-
s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
255+
s = self._random_date(rng)
268256
return literal(s) if in_query else s
269257

270258
elif self.typ == "date":
271-
year = long_tail_choice(
272-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
273-
)
274-
s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
259+
s = self._random_date(rng)
275260
return literal(s) if in_query else s
276261

277262
elif self.typ == "time":

misc/python/materialize/workload_replay/executor.py

Lines changed: 114 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,43 @@
5353
from materialize.workload_replay.util import print_workload_stats, resolve_tag
5454

5555

56+
def wait_for_freshness(c: Composition) -> None:
57+
"""Wait for all user materializations to be caught up (fresh).
58+
59+
Uses local_lag (lag relative to source) rather than global_lag (lag
60+
relative to wall-clock) because with captured/historical data the source
61+
frontiers are stuck at capture time and global_lag will never shrink.
62+
Sleeps first so the system has time to start processing data;
63+
otherwise frontiers haven't advanced yet and everything looks fresh.
64+
"""
65+
print("Waiting for freshness")
66+
time.sleep(10)
67+
prev_lagging: set[str] = set()
68+
while True:
69+
lagging: set[str] = {
70+
entry[0]
71+
for entry in c.sql_query(
72+
"""
73+
SELECT o.name
74+
FROM mz_internal.mz_materialization_lag l
75+
JOIN mz_objects o ON o.id = l.object_id
76+
WHERE o.name NOT LIKE 'mz_%'
77+
AND o.id NOT IN (SELECT id FROM mz_sinks)
78+
AND (l.local_lag IS NULL OR l.local_lag > INTERVAL '10 seconds')
79+
ORDER BY l.local_lag DESC NULLS FIRST
80+
LIMIT 5;"""
81+
)
82+
}
83+
if lagging:
84+
if lagging != prev_lagging:
85+
print(f" Lagging: {', '.join(sorted(lagging))}")
86+
prev_lagging = lagging
87+
time.sleep(5)
88+
else:
89+
break
90+
print("Freshness complete")
91+
92+
5693
def test(
5794
c: Composition,
5895
workload: dict[str, Any],
@@ -110,23 +147,26 @@ def test(
110147
"queries": {"total": 0, "failed": 0, "slow": 0},
111148
"ingestions": {"total": 0, "failed": 0, "slow": 0},
112149
}
150+
original_cluster_sizes: dict[str, str] = {}
113151
if create_objects:
114152
start_time = time.time()
115-
run_create_objects_part_1(c, services, workload, verbose)
153+
original_cluster_sizes = run_create_objects_part_1(
154+
c, services, workload, verbose
155+
)
116156
if not early_initial_data:
117157
run_create_objects_part_2(c, services, workload, verbose)
118158
stats["object_creation"] = time.time() - start_time
119-
created_data = False
120-
if initial_data:
121-
print("Creating initial data")
122-
stats["initial_data"] = {"docker": [], "time": 0.0}
123-
stats_thread = PropagatingThread(
124-
target=docker_stats,
125-
name="docker-stats",
126-
args=(stats["initial_data"]["docker"], stop_event),
127-
)
128-
stats_thread.start()
129-
try:
159+
stats["initial_data"] = {"docker": [], "time": 0.0}
160+
stats_thread = PropagatingThread(
161+
target=docker_stats,
162+
name="docker-stats",
163+
args=(stats["initial_data"]["docker"], stop_event),
164+
)
165+
stats_thread.start()
166+
try:
167+
created_data = False
168+
if initial_data:
169+
print("Creating initial data")
130170
start_time = time.time()
131171
created_data = create_initial_data_external(
132172
c,
@@ -137,6 +177,7 @@ def test(
137177
if early_initial_data:
138178
obj_start = time.time()
139179
run_create_objects_part_2(c, services, workload, verbose)
180+
stats["initial_data"]["sources_created_at"] = time.time()
140181
stats["object_creation"] += time.time() - obj_start
141182
created_data_requiring_mz = create_initial_data_requiring_mz(
142183
c,
@@ -146,84 +187,71 @@ def test(
146187
)
147188
created_data = created_data or created_data_requiring_mz
148189
stats["initial_data"]["time"] = time.time() - start_time
149-
if not created_data:
150-
del stats["initial_data"]
151-
finally:
152-
stop_event.set()
153-
stats_thread.join()
154-
stop_event.clear()
155-
elif early_initial_data:
156-
start_time = time.time()
157-
run_create_objects_part_2(c, services, workload, verbose)
158-
stats["object_creation"] += time.time() - start_time
190+
elif early_initial_data:
191+
start_time = time.time()
192+
run_create_objects_part_2(c, services, workload, verbose)
193+
stats["object_creation"] += time.time() - start_time
159194

160-
# Wait for all user objects to hydrate before starting queries.
161-
print("Waiting for hydration")
162-
prev_not_hydrated: list[str] = []
163-
while True:
164-
not_hydrated: list[str] = [
165-
entry[0]
166-
for entry in c.sql_query(
167-
"""
168-
SELECT DISTINCT name
169-
FROM (
170-
SELECT o.name
171-
FROM mz_objects o
172-
JOIN mz_internal.mz_hydration_statuses h
173-
ON o.id = h.object_id
174-
WHERE NOT h.hydrated
175-
AND o.name NOT LIKE 'mz_%'
176-
AND o.id NOT IN (SELECT id FROM mz_sinks)
195+
# Wait for all user objects to hydrate before starting queries.
196+
print("Waiting for hydration")
197+
prev_not_hydrated: list[str] = []
198+
while True:
199+
not_hydrated: list[str] = [
200+
entry[0]
201+
for entry in c.sql_query(
202+
"""
203+
SELECT DISTINCT name
204+
FROM (
205+
SELECT o.name
206+
FROM mz_objects o
207+
JOIN mz_internal.mz_hydration_statuses h
208+
ON o.id = h.object_id
209+
WHERE NOT h.hydrated
210+
AND o.name NOT LIKE 'mz_%'
211+
AND o.id NOT IN (SELECT id FROM mz_sinks)
177212
178-
UNION ALL
213+
UNION ALL
179214
180-
SELECT o.name
181-
FROM mz_objects o
182-
JOIN mz_internal.mz_compute_hydration_statuses h
183-
ON o.id = h.object_id
184-
WHERE NOT h.hydrated
185-
AND o.name NOT LIKE 'mz_%'
186-
AND o.id NOT IN (SELECT id FROM mz_sinks)
187-
) x
188-
ORDER BY 1;"""
189-
)
190-
]
191-
if not_hydrated:
192-
if not_hydrated != prev_not_hydrated:
193-
print(f" Not yet hydrated: {', '.join(not_hydrated)}")
194-
prev_not_hydrated = not_hydrated
195-
time.sleep(1)
196-
else:
197-
break
198-
print("Hydration complete")
215+
SELECT o.name
216+
FROM mz_objects o
217+
JOIN mz_internal.mz_compute_hydration_statuses h
218+
ON o.id = h.object_id
219+
WHERE NOT h.hydrated
220+
AND o.name NOT LIKE 'mz_%'
221+
AND o.id NOT IN (SELECT id FROM mz_sinks)
222+
) x
223+
ORDER BY 1;"""
224+
)
225+
]
226+
if not_hydrated:
227+
if not_hydrated != prev_not_hydrated:
228+
print(f" Not yet hydrated: {', '.join(not_hydrated)}")
229+
prev_not_hydrated = not_hydrated
230+
time.sleep(1)
231+
else:
232+
break
233+
print("Hydration complete")
234+
235+
wait_for_freshness(c)
236+
237+
# Scale clusters back down to their original sizes.
238+
if original_cluster_sizes:
239+
print("Scaling clusters to original sizes")
240+
for name, original_size in original_cluster_sizes.items():
241+
c.sql(
242+
f"ALTER CLUSTER \"{name}\" SET (SIZE = '{original_size}')",
243+
user="mz_system",
244+
port=6877,
245+
)
246+
wait_for_freshness(c)
247+
finally:
248+
stop_event.set()
249+
stats_thread.join()
250+
stop_event.clear()
251+
252+
if not created_data:
253+
del stats["initial_data"]
199254

200-
# Wait for all user materializations to be caught up (fresh).
201-
# Sleep first so the system has time to start processing imported data;
202-
# otherwise frontiers haven't advanced yet and everything looks fresh.
203-
print("Waiting for freshness")
204-
time.sleep(10)
205-
while True:
206-
lagging: list[tuple[str, str]] = [
207-
(entry[0], entry[1])
208-
for entry in c.sql_query(
209-
"""
210-
SELECT o.name, COALESCE(l.global_lag, INTERVAL '999 hours')::text
211-
FROM mz_internal.mz_materialization_lag l
212-
JOIN mz_objects o ON o.id = l.object_id
213-
WHERE o.name NOT LIKE 'mz_%'
214-
AND o.id NOT IN (SELECT id FROM mz_sinks)
215-
AND (l.global_lag IS NULL OR l.global_lag > INTERVAL '10 seconds')
216-
ORDER BY l.global_lag DESC NULLS FIRST
217-
LIMIT 5;"""
218-
)
219-
]
220-
if lagging:
221-
summary = ", ".join(f"{name} ({lag})" for name, lag in lagging)
222-
print(f" Lagging: {summary}")
223-
time.sleep(5)
224-
else:
225-
break
226-
print("Freshness complete")
227255
if run_ingestions:
228256
print("Starting continuous ingestions")
229257
threads.extend(

0 commit comments

Comments
 (0)