Skip to content

Commit 8fed0d3

Browse files
committed
workload-replay: Improvements & feature-benchmark StartupLoaded fix
1 parent 26e1a33 commit 8fed0d3

File tree

11 files changed

+679
-324
lines changed

11 files changed

+679
-324
lines changed

ci/nightly/pipeline.template.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ steps:
264264
- common-ancestor
265265

266266
- id: workload-replay
267-
label: "Workload Replay (1% initial data)"
267+
label: "Workload Replay (10% initial data)"
268268
depends_on: build-x86_64
269269
timeout_in_minutes: 240
270270
parallelism: 5
@@ -273,7 +273,7 @@ steps:
273273
composition: workload-replay
274274
run: benchmark
275275
args:
276-
- --factor-initial-data=0.01
276+
- --factor-initial-data=0.1
277277
- --compare-against
278278
- common-ancestor
279279
agents:

ci/release-qualification/pipeline.template.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ steps:
195195
- 1200
196196

197197
- id: long-workload-replay
198-
label: "Long Workload Replay (10% initial data)"
198+
label: "Long Workload Replay (100% initial data)"
199199
depends_on: build-x86_64
200200
timeout_in_minutes: 1200
201201
parallelism: 3
@@ -204,13 +204,13 @@ steps:
204204
composition: workload-replay
205205
run: benchmark
206206
args:
207-
- --factor-initial-data=0.1
207+
- --factor-initial-data=1
208208
- --runtime=3600
209209
- --compare-against
210210
- common-ancestor
211211
- --skip-without-data-scale
212212
agents:
213-
queue: hetzner-x86-64-dedi-32cpu-128gb
213+
queue: hetzner-x86-64-dedi-48cpu-192gb
214214

215215
- group: SQLsmith
216216
key: sqlsmith

misc/python/materialize/feature_benchmark/benchmark.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ def create_scenario_instance(self) -> Scenario:
6565
elif float(self._scale) > 0:
6666
scale = float(self._scale)
6767

68+
if self._scenario_cls.MAX_SCALE is not None:
69+
scale = min(scale, self._scenario_cls.MAX_SCALE)
70+
6871
scenario_class = self._scenario_cls
6972
return scenario_class(
7073
scale=scale,

misc/python/materialize/feature_benchmark/scenario.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
class RootScenario:
2222
SCALE: float = 6
2323
FIXED_SCALE: bool = False # Will --scale=N have effect on the scenario
24+
MAX_SCALE: float | None = None # Cap scale to this value when set
2425
RELATIVE_THRESHOLD: dict[MeasurementType, float] = {
2526
MeasurementType.WALLCLOCK: 0.10,
2627
# Increased the other measurements since they are easy to regress now

misc/python/materialize/feature_benchmark/scenarios/benchmark_main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2205,6 +2205,8 @@ class StartupLoaded(Scenario):
22052205
"""Measure the time it takes to restart a populated Mz instance and have all the dataflows be ready to return something"""
22062206

22072207
SCALE = 1 # 10 objects of each kind
2208+
# Can not scale to 100s of objects
2209+
MAX_SCALE = 1.5
22082210

22092211
def shared(self) -> Action:
22102212
return TdAction(

misc/python/materialize/workload_replay/column.py

Lines changed: 75 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
from pg8000.native import literal
2323

2424
from materialize.workload_replay.util import (
25-
long_tail_choice,
2625
long_tail_float,
2726
long_tail_int,
2827
long_tail_text,
@@ -41,8 +40,9 @@ def __init__(
4140
self.default = default
4241
self.chars = string.ascii_letters + string.digits
4342
self.data_shape = data_shape
44-
if data_shape:
45-
assert typ in ("text", "bytea"), f"Can't create text shape for type {typ}"
43+
44+
self._years = list(range(2019, 2026))
45+
self._seq_counter = 0
4646

4747
self._hot_strings = [
4848
f"{name}_a",
@@ -56,6 +56,34 @@ def __init__(
5656
"NULL",
5757
]
5858

59+
def _shaped_text(self, rng: random.Random) -> str | None:
60+
"""Generate text according to data_shape, or None if not applicable."""
61+
if self.data_shape == "datetime":
62+
return self._random_datetime(rng)
63+
elif self.data_shape == "random":
64+
length = rng.randrange(5, 40)
65+
return "".join(rng.choice(self.chars) for _ in range(length))
66+
elif self.data_shape == "uuid":
67+
return str(uuid.UUID(int=rng.getrandbits(128), version=4))
68+
elif self.data_shape == "sequential":
69+
self._seq_counter += 1
70+
return f"{self.name}_{self._seq_counter}"
71+
elif self.data_shape is not None:
72+
raise ValueError(f"Unhandled data_shape {self.data_shape!r}")
73+
return None
74+
75+
def _random_date(self, rng: random.Random) -> str:
76+
"""Generate a uniformly random date string."""
77+
year = rng.choice(self._years)
78+
return f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}"
79+
80+
def _random_datetime(self, rng: random.Random) -> str:
81+
"""Generate a uniformly random datetime string."""
82+
return (
83+
f"{self._random_date(rng)}"
84+
f"T{rng.randrange(0, 24):02}:{rng.randrange(0, 60):02}:{rng.randrange(0, 60):02}Z"
85+
)
86+
5987
def avro_type(self) -> str | list[str]:
6088
"""Return the Avro type for this column."""
6189
result = self.typ
@@ -99,18 +127,15 @@ def kafka_value(self, rng: random.Random) -> Any:
99127
return long_tail_float(-1_000_000_000.0, 1_000_000_000.0, rng=rng)
100128

101129
elif self.typ in ("text", "bytea"):
102-
if self.data_shape == "datetime":
103-
year = long_tail_choice(
104-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
105-
)
106-
return literal(
107-
f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}T{rng.randrange(0, 23):02}:{rng.randrange(0, 59):02}:{rng.randrange(0, 59):02}Z"
108-
)
109-
elif self.data_shape:
110-
raise ValueError(f"Unhandled text shape {self.data_shape}")
130+
shaped = self._shaped_text(rng)
131+
if shaped is not None:
132+
return literal(shaped)
111133
return literal(long_tail_text(self.chars, 100, self._hot_strings, rng=rng))
112134

113135
elif self.typ in ("character", "character varying"):
136+
shaped = self._shaped_text(rng)
137+
if shaped is not None:
138+
return literal(shaped)
114139
return literal(long_tail_text(self.chars, 10, self._hot_strings, rng=rng))
115140

116141
elif self.typ == "uuid":
@@ -123,23 +148,15 @@ def kafka_value(self, rng: random.Random) -> Any:
123148
return json.dumps(result)
124149

125150
elif self.typ in ("timestamp with time zone", "timestamp without time zone"):
126-
now = 1700000000000 # doesn't need to be exact
127-
if rng.random() < 0.9:
128-
return now + long_tail_int(-86_400_000, 86_400_000, rng=rng)
129-
else:
130-
return rng.randrange(0, 9223372036854775807)
151+
# Epoch millis spread uniformly across 2019–2025
152+
# 2019-01-01 = 1546300800000, 2026-01-01 = 1767225600000
153+
return rng.randrange(1546300800000, 1767225600000)
131154

132155
elif self.typ == "mz_timestamp":
133-
year = long_tail_choice(
134-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
135-
)
136-
return literal(f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}")
156+
return literal(self._random_date(rng))
137157

138158
elif self.typ == "date":
139-
year = long_tail_choice(
140-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
141-
)
142-
return literal(f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}")
159+
return literal(self._random_date(rng))
143160

144161
elif self.typ == "time":
145162
if rng.random() < 0.8:
@@ -150,19 +167,22 @@ def kafka_value(self, rng: random.Random) -> Any:
150167
)
151168

152169
elif self.typ == "int2range":
153-
a = str(long_tail_int(-32768, 32767, rng=rng))
154-
b = str(long_tail_int(-32768, 32767, rng=rng))
155-
return literal(f"[{a},{b})")
170+
a = long_tail_int(-32768, 32767, rng=rng)
171+
b = long_tail_int(-32768, 32767, rng=rng)
172+
lo, hi = min(a, b), max(a, b)
173+
return literal(f"[{lo},{hi})")
156174

157175
elif self.typ == "int4range":
158-
a = str(long_tail_int(-2147483648, 2147483647, rng=rng))
159-
b = str(long_tail_int(-2147483648, 2147483647, rng=rng))
160-
return literal(f"[{a},{b})")
176+
a = long_tail_int(-2147483648, 2147483647, rng=rng)
177+
b = long_tail_int(-2147483648, 2147483647, rng=rng)
178+
lo, hi = min(a, b), max(a, b)
179+
return literal(f"[{lo},{hi})")
161180

162181
elif self.typ == "int8range":
163-
a = str(long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng))
164-
b = str(long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng))
165-
return literal(f"[{a},{b})")
182+
a = long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng)
183+
b = long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng)
184+
lo, hi = min(a, b), max(a, b)
185+
return literal(f"[{lo},{hi})")
166186

167187
elif self.typ == "map":
168188
return {
@@ -220,23 +240,16 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
220240
return str(val) if in_query else val
221241

222242
elif self.typ in ("text", "bytea"):
223-
if self.data_shape == "datetime":
224-
year = long_tail_choice(
225-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
226-
)
227-
s = (
228-
f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}"
229-
f"T{rng.randrange(0, 23):02}:{rng.randrange(0, 59):02}:{rng.randrange(0, 59):02}Z"
230-
)
231-
return literal(s) if in_query else s
232-
233-
elif self.data_shape:
234-
raise ValueError(f"Unhandled text shape {self.data_shape}")
235-
243+
shaped = self._shaped_text(rng)
244+
if shaped is not None:
245+
return literal(shaped) if in_query else shaped
236246
s = long_tail_text(self.chars, 100, self._hot_strings, rng=rng)
237247
return literal(s) if in_query else s
238248

239249
elif self.typ in ("character", "character varying"):
250+
shaped = self._shaped_text(rng)
251+
if shaped is not None:
252+
return literal(shaped) if in_query else shaped
240253
s = long_tail_text(self.chars, 10, self._hot_strings, rng=rng)
241254
return literal(s) if in_query else s
242255

@@ -254,24 +267,15 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
254267
return json.dumps(obj)
255268

256269
elif self.typ in ("timestamp with time zone", "timestamp without time zone"):
257-
year = long_tail_choice(
258-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
259-
)
260-
s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
270+
s = self._random_date(rng)
261271
return literal(s) if in_query else s
262272

263273
elif self.typ == "mz_timestamp":
264-
year = long_tail_choice(
265-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
266-
)
267-
s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
274+
s = self._random_date(rng)
268275
return literal(s) if in_query else s
269276

270277
elif self.typ == "date":
271-
year = long_tail_choice(
272-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
273-
)
274-
s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
278+
s = self._random_date(rng)
275279
return literal(s) if in_query else s
276280

277281
elif self.typ == "time":
@@ -288,21 +292,24 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
288292
return literal(s) if in_query else s
289293

290294
elif self.typ == "int2range":
291-
a = str(long_tail_int(-32768, 32767, rng=rng))
292-
b = str(long_tail_int(-32768, 32767, rng=rng))
293-
s = f"[{a},{b})"
295+
a = long_tail_int(-32768, 32767, rng=rng)
296+
b = long_tail_int(-32768, 32767, rng=rng)
297+
lo, hi = min(a, b), max(a, b)
298+
s = f"[{lo},{hi})"
294299
return literal(s) if in_query else s
295300

296301
elif self.typ == "int4range":
297-
a = str(long_tail_int(-2147483648, 2147483647, rng=rng))
298-
b = str(long_tail_int(-2147483648, 2147483647, rng=rng))
299-
s = f"[{a},{b})"
302+
a = long_tail_int(-2147483648, 2147483647, rng=rng)
303+
b = long_tail_int(-2147483648, 2147483647, rng=rng)
304+
lo, hi = min(a, b), max(a, b)
305+
s = f"[{lo},{hi})"
300306
return literal(s) if in_query else s
301307

302308
elif self.typ == "int8range":
303-
a = str(long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng))
304-
b = str(long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng))
305-
s = f"[{a},{b})"
309+
a = long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng)
310+
b = long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng)
311+
lo, hi = min(a, b), max(a, b)
312+
s = f"[{lo},{hi})"
306313
return literal(s) if in_query else s
307314

308315
elif self.typ == "map":

0 commit comments

Comments
 (0)