Skip to content

Commit f137adc

Browse files
committed
workload-replay: Improvements & feature-benchmark StartupLoaded fix
1 parent 26e1a33 commit f137adc

File tree

11 files changed

+637
-356
lines changed

11 files changed

+637
-356
lines changed

ci/nightly/pipeline.template.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ steps:
264264
- common-ancestor
265265

266266
- id: workload-replay
267-
label: "Workload Replay (1% initial data)"
267+
label: "Workload Replay (10% initial data)"
268268
depends_on: build-x86_64
269269
timeout_in_minutes: 240
270270
parallelism: 5
@@ -273,7 +273,7 @@ steps:
273273
composition: workload-replay
274274
run: benchmark
275275
args:
276-
- --factor-initial-data=0.01
276+
- --factor-initial-data=0.1
277277
- --compare-against
278278
- common-ancestor
279279
agents:

ci/release-qualification/pipeline.template.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ steps:
195195
- 1200
196196

197197
- id: long-workload-replay
198-
label: "Long Workload Replay (10% initial data)"
198+
label: "Long Workload Replay (100% initial data)"
199199
depends_on: build-x86_64
200200
timeout_in_minutes: 1200
201201
parallelism: 3
@@ -204,7 +204,7 @@ steps:
204204
composition: workload-replay
205205
run: benchmark
206206
args:
207-
- --factor-initial-data=0.1
207+
- --factor-initial-data=1
208208
- --runtime=3600
209209
- --compare-against
210210
- common-ancestor

misc/python/materialize/feature_benchmark/benchmark.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ def create_scenario_instance(self) -> Scenario:
6565
elif float(self._scale) > 0:
6666
scale = float(self._scale)
6767

68+
if self._scenario_cls.MAX_SCALE is not None:
69+
scale = min(scale, self._scenario_cls.MAX_SCALE)
70+
6871
scenario_class = self._scenario_cls
6972
return scenario_class(
7073
scale=scale,

misc/python/materialize/feature_benchmark/scenario.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
class RootScenario:
2222
SCALE: float = 6
2323
FIXED_SCALE: bool = False # Will --scale=N have effect on the scenario
24+
MAX_SCALE: float | None = None # Cap scale to this value when set
2425
RELATIVE_THRESHOLD: dict[MeasurementType, float] = {
2526
MeasurementType.WALLCLOCK: 0.10,
2627
# Increased the other measurements since they are easy to regress now

misc/python/materialize/feature_benchmark/scenarios/benchmark_main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2205,6 +2205,8 @@ class StartupLoaded(Scenario):
22052205
"""Measure the time it takes to restart a populated Mz instance and have all the dataflows be ready to return something"""
22062206

22072207
SCALE = 1 # 10 objects of each kind
2208+
# Can not scale to 100s of objects
2209+
MAX_SCALE = 1.5
22082210

22092211
def shared(self) -> Action:
22102212
return TdAction(

misc/python/materialize/workload_replay/column.py

Lines changed: 91 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222
from pg8000.native import literal
2323

2424
from materialize.workload_replay.util import (
25-
long_tail_choice,
2625
long_tail_float,
2726
long_tail_int,
27+
long_tail_rank,
2828
long_tail_text,
2929
)
3030

@@ -41,8 +41,9 @@ def __init__(
4141
self.default = default
4242
self.chars = string.ascii_letters + string.digits
4343
self.data_shape = data_shape
44-
if data_shape:
45-
assert typ in ("text", "bytea"), f"Can't create text shape for type {typ}"
44+
45+
self._years = list(range(2019, 2026))
46+
self._seq_counter = 0
4647

4748
self._hot_strings = [
4849
f"{name}_a",
@@ -56,6 +57,43 @@ def __init__(
5657
"NULL",
5758
]
5859

60+
def _shaped_text(self, rng: random.Random) -> str | None:
61+
"""Generate text according to data_shape, or None if not applicable."""
62+
if self.data_shape == "datetime":
63+
return self._random_datetime(rng)
64+
elif self.data_shape == "random":
65+
length = rng.randrange(5, 40)
66+
return "".join(rng.choice(self.chars) for _ in range(length))
67+
elif self.data_shape == "uuid":
68+
return str(uuid.UUID(int=rng.getrandbits(128), version=4))
69+
elif self.data_shape == "sequential":
70+
self._seq_counter += 1
71+
return f"{self.name}_{self._seq_counter}"
72+
elif self.data_shape == "zipfian":
73+
rank = long_tail_rank(n=10000, a=1.3, rng=rng)
74+
return f"{self.name}_{rank}"
75+
elif self.data_shape is not None and self.data_shape != "duration":
76+
raise ValueError(f"Unhandled data_shape {self.data_shape!r}")
77+
return None
78+
79+
def _shaped_float(self, rng: random.Random) -> float | None:
80+
"""Generate a float according to data_shape, or None if not applicable."""
81+
if self.data_shape == "duration":
82+
return round(rng.uniform(10.0, 1800.0), 2)
83+
return None
84+
85+
def _random_date(self, rng: random.Random) -> str:
86+
"""Generate a uniformly random date string."""
87+
year = rng.choice(self._years)
88+
return f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}"
89+
90+
def _random_datetime(self, rng: random.Random) -> str:
91+
"""Generate a uniformly random datetime string."""
92+
return (
93+
f"{self._random_date(rng)}"
94+
f"T{rng.randrange(0, 24):02}:{rng.randrange(0, 60):02}:{rng.randrange(0, 60):02}Z"
95+
)
96+
5997
def avro_type(self) -> str | list[str]:
6098
"""Return the Avro type for this column."""
6199
result = self.typ
@@ -96,21 +134,21 @@ def kafka_value(self, rng: random.Random) -> Any:
96134
return long_tail_int(0, 18446744073709551615, rng=rng)
97135

98136
elif self.typ in ("float", "double precision", "numeric"):
137+
shaped = self._shaped_float(rng)
138+
if shaped is not None:
139+
return shaped
99140
return long_tail_float(-1_000_000_000.0, 1_000_000_000.0, rng=rng)
100141

101142
elif self.typ in ("text", "bytea"):
102-
if self.data_shape == "datetime":
103-
year = long_tail_choice(
104-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
105-
)
106-
return literal(
107-
f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}T{rng.randrange(0, 23):02}:{rng.randrange(0, 59):02}:{rng.randrange(0, 59):02}Z"
108-
)
109-
elif self.data_shape:
110-
raise ValueError(f"Unhandled text shape {self.data_shape}")
143+
shaped = self._shaped_text(rng)
144+
if shaped is not None:
145+
return literal(shaped)
111146
return literal(long_tail_text(self.chars, 100, self._hot_strings, rng=rng))
112147

113148
elif self.typ in ("character", "character varying"):
149+
shaped = self._shaped_text(rng)
150+
if shaped is not None:
151+
return literal(shaped)
114152
return literal(long_tail_text(self.chars, 10, self._hot_strings, rng=rng))
115153

116154
elif self.typ == "uuid":
@@ -123,23 +161,15 @@ def kafka_value(self, rng: random.Random) -> Any:
123161
return json.dumps(result)
124162

125163
elif self.typ in ("timestamp with time zone", "timestamp without time zone"):
126-
now = 1700000000000 # doesn't need to be exact
127-
if rng.random() < 0.9:
128-
return now + long_tail_int(-86_400_000, 86_400_000, rng=rng)
129-
else:
130-
return rng.randrange(0, 9223372036854775807)
164+
# Epoch millis spread uniformly across 2019–2025
165+
# 2019-01-01 = 1546300800000, 2026-01-01 = 1767225600000
166+
return rng.randrange(1546300800000, 1767225600000)
131167

132168
elif self.typ == "mz_timestamp":
133-
year = long_tail_choice(
134-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
135-
)
136-
return literal(f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}")
169+
return literal(self._random_date(rng))
137170

138171
elif self.typ == "date":
139-
year = long_tail_choice(
140-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
141-
)
142-
return literal(f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}")
172+
return literal(self._random_date(rng))
143173

144174
elif self.typ == "time":
145175
if rng.random() < 0.8:
@@ -150,19 +180,22 @@ def kafka_value(self, rng: random.Random) -> Any:
150180
)
151181

152182
elif self.typ == "int2range":
153-
a = str(long_tail_int(-32768, 32767, rng=rng))
154-
b = str(long_tail_int(-32768, 32767, rng=rng))
155-
return literal(f"[{a},{b})")
183+
a = long_tail_int(-32768, 32767, rng=rng)
184+
b = long_tail_int(-32768, 32767, rng=rng)
185+
lo, hi = min(a, b), max(a, b)
186+
return literal(f"[{lo},{hi})")
156187

157188
elif self.typ == "int4range":
158-
a = str(long_tail_int(-2147483648, 2147483647, rng=rng))
159-
b = str(long_tail_int(-2147483648, 2147483647, rng=rng))
160-
return literal(f"[{a},{b})")
189+
a = long_tail_int(-2147483648, 2147483647, rng=rng)
190+
b = long_tail_int(-2147483648, 2147483647, rng=rng)
191+
lo, hi = min(a, b), max(a, b)
192+
return literal(f"[{lo},{hi})")
161193

162194
elif self.typ == "int8range":
163-
a = str(long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng))
164-
b = str(long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng))
165-
return literal(f"[{a},{b})")
195+
a = long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng)
196+
b = long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng)
197+
lo, hi = min(a, b), max(a, b)
198+
return literal(f"[{lo},{hi})")
166199

167200
elif self.typ == "map":
168201
return {
@@ -216,27 +249,23 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
216249
return str(val) if in_query else val
217250

218251
elif self.typ in ("float", "double precision", "numeric"):
252+
shaped = self._shaped_float(rng)
253+
if shaped is not None:
254+
return str(shaped) if in_query else shaped
219255
val = long_tail_float(-1_000_000_000.0, 1_000_000_000.0, rng=rng)
220256
return str(val) if in_query else val
221257

222258
elif self.typ in ("text", "bytea"):
223-
if self.data_shape == "datetime":
224-
year = long_tail_choice(
225-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
226-
)
227-
s = (
228-
f"{year}-{rng.randrange(1, 13):02}-{rng.randrange(1, 29):02}"
229-
f"T{rng.randrange(0, 23):02}:{rng.randrange(0, 59):02}:{rng.randrange(0, 59):02}Z"
230-
)
231-
return literal(s) if in_query else s
232-
233-
elif self.data_shape:
234-
raise ValueError(f"Unhandled text shape {self.data_shape}")
235-
259+
shaped = self._shaped_text(rng)
260+
if shaped is not None:
261+
return literal(shaped) if in_query else shaped
236262
s = long_tail_text(self.chars, 100, self._hot_strings, rng=rng)
237263
return literal(s) if in_query else s
238264

239265
elif self.typ in ("character", "character varying"):
266+
shaped = self._shaped_text(rng)
267+
if shaped is not None:
268+
return literal(shaped) if in_query else shaped
240269
s = long_tail_text(self.chars, 10, self._hot_strings, rng=rng)
241270
return literal(s) if in_query else s
242271

@@ -254,24 +283,15 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
254283
return json.dumps(obj)
255284

256285
elif self.typ in ("timestamp with time zone", "timestamp without time zone"):
257-
year = long_tail_choice(
258-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
259-
)
260-
s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
286+
s = self._random_date(rng)
261287
return literal(s) if in_query else s
262288

263289
elif self.typ == "mz_timestamp":
264-
year = long_tail_choice(
265-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
266-
)
267-
s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
290+
s = self._random_date(rng)
268291
return literal(s) if in_query else s
269292

270293
elif self.typ == "date":
271-
year = long_tail_choice(
272-
[2023, 2024, 2025, 2022, 2021, 2020, 2019], hot_prob=0.9, rng=rng
273-
)
274-
s = f"{year}-{rng.randrange(1, 13)}-{rng.randrange(1, 29)}"
294+
s = self._random_date(rng)
275295
return literal(s) if in_query else s
276296

277297
elif self.typ == "time":
@@ -288,21 +308,24 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
288308
return literal(s) if in_query else s
289309

290310
elif self.typ == "int2range":
291-
a = str(long_tail_int(-32768, 32767, rng=rng))
292-
b = str(long_tail_int(-32768, 32767, rng=rng))
293-
s = f"[{a},{b})"
311+
a = long_tail_int(-32768, 32767, rng=rng)
312+
b = long_tail_int(-32768, 32767, rng=rng)
313+
lo, hi = min(a, b), max(a, b)
314+
s = f"[{lo},{hi})"
294315
return literal(s) if in_query else s
295316

296317
elif self.typ == "int4range":
297-
a = str(long_tail_int(-2147483648, 2147483647, rng=rng))
298-
b = str(long_tail_int(-2147483648, 2147483647, rng=rng))
299-
s = f"[{a},{b})"
318+
a = long_tail_int(-2147483648, 2147483647, rng=rng)
319+
b = long_tail_int(-2147483648, 2147483647, rng=rng)
320+
lo, hi = min(a, b), max(a, b)
321+
s = f"[{lo},{hi})"
300322
return literal(s) if in_query else s
301323

302324
elif self.typ == "int8range":
303-
a = str(long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng))
304-
b = str(long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng))
305-
s = f"[{a},{b})"
325+
a = long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng)
326+
b = long_tail_int(-9223372036854775808, 9223372036854775807, rng=rng)
327+
lo, hi = min(a, b), max(a, b)
328+
s = f"[{lo},{hi})"
306329
return literal(s) if in_query else s
307330

308331
elif self.typ == "map":

0 commit comments

Comments
 (0)