2222from pg8000 .native import literal
2323
2424from materialize .workload_replay .util import (
25- long_tail_choice ,
2625 long_tail_float ,
2726 long_tail_int ,
27+ long_tail_rank ,
2828 long_tail_text ,
2929)
3030
@@ -41,8 +41,9 @@ def __init__(
4141 self .default = default
4242 self .chars = string .ascii_letters + string .digits
4343 self .data_shape = data_shape
44- if data_shape :
45- assert typ in ("text" , "bytea" ), f"Can't create text shape for type { typ } "
44+
45+ self ._years = list (range (2019 , 2026 ))
46+ self ._seq_counter = 0
4647
4748 self ._hot_strings = [
4849 f"{ name } _a" ,
@@ -56,6 +57,43 @@ def __init__(
5657 "NULL" ,
5758 ]
5859
60+ def _shaped_text (self , rng : random .Random ) -> str | None :
61+ """Generate text according to data_shape, or None if not applicable."""
62+ if self .data_shape == "datetime" :
63+ return self ._random_datetime (rng )
64+ elif self .data_shape == "random" :
65+ length = rng .randrange (5 , 40 )
66+ return "" .join (rng .choice (self .chars ) for _ in range (length ))
67+ elif self .data_shape == "uuid" :
68+ return str (uuid .UUID (int = rng .getrandbits (128 ), version = 4 ))
69+ elif self .data_shape == "sequential" :
70+ self ._seq_counter += 1
71+ return f"{ self .name } _{ self ._seq_counter } "
72+ elif self .data_shape == "zipfian" :
73+ rank = long_tail_rank (n = 10000 , a = 1.3 , rng = rng )
74+ return f"{ self .name } _{ rank } "
75+ elif self .data_shape is not None and self .data_shape != "duration" :
76+ raise ValueError (f"Unhandled data_shape { self .data_shape !r} " )
77+ return None
78+
79+ def _shaped_float (self , rng : random .Random ) -> float | None :
80+ """Generate a float according to data_shape, or None if not applicable."""
81+ if self .data_shape == "duration" :
82+ return round (rng .uniform (10.0 , 1800.0 ), 2 )
83+ return None
84+
85+ def _random_date (self , rng : random .Random ) -> str :
86+ """Generate a uniformly random date string."""
87+ year = rng .choice (self ._years )
88+ return f"{ year } -{ rng .randrange (1 , 13 ):02} -{ rng .randrange (1 , 29 ):02} "
89+
90+ def _random_datetime (self , rng : random .Random ) -> str :
91+ """Generate a uniformly random datetime string."""
92+ return (
93+ f"{ self ._random_date (rng )} "
94+ f"T{ rng .randrange (0 , 24 ):02} :{ rng .randrange (0 , 60 ):02} :{ rng .randrange (0 , 60 ):02} Z"
95+ )
96+
5997 def avro_type (self ) -> str | list [str ]:
6098 """Return the Avro type for this column."""
6199 result = self .typ
@@ -96,21 +134,21 @@ def kafka_value(self, rng: random.Random) -> Any:
96134 return long_tail_int (0 , 18446744073709551615 , rng = rng )
97135
98136 elif self .typ in ("float" , "double precision" , "numeric" ):
137+ shaped = self ._shaped_float (rng )
138+ if shaped is not None :
139+ return shaped
99140 return long_tail_float (- 1_000_000_000.0 , 1_000_000_000.0 , rng = rng )
100141
101142 elif self .typ in ("text" , "bytea" ):
102- if self .data_shape == "datetime" :
103- year = long_tail_choice (
104- [2023 , 2024 , 2025 , 2022 , 2021 , 2020 , 2019 ], hot_prob = 0.9 , rng = rng
105- )
106- return literal (
107- f"{ year } -{ rng .randrange (1 , 13 ):02} -{ rng .randrange (1 , 29 ):02} T{ rng .randrange (0 , 23 ):02} :{ rng .randrange (0 , 59 ):02} :{ rng .randrange (0 , 59 ):02} Z"
108- )
109- elif self .data_shape :
110- raise ValueError (f"Unhandled text shape { self .data_shape } " )
143+ shaped = self ._shaped_text (rng )
144+ if shaped is not None :
145+ return literal (shaped )
111146 return literal (long_tail_text (self .chars , 100 , self ._hot_strings , rng = rng ))
112147
113148 elif self .typ in ("character" , "character varying" ):
149+ shaped = self ._shaped_text (rng )
150+ if shaped is not None :
151+ return literal (shaped )
114152 return literal (long_tail_text (self .chars , 10 , self ._hot_strings , rng = rng ))
115153
116154 elif self .typ == "uuid" :
@@ -123,23 +161,15 @@ def kafka_value(self, rng: random.Random) -> Any:
123161 return json .dumps (result )
124162
125163 elif self .typ in ("timestamp with time zone" , "timestamp without time zone" ):
126- now = 1700000000000 # doesn't need to be exact
127- if rng .random () < 0.9 :
128- return now + long_tail_int (- 86_400_000 , 86_400_000 , rng = rng )
129- else :
130- return rng .randrange (0 , 9223372036854775807 )
164+ # Epoch millis spread uniformly across 2019–2025
165+ # 2019-01-01 = 1546300800000, 2026-01-01 = 1767225600000
166+ return rng .randrange (1546300800000 , 1767225600000 )
131167
132168 elif self .typ == "mz_timestamp" :
133- year = long_tail_choice (
134- [2023 , 2024 , 2025 , 2022 , 2021 , 2020 , 2019 ], hot_prob = 0.9 , rng = rng
135- )
136- return literal (f"{ year } -{ rng .randrange (1 , 13 )} -{ rng .randrange (1 , 29 )} " )
169+ return literal (self ._random_date (rng ))
137170
138171 elif self .typ == "date" :
139- year = long_tail_choice (
140- [2023 , 2024 , 2025 , 2022 , 2021 , 2020 , 2019 ], hot_prob = 0.9 , rng = rng
141- )
142- return literal (f"{ year } -{ rng .randrange (1 , 13 )} -{ rng .randrange (1 , 29 )} " )
172+ return literal (self ._random_date (rng ))
143173
144174 elif self .typ == "time" :
145175 if rng .random () < 0.8 :
@@ -150,19 +180,22 @@ def kafka_value(self, rng: random.Random) -> Any:
150180 )
151181
152182 elif self .typ == "int2range" :
153- a = str (long_tail_int (- 32768 , 32767 , rng = rng ))
154- b = str (long_tail_int (- 32768 , 32767 , rng = rng ))
155- return literal (f"[{ a } ,{ b } )" )
183+ a = long_tail_int (- 32768 , 32767 , rng = rng )
184+ b = long_tail_int (- 32768 , 32767 , rng = rng )
185+ lo , hi = min (a , b ), max (a , b )
186+ return literal (f"[{ lo } ,{ hi } )" )
156187
157188 elif self .typ == "int4range" :
158- a = str (long_tail_int (- 2147483648 , 2147483647 , rng = rng ))
159- b = str (long_tail_int (- 2147483648 , 2147483647 , rng = rng ))
160- return literal (f"[{ a } ,{ b } )" )
189+ a = long_tail_int (- 2147483648 , 2147483647 , rng = rng )
190+ b = long_tail_int (- 2147483648 , 2147483647 , rng = rng )
191+ lo , hi = min (a , b ), max (a , b )
192+ return literal (f"[{ lo } ,{ hi } )" )
161193
162194 elif self .typ == "int8range" :
163- a = str (long_tail_int (- 9223372036854775808 , 9223372036854775807 , rng = rng ))
164- b = str (long_tail_int (- 9223372036854775808 , 9223372036854775807 , rng = rng ))
165- return literal (f"[{ a } ,{ b } )" )
195+ a = long_tail_int (- 9223372036854775808 , 9223372036854775807 , rng = rng )
196+ b = long_tail_int (- 9223372036854775808 , 9223372036854775807 , rng = rng )
197+ lo , hi = min (a , b ), max (a , b )
198+ return literal (f"[{ lo } ,{ hi } )" )
166199
167200 elif self .typ == "map" :
168201 return {
@@ -216,27 +249,23 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
216249 return str (val ) if in_query else val
217250
218251 elif self .typ in ("float" , "double precision" , "numeric" ):
252+ shaped = self ._shaped_float (rng )
253+ if shaped is not None :
254+ return str (shaped ) if in_query else shaped
219255 val = long_tail_float (- 1_000_000_000.0 , 1_000_000_000.0 , rng = rng )
220256 return str (val ) if in_query else val
221257
222258 elif self .typ in ("text" , "bytea" ):
223- if self .data_shape == "datetime" :
224- year = long_tail_choice (
225- [2023 , 2024 , 2025 , 2022 , 2021 , 2020 , 2019 ], hot_prob = 0.9 , rng = rng
226- )
227- s = (
228- f"{ year } -{ rng .randrange (1 , 13 ):02} -{ rng .randrange (1 , 29 ):02} "
229- f"T{ rng .randrange (0 , 23 ):02} :{ rng .randrange (0 , 59 ):02} :{ rng .randrange (0 , 59 ):02} Z"
230- )
231- return literal (s ) if in_query else s
232-
233- elif self .data_shape :
234- raise ValueError (f"Unhandled text shape { self .data_shape } " )
235-
259+ shaped = self ._shaped_text (rng )
260+ if shaped is not None :
261+ return literal (shaped ) if in_query else shaped
236262 s = long_tail_text (self .chars , 100 , self ._hot_strings , rng = rng )
237263 return literal (s ) if in_query else s
238264
239265 elif self .typ in ("character" , "character varying" ):
266+ shaped = self ._shaped_text (rng )
267+ if shaped is not None :
268+ return literal (shaped ) if in_query else shaped
240269 s = long_tail_text (self .chars , 10 , self ._hot_strings , rng = rng )
241270 return literal (s ) if in_query else s
242271
@@ -254,24 +283,15 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
254283 return json .dumps (obj )
255284
256285 elif self .typ in ("timestamp with time zone" , "timestamp without time zone" ):
257- year = long_tail_choice (
258- [2023 , 2024 , 2025 , 2022 , 2021 , 2020 , 2019 ], hot_prob = 0.9 , rng = rng
259- )
260- s = f"{ year } -{ rng .randrange (1 , 13 )} -{ rng .randrange (1 , 29 )} "
286+ s = self ._random_date (rng )
261287 return literal (s ) if in_query else s
262288
263289 elif self .typ == "mz_timestamp" :
264- year = long_tail_choice (
265- [2023 , 2024 , 2025 , 2022 , 2021 , 2020 , 2019 ], hot_prob = 0.9 , rng = rng
266- )
267- s = f"{ year } -{ rng .randrange (1 , 13 )} -{ rng .randrange (1 , 29 )} "
290+ s = self ._random_date (rng )
268291 return literal (s ) if in_query else s
269292
270293 elif self .typ == "date" :
271- year = long_tail_choice (
272- [2023 , 2024 , 2025 , 2022 , 2021 , 2020 , 2019 ], hot_prob = 0.9 , rng = rng
273- )
274- s = f"{ year } -{ rng .randrange (1 , 13 )} -{ rng .randrange (1 , 29 )} "
294+ s = self ._random_date (rng )
275295 return literal (s ) if in_query else s
276296
277297 elif self .typ == "time" :
@@ -288,21 +308,24 @@ def value(self, rng: random.Random, in_query: bool = True) -> Any:
288308 return literal (s ) if in_query else s
289309
290310 elif self .typ == "int2range" :
291- a = str (long_tail_int (- 32768 , 32767 , rng = rng ))
292- b = str (long_tail_int (- 32768 , 32767 , rng = rng ))
293- s = f"[{ a } ,{ b } )"
311+ a = long_tail_int (- 32768 , 32767 , rng = rng )
312+ b = long_tail_int (- 32768 , 32767 , rng = rng )
313+ lo , hi = min (a , b ), max (a , b )
314+ s = f"[{ lo } ,{ hi } )"
294315 return literal (s ) if in_query else s
295316
296317 elif self .typ == "int4range" :
297- a = str (long_tail_int (- 2147483648 , 2147483647 , rng = rng ))
298- b = str (long_tail_int (- 2147483648 , 2147483647 , rng = rng ))
299- s = f"[{ a } ,{ b } )"
318+ a = long_tail_int (- 2147483648 , 2147483647 , rng = rng )
319+ b = long_tail_int (- 2147483648 , 2147483647 , rng = rng )
320+ lo , hi = min (a , b ), max (a , b )
321+ s = f"[{ lo } ,{ hi } )"
300322 return literal (s ) if in_query else s
301323
302324 elif self .typ == "int8range" :
303- a = str (long_tail_int (- 9223372036854775808 , 9223372036854775807 , rng = rng ))
304- b = str (long_tail_int (- 9223372036854775808 , 9223372036854775807 , rng = rng ))
305- s = f"[{ a } ,{ b } )"
325+ a = long_tail_int (- 9223372036854775808 , 9223372036854775807 , rng = rng )
326+ b = long_tail_int (- 9223372036854775808 , 9223372036854775807 , rng = rng )
327+ lo , hi = min (a , b ), max (a , b )
328+ s = f"[{ lo } ,{ hi } )"
306329 return literal (s ) if in_query else s
307330
308331 elif self .typ == "map" :
0 commit comments