@@ -266,6 +266,94 @@ def test_abort_cleans_up_changelog_files(self):
266266 table_write .close ()
267267 table_commit .close ()
268268
269+ def test_reject_changelog_producer_on_append_only_table (self ):
270+ append_schema = pa .schema ([
271+ ('user_id' , pa .int32 ()),
272+ ('item_id' , pa .int64 ()),
273+ ('behavior' , pa .string ()),
274+ ('dt' , pa .string ())
275+ ])
276+ for mode in ['input' , 'full-compaction' , 'lookup' ]:
277+ with self .assertRaises (ValueError , msg = f"Should reject changelog-producer={ mode } without PKs" ):
278+ Schema .from_pyarrow_schema (
279+ append_schema ,
280+ partition_keys = ['dt' ],
281+ options = {'changelog-producer' : mode , 'bucket' : '1' }
282+ )
283+
284+ def test_changelog_producer_none_allowed_on_append_only_table (self ):
285+ append_schema = pa .schema ([
286+ ('user_id' , pa .int32 ()),
287+ ('item_id' , pa .int64 ()),
288+ ('behavior' , pa .string ()),
289+ ('dt' , pa .string ())
290+ ])
291+ schema = Schema .from_pyarrow_schema (
292+ append_schema ,
293+ partition_keys = ['dt' ],
294+ options = {'changelog-producer' : 'none' , 'bucket' : '1' }
295+ )
296+ self .assertIsNotNone (schema )
297+
298+ def test_input_mode_changelog_uses_parquet_regardless_of_data_format (self ):
299+ table = self ._create_table (
300+ 'test_input_changelog_format' ,
301+ options = {'changelog-producer' : 'input' , 'bucket' : '1' , 'file.format' : 'orc' }
302+ )
303+ write_builder = table .new_batch_write_builder ()
304+ table_write = write_builder .new_write ()
305+ table_commit = write_builder .new_commit ()
306+
307+ table_write .write_arrow (self ._sample_data ())
308+ table_commit .commit (table_write .prepare_commit ())
309+
310+ bucket_dir = os .path .join (
311+ self .warehouse , 'default.db' , 'test_input_changelog_format' , 'dt=p1' , 'bucket-0' )
312+ changelog_files = glob .glob (os .path .join (bucket_dir , 'changelog-*' ))
313+ self .assertTrue (len (changelog_files ) > 0 , "Should have changelog files" )
314+ for f in changelog_files :
315+ self .assertTrue (f .endswith ('.parquet' ),
316+ f"Changelog file should use parquet format by default, got { f } " )
317+
318+ data_files = glob .glob (os .path .join (bucket_dir , 'data-*' ))
319+ self .assertTrue (len (data_files ) > 0 , "Should have data files" )
320+ for f in data_files :
321+ self .assertTrue (f .endswith ('.orc' ),
322+ f"Data file should use orc format, got { f } " )
323+
324+ table_write .close ()
325+ table_commit .close ()
326+
327+ def test_input_mode_changelog_respects_changelog_file_format (self ):
328+ table = self ._create_table (
329+ 'test_input_cl_file_fmt' ,
330+ options = {'changelog-producer' : 'input' , 'bucket' : '1' ,
331+ 'file.format' : 'parquet' , 'changelog-file.format' : 'orc' }
332+ )
333+ write_builder = table .new_batch_write_builder ()
334+ table_write = write_builder .new_write ()
335+ table_commit = write_builder .new_commit ()
336+
337+ table_write .write_arrow (self ._sample_data ())
338+ table_commit .commit (table_write .prepare_commit ())
339+
340+ bucket_dir = os .path .join (
341+ self .warehouse , 'default.db' , 'test_input_cl_file_fmt' , 'dt=p1' , 'bucket-0' )
342+ changelog_files = glob .glob (os .path .join (bucket_dir , 'changelog-*' ))
343+ self .assertTrue (len (changelog_files ) > 0 , "Should have changelog files" )
344+ for f in changelog_files :
345+ self .assertTrue (f .endswith ('.orc' ),
346+ f"Changelog file should use orc format, got { f } " )
347+
348+ data_files = glob .glob (os .path .join (bucket_dir , 'data-*' ))
349+ self .assertTrue (len (data_files ) > 0 , "Should have data files" )
350+ for f in data_files :
351+ self .assertTrue (f .endswith ('.parquet' ),
352+ f"Data file should use parquet format, got { f } " )
353+
354+ table_write .close ()
355+ table_commit .close ()
356+
269357
270358if __name__ == '__main__' :
271359 unittest .main ()
0 commit comments