1919from typing import TYPE_CHECKING , Any , Dict , List , Optional
2020
2121import pyarrow as pa
22+ import pyarrow .types as pa_types
2223
2324from pypaimon .schema .data_types import PyarrowFieldParser
2425from pypaimon .snapshot .snapshot import BATCH_COMMIT_IDENTIFIER
2930 from ray .data import Dataset
3031
3132
33+ def _is_binary_type_compatible (input_type : pa .DataType , table_type : pa .DataType ) -> bool :
34+ if pa_types .is_binary (input_type ) and pa_types .is_large_binary (table_type ):
35+ return True
36+ if pa_types .is_large_binary (input_type ) and pa_types .is_binary (table_type ):
37+ return True
38+ return False
39+
40+
3241class TableWrite :
3342 def __init__ (self , table , commit_user ):
3443 from pypaimon .table .file_store_table import FileStoreTable
@@ -44,8 +53,46 @@ def write_arrow(self, table: pa.Table):
4453 for batch in batches_iterator :
4554 self .write_arrow_batch (batch )
4655
56+ def _convert_binary_types (self , data : pa .RecordBatch ) -> pa .RecordBatch :
57+ write_cols = self .file_store_write .write_cols
58+ table_schema = self .table_pyarrow_schema
59+
60+ converted_arrays = []
61+ needs_conversion = False
62+
63+ for i , field in enumerate (data .schema ):
64+ array = data .column (i )
65+ expected_type = None
66+
67+ if write_cols is None or field .name in write_cols :
68+ try :
69+ expected_type = table_schema .field (field .name ).type
70+ except KeyError :
71+ pass
72+
73+ if expected_type and field .type != expected_type and _is_binary_type_compatible (field .type , expected_type ):
74+ try :
75+ array = pa .compute .cast (array , expected_type )
76+ needs_conversion = True
77+ except (pa .ArrowInvalid , pa .ArrowCapacityError , ValueError ) as e :
78+ direction = f"{ field .type } to { expected_type } "
79+ raise ValueError (
80+ f"Failed to convert field '{ field .name } ' from { direction } . "
81+ f"If converting to binary(), ensure no value exceeds 2GB limit: { e } "
82+ ) from e
83+
84+ converted_arrays .append (array )
85+
86+ if needs_conversion :
87+ new_fields = [pa .field (field .name , arr .type , nullable = field .nullable )
88+ for field , arr in zip (data .schema , converted_arrays )]
89+ return pa .RecordBatch .from_arrays (converted_arrays , schema = pa .schema (new_fields ))
90+
91+ return data
92+
4793 def write_arrow_batch (self , data : pa .RecordBatch ):
4894 self ._validate_pyarrow_schema (data .schema )
95+ data = self ._convert_binary_types (data )
4996 partitions , buckets = self .row_key_extractor .extract_partition_bucket_batch (data )
5097
5198 partition_bucket_groups = defaultdict (list )
@@ -59,7 +106,7 @@ def write_arrow_batch(self, data: pa.RecordBatch):
59106
60107 def write_pandas (self , dataframe ):
61108 pa_schema = PyarrowFieldParser .from_paimon_schema (self .table .table_schema .fields )
62- record_batch = pa .RecordBatch .from_pandas (dataframe , schema = pa_schema )
109+ record_batch = pa .RecordBatch .from_pandas (dataframe , schema = pa_schema , preserve_index = False )
63110 return self .write_arrow_batch (record_batch )
64111
65112 def with_write_type (self , write_cols : List [str ]):
@@ -81,6 +128,11 @@ def write_ray(
81128 """
82129 Write a Ray Dataset to Paimon table.
83130
131+ .. note::
132+ Ray Data converts ``large_binary()`` to ``binary()`` when reading.
133+ This method automatically converts ``binary()`` back to ``large_binary()``
134+ to match the table schema.
135+
84136 Args:
85137 dataset: Ray Dataset to write. This is a distributed data collection
86138 from Ray Data (ray.data.Dataset).
@@ -102,11 +154,50 @@ def close(self):
102154 self .file_store_write .close ()
103155
104156 def _validate_pyarrow_schema (self , data_schema : pa .Schema ):
105- if data_schema != self .table_pyarrow_schema and data_schema .names != self .file_store_write .write_cols :
106- raise ValueError (f"Input schema isn't consistent with table schema and write cols. "
107- f"Input schema is: { data_schema } "
108- f"Table schema is: { self .table_pyarrow_schema } "
109- f"Write cols is: { self .file_store_write .write_cols } " )
157+ write_cols = self .file_store_write .write_cols
158+
159+ if write_cols is None :
160+ if data_schema .names != self .table_pyarrow_schema .names :
161+ raise ValueError (
162+ f"Input schema doesn't match table schema. "
163+ f"Field names and order must exactly match.\n "
164+ f"Input schema: { data_schema } \n "
165+ f"Table schema: { self .table_pyarrow_schema } "
166+ )
167+ for input_field , table_field in zip (data_schema , self .table_pyarrow_schema ):
168+ if input_field .type != table_field .type :
169+ if not _is_binary_type_compatible (input_field .type , table_field .type ):
170+ raise ValueError (
171+ f"Input schema doesn't match table schema. "
172+ f"Field '{ input_field .name } ' type mismatch.\n "
173+ f"Input type: { input_field .type } \n "
174+ f"Table type: { table_field .type } \n "
175+ f"Input schema: { data_schema } \n "
176+ f"Table schema: { self .table_pyarrow_schema } "
177+ )
178+ else :
179+ if list (data_schema .names ) != write_cols :
180+ raise ValueError (
181+ f"Input schema field names don't match write_cols. "
182+ f"Field names and order must match write_cols.\n "
183+ f"Input schema names: { list (data_schema .names )} \n "
184+ f"Write cols: { write_cols } "
185+ )
186+ table_field_map = {field .name : field for field in self .table_pyarrow_schema }
187+ for field_name in write_cols :
188+ if field_name not in table_field_map :
189+ raise ValueError (
190+ f"Field '{ field_name } ' in write_cols is not in table schema."
191+ )
192+ input_field = data_schema .field (field_name )
193+ table_field = table_field_map [field_name ]
194+ if input_field .type != table_field .type :
195+ if not _is_binary_type_compatible (input_field .type , table_field .type ):
196+ raise ValueError (
197+ f"Field '{ field_name } ' type mismatch.\n "
198+ f"Input type: { input_field .type } \n "
199+ f"Table type: { table_field .type } "
200+ )
110201
111202
112203class BatchTableWrite (TableWrite ):
0 commit comments