[python] optimize schema validation and support binary/large_binary type conversion

xiaohongbo · xiaohongbo · commit 08099eba843f · 2026-01-21T14:16:20.000+08:00
add doc for ray convert large binary to binary

fix test case failure

clean code
diff --git a/docs/content/program-api/python-api.md b/docs/content/program-api/python-api.md
@@ -195,6 +195,8 @@ record_batch = ...
 table_write.write_arrow_batch(record_batch)
 
 # 2.4 Write Ray Dataset (requires ray to be installed)
+
+**Note:** Ray Data converts `large_binary()` to `binary()` when reading. `write_ray()` automatically converts `binary()` back to `large_binary()` to match the table schema.
 import ray
 ray_dataset = ray.data.read_json("/path/to/data.jsonl")
 table_write.write_ray(ray_dataset, overwrite=False, concurrency=2)
@@ -471,6 +473,8 @@ df = ray_dataset.to_pandas()
 - `**read_args`: Additional kwargs passed to the datasource (e.g., `per_task_row_limit`
   in Ray 2.52.0+).
 
+**Note:** Ray Data converts `large_binary()` to `binary()` when reading. When writing back via `write_ray()`, the conversion is handled automatically.
+
 **Ray Block Size Configuration:**
 
 If you need to configure Ray's block size (e.g., when Paimon splits exceed Ray's default
diff --git a/paimon-python/pypaimon/read/datasource.py b/paimon-python/pypaimon/read/datasource.py
@@ -50,6 +50,10 @@ class RayDatasource(Datasource):
 
     This datasource enables distributed parallel reading of Paimon table splits,
     allowing Ray to read multiple splits concurrently across the cluster.
+
+    .. note::
+        Ray Data converts ``large_binary()`` to ``binary()`` when reading.
+        When writing back via :meth:`TableWrite.write_ray`, the conversion is handled automatically.
     """
 
     def __init__(self, table_read: TableRead, splits: List[Split]):
diff --git a/paimon-python/pypaimon/read/table_read.py b/paimon-python/pypaimon/read/table_read.py
@@ -194,6 +194,11 @@ def to_ray(
         **read_args,
     ) -> "ray.data.dataset.Dataset":
         """Convert Paimon table data to Ray Dataset.
+        
+        .. note::
+            Ray Data converts ``large_binary()`` to ``binary()`` when reading.
+            When writing back via :meth:`write_ray`, the conversion is handled automatically.
+        
         Args:
             splits: List of splits to read from the Paimon table.
             ray_remote_args: Optional kwargs passed to :func:`ray.remote` in read tasks.
diff --git a/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py b/paimon-python/pypaimon/tests/py36/rest_ao_read_write_test.py
@@ -577,7 +577,7 @@ def test_write_wrong_schema(self):
 
         with self.assertRaises(ValueError) as e:
             table_write.write_arrow_batch(record_batch)
-        self.assertTrue(str(e.exception).startswith("Input schema isn't consistent with table schema and write cols."))
+        self.assertTrue(str(e.exception).startswith("Input schema doesn't match table schema."))
 
     def test_write_wide_table_large_data(self):
         logging.basicConfig(level=logging.INFO)
diff --git a/paimon-python/pypaimon/tests/ray_data_test.py b/paimon-python/pypaimon/tests/ray_data_test.py
@@ -22,6 +22,7 @@
 import shutil
 
 import pyarrow as pa
+import pyarrow.types as pa_types
 import ray
 
 from pypaimon import CatalogFactory, Schema
@@ -115,6 +116,66 @@ def test_basic_ray_data_read(self):
         self.assertIsNotNone(ray_dataset, "Ray dataset should not be None")
         self.assertEqual(ray_dataset.count(), 5, "Should have 5 rows")
 
+    def test_ray_data_read_with_blob(self):
+        pa_schema = pa.schema([
+            ('id', pa.int32()),
+            ('name', pa.string()),
+            ('data', pa.large_binary()),  # BLOB type in Paimon
+        ])
+
+        schema = Schema.from_pyarrow_schema(
+            pa_schema,
+            options={
+                'row-tracking.enabled': 'true',
+                'data-evolution.enabled': 'true',
+                'blob-field': 'data',
+            }
+        )
+        import time
+        table_name = f'default.test_ray_blob_{int(time.time() * 1000000)}'
+        
+        self.catalog.create_table(table_name, schema, False)
+        table = self.catalog.get_table(table_name)
+
+        test_data = pa.Table.from_pydict({
+            'id': [1, 2, 3, 4, 5],
+            'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
+            'data': [b'data1', b'data2', b'data3', b'data4', b'data5'],
+        }, schema=pa_schema)
+
+        write_builder = table.new_batch_write_builder()
+        writer = write_builder.new_write()
+        writer.write_arrow(test_data)
+        commit_messages = writer.prepare_commit()
+        commit = write_builder.new_commit()
+        commit.commit(commit_messages)
+        writer.close()
+
+        read_builder = table.new_read_builder()
+        table_read = read_builder.new_read()
+        table_scan = read_builder.new_scan()
+        splits = table_scan.plan().splits()
+
+        ray_dataset = table_read.to_ray(splits, override_num_blocks=2)
+
+        self.assertIsNotNone(ray_dataset, "Ray dataset should not be None")
+
+        df_check = ray_dataset.to_pandas()
+        ray_table_check = pa.Table.from_pandas(df_check)
+        ray_schema_check = ray_table_check.schema
+        ray_data_field = ray_schema_check.field('data')
+
+        self.assertTrue(
+            pa_types.is_binary(ray_data_field.type),
+            f"Ray Dataset should convert large_binary to binary when reading, "
+            f"but got {ray_data_field.type}"
+        )
+        self.assertFalse(
+            pa_types.is_large_binary(ray_data_field.type),
+            f"Ray Dataset should NOT have large_binary type after reading, "
+            f"but got {ray_data_field.type}"
+        )
+
         # Test basic operations
         sample_data = ray_dataset.take(3)
         self.assertEqual(len(sample_data), 3, "Should have 3 sample rows")
@@ -130,6 +191,13 @@ def test_basic_ray_data_read(self):
             ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
             "Name column should match"
         )
+        
+        data_values = [bytes(d) if d is not None else None for d in df_sorted['data']]
+        self.assertEqual(
+            data_values,
+            [b'data1', b'data2', b'data3', b'data4', b'data5'],
+            "Data column should match"
+        )
 
     def test_basic_ray_data_write(self):
         """Test basic Ray Data write from PyPaimon table."""
diff --git a/paimon-python/pypaimon/tests/reader_base_test.py b/paimon-python/pypaimon/tests/reader_base_test.py
@@ -273,7 +273,7 @@ def test_write_wrong_schema(self):
 
         with self.assertRaises(ValueError) as e:
             table_write.write_arrow_batch(record_batch)
-        self.assertTrue(str(e.exception).startswith("Input schema isn't consistent with table schema and write cols."))
+        self.assertTrue(str(e.exception).startswith("Input schema doesn't match table schema."))
 
     def test_reader_iterator(self):
         read_builder = self.table.new_read_builder()
diff --git a/paimon-python/pypaimon/tests/rest/rest_read_write_test.py b/paimon-python/pypaimon/tests/rest/rest_read_write_test.py
@@ -438,7 +438,7 @@ def test_write_wrong_schema(self):
 
         with self.assertRaises(ValueError) as e:
             table_write.write_arrow_batch(record_batch)
-        self.assertTrue(str(e.exception).startswith("Input schema isn't consistent with table schema and write cols."))
+        self.assertTrue(str(e.exception).startswith("Input schema doesn't match table schema."))
 
     def test_reader_iterator(self):
         read_builder = self.table.new_read_builder()
diff --git a/paimon-python/pypaimon/write/table_write.py b/paimon-python/pypaimon/write/table_write.py
@@ -19,6 +19,7 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional
 
 import pyarrow as pa
+import pyarrow.types as pa_types
 
 from pypaimon.schema.data_types import PyarrowFieldParser
 from pypaimon.snapshot.snapshot import BATCH_COMMIT_IDENTIFIER
@@ -29,6 +30,14 @@
     from ray.data import Dataset
 
 
+def _is_binary_type_compatible(input_type: pa.DataType, table_type: pa.DataType) -> bool:
+    if pa_types.is_binary(input_type) and pa_types.is_large_binary(table_type):
+        return True
+    if pa_types.is_large_binary(input_type) and pa_types.is_binary(table_type):
+        return True
+    return False
+
+
 class TableWrite:
     def __init__(self, table, commit_user):
         from pypaimon.table.file_store_table import FileStoreTable
@@ -44,8 +53,46 @@ def write_arrow(self, table: pa.Table):
         for batch in batches_iterator:
             self.write_arrow_batch(batch)
 
+    def _convert_binary_types(self, data: pa.RecordBatch) -> pa.RecordBatch:
+        write_cols = self.file_store_write.write_cols
+        table_schema = self.table_pyarrow_schema
+        
+        converted_arrays = []
+        needs_conversion = False
+        
+        for i, field in enumerate(data.schema):
+            array = data.column(i)
+            expected_type = None
+            
+            if write_cols is None or field.name in write_cols:
+                try:
+                    expected_type = table_schema.field(field.name).type
+                except KeyError:
+                    pass
+            
+            if expected_type and field.type != expected_type and _is_binary_type_compatible(field.type, expected_type):
+                try:
+                    array = pa.compute.cast(array, expected_type)
+                    needs_conversion = True
+                except (pa.ArrowInvalid, pa.ArrowCapacityError, ValueError) as e:
+                    direction = f"{field.type} to {expected_type}"
+                    raise ValueError(
+                        f"Failed to convert field '{field.name}' from {direction}. "
+                        f"If converting to binary(), ensure no value exceeds 2GB limit: {e}"
+                    ) from e
+            
+            converted_arrays.append(array)
+        
+        if needs_conversion:
+            new_fields = [pa.field(field.name, arr.type, nullable=field.nullable)
+                          for field, arr in zip(data.schema, converted_arrays)]
+            return pa.RecordBatch.from_arrays(converted_arrays, schema=pa.schema(new_fields))
+        
+        return data
+
     def write_arrow_batch(self, data: pa.RecordBatch):
         self._validate_pyarrow_schema(data.schema)
+        data = self._convert_binary_types(data)
         partitions, buckets = self.row_key_extractor.extract_partition_bucket_batch(data)
 
         partition_bucket_groups = defaultdict(list)
@@ -59,7 +106,7 @@ def write_arrow_batch(self, data: pa.RecordBatch):
 
     def write_pandas(self, dataframe):
         pa_schema = PyarrowFieldParser.from_paimon_schema(self.table.table_schema.fields)
-        record_batch = pa.RecordBatch.from_pandas(dataframe, schema=pa_schema)
+        record_batch = pa.RecordBatch.from_pandas(dataframe, schema=pa_schema, preserve_index=False)
         return self.write_arrow_batch(record_batch)
 
     def with_write_type(self, write_cols: List[str]):
@@ -81,6 +128,11 @@ def write_ray(
         """
         Write a Ray Dataset to Paimon table.
         
+        .. note::
+            Ray Data converts ``large_binary()`` to ``binary()`` when reading.
+            This method automatically converts ``binary()`` back to ``large_binary()``
+            to match the table schema.
+        
         Args:
             dataset: Ray Dataset to write. This is a distributed data collection
                 from Ray Data (ray.data.Dataset).
@@ -102,11 +154,50 @@ def close(self):
         self.file_store_write.close()
 
     def _validate_pyarrow_schema(self, data_schema: pa.Schema):
-        if data_schema != self.table_pyarrow_schema and data_schema.names != self.file_store_write.write_cols:
-            raise ValueError(f"Input schema isn't consistent with table schema and write cols. "
-                             f"Input schema is: {data_schema} "
-                             f"Table schema is: {self.table_pyarrow_schema} "
-                             f"Write cols is: {self.file_store_write.write_cols}")
+        write_cols = self.file_store_write.write_cols
+        
+        if write_cols is None:
+            if data_schema.names != self.table_pyarrow_schema.names:
+                raise ValueError(
+                    f"Input schema doesn't match table schema. "
+                    f"Field names and order must exactly match.\n"
+                    f"Input schema: {data_schema}\n"
+                    f"Table schema: {self.table_pyarrow_schema}"
+                )
+            for input_field, table_field in zip(data_schema, self.table_pyarrow_schema):
+                if input_field.type != table_field.type:
+                    if not _is_binary_type_compatible(input_field.type, table_field.type):
+                        raise ValueError(
+                            f"Input schema doesn't match table schema. "
+                            f"Field '{input_field.name}' type mismatch.\n"
+                            f"Input type: {input_field.type}\n"
+                            f"Table type: {table_field.type}\n"
+                            f"Input schema: {data_schema}\n"
+                            f"Table schema: {self.table_pyarrow_schema}"
+                        )
+        else:
+            if list(data_schema.names) != write_cols:
+                raise ValueError(
+                    f"Input schema field names don't match write_cols. "
+                    f"Field names and order must match write_cols.\n"
+                    f"Input schema names: {list(data_schema.names)}\n"
+                    f"Write cols: {write_cols}"
+                )
+            table_field_map = {field.name: field for field in self.table_pyarrow_schema}
+            for field_name in write_cols:
+                if field_name not in table_field_map:
+                    raise ValueError(
+                        f"Field '{field_name}' in write_cols is not in table schema."
+                    )
+                input_field = data_schema.field(field_name)
+                table_field = table_field_map[field_name]
+                if input_field.type != table_field.type:
+                    if not _is_binary_type_compatible(input_field.type, table_field.type):
+                        raise ValueError(
+                            f"Field '{field_name}' type mismatch.\n"
+                            f"Input type: {input_field.type}\n"
+                            f"Table type: {table_field.type}"
+                        )
 
 
 class BatchTableWrite(TableWrite):