apache
diff --git a/‎docs/content/append-table/vector.md‎
Lines changed: 167 additions & 0 deletions b/‎docs/content/append-table/vector.md‎
Lines changed: 167 additions & 0 deletions
diff --git a/‎docs/layouts/shortcodes/generated/core_configuration.html‎
Lines changed: 18 additions & 0 deletions b/‎docs/layouts/shortcodes/generated/core_configuration.html‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎paimon-api/src/main/java/org/apache/paimon/CoreOptions.java‎
Lines changed: 52 additions & 0 deletions b/‎paimon-api/src/main/java/org/apache/paimon/CoreOptions.java‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java‎
Lines changed: 8 additions & 0 deletions b/‎paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎paimon-core/src/main/java/org/apache/paimon/append/AppendOnlyWriter.java‎
Lines changed: 43 additions & 4 deletions b/‎paimon-core/src/main/java/org/apache/paimon/append/AppendOnlyWriter.java‎
Lines changed: 43 additions & 4 deletions
@@ -0,0 +1,167 @@
+---
+title: "Vector Storage"
+weight: 7
+type: docs
+aliases:
+- /append-table/vector-storage.html
+---
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Vector Storage
+
+## Overview
+
+With the explosive growth of AI scenarios, vector storage has become increasingly important.
+
+Paimon provides optimized storage solutions specifically designed for vector data to meet the needs of various scenarios.
+
+## Vector Data Type
+
+Vector data comes in many types, among which dense vectors are the most commonly used. They are typically expressed as fixed-length, densely packed arrays, generally without `null` elements.
+
+Paimon supports defining columns of type `VECTOR<t, n>`, which represents a fixed-length, dense vector column, where:
+ - **`t`**: The element type of the vector. Currently supports seven primitive types: `BOOLEAN`, `TINYINT`, `SMALLINT`, `INT`, `BIGINT`, `FLOAT`, `DOUBLE`;
+ - **`n`**: The vector dimension, must be a positive integer not exceeding `2,147,483,647`;
+ - **`null constraint`**: `VECTOR` type supports defining `NOT NULL` or the default nullable. However, if a specific `VECTOR` value itself is not `null`, its elements are not allowed to be `null`.
+
+Compared to variable-length arrays, these features make dense vectors more concise in storage and memory representation, with benefits including:
+ - More natural semantic constraints, preventing mismatched lengths, `null` elements, and other anomalies at the data storage layer;
+ - Better point-lookup performance, eliminating offset array storage and access;
+ - Closer alignment with type representations in specialized vector engines, often avoiding memory copies and type conversions during queries.
+
+Example: Define a table with a `VECTOR` column using Java API and write one row of data.
+```java
+public class CreateTableWithVector {
+
+    public static void main(String[] args) throws Exception {
+        // Schema
+        Schema.Builder schemaBuilder = Schema.newBuilder();
+        schemaBuilder.column("id", DataTypes.BIGINT());
+        schemaBuilder.column("embed", DataTypes.VECTOR(3, DataTypes.FLOAT()));
+        schemaBuilder.option(CoreOptions.FILE_FORMAT.key(), "lance");
+        schemaBuilder.option(CoreOptions.FILE_COMPRESSION.key(), "none");
+        Schema schema = schemaBuilder.build();
+
+        // Create catalog
+        String database = "default";
+        String tempPath = System.getProperty("java.io.tmpdir") + UUID.randomUUID();
+        Path warehouse = new Path(TraceableFileIO.SCHEME + "://" + tempPath);
+        Identifier identifier = Identifier.create("default", "my_table");
+        try (Catalog catalog = CatalogFactory.createCatalog(CatalogContext.create(warehouse))) {
+
+            // Create table
+            catalog.createDatabase(database, true);
+            catalog.createTable(identifier, schema, true);
+            FileStoreTable table = (FileStoreTable) catalog.getTable(identifier);
+
+            // Write data
+            BatchWriteBuilder builder = table.newBatchWriteBuilder();
+            InternalVector vector = BinaryVector.fromPrimitiveArray(new float[] {1.0f, 2.0f, 3.0f});
+            try (BatchTableWrite batchTableWrite = builder.newWrite()) {
+                try (BatchTableCommit commit = builder.newCommit()) {
+                    batchTableWrite.write(GenericRow.of(1L, vector));
+                    commit.commit(batchTableWrite.prepareCommit());
+                }
+            }
+
+            // Read data
+            ReadBuilder readBuilder = table.newReadBuilder();
+            TableScan.Plan plan = readBuilder.newScan().plan();
+            try (RecordReader<InternalRow> reader = readBuilder.newRead().createReader(plan)) {
+                reader.forEachRemaining(row -> {
+                    float[] readVector = row.getVector(1).toFloatArray();
+                    System.out.println(Arrays.toString(readVector));
+                });
+            }
+        }
+    }
+}
+```
+
+**Notes**:
+ - Columns of `VECTOR` type cannot be used as primary key columns, partition columns, or for sorting.
+
+## Engine-Level Representation
+
+Since engine layers typically don't have dedicated vector types, to support `VECTOR` type in engine SQL, Paimon provides a separate configuration to convert the engine's `ARRAY` type to Paimon's `VECTOR` type.
+
+Usage:
+ - **`'vector-field'`**: Declare columns as `VECTOR` type, multiple columns separated by commas (`,`);
+ - **`'field.{field-name}.vector-dim'`**: Declare the dimension of the vector column.
+
+Example: Define a table with a `VECTOR` column using Flink SQL.
+```sql
+CREATE TABLE IF NOT EXISTS ts_table (
+    id BIGINT,
+    embed1 ARRAY<FLOAT>,
+    embed2 ARRAY<FLOAT>
+) WITH (
+    'file.format' = 'lance',
+    'vector-field' = 'embed1,embed2',
+    'field.embed1.vector-dim' = '128',
+    'field.embed2.vector-dim' = '768'
+);
+```
+
+**Notes**:
+ - When defining `vector-field` columns, you must provide the vector dimension; otherwise, the CREATE TABLE statement will fail;
+ - Currently, only Flink SQL supports this configuration; other engines have not been implemented yet.
+
+## Specify File Format for Vector
+
+When mapping `VECTOR` type to the file format layer, the ideal storage format is `FixedSizeList`. Currently, this is only supported for certain file formats (such as `lance`) through the `paimon-arrow` integration. This means that to use `VECTOR` type, you must specify a particular format via `file.format`, which has a global impact. In particular, this may be unfavorable for scalars and multimodal (Blob) data.
+
+Therefore, Paimon provides a solution to store vector columns separately within Data Evolution tables.
+
+Layout:
+```
+table/
+├── bucket-0/
+│   ├── data-uuid-0.parquet      # Contains id, name columns
+│   ├── data-uuid-1.blob         # Contains blob data
+│   ├── data-uuid-2.vector.lance # Contains vector data using lance format
+│   └── ...
+├── manifest/
+├── schema/
+└── snapshot/
+```
+
+Usage:
+ - **`vector.file.format`**: Store `VECTOR` type columns separately in the specified file format;
+ - **`vector.target-file-size`**: If stored separately, specifies the target file size for vector data, defaulting to `10 * 'target-file-size'`.
+
+Example: Store `VECTOR` columns separately using Flink SQL.
+```sql
+CREATE TABLE IF NOT EXISTS ts_table (
+    id BIGINT,
+    embed ARRAY<FLOAT>
+) WITH (
+    'file.format' = 'parquet',
+    'vector.file.format' = 'lance',
+    'vector-field' = 'embed',
+    'field.embed.vector-dim' = '128',
+    'row-tracking.enabled' = 'true',
+    'data-evolution.enabled' = 'true'
+);
+```
+
+**Notes**:
+ - If `vector.file.format` is the same as `file.format`, the data will be stored together and not separately;
+ - Only supported for Append tables, not primary key tables, and requires `row-tracking.enabled` and `data-evolution.enabled` to be enabled.
@@ -1506,6 +1506,24 @@
             <td>String</td>
             <td>The Variant shredding schema for writing.</td>
         </tr>
+        <tr>
+            <td><h5>vector-field</h5></td>
+            <td style="word-wrap: break-word;">(none)</td>
+            <td>String</td>
+            <td>Specifies column names that should be stored as vector type. This is used when you want to treat a ARRAY column as a VECTOR.</td>
+        </tr>
+        <tr>
+            <td><h5>vector.file.format</h5></td>
+            <td style="word-wrap: break-word;">(none)</td>
+            <td>String</td>
+            <td>Specify the vector store file format.</td>
+        </tr>
+        <tr>
+            <td><h5>vector.target-file-size</h5></td>
+            <td style="word-wrap: break-word;">(none)</td>
+            <td>MemorySize</td>
+            <td>Target size of a vector-store file. Default is 10 * TARGET_FILE_SIZE.</td>
+        </tr>
         <tr>
             <td><h5>visibility-callback.check-interval</h5></td>
             <td style="word-wrap: break-word;">10 s</td>
 
@@ -2324,6 +2324,31 @@ public InlineElement getDescription() {
                     .withDescription(
                             "The interval for checking visibility when visibility-callback enabled.");
 
+    public static final ConfigOption<String> VECTOR_FILE_FORMAT =
+            key("vector.file.format")
+                    .stringType()
+                    .noDefaultValue()
+                    .withDescription("Specify the vector store file format.");
+
+    public static final ConfigOption<String> VECTOR_FIELD =
+            key("vector-field")
+                    .stringType()
+                    .noDefaultValue()
+                    .withDescription(
+                            "Specifies column names that should be stored as vector type. "
+                                    + "This is used when you want to treat a ARRAY column as a VECTOR.");
+
+    public static final ConfigOption<MemorySize> VECTOR_TARGET_FILE_SIZE =
+            key("vector.target-file-size")
+                    .memoryType()
+                    .noDefaultValue()
+                    .withDescription(
+                            Description.builder()
+                                    .text(
+                                            "Target size of a vector-store file."
+                                                    + " Default is 10 * TARGET_FILE_SIZE.")
+                                    .build());
+
     private final Options options;
 
     public CoreOptions(Map<String, String> options) {
@@ -3647,6 +3672,33 @@ public Duration visibilityCallbackCheckInterval() {
         return options.get(VISIBILITY_CALLBACK_CHECK_INTERVAL);
     }
 
+    public String vectorFileFormatString() {
+        return normalizeFileFormat(options.get(VECTOR_FILE_FORMAT));
+    }
+
+    public Set<String> vectorField() {
+        String vectorFields = options.get(CoreOptions.VECTOR_FIELD);
+        if (vectorFields == null || vectorFields.trim().isEmpty()) {
+            return Collections.emptySet();
+        }
+        return Arrays.stream(vectorFields.trim().split(",")).collect(Collectors.toSet());
+    }
+
+    public static Set<String> vectorField(Map<String, String> options) {
+        String vectorFields = options.getOrDefault(CoreOptions.VECTOR_FIELD.key(), null);
+        if (vectorFields == null || vectorFields.trim().isEmpty()) {
+            return Collections.emptySet();
+        }
+        return Arrays.stream(vectorFields.trim().split(",")).collect(Collectors.toSet());
+    }
+
+    public long vectorTargetFileSize() {
+        // Since vectors are large, it would be better to set a larger target size for vectors.
+        return options.getOptional(VECTOR_TARGET_FILE_SIZE)
+                .map(MemorySize::getBytes)
+                .orElse(10 * targetFileSize(false));
+    }
+
     /** Specifies the merge engine for table with primary key. */
     public enum MergeEngine implements DescribedEnum {
         DEDUPLICATE("deduplicate", "De-duplicate and keep the last row."),
 
@@ -107,6 +107,14 @@ public static FileFormat fileFormat(CoreOptions options) {
         return FileFormat.fromIdentifier(options.fileFormatString(), options.toConfiguration());
     }
 
+    public static FileFormat vectorFileFormat(CoreOptions options) {
+        String vectorFileFormat = options.vectorFileFormatString();
+        if (vectorFileFormat == null) {
+            return fileFormat(options);
+        }
+        return FileFormat.fromIdentifier(vectorFileFormat, options.toConfiguration());
+    }
+
     public static FileFormat manifestFormat(CoreOptions options) {
         return FileFormat.fromIdentifier(options.manifestFormatString(), options.toConfiguration());
     }
 
@@ -41,6 +41,7 @@
 import org.apache.paimon.operation.BlobFileContext;
 import org.apache.paimon.options.MemorySize;
 import org.apache.paimon.reader.RecordReaderIterator;
+import org.apache.paimon.types.DataField;
 import org.apache.paimon.types.RowType;
 import org.apache.paimon.utils.BatchRecordWriter;
 import org.apache.paimon.utils.CommitIncrement;
@@ -52,15 +53,20 @@
 import org.apache.paimon.utils.SinkWriter.BufferedSinkWriter;
 import org.apache.paimon.utils.SinkWriter.DirectSinkWriter;
 import org.apache.paimon.utils.StatsCollectorFactories;
+import org.apache.paimon.utils.VectorStoreUtils;
 
 import javax.annotation.Nullable;
 
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
+import java.util.Set;
 import java.util.concurrent.ExecutionException;
 import java.util.function.Supplier;
+import java.util.stream.Collectors;
+
+import static org.apache.paimon.types.BlobType.fieldsInBlobFile;
 
 /**
  * A {@link RecordWriter} implementation that only accepts records which are always insert
@@ -71,8 +77,10 @@ public class AppendOnlyWriter implements BatchRecordWriter, MemoryOwner {
     private final FileIO fileIO;
     private final long schemaId;
     private final FileFormat fileFormat;
+    private final FileFormat vectorFileFormat;
     private final long targetFileSize;
     private final long blobTargetFileSize;
+    private final long vectorTargetFileSize;
     private final RowType writeSchema;
     @Nullable private final List<String> writeCols;
     private final DataFilePathFactory pathFactory;
@@ -103,8 +111,10 @@ public AppendOnlyWriter(
             @Nullable IOManager ioManager,
             long schemaId,
             FileFormat fileFormat,
+            FileFormat vectorFileFormat,
             long targetFileSize,
             long blobTargetFileSize,
+            long vectorTargetFileSize,
             RowType writeSchema,
             @Nullable List<String> writeCols,
             long maxSequenceNumber,
@@ -127,8 +137,10 @@ public AppendOnlyWriter(
         this.fileIO = fileIO;
         this.schemaId = schemaId;
         this.fileFormat = fileFormat;
+        this.vectorFileFormat = vectorFileFormat;
         this.targetFileSize = targetFileSize;
         this.blobTargetFileSize = blobTargetFileSize;
+        this.vectorTargetFileSize = vectorTargetFileSize;
         this.writeSchema = writeSchema;
         this.writeCols = writeCols;
         this.pathFactory = pathFactory;
@@ -302,13 +314,38 @@ public void toBufferedWriter() throws Exception {
     }
 
     private RollingFileWriter<InternalRow, DataFileMeta> createRollingRowWriter() {
-        if (blobContext != null) {
-            return new RollingBlobFileWriter(
+        boolean hasNormal, hasBlob, hasVectorStore;
+        {
+            hasBlob = (blobContext != null);
+
+            List<DataField> fieldsInVectorFile =
+                    VectorStoreUtils.fieldsInVectorFile(writeSchema, fileFormat, vectorFileFormat);
+            Set<String> vectorFieldNames =
+                    fieldsInVectorFile.stream().map(DataField::name).collect(Collectors.toSet());
+            hasVectorStore = !fieldsInVectorFile.isEmpty();
+
+            List<DataField> fieldsInBlobFile =
+                    hasBlob
+                            ? fieldsInBlobFile(writeSchema, blobContext.blobDescriptorFields())
+                            : Collections.emptyList();
+            Set<String> blobFieldNames =
+                    fieldsInBlobFile.stream().map(DataField::name).collect(Collectors.toSet());
+            hasNormal =
+                    writeSchema.getFields().stream()
+                            .anyMatch(
+                                    f ->
+                                            !blobFieldNames.contains(f.name())
+                                                    && !vectorFieldNames.contains(f.name()));
+        }
+        if (hasBlob || (hasNormal && hasVectorStore)) {
+            return new DataEvolutionRollingFileWriter(
                     fileIO,
                     schemaId,
                     fileFormat,
+                    vectorFileFormat,
                     targetFileSize,
                     blobTargetFileSize,
+                    vectorTargetFileSize,
                     writeSchema,
                     pathFactory,
                     seqNumCounterProvider,
@@ -319,11 +356,13 @@ private RollingFileWriter<InternalRow, DataFileMeta> createRollingRowWriter() {
                     statsDenseStore,
                     blobContext);
         }
+        FileFormat realFileFormat = hasNormal ? fileFormat : vectorFileFormat;
+        long realTargetFileSize = hasNormal ? targetFileSize : vectorTargetFileSize;
         return new RowDataRollingFileWriter(
                 fileIO,
                 schemaId,
-                fileFormat,
-                targetFileSize,
+                realFileFormat,
+                realTargetFileSize,
                 writeSchema,
                 pathFactory,
                 seqNumCounterProvider,
Original file line number	Diff line number	Diff line change
`@@ -107,6 +107,14 @@ public static FileFormat fileFormat(CoreOptions options) {`
`107`	`107`	`return FileFormat.fromIdentifier(options.fileFormatString(), options.toConfiguration());`
`108`	`108`	`}`
`109`	`109`
	`110`	`+ public static FileFormat vectorFileFormat(CoreOptions options) {`
	`111`	`+ String vectorFileFormat = options.vectorFileFormatString();`
	`112`	`+ if (vectorFileFormat == null) {`
	`113`	`+ return fileFormat(options);`
	`114`	`+ }`
	`115`	`+ return FileFormat.fromIdentifier(vectorFileFormat, options.toConfiguration());`
	`116`	`+ }`
	`117`	`+`
`110`	`118`	`public static FileFormat manifestFormat(CoreOptions options) {`
`111`	`119`	`return FileFormat.fromIdentifier(options.manifestFormatString(), options.toConfiguration());`
`112`	`120`	`}`