Add Pypaimon 0.2.0 release note (#14)

yuzelin · web-flow · commit 2ef619b18415 · 2025-03-15T23:21:38.000+08:00
diff --git a/README.md b/README.md
@@ -126,8 +126,11 @@ The release notes are maintained in the `community/docs/releases` directory.
     title: "Release 0.9"
     type: release
     version: 0.9.0
+    weight: 90
     ---
     ```
+   The `weight` field is used to sort the release notes in the website. The higher the number, the earlier the release note will be displayed.
+   
    If you'd like to use some pictures in the markdown, you can save theme in the `public/img` directory and use relative path, such as `![image](./img/xxx.png)`.
 3. Update the latest version in the `community/docs/downloads.md`.
 4. Commit the changes and push them to the repository.
diff --git a/community/docs/releases/release-0.4.md b/community/docs/releases/release-0.4.md
@@ -2,6 +2,7 @@
 title: "Release 0.4"
 type: release
 version: 0.4.0
+weight: 40
 ---
 
 # Apache Paimon 0.4 Available
diff --git a/community/docs/releases/release-0.5.md b/community/docs/releases/release-0.5.md
@@ -2,6 +2,7 @@
 title: "Release 0.5"
 type: release
 version: 0.5.0
+weight: 50
 ---
 
 # Apache Paimon 0.5 Available
diff --git a/community/docs/releases/release-0.6.md b/community/docs/releases/release-0.6.md
@@ -2,6 +2,7 @@
 title: "Release 0.6"
 type: release
 version: 0.6.0
+weight: 60
 ---
 
 # Apache Paimon 0.6 Available
diff --git a/community/docs/releases/release-0.7.md b/community/docs/releases/release-0.7.md
@@ -2,6 +2,7 @@
 title: "Release 0.7"
 type: release
 version: 0.7.0
+weight: 70
 ---
 
 # Apache Paimon 0.7 Available
diff --git a/community/docs/releases/release-0.8.1.md b/community/docs/releases/release-0.8.1.md
@@ -2,6 +2,7 @@
 title: "Release 0.8.1"
 type: release
 version: 0.8.1
+weight: 81
 ---
 
 # Apache Paimon 0.8.1 Available
diff --git a/community/docs/releases/release-0.8.2.md b/community/docs/releases/release-0.8.2.md
@@ -2,6 +2,7 @@
 title: "Release 0.8.2"
 type: release
 version: 0.8.2
+weight: 82
 ---
 
 # Apache Paimon 0.8.2 Available
diff --git a/community/docs/releases/release-0.8.md b/community/docs/releases/release-0.8.md
@@ -2,6 +2,7 @@
 title: "Release 0.8"
 type: release
 version: 0.8.0
+weight: 80
 ---
 
 # Apache Paimon 0.8 Available
diff --git a/community/docs/releases/release-0.9.md b/community/docs/releases/release-0.9.md
@@ -2,6 +2,7 @@
 title: "Release 0.9"
 type: release
 version: 0.9.0
+weight: 90
 ---
 
 # Apache Paimon 0.9 Available
diff --git a/community/docs/releases/release-pypaimon-0.2.0.md b/community/docs/releases/release-pypaimon-0.2.0.md
@@ -0,0 +1,178 @@
+---
+title: "PyPaimon Release 0.2.0"
+type: release
+version: pypaimon-0.2.0
+weight: 91
+---
+
+# PyPaimon 0.2.0 Available
+
+Dec 19, 2024 - Zelin Yu (yuzelin.yzl@gmail.com)
+
+The Apache Paimon PMC officially announces the release of PyPaimon 0.2.0. Because we didn't release 0.1.0,
+this is the first version.
+
+## What is PyPaimon?
+
+[PyPaimon](https://github.com/apache/paimon-python) is the Python SDK of Apache Paimon. It provides a way
+for users to get data from Paimon tables with Python for data analysis, and write data back to Paimon tables.
+
+## Version Overview
+
+The first version of PyPaimon supports following features:
+
+1. Connect to `Catalog`.
+2. Get or create table.
+3. Batch read: Filter and projection pushdown, and parallelly reading data as Apache Arrow, Pandas, DuckDB and Ray format.
+4. Batch write: Insert into or overwrite table with Apache Arrow and Pandas data.
+
+The detailed document can found at https://paimon.apache.org/docs/master/program-api/python-api/.
+
+### Connect to Catalog
+
+You can create a `Catalog` with options just like in SQL:
+
+```python
+from pypaimon.py4j import Catalog
+
+catalog_options = {
+  'warehouse': 'path/to/warehouse',
+  'metastore': 'filesystem'
+  # other options
+}
+
+catalog = Catalog.create(catalog_options)
+```
+
+You can connect to any `Catalog` supported by Java. PyPaimon has built-in support for `filesystem`, `Jdbc` and `hive` catalog.
+If you want to connect to your self-defined catalogs, you can add the dependency jars in following way:
+
+```python
+import os
+from pypaimon.py4j import constants
+
+os.environ[constants.PYPAIMON_JAVA_CLASSPATH] = '/path/to/jars/*'
+```
+
+### Get or create table
+
+You can get a existed table from `Catalog` by its identifier:
+
+```python
+table = catalog.get_table('database_name.table_name')
+```
+
+You can also create a new table. The table field definitions are described by `pyarrow.Schema`, and you can set primary keys,
+partition keys, table options and comment.
+
+```python
+import pyarrow as pa
+from pypaimon import Schema
+
+# field definitions
+pa_schema = pa.schema([
+    ('dt', pa.string()),
+    ('hh', pa.string()),
+    ('pk', pa.int64()),
+    ('value', pa.string())
+])
+# table schema
+schema = Schema(
+    pa_schema=pa_schema, 
+    partition_keys=['dt', 'hh'],
+    primary_keys=['dt', 'hh', 'pk'],
+    options={'bucket': '2'},
+    comment='my test table'
+)
+
+# create table 
+catalog.create_table(identifier='default.test_table', schema=schema, ignore_if_exists=False)
+```
+
+Then you can get table read and write interfaces from table.
+
+## Batch read
+
+Assume that you already hava the table `default.test_table` described in the previous section. Let's see how to read data from it.
+
+```python
+from pypaimon.py4j import Catalog
+
+# set 'max-workers' (thread numbers) for parallelly reading
+catalog_options = {
+  'warehouse': 'path/to/warehouse',
+  'metastore': 'filesystem',
+  'max-workers': '4'
+}
+catalog = Catalog.create(catalog_options)
+table = catalog.get_table('default.test_table')
+
+# use ReadBuilder to perform filter and projection pushdown
+read_builder = table.new_read_builder()
+
+# select partition: dt='2024-12-01',hh='12'
+predicate_builder = read_builder.new_predicate_builder()
+dt_predicate = predicate_builder.equal('dt', '2024-12-01')
+dt_hh = predicate_builder.equal('hh', '12')
+partition_predicate = predicate_builder.and_([dt_predicate, dt_hh])
+read_builder = read_builder.with_filter(partition_predicate)
+
+# select pk and value
+read_builder = read_builder.with_projection(['pk', 'value'])
+
+# plan splits
+table_scan = read_builder.new_scan()
+splits = table_scan.splits()
+
+# read data to pandas.DataFrame
+df = table_read.to_pandas(splits)
+```
+
+Then you can do some analysis on the dataframe with Python.
+
+## Batch Write
+
+Assume that you already hava the table `default.test_table` described in the previous section. Let's see how to write or overwrite it.
+
+First, assume that you have a dataframe data of 2024-12-02, 12 o'clock, and you want to write it into the table.
+
+```python
+write_builder = table.new_batch_write_builder()
+table_write = write_builder.new_write()
+table_commit = write_builder.new_commit()
+
+# you can write data many times before committing
+dataframe = ...
+table_write.write_pandas(dataframe)
+
+commit_messages = table_write.prepare_commit()
+table_commit.commit(commit_messages)
+
+table_write.close()
+table_commit.close()
+```
+
+Let's see how to overwrite the partition 'dt=2024-12-02,hh=12' with new data.
+```python
+write_builder = table.new_batch_write_builder()
+# set partition to overwrite
+write_builder = write_builder.overwrite({'dt': '2024-01-01', 'hh': '12'})
+
+table_write = write_builder.new_write()
+table_commit = write_builder.new_commit()
+
+# then write data
+dataframe = ...
+table_write.write_pandas(dataframe)
+
+commit_messages = table_write.prepare_commit()
+table_commit.commit(commit_messages)
+
+table_write.close()
+table_commit.close()
+```
+
+### Various data formats
+
+PyPaimon supports reading data in following formats: Pandas, Apache Arrow and DuckDB, and writing data in following
+formats: Pandas, Apache Arrow. Please refer to the [document](https://paimon.apache.org/docs/master/program-api/python-api/) for details.
diff --git a/library/markdown-parser/handlers/document.ts b/library/markdown-parser/handlers/document.ts
@@ -17,8 +17,6 @@
  * under the License.
  */
 
-import { gt } from 'semver';
-
 import * as fs from 'fs';
 import { mkdirSync, readdirSync, readFileSync, writeFileSync } from 'fs';
 import { parse as parseFileName } from 'path';
@@ -52,9 +50,11 @@ export function processDocuments(): { releases: BriefRelease[] } {
 
   releases.forEach(release => writeFileSync(`${docsDist}/${release.version}.json`, JSON.stringify(release)));
 
+  // sort by weight
   const briefReleases = releases
-    .map(release => new BriefRelease(release.title, release.version))
-    .sort((a, b) => (gt(a.version, b.version) ? -1 : 1));
+    .sort((a, b) => b.weight - a.weight)
+    .map(release => new BriefRelease(release.title, release.version));
+
   writeFileSync(`${docsDist}/releases.json`, JSON.stringify(briefReleases));
 
   return { releases: briefReleases };
diff --git a/library/markdown-parser/models/document.ts b/library/markdown-parser/models/document.ts
@@ -30,7 +30,8 @@ export class Document {
     readonly type: string, // 'release' | 'download'
     readonly content: string,
     readonly alias: string,
-    readonly version?: string
+    readonly version?: string,
+    readonly weight?: number
   ) {}
 }
 
@@ -44,7 +45,8 @@ export class ResolvedDocument {
     readonly content: string,
     readonly toc: TOC[],
     readonly alias: string, // for `Edit this page` button
-    readonly version?: string
+    readonly version?: string,
+    readonly weight?: number
   ) {}
 }
 
@@ -62,8 +64,8 @@ export class BriefRelease {
  */
 export function parseDocumentFromBuffer(name: string, buffer: Buffer): Document {
   try {
-    const { title, type, version, __content } = parseMarkdownWithYAML(buffer);
-    return new Document(title, type, __content, name, version);
+    const { title, type, version, weight, __content } = parseMarkdownWithYAML(buffer);
+    return new Document(title, type, __content, name, version, weight);
   } catch (e) {
     console.log(e);
     throw new Error(name);
@@ -75,10 +77,10 @@ export function parseDocumentFromBuffer(name: string, buffer: Buffer): Document
  * @param doc
  */
 export function resolveDocument(doc: Document): ResolvedDocument {
-  const { title, type, content, alias, version } = doc;
+  const { title, type, content, alias, version, weight } = doc;
   const tokensList = new marked.Lexer().lex(doc.content) as unknown as ExtendTokensList;
 
   const contentString = parseMarkdownToHTML(content);
   const toc = generateTOC(tokensList);
-  return new ResolvedDocument(title, type, contentString, toc, alias, version);
+  return new ResolvedDocument(title, type, contentString, toc, alias, version, weight);
 }