Skip to content

Commit 55ad294

Browse files
infvgLakehouse Engine Bot
authored andcommitted
Fix iceberg min max statistics for decimal type when encoded as int32
Signed-off-by: Hazmi <[email protected]> Alchemy-item: (ID = 1203) Fix iceberg min max statistics for decimal type when encoded as int32 commit 1/1 - 0ac9930
1 parent ecbd357 commit 55ad294

2 files changed

Lines changed: 45 additions & 1 deletion

File tree

velox/dwio/parquet/writer/arrow/Statistics.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -559,7 +559,9 @@ TypedComparatorImpl<false, ByteArrayType>::getMinMax(
559559
template <typename T>
560560
std::string encodeDecimalToBigEndian(T value) {
561561
uint8_t buffer[sizeof(T)];
562-
if constexpr (std::is_same_v<T, int64_t>) {
562+
if constexpr (std::is_same_v<T, int32_t>) {
563+
*reinterpret_cast<int32_t*>(buffer) = ::arrow::bit_util::ToBigEndian(value);
564+
} else if constexpr (std::is_same_v<T, int64_t>) {
563565
*reinterpret_cast<int64_t*>(buffer) = ::arrow::bit_util::ToBigEndian(value);
564566
} else if constexpr (std::is_same_v<T, int128_t>) {
565567
*reinterpret_cast<int128_t*>(buffer) = DecimalUtil::bigEndian(value);
@@ -809,6 +811,11 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
809811
}
810812

811813
std::string MinValue() const override {
814+
if constexpr (std::is_same_v<T, int32_t>) {
815+
if (descr_->logicalType()->isDecimal()) {
816+
return encodeDecimalToBigEndian(min_);
817+
}
818+
}
812819
if constexpr (std::is_same_v<T, int64_t>) {
813820
if (descr_->logicalType()->isDecimal()) {
814821
return encodeDecimalToBigEndian(min_);
@@ -835,6 +842,11 @@ class TypedStatisticsImpl : public TypedStatistics<DType> {
835842
}
836843

837844
std::string MaxValue() const override {
845+
if constexpr (std::is_same_v<T, int32_t>) {
846+
if (descr_->logicalType()->isDecimal()) {
847+
return encodeDecimalToBigEndian(max_);
848+
}
849+
}
838850
if constexpr (std::is_same_v<T, int64_t>) {
839851
if (descr_->logicalType()->isDecimal()) {
840852
return encodeDecimalToBigEndian(max_);

velox/dwio/parquet/writer/arrow/tests/StatisticsTest.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2139,6 +2139,38 @@ TEST(IcebergStatistics, unboundedUpperBound) {
21392139
}
21402140
}
21412141

2142+
TEST(IcebergStatistics, maxValueWithNulls) {
2143+
const NodePtr node = PrimitiveNode::make(
2144+
"decimal_col",
2145+
Repetition::kOptional,
2146+
LogicalType::decimal(7, 2),
2147+
Type::kInt32);
2148+
ColumnDescriptor descr(node, 1, 1);
2149+
2150+
auto stats = makeStatistics<Int32Type>(&descr);
2151+
2152+
std::vector<int32_t> values = {19900, 20000};
2153+
stats->update(values.data(), values.size(), 1);
2154+
2155+
ASSERT_TRUE(stats->hasMinMax());
2156+
EXPECT_EQ(stats->min(), 19900);
2157+
EXPECT_EQ(stats->max(), 20000);
2158+
2159+
const auto maxValue = stats->MaxValue();
2160+
EXPECT_FALSE(maxValue.empty()) << "MaxValue() should not be empty";
2161+
2162+
int32_t decodedMax = ::arrow::bit_util::FromBigEndian(
2163+
*reinterpret_cast<const int32_t*>(maxValue.data()));
2164+
EXPECT_EQ(decodedMax, 20000) << "MaxValue() should return 20000";
2165+
2166+
const auto minValue = stats->MinValue();
2167+
EXPECT_FALSE(minValue.empty()) << "MinValue() should not be empty";
2168+
2169+
int32_t decodedMin = ::arrow::bit_util::FromBigEndian(
2170+
*reinterpret_cast<const int32_t*>(minValue.data()));
2171+
EXPECT_EQ(decodedMin, 19900) << "MinValue() should return 19900";
2172+
}
2173+
21422174
TEST(StatisticsComparison, withInt64) {
21432175
NodePtr Node =
21442176
PrimitiveNode::make("int_col", Repetition::kRequired, Type::kInt64);

0 commit comments

Comments
 (0)