Skip to content

Commit 644a371

Browse files
committed
[feat](hll) add to_hll(bigint) scalar function
Add a new built-in scalar function to_hll(bigint) that constructs an HLL value from a BIGINT directly, bypassing any hash — the bigint is fed as-is into HyperLogLog::update(uint64_t). Changes: - BE (vec/functions/function_hll.cpp): implement struct ToHll with Status-returning execute/vector/vector_nullable methods; register as FunctionAlwaysNotNullable<ToHll, true> so InvalidArgument status is propagated on negative input. - BE test (be/test/vec/function/function_hll_test.cpp): add unit test function_hll_test.function_to_hll_test covering values 0, 1, 2, 100. - FE Nereids (scalar/ToHll.java): new ScalarFunction with signature BIGINT -> HLL. - FE catalog (BuiltinScalarFunctions.java): register to_hll. - FE visitor (ScalarFunctionVisitor.java): add visitToHll visitor hook.
1 parent e8bc244 commit 644a371

File tree

5 files changed

+148
-9
lines changed

5 files changed

+148
-9
lines changed

be/src/vec/functions/function_hll.cpp

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -103,26 +103,19 @@ class FunctionHLL : public IFunction {
103103

104104
MutableColumnPtr column_result = get_return_type_impl({})->create_column();
105105
column_result->resize(input_rows_count);
106+
106107
if (const ColumnNullable* col_nullable =
107108
check_and_get_column<ColumnNullable>(column.get())) {
108109
const ColumnHLL* col =
109110
check_and_get_column<ColumnHLL>(col_nullable->get_nested_column_ptr().get());
110111
const ColumnUInt8* col_nullmap = check_and_get_column<ColumnUInt8>(
111112
col_nullable->get_null_map_column_ptr().get());
112113

113-
if (col != nullptr && col_nullmap != nullptr) {
114+
if (col && col_nullmap) {
114115
Function::vector_nullable(col->get_data(), col_nullmap->get_data(), column_result);
115-
block.replace_by_position(result, std::move(column_result));
116-
return Status::OK();
117116
}
118117
} else if (const ColumnHLL* col = check_and_get_column<ColumnHLL>(column.get())) {
119118
Function::vector(col->get_data(), column_result);
120-
block.replace_by_position(result, std::move(column_result));
121-
return Status::OK();
122-
} else {
123-
return Status::RuntimeError("Illegal column {} of argument of function {}",
124-
block.get_by_position(arguments[0]).column->get_name(),
125-
get_name());
126119
}
127120

128121
block.replace_by_position(result, std::move(column_result));
@@ -259,6 +252,47 @@ struct HLLHash {
259252
}
260253
};
261254

255+
struct ToHll {
256+
static constexpr auto name = "to_hll";
257+
258+
using ReturnType = DataTypeHLL;
259+
template <typename ColumnType>
260+
static Status vector(const ColumnType* col, MutableColumnPtr& col_res) {
261+
return execute<ColumnType, false>(col, nullptr, col_res);
262+
}
263+
template <typename ColumnType>
264+
static Status vector_nullable(const ColumnType* col, const NullMap& nullmap,
265+
MutableColumnPtr& col_res) {
266+
return execute<ColumnType, true>(col, &nullmap, col_res);
267+
}
268+
template <typename ColumnType, bool arg_is_nullable>
269+
static Status execute(const ColumnType* col, const NullMap* nullmap,
270+
MutableColumnPtr& col_res) {
271+
if constexpr (std::is_same_v<ColumnType, ColumnInt64>) {
272+
auto* res_column = reinterpret_cast<ColumnHLL*>(col_res.get());
273+
auto& res_data = res_column->get_data();
274+
size_t size = col->size();
275+
const auto& data = col->get_data();
276+
277+
for (size_t i = 0; i < size; ++i) {
278+
if (arg_is_nullable && (*nullmap)[i]) {
279+
continue;
280+
} else {
281+
int64_t value = data[i];
282+
if (value < 0) {
283+
return Status::InvalidArgument(
284+
"to_hll does not support negative bigint value: {}", value);
285+
}
286+
res_data[i].update(static_cast<uint64_t>(value));
287+
}
288+
}
289+
return Status::OK();
290+
} else {
291+
return Status::InvalidArgument("not support type");
292+
}
293+
}
294+
};
295+
262296
struct NameHllToBase64 {
263297
static constexpr auto name = "hll_to_base64";
264298
};
@@ -311,13 +345,15 @@ struct HllToBase64 {
311345
using FunctionHLLCardinality = FunctionHLL<HLLCardinality>;
312346
using FunctionHLLEmpty = FunctionConst<HLLEmptyImpl, false>;
313347
using FunctionHLLHash = FunctionAlwaysNotNullable<HLLHash>;
348+
using FunctionToHll = FunctionAlwaysNotNullable<ToHll, true>;
314349
using FunctionHllToBase64 = FunctionUnaryToType<HllToBase64, NameHllToBase64>;
315350

316351
void register_function_hll(SimpleFunctionFactory& factory) {
317352
factory.register_function<FunctionHLLCardinality>();
318353
factory.register_function<FunctionHLLEmpty>();
319354
factory.register_function<FunctionHllFromBase64>();
320355
factory.register_function<FunctionHLLHash>();
356+
factory.register_function<FunctionToHll>();
321357
factory.register_function<FunctionHllToBase64>();
322358
}
323359

be/test/vec/function/function_hll_test.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,4 +133,26 @@ TEST(function_hll_test, function_hll_from_base64_test) {
133133

134134
static_cast<void>(check_function<DataTypeHLL, true>(func_name, input_types, data_set));
135135
}
136+
137+
TEST(function_hll_test, function_to_hll_test) {
138+
std::string func_name = "to_hll";
139+
InputTypeSet input_types = {PrimitiveType::TYPE_BIGINT};
140+
141+
HyperLogLog hll1;
142+
hll1.update(1);
143+
HyperLogLog hll2;
144+
hll2.update(2);
145+
HyperLogLog hll3;
146+
hll3.update(100);
147+
HyperLogLog hll4;
148+
hll4.update(0);
149+
150+
DataSet data_set = {{{(int64_t)1}, &hll1},
151+
{{(int64_t)2}, &hll2},
152+
{{(int64_t)100}, &hll3},
153+
{{(int64_t)0}, &hll4},
154+
{{Null()}, Null()}};
155+
156+
static_cast<void>(check_function<DataTypeHLL>(func_name, input_types, data_set));
157+
}
136158
} // namespace doris::vectorized

fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,7 @@
518518
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDate;
519519
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDateV2;
520520
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDays;
521+
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToHll;
521522
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToIpv4;
522523
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToIpv4OrDefault;
523524
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToIpv4OrNull;
@@ -1103,6 +1104,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
11031104
scalar(ToIpv6OrNull.class, "to_ipv6_or_null"),
11041105
scalar(ToIso8601.class, "to_iso8601"),
11051106
scalar(Tokenize.class, "tokenize"),
1107+
scalar(ToHll.class, "to_hll"),
11061108
scalar(ToJson.class, "to_json"),
11071109
scalar(ToMonday.class, "to_monday"),
11081110
scalar(TopLevelDomain.class, "top_level_domain"),
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
package org.apache.doris.nereids.trees.expressions.functions.scalar;
19+
20+
import org.apache.doris.catalog.FunctionSignature;
21+
import org.apache.doris.nereids.trees.expressions.Expression;
22+
import org.apache.doris.nereids.trees.expressions.functions.AlwaysNotNullable;
23+
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
24+
import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression;
25+
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
26+
import org.apache.doris.nereids.types.BigIntType;
27+
import org.apache.doris.nereids.types.HllType;
28+
29+
import com.google.common.base.Preconditions;
30+
import com.google.common.collect.ImmutableList;
31+
32+
import java.util.List;
33+
34+
/**
35+
* ScalarFunction 'to_hll'. This class is generated by GenerateFunction.
36+
*/
37+
public class ToHll extends ScalarFunction
38+
implements UnaryExpression, ExplicitlyCastableSignature, AlwaysNotNullable {
39+
40+
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
41+
FunctionSignature.ret(HllType.INSTANCE).args(BigIntType.INSTANCE)
42+
);
43+
44+
/**
45+
* constructor with 1 argument.
46+
*/
47+
public ToHll(Expression arg) {
48+
super("to_hll", arg);
49+
}
50+
51+
/** constructor for withChildren and reuse signature */
52+
private ToHll(ScalarFunctionParams functionParams) {
53+
super(functionParams);
54+
}
55+
56+
/**
57+
* withChildren.
58+
*/
59+
@Override
60+
public ToHll withChildren(List<Expression> children) {
61+
Preconditions.checkArgument(children.size() == 1);
62+
return new ToHll(getFunctionParams(children));
63+
}
64+
65+
@Override
66+
public List<FunctionSignature> getSignatures() {
67+
return SIGNATURES;
68+
}
69+
70+
@Override
71+
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
72+
return visitor.visitToHll(this, context);
73+
}
74+
}

fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,7 @@
538538
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDate;
539539
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDateV2;
540540
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDays;
541+
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToHll;
541542
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToIpv4;
542543
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToIpv4OrDefault;
543544
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToIpv4OrNull;
@@ -1796,6 +1797,10 @@ default R visitJsonbExtractString(JsonbExtractString jsonbExtractString, C conte
17961797
return visitScalarFunction(jsonbExtractString, context);
17971798
}
17981799

1800+
default R visitToHll(ToHll toHll, C context) {
1801+
return visitScalarFunction(toHll, context);
1802+
}
1803+
17991804
default R visitToJson(ToJson toJson, C context) {
18001805
return visitScalarFunction(toJson, context);
18011806
}

0 commit comments

Comments
 (0)