Skip to content
This repository was archived by the owner on Apr 2, 2026. It is now read-only.

Commit 58a7bc6

Browse files
authored
Merge pull request #9 from reddit/v2.6.6-patched-251123-trace
Minimum Should Match Support -cherry pick from master
2 parents ea8f2e7 + a59b31f commit 58a7bc6

24 files changed

Lines changed: 2529 additions & 1583 deletions

client/milvusclient/read_example_test.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,3 +403,91 @@ func ExampleClient_HybridSearch() {
403403
log.Println("Scores: ", resultSet.Scores)
404404
}
405405
}
406+
407+
func ExampleClient_Search_textMatch() {
408+
ctx, cancel := context.WithCancel(context.Background())
409+
defer cancel()
410+
411+
collectionName := "text_min_match"
412+
titleField := "title"
413+
textField := "document_text"
414+
titleSparse := "title_sparse_vector"
415+
textSparse := "text_sparse_vector"
416+
milvusAddr := "127.0.0.1:19530"
417+
418+
cli, err := milvusclient.New(ctx, &milvusclient.ClientConfig{
419+
Address: milvusAddr,
420+
})
421+
if err != nil {
422+
log.Fatal("failed to connect to milvus server: ", err.Error())
423+
}
424+
defer cli.Close(ctx)
425+
426+
_ = cli.DropCollection(ctx, milvusclient.NewDropCollectionOption(collectionName))
427+
428+
schema := entity.NewSchema().
429+
WithField(entity.NewField().WithName("id").WithDataType(entity.FieldTypeInt64).WithIsPrimaryKey(true).WithIsAutoID(true)).
430+
WithField(entity.NewField().WithName(titleField).WithDataType(entity.FieldTypeVarChar).WithMaxLength(512).WithEnableAnalyzer(true).WithEnableMatch(true)).
431+
WithField(entity.NewField().WithName(textField).WithDataType(entity.FieldTypeVarChar).WithMaxLength(2048).WithEnableAnalyzer(true).WithEnableMatch(true)).
432+
WithField(entity.NewField().WithName(titleSparse).WithDataType(entity.FieldTypeSparseVector)).
433+
WithField(entity.NewField().WithName(textSparse).WithDataType(entity.FieldTypeSparseVector)).
434+
WithFunction(entity.NewFunction().WithName("title_bm25_func").WithType(entity.FunctionTypeBM25).WithInputFields(titleField).WithOutputFields(titleSparse)).
435+
WithFunction(entity.NewFunction().WithName("text_bm25_func").WithType(entity.FunctionTypeBM25).WithInputFields(textField).WithOutputFields(textSparse))
436+
437+
idxOpts := []milvusclient.CreateIndexOption{
438+
milvusclient.NewCreateIndexOption(collectionName, titleField, index.NewInvertedIndex()),
439+
milvusclient.NewCreateIndexOption(collectionName, textField, index.NewInvertedIndex()),
440+
milvusclient.NewCreateIndexOption(collectionName, titleSparse, index.NewSparseInvertedIndex(entity.BM25, 0.2)),
441+
milvusclient.NewCreateIndexOption(collectionName, textSparse, index.NewSparseInvertedIndex(entity.BM25, 0.2)),
442+
}
443+
444+
err = cli.CreateCollection(ctx, milvusclient.NewCreateCollectionOption(collectionName, schema).WithIndexOptions(idxOpts...))
445+
if err != nil {
446+
log.Fatal("failed to create collection: ", err.Error())
447+
}
448+
449+
_, err = cli.Insert(ctx, milvusclient.NewColumnBasedInsertOption(collectionName).
450+
WithVarcharColumn(titleField, []string{
451+
"History of AI",
452+
"Alan Turing Biography",
453+
"Machine Learning Overview",
454+
}).
455+
WithVarcharColumn(textField, []string{
456+
"Artificial intelligence was founded in 1956 by computer scientists.",
457+
"Alan Turing proposed early concepts of AI and machine learning.",
458+
"Machine learning is a subset of artificial intelligence.",
459+
}))
460+
if err != nil {
461+
log.Fatal("failed to insert data: ", err.Error())
462+
}
463+
464+
task, err := cli.LoadCollection(ctx, milvusclient.NewLoadCollectionOption(collectionName))
465+
if err != nil {
466+
log.Fatal("failed to load collection: ", err.Error())
467+
}
468+
_ = task.Await(ctx)
469+
470+
q := "artificial intelligence"
471+
expr := "text_match(" + titleField + ", \"" + q + "\", minimum_should_match=2) OR text_match(" + textField + ", \"" + q + "\", minimum_should_match=2)"
472+
473+
boost := entity.NewFunction().
474+
WithName("title_boost").
475+
WithType(entity.FunctionTypeRerank).
476+
WithParam("reranker", "boost").
477+
WithParam("filter", "text_match("+titleField+", \""+q+"\", minimum_should_match=2)").
478+
WithParam("weight", "2.0")
479+
480+
vectors := []entity.Vector{entity.Text(q)}
481+
rs, err := cli.Search(ctx, milvusclient.NewSearchOption(collectionName, 5, vectors).
482+
WithANNSField(textSparse).
483+
WithFilter(expr).
484+
WithOutputFields("id", titleField, textField).
485+
WithFunctionReranker(boost))
486+
if err != nil {
487+
log.Fatal("failed to search: ", err.Error())
488+
}
489+
490+
for _, r := range rs {
491+
_ = r.ResultCount
492+
}
493+
}

client/milvusclient/read_test.go

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,46 @@ func (s *ReadSuite) TestSearch() {
191191
})
192192
}
193193

194+
// TestSearch_TextMatch tests the text match search functionality.
195+
// It tests the minimum_should_match parameter in the expression.
196+
func (s *ReadSuite) TestSearch_TextMatch() {
197+
ctx, cancel := context.WithCancel(context.Background())
198+
defer cancel()
199+
200+
s.Run("min_should_match_in_expr", func() {
201+
collectionName := fmt.Sprintf("coll_%s", s.randString(6))
202+
s.setupCache(collectionName, s.schema)
203+
204+
s.mock.EXPECT().Search(mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, sr *milvuspb.SearchRequest) (*milvuspb.SearchResults, error) {
205+
// ensure the expression contains minimum_should_match and both fields
206+
s.Contains(sr.GetDsl(), "minimum_should_match=2")
207+
s.Contains(sr.GetDsl(), "text_match(")
208+
return &milvuspb.SearchResults{
209+
Status: merr.Success(),
210+
Results: &schemapb.SearchResultData{
211+
NumQueries: 1,
212+
TopK: 1,
213+
FieldsData: []*schemapb.FieldData{
214+
s.getInt64FieldData("ID", []int64{1}),
215+
},
216+
Ids: &schemapb.IDs{IdField: &schemapb.IDs_IntId{IntId: &schemapb.LongArray{Data: []int64{1}}}},
217+
Scores: []float32{0.1},
218+
Topks: []int64{1},
219+
},
220+
}, nil
221+
}).Once()
222+
223+
q := "artificial intelligence"
224+
expr := "text_match(title, \"" + q + "\", minimum_should_match=2) OR text_match(document_text, \"" + q + "\", minimum_should_match=2)"
225+
vectors := []entity.Vector{entity.Text(q)}
226+
_, err := s.client.Search(ctx, NewSearchOption(collectionName, 5, vectors).
227+
WithANNSField("text_sparse_vector").
228+
WithFilter(expr).
229+
WithOutputFields("ID"))
230+
s.NoError(err)
231+
})
232+
}
233+
194234
func (s *ReadSuite) TestHybridSearch() {
195235
ctx, cancel := context.WithCancel(context.Background())
196236
defer cancel()
@@ -233,7 +273,7 @@ func (s *ReadSuite) TestHybridSearch() {
233273
return rand.Float32()
234274
}))).WithFilter("ID > 100"), NewAnnRequest("vector", 10, entity.FloatVector(lo.RepeatBy(128, func(_ int) float32 {
235275
return rand.Float32()
236-
})))).WithConsistencyLevel(entity.ClStrong).WithPartitons(partitionName).WithReranker(NewRRFReranker()).WithOutputFields("*"))
276+
})))).WithConsistencyLevel(entity.ClStrong).WithPartitions(partitionName).WithReranker(NewRRFReranker()).WithOutputFields("*"))
237277
s.NoError(err)
238278
})
239279

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ require (
6565
github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.23.0
6666
github.com/bits-and-blooms/bitset v1.10.0
6767
github.com/bytedance/mockey v1.2.14
68-
github.com/bytedance/sonic v1.13.2
68+
github.com/bytedance/sonic v1.14.0
6969
github.com/cenkalti/backoff/v4 v4.2.1
7070
github.com/cockroachdb/redact v1.1.3
7171
github.com/google/uuid v1.6.0
@@ -137,7 +137,7 @@ require (
137137
github.com/benesch/cgosymbolizer v0.0.0-20190515212042-bec6fe6e597b // indirect
138138
github.com/beorn7/perks v1.0.1 // indirect
139139
github.com/bytecodealliance/wasmtime-go v1.0.0 // indirect
140-
github.com/bytedance/sonic/loader v0.2.4 // indirect
140+
github.com/bytedance/sonic/loader v0.3.0 // indirect
141141
github.com/campoy/embedmd v1.0.0 // indirect
142142
github.com/cespare/xxhash/v2 v2.3.0 // indirect
143143
github.com/cilium/ebpf v0.11.0 // indirect

go.sum

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -207,11 +207,11 @@ github.com/bytecodealliance/wasmtime-go v1.0.0 h1:9u9gqaUiaJeN5IoD1L7egD8atOnTGy
207207
github.com/bytecodealliance/wasmtime-go v1.0.0/go.mod h1:jjlqQbWUfVSbehpErw3UoWFndBXRRMvfikYH6KsCwOg=
208208
github.com/bytedance/mockey v1.2.14 h1:KZaFgPdiUwW+jOWFieo3Lr7INM1P+6adO3hxZhDswY8=
209209
github.com/bytedance/mockey v1.2.14/go.mod h1:1BPHF9sol5R1ud/+0VEHGQq/+i2lN+GTsr3O2Q9IENY=
210-
github.com/bytedance/sonic v1.13.2 h1:8/H1FempDZqC4VqjptGo14QQlJx8VdZJegxs6wwfqpQ=
211-
github.com/bytedance/sonic v1.13.2/go.mod h1:o68xyaF9u2gvVBuGHPlUVCy+ZfmNNO5ETf1+KgkJhz4=
210+
github.com/bytedance/sonic v1.14.0 h1:/OfKt8HFw0kh2rj8N0F6C/qPGRESq0BbaNZgcNXXzQQ=
211+
github.com/bytedance/sonic v1.14.0/go.mod h1:WoEbx8WTcFJfzCe0hbmyTGrfjt8PzNEBdxlNUO24NhA=
212212
github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
213-
github.com/bytedance/sonic/loader v0.2.4 h1:ZWCw4stuXUsn1/+zQDqeE7JKP+QO47tz7QCNan80NzY=
214-
github.com/bytedance/sonic/loader v0.2.4/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI=
213+
github.com/bytedance/sonic/loader v0.3.0 h1:dskwH8edlzNMctoruo8FPTJDF3vLtDT0sXZwvZJyqeA=
214+
github.com/bytedance/sonic/loader v0.3.0/go.mod h1:N8A3vUdtUebEY2/VQC0MyhYeKUFosQU6FxH2JmUe6VI=
215215
github.com/campoy/embedmd v1.0.0 h1:V4kI2qTJJLf4J29RzI/MAt2c3Bl4dQSYPuflzwFH2hY=
216216
github.com/campoy/embedmd v1.0.0/go.mod h1:oxyr9RCiSXg0M3VJ3ks0UGfp98BpSSGr0kpiX3MzVl8=
217217
github.com/casbin/casbin/v2 v2.0.0/go.mod h1:YcPU1XXisHhLzuxH9coDNf2FbKpjGlbCg3n9yuLkIJQ=

internal/core/src/exec/expression/UnaryExpr.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1792,10 +1792,18 @@ PhyUnaryRangeFilterExpr::ExecTextMatch() {
17921792
}
17931793
}
17941794

1795-
auto func = [op_type, slop](Index* index,
1796-
const std::string& query) -> TargetBitmap {
1795+
uint32_t min_should_match = 1; // default value
1796+
if (op_type == proto::plan::OpType::TextMatch &&
1797+
expr_->extra_values_.size() > 0) {
1798+
// min_should_match is stored in the first extra value
1799+
min_should_match = static_cast<uint32_t>(
1800+
GetValueFromProto<int64_t>(expr_->extra_values_[0]));
1801+
}
1802+
1803+
auto func = [op_type, slop, min_should_match](
1804+
Index* index, const std::string& query) -> TargetBitmap {
17971805
if (op_type == proto::plan::OpType::TextMatch) {
1798-
return index->MatchQuery(query);
1806+
return index->MatchQuery(query, min_should_match);
17991807
} else if (op_type == proto::plan::OpType::PhraseMatch) {
18001808
return index->PhraseMatchQuery(query, slop);
18011809
} else {

internal/core/src/index/TextMatchIndex.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -153,12 +153,12 @@ TextMatchIndex::Load(const Config& config) {
153153
std::vector<std::string> file;
154154
file.push_back(*it);
155155
files_value.erase(it);
156-
auto index_datas =
156+
auto index_data =
157157
mem_file_manager_->LoadIndexToMemory(file, load_priority);
158158
BinarySet binary_set;
159-
AssembleIndexDatas(index_datas, binary_set);
160-
// clear index_datas to free memory early
161-
index_datas.clear();
159+
AssembleIndexDatas(index_data, binary_set);
160+
// clear index_data to free memory early
161+
index_data.clear();
162162
auto index_valid_data = binary_set.GetByName("index_null_offset");
163163
null_offset_.resize((size_t)index_valid_data->size / sizeof(size_t));
164164
memcpy(null_offset_.data(),
@@ -226,18 +226,18 @@ TextMatchIndex::AddTextsGrowing(size_t n,
226226
// schema_ may not be initialized so we need this `nullable` parameter
227227
void
228228
TextMatchIndex::BuildIndexFromFieldData(
229-
const std::vector<FieldDataPtr>& field_datas, bool nullable) {
229+
const std::vector<FieldDataPtr>& field_data, bool nullable) {
230230
int64_t offset = 0;
231231
if (nullable) {
232232
int64_t total = 0;
233-
for (const auto& data : field_datas) {
233+
for (const auto& data : field_data) {
234234
total += data->get_null_count();
235235
}
236236
{
237237
std::unique_lock<folly::SharedMutex> lock(mutex_);
238238
null_offset_.reserve(total);
239239
}
240-
for (const auto& data : field_datas) {
240+
for (const auto& data : field_data) {
241241
auto n = data->get_num_rows();
242242
for (int i = 0; i < n; i++) {
243243
if (!data->is_valid(i)) {
@@ -251,7 +251,7 @@ TextMatchIndex::BuildIndexFromFieldData(
251251
}
252252
}
253253
} else {
254-
for (const auto& data : field_datas) {
254+
for (const auto& data : field_data) {
255255
auto n = data->get_num_rows();
256256
wrapper_->add_data(
257257
static_cast<const std::string*>(data->Data()), n, offset);
@@ -302,17 +302,19 @@ TextMatchIndex::RegisterTokenizer(const char* tokenizer_name,
302302
}
303303

304304
TargetBitmap
305-
TextMatchIndex::MatchQuery(const std::string& query) {
305+
TextMatchIndex::MatchQuery(const std::string& query,
306+
uint32_t min_should_match) {
307+
tracer::AutoSpan span("TextMatchIndex::MatchQuery", tracer::GetRootSpan());
306308
if (shouldTriggerCommit()) {
307309
Commit();
308310
Reload();
309311
}
310312

311313
TargetBitmap bitset{static_cast<size_t>(Count())};
312-
// The count opeartion of tantivy may be get older cnt if the index is committed with new tantivy segment.
314+
// The count operation of tantivy may be get older cnt if the index is committed with new tantivy segment.
313315
// So we cannot use the count operation to get the total count for bitmap.
314316
// Just use the maximum offset of hits to get the total count for bitmap here.
315-
wrapper_->match_query(query, &bitset);
317+
wrapper_->match_query(query, min_should_match, &bitset);
316318
return bitset;
317319
}
318320

@@ -324,7 +326,7 @@ TextMatchIndex::PhraseMatchQuery(const std::string& query, uint32_t slop) {
324326
}
325327

326328
TargetBitmap bitset{static_cast<size_t>(Count())};
327-
// The count opeartion of tantivy may be get older cnt if the index is committed with new tantivy segment.
329+
// The count operation of tantivy may be get older cnt if the index is committed with new tantivy segment.
328330
// So we cannot use the count operation to get the total count for bitmap.
329331
// Just use the maximum offset of hits to get the total count for bitmap here.
330332
wrapper_->phrase_match_query(query, slop, &bitset);

internal/core/src/index/TextMatchIndex.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
6363
int64_t offset_begin);
6464

6565
void
66-
BuildIndexFromFieldData(const std::vector<FieldDataPtr>& field_datas,
66+
BuildIndexFromFieldData(const std::vector<FieldDataPtr>& field_data,
6767
bool nullable);
6868

6969
void
@@ -83,7 +83,7 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
8383
RegisterTokenizer(const char* tokenizer_name, const char* analyzer_params);
8484

8585
TargetBitmap
86-
MatchQuery(const std::string& query);
86+
MatchQuery(const std::string& query, uint32_t min_should_match);
8787

8888
TargetBitmap
8989
PhraseMatchQuery(const std::string& query, uint32_t slop);

internal/core/src/index/TextMatchIndexTest.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ TEST(TextMatch, Index) {
164164
index->Reload();
165165

166166
{
167-
auto res = index->MatchQuery("football");
167+
auto res = index->MatchQuery("football", 1);
168168
ASSERT_EQ(res.size(), 3);
169169
ASSERT_TRUE(res[0]);
170170
ASSERT_FALSE(res[1]);
@@ -177,11 +177,16 @@ TEST(TextMatch, Index) {
177177
ASSERT_TRUE(res2[0]);
178178
ASSERT_FALSE(res2[1]);
179179
ASSERT_TRUE(res2[2]);
180-
res = index->MatchQuery("nothing");
180+
res = index->MatchQuery("nothing", 1);
181181
ASSERT_EQ(res.size(), 3);
182182
ASSERT_FALSE(res[0]);
183183
ASSERT_FALSE(res[1]);
184184
ASSERT_FALSE(res[2]);
185+
auto res3 = index->MatchQuery("football pingpang cricket", 2);
186+
ASSERT_EQ(res3.size(), 3);
187+
ASSERT_TRUE(res3[0]);
188+
ASSERT_FALSE(res3[1]);
189+
ASSERT_FALSE(res3[2]);
185190
}
186191

187192
{

0 commit comments

Comments
 (0)