Skip to content
This repository was archived by the owner on Apr 2, 2026. It is now read-only.

Commit fd98dd4

Browse files
author
Amit Kumar
committed
Multi Field support -use single tantivy call for multiple fields
1 parent 473f72b commit fd98dd4

11 files changed

Lines changed: 828 additions & 24 deletions

File tree

internal/core/src/common/QueryInfo.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,30 @@ struct SearchInfo {
4646
std::optional<std::string> json_path_;
4747
std::optional<milvus::DataType> json_type_;
4848
bool strict_cast_{false};
49+
50+
// Multi-field TEXT_BM25 search support
51+
// Additional field IDs for multi-field search (primary field is in field_id_)
52+
std::vector<FieldId> additional_field_ids_;
53+
// Weights for each field (first weight is for field_id_, rest for additional_field_ids_)
54+
std::vector<float> bm25_weights_;
55+
// If true, use max aggregation; otherwise use weighted_sum (default)
56+
bool bm25_use_max_aggregation_{false};
57+
58+
// Helper to check if this is a multi-field search
59+
bool
60+
IsMultiFieldTextSearch() const {
61+
return !additional_field_ids_.empty();
62+
}
63+
64+
// Get all field IDs (primary + additional)
65+
std::vector<FieldId>
66+
GetAllTextFieldIds() const {
67+
std::vector<FieldId> all_ids;
68+
all_ids.push_back(field_id_);
69+
all_ids.insert(
70+
all_ids.end(), additional_field_ids_.begin(), additional_field_ids_.end());
71+
return all_ids;
72+
}
4973
};
5074

5175
using SearchInfoPtr = std::shared_ptr<SearchInfo>;

internal/core/src/index/TextMatchIndex.cpp

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,4 +428,92 @@ TextMatchIndex::BM25SearchQueryWithFilter(const std::string& query,
428428
return {std::move(seg_offsets), std::move(scores)};
429429
}
430430

431+
std::pair<std::vector<int64_t>, std::vector<float>>
432+
TextMatchIndex::BM25MultiFieldSearch(const std::vector<TextMatchIndex*>& indexes,
433+
const std::string& query,
434+
int64_t topk,
435+
const std::vector<float>& weights,
436+
bool use_max_aggregation) {
437+
tracer::AutoSpan span(
438+
"TextMatchIndex::BM25MultiFieldSearch", tracer::GetRootSpan());
439+
440+
if (indexes.empty() || weights.empty() || indexes.size() != weights.size()) {
441+
return {{}, {}};
442+
}
443+
444+
// Collect wrapper pointers - indexes are expected to be ready for reading
445+
std::vector<TantivyIndexWrapper*> wrappers;
446+
wrappers.reserve(indexes.size());
447+
for (auto* idx : indexes) {
448+
if (idx != nullptr && idx->wrapper_ != nullptr) {
449+
wrappers.push_back(idx->wrapper_.get());
450+
}
451+
}
452+
453+
if (wrappers.empty()) {
454+
return {{}, {}};
455+
}
456+
457+
auto aggregation = use_max_aggregation ? BM25AggregationType::Max
458+
: BM25AggregationType::WeightedSum;
459+
460+
auto [doc_ids, scores] = TantivyIndexWrapper::bm25_multi_field_search(
461+
wrappers, query, static_cast<uintptr_t>(topk), weights, aggregation);
462+
463+
// Convert uint32_t doc_ids to int64_t seg_offsets
464+
std::vector<int64_t> seg_offsets;
465+
seg_offsets.reserve(doc_ids.size());
466+
for (auto doc_id : doc_ids) {
467+
seg_offsets.push_back(static_cast<int64_t>(doc_id));
468+
}
469+
470+
return {std::move(seg_offsets), std::move(scores)};
471+
}
472+
473+
std::pair<std::vector<int64_t>, std::vector<float>>
474+
TextMatchIndex::BM25MultiFieldSearchWithFilter(
475+
const std::vector<TextMatchIndex*>& indexes,
476+
const std::string& query,
477+
int64_t topk,
478+
const std::vector<float>& weights,
479+
bool use_max_aggregation,
480+
const uint8_t* filter_bitset,
481+
size_t filter_bitset_len) {
482+
tracer::AutoSpan span(
483+
"TextMatchIndex::BM25MultiFieldSearchWithFilter", tracer::GetRootSpan());
484+
485+
if (indexes.empty() || weights.empty() || indexes.size() != weights.size()) {
486+
return {{}, {}};
487+
}
488+
489+
// Collect wrapper pointers - indexes are expected to be ready for reading
490+
std::vector<TantivyIndexWrapper*> wrappers;
491+
wrappers.reserve(indexes.size());
492+
for (auto* idx : indexes) {
493+
if (idx != nullptr && idx->wrapper_ != nullptr) {
494+
wrappers.push_back(idx->wrapper_.get());
495+
}
496+
}
497+
498+
if (wrappers.empty()) {
499+
return {{}, {}};
500+
}
501+
502+
auto aggregation = use_max_aggregation ? BM25AggregationType::Max
503+
: BM25AggregationType::WeightedSum;
504+
505+
auto [doc_ids, scores] = TantivyIndexWrapper::bm25_multi_field_search_with_filter(
506+
wrappers, query, static_cast<uintptr_t>(topk), weights, aggregation,
507+
filter_bitset, filter_bitset_len);
508+
509+
// Convert uint32_t doc_ids to int64_t seg_offsets
510+
std::vector<int64_t> seg_offsets;
511+
seg_offsets.reserve(doc_ids.size());
512+
for (auto doc_id : doc_ids) {
513+
seg_offsets.push_back(static_cast<int64_t>(doc_id));
514+
}
515+
516+
return {std::move(seg_offsets), std::move(scores)};
517+
}
518+
431519
} // namespace milvus::index

internal/core/src/index/TextMatchIndex.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,32 @@ class TextMatchIndex : public InvertedIndexTantivy<std::string> {
113113
const uint8_t* filter_bitset,
114114
size_t filter_bitset_len);
115115

116+
/// Multi-field BM25 search - queries multiple text indexes and aggregates scores.
117+
/// This enables native multi-field text search without requiring HybridSearch + Reranker.
118+
///
119+
/// @param indexes - Vector of TextMatchIndex pointers for each field
120+
/// @param query - Search query text
121+
/// @param topk - Number of top results per field (before aggregation)
122+
/// @param weights - Weight for each field (must match indexes length)
123+
/// @param use_max_aggregation - If true, use max; otherwise use weighted_sum
124+
/// @return (seg_offsets, scores) sorted by aggregated score
125+
static std::pair<std::vector<int64_t>, std::vector<float>>
126+
BM25MultiFieldSearch(const std::vector<TextMatchIndex*>& indexes,
127+
const std::string& query,
128+
int64_t topk,
129+
const std::vector<float>& weights,
130+
bool use_max_aggregation = false);
131+
132+
/// Multi-field BM25 search with filter bitset.
133+
static std::pair<std::vector<int64_t>, std::vector<float>>
134+
BM25MultiFieldSearchWithFilter(const std::vector<TextMatchIndex*>& indexes,
135+
const std::string& query,
136+
int64_t topk,
137+
const std::vector<float>& weights,
138+
bool use_max_aggregation,
139+
const uint8_t* filter_bitset,
140+
size_t filter_bitset_len);
141+
116142
private:
117143
bool
118144
shouldTriggerCommit();

internal/core/src/query/PlanProto.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include <cstdint>
1717
#include <memory>
18+
#include <sstream>
1819
#include <string>
1920
#include <vector>
2021

@@ -130,6 +131,41 @@ ProtoParser::PlanNodeFromProto(const planpb::PlanNode& plan_node_proto) {
130131
}
131132
}
132133

134+
// Parse multi-field TEXT_BM25 parameters from search_params
135+
// bm25_field_ids: comma-separated additional field IDs (e.g., "101,102,103")
136+
// bm25_weights: comma-separated weights for all fields (e.g., "1.0,0.5,0.5")
137+
// bm25_agg: aggregation type ("weighted_sum" or "max")
138+
if (search_info.search_params_.contains("bm25_field_ids")) {
139+
std::string field_ids_str =
140+
search_info.search_params_["bm25_field_ids"].get<std::string>();
141+
std::stringstream ss(field_ids_str);
142+
std::string token;
143+
while (std::getline(ss, token, ',')) {
144+
if (!token.empty()) {
145+
search_info.additional_field_ids_.push_back(
146+
FieldId(std::stoll(token)));
147+
}
148+
}
149+
}
150+
151+
if (search_info.search_params_.contains("bm25_weights")) {
152+
std::string weights_str =
153+
search_info.search_params_["bm25_weights"].get<std::string>();
154+
std::stringstream ss(weights_str);
155+
std::string token;
156+
while (std::getline(ss, token, ',')) {
157+
if (!token.empty()) {
158+
search_info.bm25_weights_.push_back(std::stof(token));
159+
}
160+
}
161+
}
162+
163+
if (search_info.search_params_.contains("bm25_agg")) {
164+
std::string agg_type =
165+
search_info.search_params_["bm25_agg"].get<std::string>();
166+
search_info.bm25_use_max_aggregation_ = (agg_type == "max");
167+
}
168+
133169
return search_info;
134170
};
135171

internal/core/src/segcore/SegmentInterface.cpp

Lines changed: 82 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -503,35 +503,93 @@ SegmentInternalInterface::text_search(SearchInfo& search_info,
503503
SearchResult& output) const {
504504
tracer::AutoSpan span("text_search", tracer::GetRootSpan());
505505

506-
auto field_id = search_info.field_id_;
507506
auto topk = search_info.topk_;
508-
509-
// Get the text index for this field
510-
auto pinned_index = GetTextIndex(op_context, field_id);
511-
auto text_index = pinned_index.get();
512-
513507
std::vector<int64_t> seg_offsets;
514508
std::vector<float> scores;
515509

516-
// Use filtered BM25 search if there's a filter, otherwise use unfiltered
517-
// This is more efficient than searching and then filtering post-hoc
518-
// because Tantivy can skip filtered documents during search.
519-
if (!bitset.empty()) {
520-
// Pass the bitset to Tantivy for efficient filtered search
521-
auto [filtered_offsets, filtered_scores] =
522-
text_index->BM25SearchQueryWithFilter(
523-
query_text,
524-
topk,
525-
bitset.data(),
526-
bitset.size());
527-
seg_offsets = std::move(filtered_offsets);
528-
scores = std::move(filtered_scores);
510+
// Check if this is a multi-field TEXT_BM25 search
511+
if (search_info.IsMultiFieldTextSearch()) {
512+
// Multi-field search: gather all text indexes
513+
auto all_field_ids = search_info.GetAllTextFieldIds();
514+
515+
// Collect pinned indexes (to keep them alive during search)
516+
std::vector<PinWrapper<index::TextMatchIndex*>> pinned_indexes;
517+
std::vector<index::TextMatchIndex*> text_indexes;
518+
pinned_indexes.reserve(all_field_ids.size());
519+
text_indexes.reserve(all_field_ids.size());
520+
521+
for (const auto& fid : all_field_ids) {
522+
auto pinned = GetTextIndex(op_context, fid);
523+
text_indexes.push_back(pinned.get());
524+
pinned_indexes.push_back(std::move(pinned));
525+
}
526+
527+
// Get weights (use equal weights if not specified)
528+
std::vector<float> weights = search_info.bm25_weights_;
529+
if (weights.empty()) {
530+
weights.assign(all_field_ids.size(), 1.0f);
531+
}
532+
533+
// Ensure weights vector matches field count
534+
if (weights.size() != all_field_ids.size()) {
535+
throw SegcoreError(
536+
milvus::ErrorCode::FieldNotLoaded,
537+
fmt::format("BM25 weights count ({}) doesn't match field count ({})",
538+
weights.size(), all_field_ids.size()));
539+
}
540+
541+
// Use filtered or unfiltered multi-field search
542+
if (!bitset.empty()) {
543+
auto [filtered_offsets, filtered_scores] =
544+
index::TextMatchIndex::BM25MultiFieldSearchWithFilter(
545+
text_indexes,
546+
query_text,
547+
topk,
548+
weights,
549+
search_info.bm25_use_max_aggregation_,
550+
bitset.data(),
551+
bitset.size());
552+
seg_offsets = std::move(filtered_offsets);
553+
scores = std::move(filtered_scores);
554+
} else {
555+
auto [unfiltered_offsets, unfiltered_scores] =
556+
index::TextMatchIndex::BM25MultiFieldSearch(
557+
text_indexes,
558+
query_text,
559+
topk,
560+
weights,
561+
search_info.bm25_use_max_aggregation_);
562+
seg_offsets = std::move(unfiltered_offsets);
563+
scores = std::move(unfiltered_scores);
564+
}
529565
} else {
530-
// No filter, use regular BM25 search
531-
auto [unfiltered_offsets, unfiltered_scores] =
532-
text_index->BM25SearchQuery(query_text, topk);
533-
seg_offsets = std::move(unfiltered_offsets);
534-
scores = std::move(unfiltered_scores);
566+
// Single-field search (original path)
567+
auto field_id = search_info.field_id_;
568+
569+
// Get the text index for this field
570+
auto pinned_index = GetTextIndex(op_context, field_id);
571+
auto text_index = pinned_index.get();
572+
573+
// Use filtered BM25 search if there's a filter, otherwise use unfiltered
574+
// This is more efficient than searching and then filtering post-hoc
575+
// because Tantivy can skip filtered documents during search.
576+
if (!bitset.empty()) {
577+
// Pass the bitset to Tantivy for efficient filtered search
578+
auto [filtered_offsets, filtered_scores] =
579+
text_index->BM25SearchQueryWithFilter(
580+
query_text,
581+
topk,
582+
bitset.data(),
583+
bitset.size());
584+
seg_offsets = std::move(filtered_offsets);
585+
scores = std::move(filtered_scores);
586+
} else {
587+
// No filter, use regular BM25 search
588+
auto [unfiltered_offsets, unfiltered_scores] =
589+
text_index->BM25SearchQuery(query_text, topk);
590+
seg_offsets = std::move(unfiltered_offsets);
591+
scores = std::move(unfiltered_scores);
592+
}
535593
}
536594

537595
// Populate the search result

internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@
66
#include <ostream>
77
#include <new>
88

9+
/// BM25 aggregation type for multi-field search.
10+
/// 0 = WeightedSum (default), 1 = Max
11+
enum class BM25AggregationType {
12+
WeightedSum = 0,
13+
Max = 1,
14+
};
15+
916
enum class TantivyDataType : uint8_t {
1017
Text,
1118
Keyword,
@@ -346,6 +353,36 @@ RustResult tantivy_bm25_search_query_with_filter(void *ptr,
346353
uintptr_t filter_bitset_len,
347354
RustScoredSearchResult *result);
348355

356+
/// Performs multi-field BM25 search by querying multiple text indexes and aggregating results.
357+
/// This enables native multi-field text search without requiring HybridSearch + Reranker.
358+
///
359+
/// # Arguments
360+
/// * `readers` - Array of IndexReaderWrapper pointers for each field
361+
/// * `num_readers` - Number of readers
362+
/// * `query` - Search query text
363+
/// * `topk` - Number of top results per field (before aggregation)
364+
/// * `weights` - Weight for each field (array of floats, length must match num_readers)
365+
/// * `aggregation` - How to combine scores (0 = WeightedSum, 1 = Max)
366+
/// * `result` - Output result
367+
RustResult tantivy_bm25_multi_field_search(void *const *readers,
368+
uintptr_t num_readers,
369+
const char *query,
370+
uintptr_t topk,
371+
const float *weights,
372+
BM25AggregationType aggregation,
373+
RustScoredSearchResult *result);
374+
375+
/// Performs multi-field BM25 search with a filter bitset.
376+
RustResult tantivy_bm25_multi_field_search_with_filter(void *const *readers,
377+
uintptr_t num_readers,
378+
const char *query,
379+
uintptr_t topk,
380+
const float *weights,
381+
BM25AggregationType aggregation,
382+
const uint8_t *filter_bitset,
383+
uintptr_t filter_bitset_len,
384+
RustScoredSearchResult *result);
385+
349386
RustResult tantivy_create_index(const char *field_name,
350387
TantivyDataType data_type,
351388
const char *path,

0 commit comments

Comments
 (0)