// Copyright (c) 2023 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build vectors // +build vectors package bleve import ( "context" "encoding/json" "fmt" "sort" "github.com/blevesearch/bleve/v2/document" "github.com/blevesearch/bleve/v2/search" "github.com/blevesearch/bleve/v2/search/collector" "github.com/blevesearch/bleve/v2/search/query" index "github.com/blevesearch/bleve_index_api" ) const supportForVectorSearch = true type knnOperator string // Must be updated only at init var BleveMaxK = int64(10000) type SearchRequest struct { ClientContextID string `json:"client_context_id,omitempty"` Query query.Query `json:"query"` Size int `json:"size"` From int `json:"from"` Highlight *HighlightRequest `json:"highlight"` Fields []string `json:"fields"` Facets FacetsRequest `json:"facets"` Explain bool `json:"explain"` Sort search.SortOrder `json:"sort"` IncludeLocations bool `json:"includeLocations"` Score string `json:"score,omitempty"` SearchAfter []string `json:"search_after"` SearchBefore []string `json:"search_before"` KNN []*KNNRequest `json:"knn"` KNNOperator knnOperator `json:"knn_operator"` // PreSearchData will be a map that will be used // in the second phase of any 2-phase search, to provide additional // context to the second phase. This is useful in the case of index // aliases where the first phase will gather the PreSearchData from all // the indexes in the alias, and the second phase will use that // PreSearchData to perform the actual search. // The currently accepted map configuration is: // // "_knn_pre_search_data_key": []*search.DocumentMatch PreSearchData map[string]interface{} `json:"pre_search_data,omitempty"` sortFunc func(sort.Interface) } // Vector takes precedence over vectorBase64 in case both fields are given type KNNRequest struct { Field string `json:"field"` Vector []float32 `json:"vector"` VectorBase64 string `json:"vector_base64"` K int64 `json:"k"` Boost *query.Boost `json:"boost,omitempty"` // Search parameters for the field's vector index part of the segment. // Value of it depends on the field's backing vector index implementation. // // For Faiss IVF index, supported search params are: // - ivf_nprobe_pct : int // percentage of total clusters to search // - ivf_max_codes_pct : float // percentage of total vectors to visit to do a query (across all clusters) // // Consult go-faiss to know all supported search params Params json.RawMessage `json:"params"` // Filter query to use with kNN pre-filtering. // Supports pre-filtering with all existing types of query clauses. FilterQuery query.Query `json:"filter,omitempty"` } func (r *SearchRequest) AddKNN(field string, vector []float32, k int64, boost float64) { b := query.Boost(boost) r.KNN = append(r.KNN, &KNNRequest{ Field: field, Vector: vector, K: k, Boost: &b, }) } func (r *SearchRequest) AddKNNWithFilter(field string, vector []float32, k int64, boost float64, filterQuery query.Query) { b := query.Boost(boost) r.KNN = append(r.KNN, &KNNRequest{ Field: field, Vector: vector, K: k, Boost: &b, FilterQuery: filterQuery, }) } func (r *SearchRequest) AddKNNOperator(operator knnOperator) { r.KNNOperator = operator } // UnmarshalJSON deserializes a JSON representation of // a SearchRequest func (r *SearchRequest) UnmarshalJSON(input []byte) error { type tempKNNReq struct { Field string `json:"field"` Vector []float32 `json:"vector"` VectorBase64 string `json:"vector_base64"` K int64 `json:"k"` Boost *query.Boost `json:"boost,omitempty"` Params json.RawMessage `json:"params"` FilterQuery json.RawMessage `json:"filter,omitempty"` } var temp struct { Q json.RawMessage `json:"query"` Size *int `json:"size"` From int `json:"from"` Highlight *HighlightRequest `json:"highlight"` Fields []string `json:"fields"` Facets FacetsRequest `json:"facets"` Explain bool `json:"explain"` Sort []json.RawMessage `json:"sort"` IncludeLocations bool `json:"includeLocations"` Score string `json:"score"` SearchAfter []string `json:"search_after"` SearchBefore []string `json:"search_before"` KNN []*tempKNNReq `json:"knn"` KNNOperator knnOperator `json:"knn_operator"` PreSearchData json.RawMessage `json:"pre_search_data"` } err := json.Unmarshal(input, &temp) if err != nil { return err } if temp.Size == nil { r.Size = 10 } else { r.Size = *temp.Size } if temp.Sort == nil { r.Sort = search.SortOrder{&search.SortScore{Desc: true}} } else { r.Sort, err = search.ParseSortOrderJSON(temp.Sort) if err != nil { return err } } r.From = temp.From r.Explain = temp.Explain r.Highlight = temp.Highlight r.Fields = temp.Fields r.Facets = temp.Facets r.IncludeLocations = temp.IncludeLocations r.Score = temp.Score r.SearchAfter = temp.SearchAfter r.SearchBefore = temp.SearchBefore r.Query, err = query.ParseQuery(temp.Q) if err != nil { return err } if r.Size < 0 { r.Size = 10 } if r.From < 0 { r.From = 0 } r.KNN = make([]*KNNRequest, len(temp.KNN)) for i, knnReq := range temp.KNN { r.KNN[i] = &KNNRequest{} r.KNN[i].Field = temp.KNN[i].Field r.KNN[i].Vector = temp.KNN[i].Vector r.KNN[i].VectorBase64 = temp.KNN[i].VectorBase64 r.KNN[i].K = temp.KNN[i].K r.KNN[i].Boost = temp.KNN[i].Boost r.KNN[i].Params = temp.KNN[i].Params if len(knnReq.FilterQuery) == 0 { // Setting this to nil to avoid ParseQuery() setting it to a match none r.KNN[i].FilterQuery = nil } else { r.KNN[i].FilterQuery, err = query.ParseQuery(knnReq.FilterQuery) if err != nil { return err } } } r.KNNOperator = temp.KNNOperator if r.KNNOperator == "" { r.KNNOperator = knnOperatorOr } if temp.PreSearchData != nil { r.PreSearchData, err = query.ParsePreSearchData(temp.PreSearchData) if err != nil { return err } } return nil } // ----------------------------------------------------------------------------- func copySearchRequest(req *SearchRequest, preSearchData map[string]interface{}) *SearchRequest { rv := SearchRequest{ Query: req.Query, Size: req.Size + req.From, From: 0, Highlight: req.Highlight, Fields: req.Fields, Facets: req.Facets, Explain: req.Explain, Sort: req.Sort.Copy(), IncludeLocations: req.IncludeLocations, Score: req.Score, SearchAfter: req.SearchAfter, SearchBefore: req.SearchBefore, KNN: req.KNN, KNNOperator: req.KNNOperator, PreSearchData: preSearchData, } return &rv } var ( knnOperatorAnd = knnOperator("and") knnOperatorOr = knnOperator("or") ) func createKNNQuery(req *SearchRequest, knnFilterResults map[int]index.EligibleDocumentSelector) ( query.Query, []int64, int64, error) { if requestHasKNN(req) { // first perform validation err := validateKNN(req) if err != nil { return nil, nil, 0, err } var subQueries []query.Query kArray := make([]int64, 0, len(req.KNN)) sumOfK := int64(0) for i, knn := range req.KNN { // If it's a filtered kNN but has no eligible filter hits, then // do not run the kNN query. if selector, exists := knnFilterResults[i]; exists && selector == nil { continue } knnQuery := query.NewKNNQuery(knn.Vector) knnQuery.SetFieldVal(knn.Field) knnQuery.SetK(knn.K) knnQuery.SetBoost(knn.Boost.Value()) knnQuery.SetParams(knn.Params) if selector, exists := knnFilterResults[i]; exists { knnQuery.SetEligibleSelector(selector) } subQueries = append(subQueries, knnQuery) kArray = append(kArray, knn.K) sumOfK += knn.K } rv := query.NewDisjunctionQuery(subQueries) rv.RetrieveScoreBreakdown(true) return rv, kArray, sumOfK, nil } return nil, nil, 0, nil } func validateKNN(req *SearchRequest) error { for _, q := range req.KNN { if q == nil { return fmt.Errorf("knn query cannot be nil") } if len(q.Vector) == 0 && q.VectorBase64 != "" { // consider vector_base64 only if vector is not provided decodedVector, err := document.DecodeVector(q.VectorBase64) if err != nil { return err } q.Vector = decodedVector } if q.K <= 0 || len(q.Vector) == 0 { return fmt.Errorf("k must be greater than 0 and vector must be non-empty") } if q.K > BleveMaxK { return fmt.Errorf("k must be less than %d", BleveMaxK) } // since the DefaultField is not applicable for knn, // the field must be specified. if q.Field == "" { return fmt.Errorf("knn query field must be non-empty") } if vfq, ok := q.FilterQuery.(query.ValidatableQuery); ok { err := vfq.Validate() if err != nil { return fmt.Errorf("knn filter query is invalid: %v", err) } } } switch req.KNNOperator { case knnOperatorAnd, knnOperatorOr, "": // Valid cases, do nothing default: return fmt.Errorf("knn_operator must be either 'and' / 'or'") } return nil } func addSortAndFieldsToKNNHits(req *SearchRequest, knnHits []*search.DocumentMatch, reader index.IndexReader, name string) (err error) { requiredSortFields := req.Sort.RequiredFields() var dvReader index.DocValueReader var updateFieldVisitor index.DocValueVisitor if len(requiredSortFields) > 0 { dvReader, err = reader.DocValueReader(requiredSortFields) if err != nil { return err } updateFieldVisitor = func(field string, term []byte) { req.Sort.UpdateVisitor(field, term) } } for _, hit := range knnHits { if len(requiredSortFields) > 0 { err = dvReader.VisitDocValues(hit.IndexInternalID, updateFieldVisitor) if err != nil { return err } } req.Sort.Value(hit) err, _ = LoadAndHighlightFields(hit, req, "", reader, nil) if err != nil { return err } hit.Index = name } return nil } func (i *indexImpl) runKnnCollector(ctx context.Context, req *SearchRequest, reader index.IndexReader, preSearch bool) ([]*search.DocumentMatch, error) { // Maps the index of a KNN query in the request to its pre-filter result: // - If the KNN query is **not filtered**, the value will be `nil`. // - If the KNN query **is filtered**, the value will be an eligible document selector // that can be used to retrieve eligible documents. // - If there is an **empty entry** for a KNN query, it means no documents match // the filter query, and the KNN query can be skipped. knnFilterResults := make(map[int]index.EligibleDocumentSelector) for idx, knnReq := range req.KNN { filterQ := knnReq.FilterQuery if filterQ == nil || isMatchAllQuery(filterQ) { // When there is no filter query or the filter query is match_all, // all documents are eligible, and can be treated as unfiltered query. continue } else if isMatchNoneQuery(filterQ) { // If the filter query is match_none, then no documents match the filter query. knnFilterResults[idx] = nil continue } // Applies to all supported types of queries. filterSearcher, _ := filterQ.Searcher(ctx, reader, i.m, search.SearcherOptions{ Score: "none", // just want eligible hits --> don't compute scores if not needed }) // Using the index doc count to determine collector size since we do not // have an estimate of the number of eligible docs in the index yet. indexDocCount, err := i.DocCount() if err != nil { return nil, err } filterColl := collector.NewEligibleCollector(int(indexDocCount)) err = filterColl.Collect(ctx, filterSearcher, reader) if err != nil { return nil, err } knnFilterResults[idx] = filterColl.EligibleSelector() } // Add the filter hits when creating the kNN query KNNQuery, kArray, sumOfK, err := createKNNQuery(req, knnFilterResults) if err != nil { return nil, err } knnSearcher, err := KNNQuery.Searcher(ctx, reader, i.m, search.SearcherOptions{ Explain: req.Explain, }) if err != nil { return nil, err } knnCollector := collector.NewKNNCollector(kArray, sumOfK) err = knnCollector.Collect(ctx, knnSearcher, reader) if err != nil { return nil, err } knnHits := knnCollector.Results() if !preSearch { knnHits = finalizeKNNResults(req, knnHits) } // at this point, irrespective of whether it is a preSearch or not, // the knn hits are populated with Sort and Fields. // it must be ensured downstream that the Sort and Fields are not // re-evaluated, for these hits. // also add the index names to the hits, so that when early // exit takes place after the first phase, the hits will have // a valid value for Index. err = addSortAndFieldsToKNNHits(req, knnHits, reader, i.name) if err != nil { return nil, err } return knnHits, nil } func setKnnHitsInCollector(knnHits []*search.DocumentMatch, req *SearchRequest, coll *collector.TopNCollector) { if len(knnHits) > 0 { newScoreExplComputer := func(queryMatch *search.DocumentMatch, knnMatch *search.DocumentMatch) (float64, *search.Explanation) { totalScore := queryMatch.Score + knnMatch.Score if !req.Explain { // exit early as we don't need to compute the explanation return totalScore, nil } return totalScore, &search.Explanation{Value: totalScore, Message: "sum of:", Children: []*search.Explanation{queryMatch.Expl, knnMatch.Expl}} } coll.SetKNNHits(knnHits, search.ScoreExplCorrectionCallbackFunc(newScoreExplComputer)) } } func finalizeKNNResults(req *SearchRequest, knnHits []*search.DocumentMatch) []*search.DocumentMatch { // if the KNN operator is AND, then we need to filter out the hits that // do not have match the KNN queries. if req.KNNOperator == knnOperatorAnd { idx := 0 for _, hit := range knnHits { if len(hit.ScoreBreakdown) == len(req.KNN) { knnHits[idx] = hit idx++ } } knnHits = knnHits[:idx] } // fix the score using score breakdown now // if the score is none, then we need to set the score to 0.0 // if req.Explain is true, then we need to use the expl breakdown to // finalize the correct explanation. for _, hit := range knnHits { hit.Score = 0.0 if req.Score != "none" { for _, score := range hit.ScoreBreakdown { hit.Score += score } } if req.Explain { childrenExpl := make([]*search.Explanation, 0, len(hit.ScoreBreakdown)) for i := range hit.ScoreBreakdown { childrenExpl = append(childrenExpl, hit.Expl.Children[i]) } hit.Expl = &search.Explanation{Value: hit.Score, Message: "sum of:", Children: childrenExpl} } // we don't need the score breakdown anymore // so we can set it to nil hit.ScoreBreakdown = nil } return knnHits } // when we are setting KNN hits in the preSearchData, we need to make sure that // the KNN hit goes to the right index. This is because the KNN hits are // collected from all the indexes in the alias, but the preSearchData is // specific to each index. If alias A1 contains indexes I1 and I2 and // the KNN hits collected from both I1 and I2, and merged to get top K // hits, then the top K hits need to be distributed to I1 and I2, // so that the preSearchData for I1 contains the top K hits from I1 and // the preSearchData for I2 contains the top K hits from I2. func validateAndDistributeKNNHits(knnHits []*search.DocumentMatch, indexes []Index) (map[string][]*search.DocumentMatch, error) { // create a set of all the index names of this alias indexNames := make(map[string]struct{}, len(indexes)) for _, index := range indexes { indexNames[index.Name()] = struct{}{} } segregatedKnnHits := make(map[string][]*search.DocumentMatch) for _, hit := range knnHits { // for each hit, we need to perform a validation check to ensure that the stack // is still valid. // // if the stack is empty, then we have an inconsistency/abnormality // since any hit with an empty stack is supposed to land on a leaf index, // and not an alias. This cannot happen in normal circumstances. But // performing this check to be safe. Since we extract the stack top // in the following steps. if len(hit.IndexNames) == 0 { return nil, ErrorTwoPhaseSearchInconsistency } // since the stack is not empty, we need to check if the top of the stack // is a valid index name, of an index that is part of this alias. If not, // then we have an inconsistency that could be caused due to a topology // change. stackTopIdx := len(hit.IndexNames) - 1 top := hit.IndexNames[stackTopIdx] if _, exists := indexNames[top]; !exists { return nil, ErrorTwoPhaseSearchInconsistency } if stackTopIdx == 0 { // if the stack consists of only one index, then popping the top // would result in an empty slice, and handle this case by setting // indexNames to nil. So that the final search results will not // contain the indexNames field. hit.IndexNames = nil } else { hit.IndexNames = hit.IndexNames[:stackTopIdx] } segregatedKnnHits[top] = append(segregatedKnnHits[top], hit) } return segregatedKnnHits, nil } func requestHasKNN(req *SearchRequest) bool { return len(req.KNN) > 0 } // returns true if the search request contains a KNN request that can be // satisfied by just performing a preSearch, completely bypassing the // actual search. func isKNNrequestSatisfiedByPreSearch(req *SearchRequest) bool { // if req.Query is not match_none => then we need to go to phase 2 // to perform the actual query. if !isMatchNoneQuery(req.Query) { return false } // req.Query is a match_none query // // if request contains facets, we need to perform phase 2 to calculate // the facet result. Since documents were removed as part of the // merging process after phase 1, if the facet results were to be calculated // during phase 1, then they will be now be incorrect, since merging would // remove some documents. if req.Facets != nil { return false } // the request is a match_none query and does not contain any facets // so we can satisfy the request using just the preSearch result. return true } func constructKnnPreSearchData(mergedOut map[string]map[string]interface{}, preSearchResult *SearchResult, indexes []Index) (map[string]map[string]interface{}, error) { distributedHits, err := validateAndDistributeKNNHits([]*search.DocumentMatch(preSearchResult.Hits), indexes) if err != nil { return nil, err } for _, index := range indexes { mergedOut[index.Name()][search.KnnPreSearchDataKey] = distributedHits[index.Name()] } return mergedOut, nil } func addKnnToDummyRequest(dummyReq *SearchRequest, realReq *SearchRequest) { dummyReq.KNN = realReq.KNN dummyReq.KNNOperator = knnOperatorOr dummyReq.Explain = realReq.Explain dummyReq.Fields = realReq.Fields dummyReq.Sort = realReq.Sort } func newKnnPreSearchResultProcessor(req *SearchRequest) *knnPreSearchResultProcessor { kArray := make([]int64, len(req.KNN)) for i, knnReq := range req.KNN { kArray[i] = knnReq.K } knnStore := collector.GetNewKNNCollectorStore(kArray) return &knnPreSearchResultProcessor{ addFn: func(sr *SearchResult, indexName string) { for _, hit := range sr.Hits { // tag the hit with the index name, so that when the // final search result is constructed, the hit will have // a valid path to follow along the alias tree to reach // the index. hit.IndexNames = append(hit.IndexNames, indexName) knnStore.AddDocument(hit) } }, finalizeFn: func(sr *SearchResult) { // passing nil as the document fixup function, because we don't need to // fixup the document, since this was already done in the first phase, // hence error is always nil. // the merged knn hits are finalized and set in the search result. sr.Hits, _ = knnStore.Final(nil) }, } }