1
0
Fork 0
golang-github-blevesearch-b.../search/util.go
Daniel Baumann 982828099e
Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00

235 lines
7.6 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package search
import (
"context"
"github.com/blevesearch/geo/s2"
)
func MergeLocations(locations []FieldTermLocationMap) FieldTermLocationMap {
rv := locations[0]
for i := 1; i < len(locations); i++ {
nextLocations := locations[i]
for field, termLocationMap := range nextLocations {
rvTermLocationMap, rvHasField := rv[field]
if rvHasField {
rv[field] = MergeTermLocationMaps(rvTermLocationMap, termLocationMap)
} else {
rv[field] = termLocationMap
}
}
}
return rv
}
func MergeTermLocationMaps(rv, other TermLocationMap) TermLocationMap {
for term, locationMap := range other {
// for a given term/document there cannot be different locations
// if they came back from different clauses, overwrite is ok
rv[term] = locationMap
}
return rv
}
func MergeFieldTermLocations(dest []FieldTermLocation, matches []*DocumentMatch) []FieldTermLocation {
n := len(dest)
for _, dm := range matches {
n += len(dm.FieldTermLocations)
}
if cap(dest) < n {
dest = append(make([]FieldTermLocation, 0, n), dest...)
}
for _, dm := range matches {
for _, ftl := range dm.FieldTermLocations {
dest = append(dest, FieldTermLocation{
Field: ftl.Field,
Term: ftl.Term,
Location: Location{
Pos: ftl.Location.Pos,
Start: ftl.Location.Start,
End: ftl.Location.End,
ArrayPositions: append(ArrayPositions(nil), ftl.Location.ArrayPositions...),
},
})
}
}
return dest
}
type SearchIOStatsCallbackFunc func(uint64)
// Implementation of SearchIncrementalCostCallbackFn should handle the following messages
// - add: increment the cost of a search operation
// (which can be specific to a query type as well)
// - abort: query was aborted due to a cancel of search's context (for eg),
// which can be handled differently as well
// - done: indicates that a search was complete and the tracked cost can be
// handled safely by the implementation.
type SearchIncrementalCostCallbackFn func(SearchIncrementalCostCallbackMsg,
SearchQueryType, uint64)
type (
SearchIncrementalCostCallbackMsg uint
SearchQueryType uint
)
const (
Term = SearchQueryType(1 << iota)
Geo
Numeric
GenericCost
)
const (
AddM = SearchIncrementalCostCallbackMsg(1 << iota)
AbortM
DoneM
)
// ContextKey is used to identify the context key in the context.Context
type ContextKey string
func (c ContextKey) String() string {
return string(c)
}
const (
SearchIncrementalCostKey ContextKey = "_search_incremental_cost_key"
QueryTypeKey ContextKey = "_query_type_key"
FuzzyMatchPhraseKey ContextKey = "_fuzzy_match_phrase_key"
IncludeScoreBreakdownKey ContextKey = "_include_score_breakdown_key"
// PreSearchKey indicates whether to perform a preliminary search to gather necessary
// information which would be used in the actual search down the line.
PreSearchKey ContextKey = "_presearch_key"
// GetScoringModelCallbackKey is used to help the underlying searcher identify
// which scoring mechanism to use based on index mapping.
GetScoringModelCallbackKey ContextKey = "_get_scoring_model"
// SearchIOStatsCallbackKey is used to help the underlying searcher identify
SearchIOStatsCallbackKey ContextKey = "_search_io_stats_callback_key"
// GeoBufferPoolCallbackKey ContextKey is used to help the underlying searcher
GeoBufferPoolCallbackKey ContextKey = "_geo_buffer_pool_callback_key"
// SearchTypeKey is used to identify type of the search being performed.
//
// for consistent scoring in cases an index is partitioned/sharded (using an
// index alias), GlobalScoring helps in aggregating the necessary stats across
// all the child bleve indexes (shards/partitions) first before the actual search
// is performed, such that the scoring involved using these stats would be at a
// global level.
SearchTypeKey ContextKey = "_search_type_key"
// The following keys are used to invoke the callbacks at the start and end stages
// of optimizing the disjunction/conjunction searcher creation.
SearcherStartCallbackKey ContextKey = "_searcher_start_callback_key"
SearcherEndCallbackKey ContextKey = "_searcher_end_callback_key"
// FieldTermSynonymMapKey is used to store and transport the synonym definitions data
// to the actual search phase which would use the synonyms to perform the search.
FieldTermSynonymMapKey ContextKey = "_field_term_synonym_map_key"
// BM25StatsKey is used to store and transport the BM25 Data
// to the actual search phase which would use it to perform the search.
BM25StatsKey ContextKey = "_bm25_stats_key"
)
func RecordSearchCost(ctx context.Context,
msg SearchIncrementalCostCallbackMsg, bytes uint64,
) {
if ctx != nil {
queryType, ok := ctx.Value(QueryTypeKey).(SearchQueryType)
if !ok {
// for the cost of the non query type specific factors such as
// doc values and stored fields section.
queryType = GenericCost
}
aggCallbackFn := ctx.Value(SearchIncrementalCostKey)
if aggCallbackFn != nil {
aggCallbackFn.(SearchIncrementalCostCallbackFn)(msg, queryType, bytes)
}
}
}
// Assigning the size of the largest buffer in the pool to 24KB and
// the smallest buffer to 24 bytes. The pools are used to read a
// sequence of vertices which are always 24 bytes each.
const (
MaxGeoBufPoolSize = 24 * 1024
MinGeoBufPoolSize = 24
)
type GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool
// *PreSearchDataKey are used to store the data gathered during the presearch phase
// which would be use in the actual search phase.
const (
KnnPreSearchDataKey = "_knn_pre_search_data_key"
SynonymPreSearchDataKey = "_synonym_pre_search_data_key"
BM25PreSearchDataKey = "_bm25_pre_search_data_key"
)
const GlobalScoring = "_global_scoring"
type (
SearcherStartCallbackFn func(size uint64) error
SearcherEndCallbackFn func(size uint64) error
)
type GetScoringModelCallbackFn func() string
type ScoreExplCorrectionCallbackFunc func(queryMatch *DocumentMatch, knnMatch *DocumentMatch) (float64, *Explanation)
// field -> term -> synonyms
type FieldTermSynonymMap map[string]map[string][]string
func (f FieldTermSynonymMap) MergeWith(fts FieldTermSynonymMap) {
for field, termSynonymMap := range fts {
// Ensure the field exists in the receiver
if _, exists := f[field]; !exists {
f[field] = make(map[string][]string)
}
for term, synonyms := range termSynonymMap {
// Append synonyms
f[field][term] = append(f[field][term], synonyms...)
}
}
}
// BM25 specific multipliers which control the scoring of a document.
//
// BM25_b - controls the extent to which doc's field length normalize term frequency part of score
// BM25_k1 - controls the saturation of the score due to term frequency
// the default values are as per elastic search's implementation
// - https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules-similarity.html#bm25
// - https://www.elastic.co/blog/practical-bm25-part-3-considerations-for-picking-b-and-k1-in-elasticsearch
var (
BM25_k1 float64 = 1.2
BM25_b float64 = 0.75
)
type BM25Stats struct {
DocCount float64 `json:"doc_count"`
FieldCardinality map[string]int `json:"field_cardinality"`
}