// Copyright (c) 2014 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package search import ( "context" "github.com/blevesearch/geo/s2" ) func MergeLocations(locations []FieldTermLocationMap) FieldTermLocationMap { rv := locations[0] for i := 1; i < len(locations); i++ { nextLocations := locations[i] for field, termLocationMap := range nextLocations { rvTermLocationMap, rvHasField := rv[field] if rvHasField { rv[field] = MergeTermLocationMaps(rvTermLocationMap, termLocationMap) } else { rv[field] = termLocationMap } } } return rv } func MergeTermLocationMaps(rv, other TermLocationMap) TermLocationMap { for term, locationMap := range other { // for a given term/document there cannot be different locations // if they came back from different clauses, overwrite is ok rv[term] = locationMap } return rv } func MergeFieldTermLocations(dest []FieldTermLocation, matches []*DocumentMatch) []FieldTermLocation { n := len(dest) for _, dm := range matches { n += len(dm.FieldTermLocations) } if cap(dest) < n { dest = append(make([]FieldTermLocation, 0, n), dest...) } for _, dm := range matches { for _, ftl := range dm.FieldTermLocations { dest = append(dest, FieldTermLocation{ Field: ftl.Field, Term: ftl.Term, Location: Location{ Pos: ftl.Location.Pos, Start: ftl.Location.Start, End: ftl.Location.End, ArrayPositions: append(ArrayPositions(nil), ftl.Location.ArrayPositions...), }, }) } } return dest } type SearchIOStatsCallbackFunc func(uint64) // Implementation of SearchIncrementalCostCallbackFn should handle the following messages // - add: increment the cost of a search operation // (which can be specific to a query type as well) // - abort: query was aborted due to a cancel of search's context (for eg), // which can be handled differently as well // - done: indicates that a search was complete and the tracked cost can be // handled safely by the implementation. type SearchIncrementalCostCallbackFn func(SearchIncrementalCostCallbackMsg, SearchQueryType, uint64) type ( SearchIncrementalCostCallbackMsg uint SearchQueryType uint ) const ( Term = SearchQueryType(1 << iota) Geo Numeric GenericCost ) const ( AddM = SearchIncrementalCostCallbackMsg(1 << iota) AbortM DoneM ) // ContextKey is used to identify the context key in the context.Context type ContextKey string func (c ContextKey) String() string { return string(c) } const ( SearchIncrementalCostKey ContextKey = "_search_incremental_cost_key" QueryTypeKey ContextKey = "_query_type_key" FuzzyMatchPhraseKey ContextKey = "_fuzzy_match_phrase_key" IncludeScoreBreakdownKey ContextKey = "_include_score_breakdown_key" // PreSearchKey indicates whether to perform a preliminary search to gather necessary // information which would be used in the actual search down the line. PreSearchKey ContextKey = "_presearch_key" // GetScoringModelCallbackKey is used to help the underlying searcher identify // which scoring mechanism to use based on index mapping. GetScoringModelCallbackKey ContextKey = "_get_scoring_model" // SearchIOStatsCallbackKey is used to help the underlying searcher identify SearchIOStatsCallbackKey ContextKey = "_search_io_stats_callback_key" // GeoBufferPoolCallbackKey ContextKey is used to help the underlying searcher GeoBufferPoolCallbackKey ContextKey = "_geo_buffer_pool_callback_key" // SearchTypeKey is used to identify type of the search being performed. // // for consistent scoring in cases an index is partitioned/sharded (using an // index alias), GlobalScoring helps in aggregating the necessary stats across // all the child bleve indexes (shards/partitions) first before the actual search // is performed, such that the scoring involved using these stats would be at a // global level. SearchTypeKey ContextKey = "_search_type_key" // The following keys are used to invoke the callbacks at the start and end stages // of optimizing the disjunction/conjunction searcher creation. SearcherStartCallbackKey ContextKey = "_searcher_start_callback_key" SearcherEndCallbackKey ContextKey = "_searcher_end_callback_key" // FieldTermSynonymMapKey is used to store and transport the synonym definitions data // to the actual search phase which would use the synonyms to perform the search. FieldTermSynonymMapKey ContextKey = "_field_term_synonym_map_key" // BM25StatsKey is used to store and transport the BM25 Data // to the actual search phase which would use it to perform the search. BM25StatsKey ContextKey = "_bm25_stats_key" ) func RecordSearchCost(ctx context.Context, msg SearchIncrementalCostCallbackMsg, bytes uint64, ) { if ctx != nil { queryType, ok := ctx.Value(QueryTypeKey).(SearchQueryType) if !ok { // for the cost of the non query type specific factors such as // doc values and stored fields section. queryType = GenericCost } aggCallbackFn := ctx.Value(SearchIncrementalCostKey) if aggCallbackFn != nil { aggCallbackFn.(SearchIncrementalCostCallbackFn)(msg, queryType, bytes) } } } // Assigning the size of the largest buffer in the pool to 24KB and // the smallest buffer to 24 bytes. The pools are used to read a // sequence of vertices which are always 24 bytes each. const ( MaxGeoBufPoolSize = 24 * 1024 MinGeoBufPoolSize = 24 ) type GeoBufferPoolCallbackFunc func() *s2.GeoBufferPool // *PreSearchDataKey are used to store the data gathered during the presearch phase // which would be use in the actual search phase. const ( KnnPreSearchDataKey = "_knn_pre_search_data_key" SynonymPreSearchDataKey = "_synonym_pre_search_data_key" BM25PreSearchDataKey = "_bm25_pre_search_data_key" ) const GlobalScoring = "_global_scoring" type ( SearcherStartCallbackFn func(size uint64) error SearcherEndCallbackFn func(size uint64) error ) type GetScoringModelCallbackFn func() string type ScoreExplCorrectionCallbackFunc func(queryMatch *DocumentMatch, knnMatch *DocumentMatch) (float64, *Explanation) // field -> term -> synonyms type FieldTermSynonymMap map[string]map[string][]string func (f FieldTermSynonymMap) MergeWith(fts FieldTermSynonymMap) { for field, termSynonymMap := range fts { // Ensure the field exists in the receiver if _, exists := f[field]; !exists { f[field] = make(map[string][]string) } for term, synonyms := range termSynonymMap { // Append synonyms f[field][term] = append(f[field][term], synonyms...) } } } // BM25 specific multipliers which control the scoring of a document. // // BM25_b - controls the extent to which doc's field length normalize term frequency part of score // BM25_k1 - controls the saturation of the score due to term frequency // the default values are as per elastic search's implementation // - https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules-similarity.html#bm25 // - https://www.elastic.co/blog/practical-bm25-part-3-considerations-for-picking-b-and-k1-in-elasticsearch var ( BM25_k1 float64 = 1.2 BM25_b float64 = 0.75 ) type BM25Stats struct { DocCount float64 `json:"doc_count"` FieldCardinality map[string]int `json:"field_cardinality"` }