554 lines
16 KiB
Go
554 lines
16 KiB
Go
// Copyright (c) 2014 Couchbase, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package searcher
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math"
|
|
"reflect"
|
|
|
|
"github.com/blevesearch/bleve/v2/search"
|
|
"github.com/blevesearch/bleve/v2/size"
|
|
index "github.com/blevesearch/bleve_index_api"
|
|
)
|
|
|
|
var reflectStaticSizePhraseSearcher int
|
|
|
|
func init() {
|
|
var ps PhraseSearcher
|
|
reflectStaticSizePhraseSearcher = int(reflect.TypeOf(ps).Size())
|
|
}
|
|
|
|
type PhraseSearcher struct {
|
|
mustSearcher search.Searcher
|
|
queryNorm float64
|
|
currMust *search.DocumentMatch
|
|
terms [][]string
|
|
path phrasePath
|
|
paths []phrasePath
|
|
locations []search.Location
|
|
initialized bool
|
|
// map a term to a list of fuzzy terms that match it
|
|
fuzzyTermMatches map[string][]string
|
|
}
|
|
|
|
func (s *PhraseSearcher) Size() int {
|
|
sizeInBytes := reflectStaticSizePhraseSearcher + size.SizeOfPtr
|
|
|
|
if s.mustSearcher != nil {
|
|
sizeInBytes += s.mustSearcher.Size()
|
|
}
|
|
|
|
if s.currMust != nil {
|
|
sizeInBytes += s.currMust.Size()
|
|
}
|
|
|
|
for _, entry := range s.terms {
|
|
sizeInBytes += size.SizeOfSlice
|
|
for _, entry1 := range entry {
|
|
sizeInBytes += size.SizeOfString + len(entry1)
|
|
}
|
|
}
|
|
|
|
return sizeInBytes
|
|
}
|
|
|
|
func NewPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms []string,
|
|
fuzziness int, autoFuzzy bool, field string, boost float64, options search.SearcherOptions) (*PhraseSearcher, error) {
|
|
|
|
// turn flat terms []string into [][]string
|
|
mterms := make([][]string, len(terms))
|
|
for i, term := range terms {
|
|
mterms[i] = []string{term}
|
|
}
|
|
return NewMultiPhraseSearcher(ctx, indexReader, mterms, fuzziness, autoFuzzy, field, boost, options)
|
|
}
|
|
|
|
func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms [][]string,
|
|
fuzziness int, autoFuzzy bool, field string, boost float64, options search.SearcherOptions) (*PhraseSearcher, error) {
|
|
|
|
options.IncludeTermVectors = true
|
|
var termPositionSearchers []search.Searcher
|
|
var err error
|
|
var ts search.Searcher
|
|
// The following logic checks if fuzziness is enabled.
|
|
// Fuzziness is considered enabled if either:
|
|
// a. `fuzziness` is greater than 0, or
|
|
// b. `autoFuzzy` is set to true.
|
|
// if both conditions are true, `autoFuzzy` takes precedence.
|
|
// If enabled, a map will be created to store the matches for fuzzy terms.
|
|
fuzzinessEnabled := autoFuzzy || fuzziness > 0
|
|
var fuzzyTermMatches map[string][]string
|
|
if fuzzinessEnabled {
|
|
fuzzyTermMatches = make(map[string][]string)
|
|
ctx = context.WithValue(ctx, search.FuzzyMatchPhraseKey, fuzzyTermMatches)
|
|
}
|
|
// in case of fuzzy multi-phrase, phrase and match-phrase queries we hardcode the
|
|
// prefix length to 0, as setting a per word matching prefix length would not
|
|
// make sense from a user perspective.
|
|
for _, termPos := range terms {
|
|
if len(termPos) == 1 && termPos[0] != "" {
|
|
// single term
|
|
if fuzzinessEnabled {
|
|
// fuzzy
|
|
if autoFuzzy {
|
|
// auto fuzzy
|
|
ts, err = NewAutoFuzzySearcher(ctx, indexReader, termPos[0], 0, field, boost, options)
|
|
} else {
|
|
// non-auto fuzzy
|
|
ts, err = NewFuzzySearcher(ctx, indexReader, termPos[0], 0, fuzziness, field, boost, options)
|
|
}
|
|
} else {
|
|
// non-fuzzy
|
|
ts, err = NewTermSearcher(ctx, indexReader, termPos[0], field, boost, options)
|
|
}
|
|
if err != nil {
|
|
// close any searchers already opened
|
|
for _, ts := range termPositionSearchers {
|
|
_ = ts.Close()
|
|
}
|
|
return nil, fmt.Errorf("phrase searcher error building term searcher: %v", err)
|
|
}
|
|
termPositionSearchers = append(termPositionSearchers, ts)
|
|
} else if len(termPos) > 1 {
|
|
// multiple terms
|
|
var termSearchers []search.Searcher
|
|
for _, term := range termPos {
|
|
if term == "" {
|
|
continue
|
|
}
|
|
if fuzzinessEnabled {
|
|
// fuzzy
|
|
if autoFuzzy {
|
|
// auto fuzzy
|
|
ts, err = NewAutoFuzzySearcher(ctx, indexReader, term, 0, field, boost, options)
|
|
} else {
|
|
// non-auto fuzzy
|
|
ts, err = NewFuzzySearcher(ctx, indexReader, term, 0, fuzziness, field, boost, options)
|
|
}
|
|
} else {
|
|
// non-fuzzy
|
|
ts, err = NewTermSearcher(ctx, indexReader, term, field, boost, options)
|
|
}
|
|
if err != nil {
|
|
// close any searchers already opened
|
|
for _, ts := range termPositionSearchers {
|
|
_ = ts.Close()
|
|
}
|
|
return nil, fmt.Errorf("phrase searcher error building term searcher: %v", err)
|
|
}
|
|
termSearchers = append(termSearchers, ts)
|
|
}
|
|
disjunction, err := NewDisjunctionSearcher(ctx, indexReader, termSearchers, 1, options)
|
|
if err != nil {
|
|
// close any searchers already opened
|
|
for _, ts := range termPositionSearchers {
|
|
_ = ts.Close()
|
|
}
|
|
return nil, fmt.Errorf("phrase searcher error building term position disjunction searcher: %v", err)
|
|
}
|
|
termPositionSearchers = append(termPositionSearchers, disjunction)
|
|
}
|
|
}
|
|
|
|
if ctx != nil {
|
|
if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok {
|
|
if ts, exists := fts[field]; exists {
|
|
if fuzzinessEnabled {
|
|
for term, fuzzyTerms := range fuzzyTermMatches {
|
|
fuzzySynonymTerms := make([]string, 0, len(fuzzyTerms))
|
|
if s, found := ts[term]; found {
|
|
fuzzySynonymTerms = append(fuzzySynonymTerms, s...)
|
|
}
|
|
for _, fuzzyTerm := range fuzzyTerms {
|
|
if fuzzyTerm == term {
|
|
continue
|
|
}
|
|
if s, found := ts[fuzzyTerm]; found {
|
|
fuzzySynonymTerms = append(fuzzySynonymTerms, s...)
|
|
}
|
|
}
|
|
if len(fuzzySynonymTerms) > 0 {
|
|
fuzzyTermMatches[term] = append(fuzzyTermMatches[term], fuzzySynonymTerms...)
|
|
}
|
|
}
|
|
} else {
|
|
for _, termPos := range terms {
|
|
for _, term := range termPos {
|
|
if s, found := ts[term]; found {
|
|
if fuzzyTermMatches == nil {
|
|
fuzzyTermMatches = make(map[string][]string)
|
|
}
|
|
fuzzyTermMatches[term] = s
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
mustSearcher, err := NewConjunctionSearcher(ctx, indexReader, termPositionSearchers, options)
|
|
if err != nil {
|
|
// close any searchers already opened
|
|
for _, ts := range termPositionSearchers {
|
|
_ = ts.Close()
|
|
}
|
|
return nil, fmt.Errorf("phrase searcher error building conjunction searcher: %v", err)
|
|
}
|
|
|
|
// build our searcher
|
|
rv := PhraseSearcher{
|
|
mustSearcher: mustSearcher,
|
|
terms: terms,
|
|
fuzzyTermMatches: fuzzyTermMatches,
|
|
}
|
|
rv.computeQueryNorm()
|
|
return &rv, nil
|
|
}
|
|
|
|
func (s *PhraseSearcher) computeQueryNorm() {
|
|
// first calculate sum of squared weights
|
|
sumOfSquaredWeights := 0.0
|
|
if s.mustSearcher != nil {
|
|
sumOfSquaredWeights += s.mustSearcher.Weight()
|
|
}
|
|
|
|
// now compute query norm from this
|
|
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
|
|
// finally tell all the downstream searchers the norm
|
|
if s.mustSearcher != nil {
|
|
s.mustSearcher.SetQueryNorm(s.queryNorm)
|
|
}
|
|
}
|
|
|
|
func (s *PhraseSearcher) initSearchers(ctx *search.SearchContext) error {
|
|
err := s.advanceNextMust(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
s.initialized = true
|
|
return nil
|
|
}
|
|
|
|
func (s *PhraseSearcher) advanceNextMust(ctx *search.SearchContext) error {
|
|
var err error
|
|
|
|
if s.mustSearcher != nil {
|
|
if s.currMust != nil {
|
|
ctx.DocumentMatchPool.Put(s.currMust)
|
|
}
|
|
s.currMust, err = s.mustSearcher.Next(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *PhraseSearcher) Weight() float64 {
|
|
return s.mustSearcher.Weight()
|
|
}
|
|
|
|
func (s *PhraseSearcher) SetQueryNorm(qnorm float64) {
|
|
s.mustSearcher.SetQueryNorm(qnorm)
|
|
}
|
|
|
|
func (s *PhraseSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) {
|
|
if !s.initialized {
|
|
err := s.initSearchers(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
for s.currMust != nil {
|
|
// check this match against phrase constraints
|
|
rv := s.checkCurrMustMatch(ctx)
|
|
|
|
// prepare for next iteration (either loop or subsequent call to Next())
|
|
err := s.advanceNextMust(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// if match satisfied phrase constraints return it as a hit
|
|
if rv != nil {
|
|
return rv, nil
|
|
}
|
|
}
|
|
|
|
return nil, nil
|
|
}
|
|
|
|
// checkCurrMustMatch is solely concerned with determining if the DocumentMatch
|
|
// pointed to by s.currMust (which satisifies the pre-condition searcher)
|
|
// also satisfies the phrase constraints. if so, it returns a DocumentMatch
|
|
// for this document, otherwise nil
|
|
func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.DocumentMatch {
|
|
s.locations = s.currMust.Complete(s.locations)
|
|
|
|
locations := s.currMust.Locations
|
|
s.currMust.Locations = nil
|
|
|
|
ftls := s.currMust.FieldTermLocations
|
|
|
|
// typically we would expect there to only actually be results in
|
|
// one field, but we allow for this to not be the case
|
|
// but, we note that phrase constraints can only be satisfied within
|
|
// a single field, so we can check them each independently
|
|
for field, tlm := range locations {
|
|
ftls = s.checkCurrMustMatchField(ctx, field, tlm, ftls)
|
|
}
|
|
|
|
if len(ftls) > 0 {
|
|
// return match
|
|
rv := s.currMust
|
|
s.currMust = nil
|
|
rv.FieldTermLocations = ftls
|
|
return rv
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// checkCurrMustMatchField is solely concerned with determining if one
|
|
// particular field within the currMust DocumentMatch Locations
|
|
// satisfies the phrase constraints (possibly more than once). if so,
|
|
// the matching field term locations are appended to the provided
|
|
// slice
|
|
func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext,
|
|
field string, tlm search.TermLocationMap,
|
|
ftls []search.FieldTermLocation) []search.FieldTermLocation {
|
|
if s.path == nil {
|
|
s.path = make(phrasePath, 0, len(s.terms))
|
|
}
|
|
var tlmPtr *search.TermLocationMap = &tlm
|
|
if s.fuzzyTermMatches != nil {
|
|
// if fuzzy search, we need to expand the tlm to include all the fuzzy matches
|
|
// Example - term is "foo" and fuzzy matches are "foo", "fool", "food"
|
|
// the non expanded tlm will be:
|
|
// foo -> Locations[foo]
|
|
// fool -> Locations[fool]
|
|
// food -> Locations[food]
|
|
// the expanded tlm will be:
|
|
// foo -> [Locations[foo], Locations[fool], Locations[food]]
|
|
expandedTlm := make(search.TermLocationMap)
|
|
s.expandFuzzyMatches(tlm, expandedTlm)
|
|
tlmPtr = &expandedTlm
|
|
}
|
|
s.paths = findPhrasePaths(0, nil, s.terms, *tlmPtr, s.path[:0], 0, s.paths[:0])
|
|
for _, p := range s.paths {
|
|
for _, pp := range p {
|
|
ftls = append(ftls, search.FieldTermLocation{
|
|
Field: field,
|
|
Term: pp.term,
|
|
Location: search.Location{
|
|
Pos: pp.loc.Pos,
|
|
Start: pp.loc.Start,
|
|
End: pp.loc.End,
|
|
ArrayPositions: pp.loc.ArrayPositions,
|
|
},
|
|
})
|
|
}
|
|
}
|
|
return ftls
|
|
}
|
|
|
|
func (s *PhraseSearcher) expandFuzzyMatches(tlm search.TermLocationMap, expandedTlm search.TermLocationMap) {
|
|
for term, fuzzyMatches := range s.fuzzyTermMatches {
|
|
locations := tlm[term]
|
|
for _, fuzzyMatch := range fuzzyMatches {
|
|
if fuzzyMatch == term {
|
|
continue
|
|
}
|
|
locations = append(locations, tlm[fuzzyMatch]...)
|
|
}
|
|
expandedTlm[term] = locations
|
|
}
|
|
}
|
|
|
|
type phrasePart struct {
|
|
term string
|
|
loc *search.Location
|
|
}
|
|
|
|
func (p *phrasePart) String() string {
|
|
return fmt.Sprintf("[%s %v]", p.term, p.loc)
|
|
}
|
|
|
|
type phrasePath []phrasePart
|
|
|
|
func (p phrasePath) MergeInto(in search.TermLocationMap) {
|
|
for _, pp := range p {
|
|
in[pp.term] = append(in[pp.term], pp.loc)
|
|
}
|
|
}
|
|
|
|
func (p phrasePath) String() string {
|
|
rv := "["
|
|
for i, pp := range p {
|
|
if i > 0 {
|
|
rv += ", "
|
|
}
|
|
rv += pp.String()
|
|
}
|
|
rv += "]"
|
|
return rv
|
|
}
|
|
|
|
// findPhrasePaths is a function to identify phrase matches from a set
|
|
// of known term locations. it recursive so care must be taken with
|
|
// arguments and return values.
|
|
//
|
|
// prevPos - the previous location, 0 on first invocation
|
|
//
|
|
// ap - array positions of the first candidate phrase part to
|
|
// which further recursive phrase parts must match,
|
|
// nil on initial invocation or when there are no array positions
|
|
//
|
|
// phraseTerms - slice containing the phrase terms,
|
|
// may contain empty string as placeholder (don't care)
|
|
//
|
|
// tlm - the Term Location Map containing all relevant term locations
|
|
//
|
|
// p - the current path being explored (appended to in recursive calls)
|
|
// this is the primary state being built during the traversal
|
|
//
|
|
// remainingSlop - amount of sloppiness that's allowed, which is the
|
|
// sum of the editDistances from each matching phrase part, where 0 means no
|
|
// sloppiness allowed (all editDistances must be 0), decremented during recursion
|
|
//
|
|
// rv - the final result being appended to by all the recursive calls
|
|
//
|
|
// returns slice of paths, or nil if invocation did not find any successful paths
|
|
func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string,
|
|
tlm search.TermLocationMap, p phrasePath, remainingSlop int, rv []phrasePath) []phrasePath {
|
|
// no more terms
|
|
if len(phraseTerms) < 1 {
|
|
// snapshot or copy the recursively built phrasePath p and
|
|
// append it to the rv, also optimizing by checking if next
|
|
// phrasePath item in the rv (which we're about to overwrite)
|
|
// is available for reuse
|
|
var pcopy phrasePath
|
|
if len(rv) < cap(rv) {
|
|
pcopy = rv[:len(rv)+1][len(rv)][:0]
|
|
}
|
|
return append(rv, append(pcopy, p...))
|
|
}
|
|
|
|
car := phraseTerms[0]
|
|
cdr := phraseTerms[1:]
|
|
|
|
// empty term is treated as match (continue)
|
|
if len(car) == 0 || (len(car) == 1 && car[0] == "") {
|
|
nextPos := prevPos + 1
|
|
if prevPos == 0 {
|
|
// if prevPos was 0, don't set it to 1 (as thats not a real abs pos)
|
|
nextPos = 0 // don't advance nextPos if prevPos was 0
|
|
}
|
|
return findPhrasePaths(nextPos, ap, cdr, tlm, p, remainingSlop, rv)
|
|
}
|
|
|
|
// locations for this term
|
|
for _, carTerm := range car {
|
|
locations := tlm[carTerm]
|
|
LOCATIONS_LOOP:
|
|
for _, loc := range locations {
|
|
if prevPos != 0 && !loc.ArrayPositions.Equals(ap) {
|
|
// if the array positions are wrong, can't match, try next location
|
|
continue
|
|
}
|
|
|
|
// compute distance from previous phrase term
|
|
dist := 0
|
|
if prevPos != 0 {
|
|
dist = editDistance(prevPos+1, loc.Pos)
|
|
}
|
|
|
|
// if enough slop remaining, continue recursively
|
|
if prevPos == 0 || (remainingSlop-dist) >= 0 {
|
|
// skip if we've already used this term+loc already
|
|
for _, ppart := range p {
|
|
if ppart.term == carTerm && ppart.loc == loc {
|
|
continue LOCATIONS_LOOP
|
|
}
|
|
}
|
|
|
|
// this location works, add it to the path (but not for empty term)
|
|
px := append(p, phrasePart{term: carTerm, loc: loc})
|
|
rv = findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist, rv)
|
|
}
|
|
}
|
|
}
|
|
return rv
|
|
}
|
|
|
|
func editDistance(p1, p2 uint64) int {
|
|
dist := int(p1 - p2)
|
|
if dist < 0 {
|
|
return -dist
|
|
}
|
|
return dist
|
|
}
|
|
|
|
func (s *PhraseSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) {
|
|
if !s.initialized {
|
|
err := s.initSearchers(ctx)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
if s.currMust != nil {
|
|
if s.currMust.IndexInternalID.Compare(ID) >= 0 {
|
|
return s.Next(ctx)
|
|
}
|
|
ctx.DocumentMatchPool.Put(s.currMust)
|
|
}
|
|
if s.currMust == nil {
|
|
return nil, nil
|
|
}
|
|
var err error
|
|
s.currMust, err = s.mustSearcher.Advance(ctx, ID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return s.Next(ctx)
|
|
}
|
|
|
|
func (s *PhraseSearcher) Count() uint64 {
|
|
// for now return a worst case
|
|
return s.mustSearcher.Count()
|
|
}
|
|
|
|
func (s *PhraseSearcher) Close() error {
|
|
if s.mustSearcher != nil {
|
|
err := s.mustSearcher.Close()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *PhraseSearcher) Min() int {
|
|
return 0
|
|
}
|
|
|
|
func (s *PhraseSearcher) DocumentMatchPoolSize() int {
|
|
return s.mustSearcher.DocumentMatchPoolSize() + 1
|
|
}
|