golang-github-blevesearch-b.../search/searcher/search_fuzzy.go

//  Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// 		http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package searcher

import (
	"context"
	"fmt"
	"strings"

	"github.com/blevesearch/bleve/v2/search"
	index "github.com/blevesearch/bleve_index_api"
)

var MaxFuzziness = 2

// AutoFuzzinessHighThreshold is the threshold for the term length
// above which the fuzziness is set to MaxFuzziness when the fuzziness
// mode is set to AutoFuzziness.
var AutoFuzzinessHighThreshold = 5

// AutoFuzzinessLowThreshold is the threshold for the term length
// below which the fuzziness is set to zero when the fuzziness mode
// is set to AutoFuzziness.
// For terms with length between AutoFuzzinessLowThreshold and
// AutoFuzzinessHighThreshold, the fuzziness is set to
// MaxFuzziness - 1.
var AutoFuzzinessLowThreshold = 2

func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term string,
	prefix, fuzziness int, field string, boost float64,
	options search.SearcherOptions) (search.Searcher, error) {

	if fuzziness > MaxFuzziness {
		return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness)
	}

	if fuzziness < 0 {
		return nil, fmt.Errorf("invalid fuzziness, negative")
	}
	if fuzziness == 0 {
		// no fuzziness, just do a term search
		// check if the call is made from a phrase searcher
		// and if so, add the term to the fuzzy term matches
		// since the fuzzy candidate terms are not collected
		// for a term search, and the only candidate term is
		// the term itself
		if ctx != nil {
			fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey)
			if fuzzyTermMatches != nil {
				fuzzyTermMatches.(map[string][]string)[term] = []string{term}
			}
		}
		return NewTermSearcher(ctx, indexReader, term, field, boost, options)
	}

	// Note: we don't byte slice the term for a prefix because of runes.
	prefixTerm := ""
	for i, r := range term {
		if i < prefix {
			prefixTerm += string(r)
		} else {
			break
		}
	}
	fuzzyCandidates, err := findFuzzyCandidateTerms(ctx, indexReader, term, fuzziness,
		field, prefixTerm)
	if err != nil {
		return nil, err
	}

	var candidates []string
	var editDistances []uint8
	var dictBytesRead uint64
	if fuzzyCandidates != nil {
		candidates = fuzzyCandidates.candidates
		editDistances = fuzzyCandidates.editDistances
		dictBytesRead = fuzzyCandidates.bytesRead
	}

	if ctx != nil {
		reportIOStats(ctx, dictBytesRead)
		search.RecordSearchCost(ctx, search.AddM, dictBytesRead)
		fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey)
		if fuzzyTermMatches != nil {
			fuzzyTermMatches.(map[string][]string)[term] = candidates
		}
	}
	// check if the candidates are empty or have one term which is the term itself
	if len(candidates) == 0 || (len(candidates) == 1 && candidates[0] == term) {
		if ctx != nil {
			fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey)
			if fuzzyTermMatches != nil {
				fuzzyTermMatches.(map[string][]string)[term] = []string{term}
			}
		}
		return NewTermSearcher(ctx, indexReader, term, field, boost, options)
	}

	return NewMultiTermSearcherBoosted(ctx, indexReader, candidates, field,
		boost, editDistances, options, true)
}

func GetAutoFuzziness(term string) int {
	termLength := len(term)
	if termLength > AutoFuzzinessHighThreshold {
		return MaxFuzziness
	} else if termLength > AutoFuzzinessLowThreshold {
		return MaxFuzziness - 1
	}
	return 0
}

func NewAutoFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term string,
	prefix int, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) {
	return NewFuzzySearcher(ctx, indexReader, term, prefix, GetAutoFuzziness(term), field, boost, options)
}

type fuzzyCandidates struct {
	candidates    []string
	editDistances []uint8
	bytesRead     uint64
}

func reportIOStats(ctx context.Context, bytesRead uint64) {
	// The fuzzy, regexp like queries essentially load a dictionary,
	// which potentially incurs a cost that must be accounted by
	// using the callback to report the value.
	if ctx != nil {
		statsCallbackFn := ctx.Value(search.SearchIOStatsCallbackKey)
		if statsCallbackFn != nil {
			statsCallbackFn.(search.SearchIOStatsCallbackFunc)(bytesRead)
		}
	}
}

func findFuzzyCandidateTerms(ctx context.Context, indexReader index.IndexReader, term string,
	fuzziness int, field, prefixTerm string) (rv *fuzzyCandidates, err error) {
	rv = &fuzzyCandidates{
		candidates:    make([]string, 0),
		editDistances: make([]uint8, 0),
	}

	// in case of advanced reader implementations directly call
	// the levenshtein automaton based iterator to collect the
	// candidate terms
	if ir, ok := indexReader.(index.IndexReaderFuzzy); ok {
		termSet := make(map[string]struct{})
		addCandidateTerm := func(term string, editDistance uint8) error {
			if _, exists := termSet[term]; !exists {
				termSet[term] = struct{}{}
				rv.candidates = append(rv.candidates, term)
				rv.editDistances = append(rv.editDistances, editDistance)
				if tooManyClauses(len(rv.candidates)) {
					return tooManyClausesErr(field, len(rv.candidates))
				}
			}
			return nil
		}
		fieldDict, a, err := ir.FieldDictFuzzyAutomaton(field, term, fuzziness, prefixTerm)
		if err != nil {
			return nil, err
		}
		defer func() {
			if cerr := fieldDict.Close(); cerr != nil && err == nil {
				err = cerr
			}
		}()
		tfd, err := fieldDict.Next()
		for err == nil && tfd != nil {
			err = addCandidateTerm(tfd.Term, tfd.EditDistance)
			if err != nil {
				return nil, err
			}
			tfd, err = fieldDict.Next()
		}
		if err != nil {
			return nil, err
		}
		if ctx != nil {
			if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok {
				if ts, exists := fts[field]; exists {
					for term := range ts {
						if _, exists := termSet[term]; exists {
							continue
						}
						if !strings.HasPrefix(term, prefixTerm) {
							continue
						}
						match, editDistance := a.MatchAndDistance(term)
						if match {
							err = addCandidateTerm(term, editDistance)
							if err != nil {
								return nil, err
							}
						}
					}
				}
			}
		}
		rv.bytesRead = fieldDict.BytesRead()
		return rv, nil
	}

	var fieldDict index.FieldDict
	if len(prefixTerm) > 0 {
		fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
	} else {
		fieldDict, err = indexReader.FieldDict(field)
	}
	if err != nil {
		return nil, err
	}
	defer func() {
		if cerr := fieldDict.Close(); cerr != nil && err == nil {
			err = cerr
		}
	}()

	// enumerate terms and check levenshtein distance
	var reuse []int
	tfd, err := fieldDict.Next()
	for err == nil && tfd != nil {
		var ld int
		var exceeded bool
		ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
		if !exceeded && ld <= fuzziness {
			rv.candidates = append(rv.candidates, tfd.Term)
			rv.editDistances = append(rv.editDistances, uint8(ld))
			if tooManyClauses(len(rv.candidates)) {
				return nil, tooManyClausesErr(field, len(rv.candidates))
			}
		}
		tfd, err = fieldDict.Next()
	}

	rv.bytesRead = fieldDict.BytesRead()
	return rv, err
}