1
0
Fork 0
golang-github-blevesearch-b.../search/searcher/search_fuzzy.go
Daniel Baumann 982828099e
Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00

250 lines
7.4 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package searcher
import (
"context"
"fmt"
"strings"
"github.com/blevesearch/bleve/v2/search"
index "github.com/blevesearch/bleve_index_api"
)
var MaxFuzziness = 2
// AutoFuzzinessHighThreshold is the threshold for the term length
// above which the fuzziness is set to MaxFuzziness when the fuzziness
// mode is set to AutoFuzziness.
var AutoFuzzinessHighThreshold = 5
// AutoFuzzinessLowThreshold is the threshold for the term length
// below which the fuzziness is set to zero when the fuzziness mode
// is set to AutoFuzziness.
// For terms with length between AutoFuzzinessLowThreshold and
// AutoFuzzinessHighThreshold, the fuzziness is set to
// MaxFuzziness - 1.
var AutoFuzzinessLowThreshold = 2
func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term string,
prefix, fuzziness int, field string, boost float64,
options search.SearcherOptions) (search.Searcher, error) {
if fuzziness > MaxFuzziness {
return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness)
}
if fuzziness < 0 {
return nil, fmt.Errorf("invalid fuzziness, negative")
}
if fuzziness == 0 {
// no fuzziness, just do a term search
// check if the call is made from a phrase searcher
// and if so, add the term to the fuzzy term matches
// since the fuzzy candidate terms are not collected
// for a term search, and the only candidate term is
// the term itself
if ctx != nil {
fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey)
if fuzzyTermMatches != nil {
fuzzyTermMatches.(map[string][]string)[term] = []string{term}
}
}
return NewTermSearcher(ctx, indexReader, term, field, boost, options)
}
// Note: we don't byte slice the term for a prefix because of runes.
prefixTerm := ""
for i, r := range term {
if i < prefix {
prefixTerm += string(r)
} else {
break
}
}
fuzzyCandidates, err := findFuzzyCandidateTerms(ctx, indexReader, term, fuzziness,
field, prefixTerm)
if err != nil {
return nil, err
}
var candidates []string
var editDistances []uint8
var dictBytesRead uint64
if fuzzyCandidates != nil {
candidates = fuzzyCandidates.candidates
editDistances = fuzzyCandidates.editDistances
dictBytesRead = fuzzyCandidates.bytesRead
}
if ctx != nil {
reportIOStats(ctx, dictBytesRead)
search.RecordSearchCost(ctx, search.AddM, dictBytesRead)
fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey)
if fuzzyTermMatches != nil {
fuzzyTermMatches.(map[string][]string)[term] = candidates
}
}
// check if the candidates are empty or have one term which is the term itself
if len(candidates) == 0 || (len(candidates) == 1 && candidates[0] == term) {
if ctx != nil {
fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey)
if fuzzyTermMatches != nil {
fuzzyTermMatches.(map[string][]string)[term] = []string{term}
}
}
return NewTermSearcher(ctx, indexReader, term, field, boost, options)
}
return NewMultiTermSearcherBoosted(ctx, indexReader, candidates, field,
boost, editDistances, options, true)
}
func GetAutoFuzziness(term string) int {
termLength := len(term)
if termLength > AutoFuzzinessHighThreshold {
return MaxFuzziness
} else if termLength > AutoFuzzinessLowThreshold {
return MaxFuzziness - 1
}
return 0
}
func NewAutoFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term string,
prefix int, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) {
return NewFuzzySearcher(ctx, indexReader, term, prefix, GetAutoFuzziness(term), field, boost, options)
}
type fuzzyCandidates struct {
candidates []string
editDistances []uint8
bytesRead uint64
}
func reportIOStats(ctx context.Context, bytesRead uint64) {
// The fuzzy, regexp like queries essentially load a dictionary,
// which potentially incurs a cost that must be accounted by
// using the callback to report the value.
if ctx != nil {
statsCallbackFn := ctx.Value(search.SearchIOStatsCallbackKey)
if statsCallbackFn != nil {
statsCallbackFn.(search.SearchIOStatsCallbackFunc)(bytesRead)
}
}
}
func findFuzzyCandidateTerms(ctx context.Context, indexReader index.IndexReader, term string,
fuzziness int, field, prefixTerm string) (rv *fuzzyCandidates, err error) {
rv = &fuzzyCandidates{
candidates: make([]string, 0),
editDistances: make([]uint8, 0),
}
// in case of advanced reader implementations directly call
// the levenshtein automaton based iterator to collect the
// candidate terms
if ir, ok := indexReader.(index.IndexReaderFuzzy); ok {
termSet := make(map[string]struct{})
addCandidateTerm := func(term string, editDistance uint8) error {
if _, exists := termSet[term]; !exists {
termSet[term] = struct{}{}
rv.candidates = append(rv.candidates, term)
rv.editDistances = append(rv.editDistances, editDistance)
if tooManyClauses(len(rv.candidates)) {
return tooManyClausesErr(field, len(rv.candidates))
}
}
return nil
}
fieldDict, a, err := ir.FieldDictFuzzyAutomaton(field, term, fuzziness, prefixTerm)
if err != nil {
return nil, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
err = addCandidateTerm(tfd.Term, tfd.EditDistance)
if err != nil {
return nil, err
}
tfd, err = fieldDict.Next()
}
if err != nil {
return nil, err
}
if ctx != nil {
if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok {
if ts, exists := fts[field]; exists {
for term := range ts {
if _, exists := termSet[term]; exists {
continue
}
if !strings.HasPrefix(term, prefixTerm) {
continue
}
match, editDistance := a.MatchAndDistance(term)
if match {
err = addCandidateTerm(term, editDistance)
if err != nil {
return nil, err
}
}
}
}
}
}
rv.bytesRead = fieldDict.BytesRead()
return rv, nil
}
var fieldDict index.FieldDict
if len(prefixTerm) > 0 {
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
} else {
fieldDict, err = indexReader.FieldDict(field)
}
if err != nil {
return nil, err
}
defer func() {
if cerr := fieldDict.Close(); cerr != nil && err == nil {
err = cerr
}
}()
// enumerate terms and check levenshtein distance
var reuse []int
tfd, err := fieldDict.Next()
for err == nil && tfd != nil {
var ld int
var exceeded bool
ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
if !exceeded && ld <= fuzziness {
rv.candidates = append(rv.candidates, tfd.Term)
rv.editDistances = append(rv.editDistances, uint8(ld))
if tooManyClauses(len(rv.candidates)) {
return nil, tooManyClausesErr(field, len(rv.candidates))
}
}
tfd, err = fieldDict.Next()
}
rv.bytesRead = fieldDict.BytesRead()
return rv, err
}