Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00 · 2025-05-19 00:20:02 +02:00 · 982828099e
commit 982828099e
parent c71cb8b61d
783 changed files with 150650 additions and 0 deletions
--- a/search/searcher/search_fuzzy.go
+++ b/search/searcher/search_fuzzy.go
@ -0,0 +1,250 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package searcher
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	"github.com/blevesearch/bleve/v2/search"
+	index "github.com/blevesearch/bleve_index_api"
+)
+
+var MaxFuzziness = 2
+
+// AutoFuzzinessHighThreshold is the threshold for the term length
+// above which the fuzziness is set to MaxFuzziness when the fuzziness
+// mode is set to AutoFuzziness.
+var AutoFuzzinessHighThreshold = 5
+
+// AutoFuzzinessLowThreshold is the threshold for the term length
+// below which the fuzziness is set to zero when the fuzziness mode
+// is set to AutoFuzziness.
+// For terms with length between AutoFuzzinessLowThreshold and
+// AutoFuzzinessHighThreshold, the fuzziness is set to
+// MaxFuzziness - 1.
+var AutoFuzzinessLowThreshold = 2
+
+func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term string,
+	prefix, fuzziness int, field string, boost float64,
+	options search.SearcherOptions) (search.Searcher, error) {
+
+	if fuzziness > MaxFuzziness {
+		return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness)
+	}
+
+	if fuzziness < 0 {
+		return nil, fmt.Errorf("invalid fuzziness, negative")
+	}
+	if fuzziness == 0 {
+		// no fuzziness, just do a term search
+		// check if the call is made from a phrase searcher
+		// and if so, add the term to the fuzzy term matches
+		// since the fuzzy candidate terms are not collected
+		// for a term search, and the only candidate term is
+		// the term itself
+		if ctx != nil {
+			fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey)
+			if fuzzyTermMatches != nil {
+				fuzzyTermMatches.(map[string][]string)[term] = []string{term}
+			}
+		}
+		return NewTermSearcher(ctx, indexReader, term, field, boost, options)
+	}
+
+	// Note: we don't byte slice the term for a prefix because of runes.
+	prefixTerm := ""
+	for i, r := range term {
+		if i < prefix {
+			prefixTerm += string(r)
+		} else {
+			break
+		}
+	}
+	fuzzyCandidates, err := findFuzzyCandidateTerms(ctx, indexReader, term, fuzziness,
+		field, prefixTerm)
+	if err != nil {
+		return nil, err
+	}
+
+	var candidates []string
+	var editDistances []uint8
+	var dictBytesRead uint64
+	if fuzzyCandidates != nil {
+		candidates = fuzzyCandidates.candidates
+		editDistances = fuzzyCandidates.editDistances
+		dictBytesRead = fuzzyCandidates.bytesRead
+	}
+
+	if ctx != nil {
+		reportIOStats(ctx, dictBytesRead)
+		search.RecordSearchCost(ctx, search.AddM, dictBytesRead)
+		fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey)
+		if fuzzyTermMatches != nil {
+			fuzzyTermMatches.(map[string][]string)[term] = candidates
+		}
+	}
+	// check if the candidates are empty or have one term which is the term itself
+	if len(candidates) == 0 || (len(candidates) == 1 && candidates[0] == term) {
+		if ctx != nil {
+			fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey)
+			if fuzzyTermMatches != nil {
+				fuzzyTermMatches.(map[string][]string)[term] = []string{term}
+			}
+		}
+		return NewTermSearcher(ctx, indexReader, term, field, boost, options)
+	}
+
+	return NewMultiTermSearcherBoosted(ctx, indexReader, candidates, field,
+		boost, editDistances, options, true)
+}
+
+func GetAutoFuzziness(term string) int {
+	termLength := len(term)
+	if termLength > AutoFuzzinessHighThreshold {
+		return MaxFuzziness
+	} else if termLength > AutoFuzzinessLowThreshold {
+		return MaxFuzziness - 1
+	}
+	return 0
+}
+
+func NewAutoFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term string,
+	prefix int, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) {
+	return NewFuzzySearcher(ctx, indexReader, term, prefix, GetAutoFuzziness(term), field, boost, options)
+}
+
+type fuzzyCandidates struct {
+	candidates    []string
+	editDistances []uint8
+	bytesRead     uint64
+}
+
+func reportIOStats(ctx context.Context, bytesRead uint64) {
+	// The fuzzy, regexp like queries essentially load a dictionary,
+	// which potentially incurs a cost that must be accounted by
+	// using the callback to report the value.
+	if ctx != nil {
+		statsCallbackFn := ctx.Value(search.SearchIOStatsCallbackKey)
+		if statsCallbackFn != nil {
+			statsCallbackFn.(search.SearchIOStatsCallbackFunc)(bytesRead)
+		}
+	}
+}
+
+func findFuzzyCandidateTerms(ctx context.Context, indexReader index.IndexReader, term string,
+	fuzziness int, field, prefixTerm string) (rv *fuzzyCandidates, err error) {
+	rv = &fuzzyCandidates{
+		candidates:    make([]string, 0),
+		editDistances: make([]uint8, 0),
+	}
+
+	// in case of advanced reader implementations directly call
+	// the levenshtein automaton based iterator to collect the
+	// candidate terms
+	if ir, ok := indexReader.(index.IndexReaderFuzzy); ok {
+		termSet := make(map[string]struct{})
+		addCandidateTerm := func(term string, editDistance uint8) error {
+			if _, exists := termSet[term]; !exists {
+				termSet[term] = struct{}{}
+				rv.candidates = append(rv.candidates, term)
+				rv.editDistances = append(rv.editDistances, editDistance)
+				if tooManyClauses(len(rv.candidates)) {
+					return tooManyClausesErr(field, len(rv.candidates))
+				}
+			}
+			return nil
+		}
+		fieldDict, a, err := ir.FieldDictFuzzyAutomaton(field, term, fuzziness, prefixTerm)
+		if err != nil {
+			return nil, err
+		}
+		defer func() {
+			if cerr := fieldDict.Close(); cerr != nil && err == nil {
+				err = cerr
+			}
+		}()
+		tfd, err := fieldDict.Next()
+		for err == nil && tfd != nil {
+			err = addCandidateTerm(tfd.Term, tfd.EditDistance)
+			if err != nil {
+				return nil, err
+			}
+			tfd, err = fieldDict.Next()
+		}
+		if err != nil {
+			return nil, err
+		}
+		if ctx != nil {
+			if fts, ok := ctx.Value(search.FieldTermSynonymMapKey).(search.FieldTermSynonymMap); ok {
+				if ts, exists := fts[field]; exists {
+					for term := range ts {
+						if _, exists := termSet[term]; exists {
+							continue
+						}
+						if !strings.HasPrefix(term, prefixTerm) {
+							continue
+						}
+						match, editDistance := a.MatchAndDistance(term)
+						if match {
+							err = addCandidateTerm(term, editDistance)
+							if err != nil {
+								return nil, err
+							}
+						}
+					}
+				}
+			}
+		}
+		rv.bytesRead = fieldDict.BytesRead()
+		return rv, nil
+	}
+
+	var fieldDict index.FieldDict
+	if len(prefixTerm) > 0 {
+		fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
+	} else {
+		fieldDict, err = indexReader.FieldDict(field)
+	}
+	if err != nil {
+		return nil, err
+	}
+	defer func() {
+		if cerr := fieldDict.Close(); cerr != nil && err == nil {
+			err = cerr
+		}
+	}()
+
+	// enumerate terms and check levenshtein distance
+	var reuse []int
+	tfd, err := fieldDict.Next()
+	for err == nil && tfd != nil {
+		var ld int
+		var exceeded bool
+		ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
+		if !exceeded && ld <= fuzziness {
+			rv.candidates = append(rv.candidates, tfd.Term)
+			rv.editDistances = append(rv.editDistances, uint8(ld))
+			if tooManyClauses(len(rv.candidates)) {
+				return nil, tooManyClausesErr(field, len(rv.candidates))
+			}
+		}
+		tfd, err = fieldDict.Next()
+	}
+
+	rv.bytesRead = fieldDict.BytesRead()
+	return rv, err
+}