1
0
Fork 0
golang-github-blevesearch-b.../search/scorer/scorer_term.go
Daniel Baumann 982828099e
Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00

276 lines
8.3 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scorer
import (
"fmt"
"math"
"reflect"
"github.com/blevesearch/bleve/v2/search"
"github.com/blevesearch/bleve/v2/size"
index "github.com/blevesearch/bleve_index_api"
)
var reflectStaticSizeTermQueryScorer int
func init() {
var tqs TermQueryScorer
reflectStaticSizeTermQueryScorer = int(reflect.TypeOf(tqs).Size())
}
type TermQueryScorer struct {
queryTerm string
queryField string
queryBoost float64
docTerm uint64 // number of documents containing the term
docTotal uint64 // total number of documents in the index
avgDocLength float64
idf float64
options search.SearcherOptions
idfExplanation *search.Explanation
includeScore bool
queryNorm float64
queryWeight float64
queryWeightExplanation *search.Explanation
}
func (s *TermQueryScorer) Size() int {
sizeInBytes := reflectStaticSizeTermQueryScorer + size.SizeOfPtr +
len(s.queryTerm) + len(s.queryField)
if s.idfExplanation != nil {
sizeInBytes += s.idfExplanation.Size()
}
if s.queryWeightExplanation != nil {
sizeInBytes += s.queryWeightExplanation.Size()
}
return sizeInBytes
}
func (s *TermQueryScorer) computeIDF(avgDocLength float64, docTotal, docTerm uint64) float64 {
var rv float64
if avgDocLength > 0 {
// avgDocLength is set only for bm25 scoring
rv = math.Log(1 + (float64(docTotal)-float64(docTerm)+0.5)/
(float64(docTerm)+0.5))
} else {
rv = 1.0 + math.Log(float64(docTotal)/
float64(docTerm+1.0))
}
return rv
}
// queryTerm - the specific term being scored by this scorer object
// queryField - the field in which the term is being searched
// queryBoost - the boost value for the query term
// docTotal - total number of documents in the index
// docTerm - number of documents containing the term
// avgDocLength - average document length in the index
// options - search options such as explain scoring, include the location of the term etc.
func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal,
docTerm uint64, avgDocLength float64, options search.SearcherOptions) *TermQueryScorer {
rv := TermQueryScorer{
queryTerm: string(queryTerm),
queryField: queryField,
queryBoost: queryBoost,
docTerm: docTerm,
docTotal: docTotal,
avgDocLength: avgDocLength,
options: options,
queryWeight: 1.0,
includeScore: options.Score != "none",
}
rv.idf = rv.computeIDF(avgDocLength, docTotal, docTerm)
if options.Explain {
rv.idfExplanation = &search.Explanation{
Value: rv.idf,
Message: fmt.Sprintf("idf(docFreq=%d, maxDocs=%d)", docTerm, docTotal),
}
}
return &rv
}
func (s *TermQueryScorer) Weight() float64 {
sum := s.queryBoost * s.idf
return sum * sum
}
func (s *TermQueryScorer) SetQueryNorm(qnorm float64) {
s.queryNorm = qnorm
// update the query weight
s.queryWeight = s.queryBoost * s.idf * s.queryNorm
if s.options.Explain {
childrenExplanations := make([]*search.Explanation, 3)
childrenExplanations[0] = &search.Explanation{
Value: s.queryBoost,
Message: "boost",
}
childrenExplanations[1] = s.idfExplanation
childrenExplanations[2] = &search.Explanation{
Value: s.queryNorm,
Message: "queryNorm",
}
s.queryWeightExplanation = &search.Explanation{
Value: s.queryWeight,
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, s.queryTerm, s.queryBoost),
Children: childrenExplanations,
}
}
}
func (s *TermQueryScorer) docScore(tf, norm float64) (score float64, model string) {
if s.avgDocLength > 0 {
// bm25 scoring
// using the posting's norm value to recompute the field length for the doc num
fieldLength := 1 / (norm * norm)
score = s.idf * (tf * search.BM25_k1) /
(tf + search.BM25_k1*(1-search.BM25_b+(search.BM25_b*fieldLength/s.avgDocLength)))
model = index.BM25Scoring
} else {
// tf-idf scoring by default
score = tf * norm * s.idf
model = index.DefaultScoringModel
}
return score, model
}
func (s *TermQueryScorer) scoreExplanation(tf float64, termMatch *index.TermFieldDoc) []*search.Explanation {
var rv []*search.Explanation
if s.avgDocLength > 0 {
fieldLength := 1 / (termMatch.Norm * termMatch.Norm)
fieldNormVal := 1 - search.BM25_b + (search.BM25_b * fieldLength / s.avgDocLength)
fieldNormalizeExplanation := &search.Explanation{
Value: fieldNormVal,
Message: fmt.Sprintf("fieldNorm(field=%s), b=%f, fieldLength=%f, avgFieldLength=%f)",
s.queryField, search.BM25_b, fieldLength, s.avgDocLength),
}
saturationExplanation := &search.Explanation{
Value: search.BM25_k1 / (tf + search.BM25_k1*fieldNormVal),
Message: fmt.Sprintf("saturation(term:%s), k1=%f/(tf=%f + k1*fieldNorm=%f))",
termMatch.Term, search.BM25_k1, tf, fieldNormVal),
Children: []*search.Explanation{fieldNormalizeExplanation},
}
rv = make([]*search.Explanation, 3)
rv[0] = &search.Explanation{
Value: tf,
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
}
rv[1] = saturationExplanation
rv[2] = s.idfExplanation
} else {
rv = make([]*search.Explanation, 3)
rv[0] = &search.Explanation{
Value: tf,
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
}
rv[1] = &search.Explanation{
Value: termMatch.Norm,
Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID),
}
rv[2] = s.idfExplanation
}
return rv
}
func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.TermFieldDoc) *search.DocumentMatch {
rv := ctx.DocumentMatchPool.Get()
// perform any score computations only when needed
if s.includeScore || s.options.Explain {
var scoreExplanation *search.Explanation
var tf float64
if termMatch.Freq < MaxSqrtCache {
tf = SqrtCache[int(termMatch.Freq)]
} else {
tf = math.Sqrt(float64(termMatch.Freq))
}
score, scoringModel := s.docScore(tf, termMatch.Norm)
if s.options.Explain {
childrenExplanations := s.scoreExplanation(tf, termMatch)
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), as per %s model, "+
"product of:", s.queryField, s.queryTerm, termMatch.ID, scoringModel),
Children: childrenExplanations,
}
}
// if the query weight isn't 1, multiply
if s.queryWeight != 1.0 {
score = score * s.queryWeight
if s.options.Explain {
childExplanations := make([]*search.Explanation, 2)
childExplanations[0] = s.queryWeightExplanation
childExplanations[1] = scoreExplanation
scoreExplanation = &search.Explanation{
Value: score,
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID),
Children: childExplanations,
}
}
}
if s.includeScore {
rv.Score = score
}
if s.options.Explain {
rv.Expl = scoreExplanation
}
}
rv.IndexInternalID = append(rv.IndexInternalID, termMatch.ID...)
if len(termMatch.Vectors) > 0 {
if cap(rv.FieldTermLocations) < len(termMatch.Vectors) {
rv.FieldTermLocations = make([]search.FieldTermLocation, 0, len(termMatch.Vectors))
}
for _, v := range termMatch.Vectors {
var ap search.ArrayPositions
if len(v.ArrayPositions) > 0 {
n := len(rv.FieldTermLocations)
if n < cap(rv.FieldTermLocations) { // reuse ap slice if available
ap = rv.FieldTermLocations[:n+1][n].Location.ArrayPositions[:0]
}
ap = append(ap, v.ArrayPositions...)
}
rv.FieldTermLocations =
append(rv.FieldTermLocations, search.FieldTermLocation{
Field: v.Field,
Term: s.queryTerm,
Location: search.Location{
Pos: v.Pos,
Start: v.Start,
End: v.End,
ArrayPositions: ap,
},
})
}
}
return rv
}