golang-github-blevesearch-b.../search/scorer/scorer_term.go

//  Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// 		http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package scorer

import (
	"fmt"
	"math"
	"reflect"

	"github.com/blevesearch/bleve/v2/search"
	"github.com/blevesearch/bleve/v2/size"
	index "github.com/blevesearch/bleve_index_api"
)

var reflectStaticSizeTermQueryScorer int

func init() {
	var tqs TermQueryScorer
	reflectStaticSizeTermQueryScorer = int(reflect.TypeOf(tqs).Size())
}

type TermQueryScorer struct {
	queryTerm              string
	queryField             string
	queryBoost             float64
	docTerm                uint64 // number of documents containing the term
	docTotal               uint64 // total number of documents in the index
	avgDocLength           float64
	idf                    float64
	options                search.SearcherOptions
	idfExplanation         *search.Explanation
	includeScore           bool
	queryNorm              float64
	queryWeight            float64
	queryWeightExplanation *search.Explanation
}

func (s *TermQueryScorer) Size() int {
	sizeInBytes := reflectStaticSizeTermQueryScorer + size.SizeOfPtr +
		len(s.queryTerm) + len(s.queryField)

	if s.idfExplanation != nil {
		sizeInBytes += s.idfExplanation.Size()
	}

	if s.queryWeightExplanation != nil {
		sizeInBytes += s.queryWeightExplanation.Size()
	}

	return sizeInBytes
}

func (s *TermQueryScorer) computeIDF(avgDocLength float64, docTotal, docTerm uint64) float64 {
	var rv float64
	if avgDocLength > 0 {
		// avgDocLength is set only for bm25 scoring
		rv = math.Log(1 + (float64(docTotal)-float64(docTerm)+0.5)/
			(float64(docTerm)+0.5))
	} else {
		rv = 1.0 + math.Log(float64(docTotal)/
			float64(docTerm+1.0))
	}

	return rv
}

// queryTerm - the specific term being scored by this scorer object
// queryField - the field in which the term is being searched
// queryBoost - the boost value for the query term
// docTotal - total number of documents in the index
// docTerm - number of documents containing the term
// avgDocLength - average document length in the index
// options - search options such as explain scoring, include the location of the term etc.
func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal,
	docTerm uint64, avgDocLength float64, options search.SearcherOptions) *TermQueryScorer {

	rv := TermQueryScorer{
		queryTerm:    string(queryTerm),
		queryField:   queryField,
		queryBoost:   queryBoost,
		docTerm:      docTerm,
		docTotal:     docTotal,
		avgDocLength: avgDocLength,
		options:      options,
		queryWeight:  1.0,
		includeScore: options.Score != "none",
	}

	rv.idf = rv.computeIDF(avgDocLength, docTotal, docTerm)
	if options.Explain {
		rv.idfExplanation = &search.Explanation{
			Value:   rv.idf,
			Message: fmt.Sprintf("idf(docFreq=%d, maxDocs=%d)", docTerm, docTotal),
		}
	}

	return &rv
}

func (s *TermQueryScorer) Weight() float64 {
	sum := s.queryBoost * s.idf
	return sum * sum
}

func (s *TermQueryScorer) SetQueryNorm(qnorm float64) {
	s.queryNorm = qnorm

	// update the query weight
	s.queryWeight = s.queryBoost * s.idf * s.queryNorm

	if s.options.Explain {
		childrenExplanations := make([]*search.Explanation, 3)
		childrenExplanations[0] = &search.Explanation{
			Value:   s.queryBoost,
			Message: "boost",
		}
		childrenExplanations[1] = s.idfExplanation
		childrenExplanations[2] = &search.Explanation{
			Value:   s.queryNorm,
			Message: "queryNorm",
		}
		s.queryWeightExplanation = &search.Explanation{
			Value:    s.queryWeight,
			Message:  fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, s.queryTerm, s.queryBoost),
			Children: childrenExplanations,
		}
	}
}

func (s *TermQueryScorer) docScore(tf, norm float64) (score float64, model string) {
	if s.avgDocLength > 0 {
		// bm25 scoring
		// using the posting's norm value to recompute the field length for the doc num
		fieldLength := 1 / (norm * norm)

		score = s.idf * (tf * search.BM25_k1) /
			(tf + search.BM25_k1*(1-search.BM25_b+(search.BM25_b*fieldLength/s.avgDocLength)))
		model = index.BM25Scoring
	} else {
		// tf-idf scoring by default
		score = tf * norm * s.idf
		model = index.DefaultScoringModel
	}
	return score, model
}

func (s *TermQueryScorer) scoreExplanation(tf float64, termMatch *index.TermFieldDoc) []*search.Explanation {
	var rv []*search.Explanation
	if s.avgDocLength > 0 {
		fieldLength := 1 / (termMatch.Norm * termMatch.Norm)
		fieldNormVal := 1 - search.BM25_b + (search.BM25_b * fieldLength / s.avgDocLength)
		fieldNormalizeExplanation := &search.Explanation{
			Value: fieldNormVal,
			Message: fmt.Sprintf("fieldNorm(field=%s), b=%f, fieldLength=%f, avgFieldLength=%f)",
				s.queryField, search.BM25_b, fieldLength, s.avgDocLength),
		}

		saturationExplanation := &search.Explanation{
			Value: search.BM25_k1 / (tf + search.BM25_k1*fieldNormVal),
			Message: fmt.Sprintf("saturation(term:%s), k1=%f/(tf=%f + k1*fieldNorm=%f))",
				termMatch.Term, search.BM25_k1, tf, fieldNormVal),
			Children: []*search.Explanation{fieldNormalizeExplanation},
		}

		rv = make([]*search.Explanation, 3)
		rv[0] = &search.Explanation{
			Value:   tf,
			Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
		}
		rv[1] = saturationExplanation
		rv[2] = s.idfExplanation
	} else {
		rv = make([]*search.Explanation, 3)
		rv[0] = &search.Explanation{
			Value:   tf,
			Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
		}
		rv[1] = &search.Explanation{
			Value:   termMatch.Norm,
			Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID),
		}
		rv[2] = s.idfExplanation
	}
	return rv
}

func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.TermFieldDoc) *search.DocumentMatch {
	rv := ctx.DocumentMatchPool.Get()
	// perform any score computations only when needed
	if s.includeScore || s.options.Explain {
		var scoreExplanation *search.Explanation
		var tf float64
		if termMatch.Freq < MaxSqrtCache {
			tf = SqrtCache[int(termMatch.Freq)]
		} else {
			tf = math.Sqrt(float64(termMatch.Freq))
		}

		score, scoringModel := s.docScore(tf, termMatch.Norm)
		if s.options.Explain {
			childrenExplanations := s.scoreExplanation(tf, termMatch)
			scoreExplanation = &search.Explanation{
				Value: score,
				Message: fmt.Sprintf("fieldWeight(%s:%s in %s), as per %s model, "+
					"product of:", s.queryField, s.queryTerm, termMatch.ID, scoringModel),
				Children: childrenExplanations,
			}
		}

		// if the query weight isn't 1, multiply
		if s.queryWeight != 1.0 {
			score = score * s.queryWeight
			if s.options.Explain {
				childExplanations := make([]*search.Explanation, 2)
				childExplanations[0] = s.queryWeightExplanation
				childExplanations[1] = scoreExplanation
				scoreExplanation = &search.Explanation{
					Value:    score,
					Message:  fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID),
					Children: childExplanations,
				}
			}
		}

		if s.includeScore {
			rv.Score = score
		}

		if s.options.Explain {
			rv.Expl = scoreExplanation
		}
	}

	rv.IndexInternalID = append(rv.IndexInternalID, termMatch.ID...)

	if len(termMatch.Vectors) > 0 {
		if cap(rv.FieldTermLocations) < len(termMatch.Vectors) {
			rv.FieldTermLocations = make([]search.FieldTermLocation, 0, len(termMatch.Vectors))
		}

		for _, v := range termMatch.Vectors {
			var ap search.ArrayPositions
			if len(v.ArrayPositions) > 0 {
				n := len(rv.FieldTermLocations)
				if n < cap(rv.FieldTermLocations) { // reuse ap slice if available
					ap = rv.FieldTermLocations[:n+1][n].Location.ArrayPositions[:0]
				}
				ap = append(ap, v.ArrayPositions...)
			}
			rv.FieldTermLocations =
				append(rv.FieldTermLocations, search.FieldTermLocation{
					Field: v.Field,
					Term:  s.queryTerm,
					Location: search.Location{
						Pos:            v.Pos,
						Start:          v.Start,
						End:            v.End,
						ArrayPositions: ap,
					},
				})
		}
	}
	return rv
}