golang-github-blevesearch-b.../search/collector/topn.go

//  Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// 		http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package collector

import (
	"context"
	"reflect"
	"strconv"
	"time"

	"github.com/blevesearch/bleve/v2/numeric"
	"github.com/blevesearch/bleve/v2/search"
	"github.com/blevesearch/bleve/v2/size"
	index "github.com/blevesearch/bleve_index_api"
)

var reflectStaticSizeTopNCollector int

func init() {
	var coll TopNCollector
	reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size())
}

type collectorStore interface {
	// Add the document, and if the new store size exceeds the provided size
	// the last element is removed and returned.  If the size has not been
	// exceeded, nil is returned.
	AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch

	Final(skip int, fixup collectorFixup) (search.DocumentMatchCollection, error)

	// Provide access the internal heap implementation
	Internal() search.DocumentMatchCollection
}

// PreAllocSizeSkipCap will cap preallocation to this amount when
// size+skip exceeds this value
var PreAllocSizeSkipCap = 1000

type collectorCompare func(i, j *search.DocumentMatch) int

type collectorFixup func(d *search.DocumentMatch) error

// TopNCollector collects the top N hits, optionally skipping some results
type TopNCollector struct {
	size          int
	skip          int
	total         uint64
	bytesRead     uint64
	maxScore      float64
	took          time.Duration
	sort          search.SortOrder
	results       search.DocumentMatchCollection
	facetsBuilder *search.FacetsBuilder

	store collectorStore

	needDocIds    bool
	neededFields  []string
	cachedScoring []bool
	cachedDesc    []bool

	lowestMatchOutsideResults *search.DocumentMatch
	updateFieldVisitor        index.DocValueVisitor
	dvReader                  index.DocValueReader
	searchAfter               *search.DocumentMatch

	knnHits             map[string]*search.DocumentMatch
	computeNewScoreExpl search.ScoreExplCorrectionCallbackFunc
}

// CheckDoneEvery controls how frequently we check the context deadline
const CheckDoneEvery = uint64(1024)

// NewTopNCollector builds a collector to find the top 'size' hits
// skipping over the first 'skip' hits
// ordering hits by the provided sort order
func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector {
	return newTopNCollector(size, skip, sort)
}

// NewTopNCollectorAfter builds a collector to find the top 'size' hits
// skipping over the first 'skip' hits
// ordering hits by the provided sort order
func NewTopNCollectorAfter(size int, sort search.SortOrder, after []string) *TopNCollector {
	rv := newTopNCollector(size, 0, sort)
	rv.searchAfter = createSearchAfterDocument(sort, after)
	return rv
}

func newTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector {
	hc := &TopNCollector{size: size, skip: skip, sort: sort}

	hc.store = getOptimalCollectorStore(size, skip, func(i, j *search.DocumentMatch) int {
		return hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, i, j)
	})

	// these lookups traverse an interface, so do once up-front
	if sort.RequiresDocID() {
		hc.needDocIds = true
	}
	hc.neededFields = sort.RequiredFields()
	hc.cachedScoring = sort.CacheIsScore()
	hc.cachedDesc = sort.CacheDescending()

	return hc
}

func createSearchAfterDocument(sort search.SortOrder, after []string) *search.DocumentMatch {
	rv := &search.DocumentMatch{
		Sort: after,
	}
	for pos, ss := range sort {
		if ss.RequiresDocID() {
			rv.ID = after[pos]
		}
		if ss.RequiresScoring() {
			if score, err := strconv.ParseFloat(after[pos], 64); err == nil {
				rv.Score = score
			}
		}
	}
	return rv
}

// Filter document matches based on the SearchAfter field in the SearchRequest.
func FilterHitsBySearchAfter(hits []*search.DocumentMatch, sort search.SortOrder, after []string) []*search.DocumentMatch {
	if len(hits) == 0 {
		return hits
	}
	// create a search after document
	searchAfter := createSearchAfterDocument(sort, after)
	// filter the hits
	idx := 0
	cachedScoring := sort.CacheIsScore()
	cachedDesc := sort.CacheDescending()
	for _, hit := range hits {
		if sort.Compare(cachedScoring, cachedDesc, hit, searchAfter) > 0 {
			hits[idx] = hit
			idx++
		}
	}
	return hits[:idx]
}

func getOptimalCollectorStore(size, skip int, comparator collectorCompare) collectorStore {
	// pre-allocate space on the store to avoid reslicing
	// unless the size + skip is too large, then cap it
	// everything should still work, just reslices as necessary
	backingSize := size + skip + 1
	if size+skip > PreAllocSizeSkipCap {
		backingSize = PreAllocSizeSkipCap + 1
	}

	if size+skip > 10 {
		return newStoreHeap(backingSize, comparator)
	} else {
		return newStoreSlice(backingSize, comparator)
	}
}

func (hc *TopNCollector) Size() int {
	sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr

	if hc.facetsBuilder != nil {
		sizeInBytes += hc.facetsBuilder.Size()
	}

	for _, entry := range hc.neededFields {
		sizeInBytes += len(entry) + size.SizeOfString
	}

	sizeInBytes += len(hc.cachedScoring) + len(hc.cachedDesc)

	return sizeInBytes
}

// Collect goes to the index to find the matching documents
func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error {
	startTime := time.Now()
	var err error
	var next *search.DocumentMatch

	// pre-allocate enough space in the DocumentMatchPool
	// unless the size + skip is too large, then cap it
	// everything should still work, just allocates DocumentMatches on demand
	backingSize := hc.size + hc.skip + 1
	if hc.size+hc.skip > PreAllocSizeSkipCap {
		backingSize = PreAllocSizeSkipCap + 1
	}
	searchContext := &search.SearchContext{
		DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)),
		Collector:         hc,
		IndexReader:       reader,
	}

	hc.dvReader, err = reader.DocValueReader(hc.neededFields)
	if err != nil {
		return err
	}

	hc.updateFieldVisitor = func(field string, term []byte) {
		if hc.facetsBuilder != nil {
			hc.facetsBuilder.UpdateVisitor(field, term)
		}
		hc.sort.UpdateVisitor(field, term)
	}

	dmHandlerMaker := MakeTopNDocumentMatchHandler
	if cv := ctx.Value(search.MakeDocumentMatchHandlerKey); cv != nil {
		dmHandlerMaker = cv.(search.MakeDocumentMatchHandler)
	}
	// use the application given builder for making the custom document match
	// handler and perform callbacks/invocations on the newly made handler.
	dmHandler, loadID, err := dmHandlerMaker(searchContext)
	if err != nil {
		return err
	}

	hc.needDocIds = hc.needDocIds || loadID
	select {
	case <-ctx.Done():
		search.RecordSearchCost(ctx, search.AbortM, 0)
		return ctx.Err()
	default:
		next, err = searcher.Next(searchContext)
	}
	for err == nil && next != nil {
		if hc.total%CheckDoneEvery == 0 {
			select {
			case <-ctx.Done():
				search.RecordSearchCost(ctx, search.AbortM, 0)
				return ctx.Err()
			default:
			}
		}

		err = hc.adjustDocumentMatch(searchContext, reader, next)
		if err != nil {
			break
		}

		err = hc.prepareDocumentMatch(searchContext, reader, next, false)
		if err != nil {
			break
		}

		err = dmHandler(next)
		if err != nil {
			break
		}

		next, err = searcher.Next(searchContext)
	}
	if err != nil {
		return err
	}
	if hc.knnHits != nil {
		// we may have some knn hits left that did not match any of the top N tf-idf hits
		// we need to add them to the collector store to consider them as well.
		for _, knnDoc := range hc.knnHits {
			err = hc.prepareDocumentMatch(searchContext, reader, knnDoc, true)
			if err != nil {
				return err
			}
			err = dmHandler(knnDoc)
			if err != nil {
				return err
			}
		}
	}

	statsCallbackFn := ctx.Value(search.SearchIOStatsCallbackKey)
	if statsCallbackFn != nil {
		// hc.bytesRead corresponds to the
		// total bytes read as part of docValues being read every hit
		// which must be accounted by invoking the callback.
		statsCallbackFn.(search.SearchIOStatsCallbackFunc)(hc.bytesRead)

		search.RecordSearchCost(ctx, search.AddM, hc.bytesRead)
	}

	// help finalize/flush the results in case
	// of custom document match handlers.
	err = dmHandler(nil)
	if err != nil {
		return err
	}

	// compute search duration
	hc.took = time.Since(startTime)

	// finalize actual results
	err = hc.finalizeResults(reader)
	if err != nil {
		return err
	}
	return nil
}

var sortByScoreOpt = []string{"_score"}

func (hc *TopNCollector) adjustDocumentMatch(ctx *search.SearchContext,
	reader index.IndexReader, d *search.DocumentMatch) (err error) {
	if hc.knnHits != nil {
		d.ID, err = reader.ExternalID(d.IndexInternalID)
		if err != nil {
			return err
		}
		if knnHit, ok := hc.knnHits[d.ID]; ok {
			d.Score, d.Expl = hc.computeNewScoreExpl(d, knnHit)
			delete(hc.knnHits, d.ID)
		}
	}
	return nil
}

func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext,
	reader index.IndexReader, d *search.DocumentMatch, isKnnDoc bool) (err error) {

	// visit field terms for features that require it (sort, facets)
	if !isKnnDoc && len(hc.neededFields) > 0 {
		err = hc.visitFieldTerms(reader, d, hc.updateFieldVisitor)
		if err != nil {
			return err
		}
	} else if isKnnDoc && hc.facetsBuilder != nil {
		// we need to visit the field terms for the knn document
		// only for those fields that are required for faceting
		// and not for sorting. This is because the knn document's
		// sort value is already computed in the knn collector.
		err = hc.visitFieldTerms(reader, d, func(field string, term []byte) {
			if hc.facetsBuilder != nil {
				hc.facetsBuilder.UpdateVisitor(field, term)
			}
		})
		if err != nil {
			return err
		}
	}

	// increment total hits
	hc.total++
	d.HitNumber = hc.total

	// update max score
	if d.Score > hc.maxScore {
		hc.maxScore = d.Score
	}
	// early exit as the document match had its sort value calculated in the knn
	// collector itself
	if isKnnDoc {
		return nil
	}

	// see if we need to load ID (at this early stage, for example to sort on it)
	if hc.needDocIds && d.ID == "" {
		d.ID, err = reader.ExternalID(d.IndexInternalID)
		if err != nil {
			return err
		}
	}

	// compute this hits sort value
	if len(hc.sort) == 1 && hc.cachedScoring[0] {
		d.Sort = sortByScoreOpt
	} else {
		hc.sort.Value(d)
	}

	return nil
}

func MakeTopNDocumentMatchHandler(
	ctx *search.SearchContext) (search.DocumentMatchHandler, bool, error) {
	var hc *TopNCollector
	var ok bool
	if hc, ok = ctx.Collector.(*TopNCollector); ok {
		return func(d *search.DocumentMatch) error {
			if d == nil {
				return nil
			}

			// support search after based pagination,
			// if this hit is <= the search after sort key
			// we should skip it
			if hc.searchAfter != nil {
				// exact sort order matches use hit number to break tie
				// but we want to allow for exact match, so we pretend
				hc.searchAfter.HitNumber = d.HitNumber
				if hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.searchAfter) <= 0 {
					ctx.DocumentMatchPool.Put(d)
					return nil
				}
			}

			// optimization, we track lowest sorting hit already removed from heap
			// with this one comparison, we can avoid all heap operations if
			// this hit would have been added and then immediately removed
			if hc.lowestMatchOutsideResults != nil {
				cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d,
					hc.lowestMatchOutsideResults)
				if cmp >= 0 {
					// this hit can't possibly be in the result set, so avoid heap ops
					ctx.DocumentMatchPool.Put(d)
					return nil
				}
			}

			removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip)
			if removed != nil {
				if hc.lowestMatchOutsideResults == nil {
					hc.lowestMatchOutsideResults = removed
				} else {
					cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc,
						removed, hc.lowestMatchOutsideResults)
					if cmp < 0 {
						tmp := hc.lowestMatchOutsideResults
						hc.lowestMatchOutsideResults = removed
						ctx.DocumentMatchPool.Put(tmp)
					}
				}
			}
			return nil
		}, false, nil
	}
	return nil, false, nil
}

// visitFieldTerms is responsible for visiting the field terms of the
// search hit, and passing visited terms to the sort and facet builder
func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.DocumentMatch, v index.DocValueVisitor) error {
	if hc.facetsBuilder != nil {
		hc.facetsBuilder.StartDoc()
	}
	if d.ID != "" && d.IndexInternalID == nil {
		// this document may have been sent over as preSearchData and
		// we need to look up the internal id to visit the doc values for it
		var err error
		d.IndexInternalID, err = reader.InternalID(d.ID)
		if err != nil {
			return err
		}
	}

	err := hc.dvReader.VisitDocValues(d.IndexInternalID, v)
	if hc.facetsBuilder != nil {
		hc.facetsBuilder.EndDoc()
	}

	hc.bytesRead += hc.dvReader.BytesRead()

	return err
}

// SetFacetsBuilder registers a facet builder for this collector
func (hc *TopNCollector) SetFacetsBuilder(facetsBuilder *search.FacetsBuilder) {
	hc.facetsBuilder = facetsBuilder
	fieldsRequiredForFaceting := facetsBuilder.RequiredFields()
	// for each of these fields, append only if not already there in hc.neededFields.
	for _, field := range fieldsRequiredForFaceting {
		found := false
		for _, neededField := range hc.neededFields {
			if field == neededField {
				found = true
				break
			}
		}
		if !found {
			hc.neededFields = append(hc.neededFields, field)
		}
	}
}

// finalizeResults starts with the heap containing the final top size+skip
// it now throws away the results to be skipped
// and does final doc id lookup (if necessary)
func (hc *TopNCollector) finalizeResults(r index.IndexReader) error {
	var err error
	hc.results, err = hc.store.Final(hc.skip, func(doc *search.DocumentMatch) error {
		if doc.ID == "" {
			// look up the id since we need it for lookup
			var err error
			doc.ID, err = r.ExternalID(doc.IndexInternalID)
			if err != nil {
				return err
			}
		}
		doc.Complete(nil)
		return nil
	})
	if err != nil {
		return err
	}

	// Decode geo sort keys back to its distance values
	for i, so := range hc.sort {
		if _, ok := so.(*search.SortGeoDistance); ok {
			for _, dm := range hc.results {
				// The string is a int64 bit representation of a float64 distance
				distInt, err := numeric.PrefixCoded(dm.Sort[i]).Int64()
				if err != nil {
					return err
				}
				dm.Sort[i] = strconv.FormatFloat(numeric.Int64ToFloat64(distInt), 'f', -1, 64)
			}
		}
	}
	return err
}

// Results returns the collected hits
func (hc *TopNCollector) Results() search.DocumentMatchCollection {
	return hc.results
}

// Total returns the total number of hits
func (hc *TopNCollector) Total() uint64 {
	return hc.total
}

// MaxScore returns the maximum score seen across all the hits
func (hc *TopNCollector) MaxScore() float64 {
	return hc.maxScore
}

// Took returns the time spent collecting hits
func (hc *TopNCollector) Took() time.Duration {
	return hc.took
}

// FacetResults returns the computed facets results
func (hc *TopNCollector) FacetResults() search.FacetResults {
	if hc.facetsBuilder != nil {
		return hc.facetsBuilder.Results()
	}
	return nil
}

func (hc *TopNCollector) SetKNNHits(knnHits search.DocumentMatchCollection, newScoreExplComputer search.ScoreExplCorrectionCallbackFunc) {
	hc.knnHits = make(map[string]*search.DocumentMatch, len(knnHits))
	for _, hit := range knnHits {
		hc.knnHits[hit.ID] = hit
	}
	hc.computeNewScoreExpl = newScoreExplComputer
}