golang-github-blevesearch-b.../index/scorch/persister.go

//  Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// 		http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package scorch

import (
	"bytes"
	"encoding/binary"
	"encoding/json"
	"fmt"
	"io"
	"log"
	"math"
	"os"
	"path/filepath"
	"slices"
	"sort"
	"strconv"
	"strings"
	"sync/atomic"
	"time"

	"github.com/RoaringBitmap/roaring/v2"
	"github.com/blevesearch/bleve/v2/util"
	index "github.com/blevesearch/bleve_index_api"
	segment "github.com/blevesearch/scorch_segment_api/v2"
	bolt "go.etcd.io/bbolt"
)

// DefaultPersisterNapTimeMSec is kept to zero as this helps in direct
// persistence of segments with the default safe batch option.
// If the default safe batch option results in high number of
// files on disk, then users may initialise this configuration parameter
// with higher values so that the persister will nap a bit within it's
// work loop to favour better in-memory merging of segments to result
// in fewer segment files on disk. But that may come with an indexing
// performance overhead.
// Unsafe batch users are advised to override this to higher value
// for better performance especially with high data density.
var DefaultPersisterNapTimeMSec int = 0 // ms

// DefaultPersisterNapUnderNumFiles helps in controlling the pace of
// persister. At times of a slow merger progress with heavy file merging
// operations, its better to pace down the persister for letting the merger
// to catch up within a range defined by this parameter.
// Fewer files on disk (as per the merge plan) would result in keeping the
// file handle usage under limit, faster disk merger and a healthier index.
// Its been observed that such a loosely sync'ed introducer-persister-merger
// trio results in better overall performance.
var DefaultPersisterNapUnderNumFiles int = 1000

var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64

type persisterOptions struct {
	// PersisterNapTimeMSec controls the wait/delay injected into
	// persistence workloop to improve the chances for
	// a healthier and heavier in-memory merging
	PersisterNapTimeMSec int

	// PersisterNapTimeMSec > 0, and the number of files is less than
	// PersisterNapUnderNumFiles, then the persister will sleep
	// PersisterNapTimeMSec amount of time to improve the chances for
	// a healthier and heavier in-memory merging
	PersisterNapUnderNumFiles int

	// MemoryPressurePauseThreshold let persister to have a better leeway
	// for prudently performing the memory merge of segments on a memory
	// pressure situation. Here the config value is an upper threshold
	// for the number of paused application threads. The default value would
	// be a very high number to always favour the merging of memory segments.
	MemoryPressurePauseThreshold uint64

	// NumPersisterWorkers decides the number of parallel workers that will
	// perform the in-memory merge of segments followed by a flush operation.
	NumPersisterWorkers int

	// MaxSizeInMemoryMerge is the maximum size of data that a single persister
	// worker is allowed to work on
	MaxSizeInMemoryMergePerWorker int
}

type notificationChan chan struct{}

func (s *Scorch) persisterLoop() {
	defer func() {
		if r := recover(); r != nil {
			s.fireAsyncError(&AsyncPanicError{
				Source: "persister",
				Path:   s.path,
			})
		}

		s.asyncTasks.Done()
	}()

	var persistWatchers []*epochWatcher
	var lastPersistedEpoch, lastMergedEpoch uint64
	var ew *epochWatcher

	var unpersistedCallbacks []index.BatchCallback

	po, err := s.parsePersisterOptions()
	if err != nil {
		s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err))
		return
	}

OUTER:
	for {
		atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1)

		select {
		case <-s.closeCh:
			break OUTER
		case ew = <-s.persisterNotifier:
			persistWatchers = append(persistWatchers, ew)
		default:
		}
		if ew != nil && ew.epoch > lastMergedEpoch {
			lastMergedEpoch = ew.epoch
		}
		lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
			lastMergedEpoch, persistWatchers, po)

		var ourSnapshot *IndexSnapshot
		var ourPersisted []chan error
		var ourPersistedCallbacks []index.BatchCallback

		// check to see if there is a new snapshot to persist
		s.rootLock.Lock()
		if s.root != nil && s.root.epoch > lastPersistedEpoch {
			ourSnapshot = s.root
			ourSnapshot.AddRef()
			ourPersisted = s.rootPersisted
			s.rootPersisted = nil
			ourPersistedCallbacks = s.persistedCallbacks
			s.persistedCallbacks = nil
			atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size()))
			atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch)
		}
		s.rootLock.Unlock()

		if ourSnapshot != nil {
			startTime := time.Now()

			err := s.persistSnapshot(ourSnapshot, po)
			for _, ch := range ourPersisted {
				if err != nil {
					ch <- err
				}
				close(ch)
			}
			if err != nil {
				atomic.StoreUint64(&s.iStats.persistEpoch, 0)
				if err == segment.ErrClosed {
					// index has been closed
					_ = ourSnapshot.DecRef()
					break OUTER
				}

				// save this current snapshot's persistedCallbacks, to invoke during
				// the retry attempt
				unpersistedCallbacks = append(unpersistedCallbacks, ourPersistedCallbacks...)

				s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err))
				_ = ourSnapshot.DecRef()
				atomic.AddUint64(&s.stats.TotPersistLoopErr, 1)
				continue OUTER
			}

			if unpersistedCallbacks != nil {
				// in the event of this being a retry attempt for persisting a snapshot
				// that had earlier failed, prepend the persistedCallbacks associated
				// with earlier segment(s) to the latest persistedCallbacks
				ourPersistedCallbacks = append(unpersistedCallbacks, ourPersistedCallbacks...)
				unpersistedCallbacks = nil
			}

			for i := range ourPersistedCallbacks {
				ourPersistedCallbacks[i](err)
			}

			atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch)

			lastPersistedEpoch = ourSnapshot.epoch
			for _, ew := range persistWatchers {
				close(ew.notifyCh)
			}

			persistWatchers = nil
			_ = ourSnapshot.DecRef()

			changed := false
			s.rootLock.RLock()
			if s.root != nil && s.root.epoch != lastPersistedEpoch {
				changed = true
			}
			s.rootLock.RUnlock()

			s.fireEvent(EventKindPersisterProgress, time.Since(startTime))

			if changed {
				atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1)
				continue OUTER
			}
		}

		// tell the introducer we're waiting for changes
		w := &epochWatcher{
			epoch:    lastPersistedEpoch,
			notifyCh: make(notificationChan, 1),
		}

		select {
		case <-s.closeCh:
			break OUTER
		case s.introducerNotifier <- w:
		}

		if ok := s.fireEvent(EventKindPurgerCheck, 0); ok {
			s.removeOldData() // might as well cleanup while waiting
		}

		atomic.AddUint64(&s.stats.TotPersistLoopWait, 1)

		select {
		case <-s.closeCh:
			break OUTER
		case <-w.notifyCh:
			// woken up, next loop should pick up work
			atomic.AddUint64(&s.stats.TotPersistLoopWaitNotified, 1)
		case ew = <-s.persisterNotifier:
			// if the watchers are already caught up then let them wait,
			// else let them continue to do the catch up
			persistWatchers = append(persistWatchers, ew)
		}

		atomic.AddUint64(&s.stats.TotPersistLoopEnd, 1)
	}
}

func notifyMergeWatchers(lastPersistedEpoch uint64,
	persistWatchers []*epochWatcher,
) []*epochWatcher {
	var watchersNext []*epochWatcher
	for _, w := range persistWatchers {
		if w.epoch < lastPersistedEpoch {
			close(w.notifyCh)
		} else {
			watchersNext = append(watchersNext, w)
		}
	}
	return watchersNext
}

func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64,
	lastMergedEpoch uint64, persistWatchers []*epochWatcher,
	po *persisterOptions,
) (uint64, []*epochWatcher) {
	// First, let the watchers proceed if they lag behind
	persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)

	// Check the merger lag by counting the segment files on disk,
	numFilesOnDisk, _, _ := s.diskFileStats(nil)

	// On finding fewer files on disk, persister takes a short pause
	// for sufficient in-memory segments to pile up for the next
	// memory merge cum persist loop.
	if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) &&
		po.PersisterNapTimeMSec > 0 && s.NumEventsBlocking() == 0 {
		select {
		case <-s.closeCh:
		case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)):
			atomic.AddUint64(&s.stats.TotPersisterNapPauseCompleted, 1)

		case ew := <-s.persisterNotifier:
			// unblock the merger in meantime
			persistWatchers = append(persistWatchers, ew)
			lastMergedEpoch = ew.epoch
			persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
			atomic.AddUint64(&s.stats.TotPersisterMergerNapBreak, 1)
		}
		return lastMergedEpoch, persistWatchers
	}

	// Finding too many files on disk could be due to two reasons.
	// 1. Too many older snapshots awaiting the clean up.
	// 2. The merger could be lagging behind on merging the disk files.
	if numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) {
		if ok := s.fireEvent(EventKindPurgerCheck, 0); ok {
			s.removeOldData()
		}
		numFilesOnDisk, _, _ = s.diskFileStats(nil)
	}

	// Persister pause until the merger catches up to reduce the segment
	// file count under the threshold.
	// But if there is memory pressure, then skip this sleep maneuvers.
OUTER:
	for po.PersisterNapUnderNumFiles > 0 &&
		numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) &&
		lastMergedEpoch < lastPersistedEpoch {
		atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1)

		select {
		case <-s.closeCh:
			break OUTER
		case ew := <-s.persisterNotifier:
			persistWatchers = append(persistWatchers, ew)
			lastMergedEpoch = ew.epoch
		}

		atomic.AddUint64(&s.stats.TotPersisterSlowMergerResume, 1)

		// let the watchers proceed if they lag behind
		persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)

		numFilesOnDisk, _, _ = s.diskFileStats(nil)
	}

	return lastMergedEpoch, persistWatchers
}

func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) {
	po := persisterOptions{
		PersisterNapTimeMSec:          DefaultPersisterNapTimeMSec,
		PersisterNapUnderNumFiles:     DefaultPersisterNapUnderNumFiles,
		MemoryPressurePauseThreshold:  DefaultMemoryPressurePauseThreshold,
		NumPersisterWorkers:           DefaultNumPersisterWorkers,
		MaxSizeInMemoryMergePerWorker: DefaultMaxSizeInMemoryMergePerWorker,
	}
	if v, ok := s.config["scorchPersisterOptions"]; ok {
		b, err := util.MarshalJSON(v)
		if err != nil {
			return &po, err
		}

		err = util.UnmarshalJSON(b, &po)
		if err != nil {
			return &po, err
		}
	}
	return &po, nil
}

func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot,
	po *persisterOptions,
) error {
	// Perform in-memory segment merging only when the memory pressure is
	// below the configured threshold, else the persister performs the
	// direct persistence of segments.
	if s.NumEventsBlocking() < po.MemoryPressurePauseThreshold {
		persisted, err := s.persistSnapshotMaybeMerge(snapshot, po)
		if err != nil {
			return err
		}
		if persisted {
			return nil
		}
	}

	return s.persistSnapshotDirect(snapshot, nil)
}

// DefaultMinSegmentsForInMemoryMerge represents the default number of
// in-memory zap segments that persistSnapshotMaybeMerge() needs to
// see in an IndexSnapshot before it decides to merge and persist
// those segments
var DefaultMinSegmentsForInMemoryMerge = 2

type flushable struct {
	segments []segment.Segment
	drops    []*roaring.Bitmap
	sbIdxs   []int
	totDocs  uint64
}

// number workers which parallely perform an in-memory merge of the segments
// followed by a flush operation.
var DefaultNumPersisterWorkers = 1

// maximum size of data that a single worker is allowed to perform the in-memory
// merge operation.
var DefaultMaxSizeInMemoryMergePerWorker = 0

func legacyFlushBehaviour(maxSizeInMemoryMergePerWorker, numPersisterWorkers int) bool {
	// DefaultMaxSizeInMemoryMergePerWorker = 0 is a special value to preserve the leagcy
	// one-shot in-memory merge + flush behaviour.
	return maxSizeInMemoryMergePerWorker == 0 && numPersisterWorkers == 1
}

// persistSnapshotMaybeMerge examines the snapshot and might merge and
// persist the in-memory zap segments if there are enough of them
func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot, po *persisterOptions) (
	bool, error) {
	// collect the in-memory zap segments (SegmentBase instances)
	var sbs []segment.Segment
	var sbsDrops []*roaring.Bitmap
	var sbsIndexes []int
	var oldSegIdxs []int

	flushSet := make([]*flushable, 0)
	var totSize int
	var numSegsToFlushOut int
	var totDocs uint64

	// legacy behaviour of merge + flush of all in-memory segments in one-shot
	if legacyFlushBehaviour(po.MaxSizeInMemoryMergePerWorker, po.NumPersisterWorkers) {
		val := &flushable{
			segments: make([]segment.Segment, 0),
			drops:    make([]*roaring.Bitmap, 0),
			sbIdxs:   make([]int, 0),
			totDocs:  totDocs,
		}
		for i, snapshot := range snapshot.segment {
			if _, ok := snapshot.segment.(segment.PersistedSegment); !ok {
				val.segments = append(val.segments, snapshot.segment)
				val.drops = append(val.drops, snapshot.deleted)
				val.sbIdxs = append(val.sbIdxs, i)
				oldSegIdxs = append(oldSegIdxs, i)
				val.totDocs += snapshot.segment.Count()
				numSegsToFlushOut++
			}
		}

		flushSet = append(flushSet, val)
	} else {
		// constructs a flushSet where each flushable object contains a set of segments
		// to be merged and flushed out to disk.
		for i, snapshot := range snapshot.segment {
			if totSize >= po.MaxSizeInMemoryMergePerWorker &&
				len(sbs) >= DefaultMinSegmentsForInMemoryMerge {
				numSegsToFlushOut += len(sbs)
				val := &flushable{
					segments: slices.Clone(sbs),
					drops:    slices.Clone(sbsDrops),
					sbIdxs:   slices.Clone(sbsIndexes),
					totDocs:  totDocs,
				}
				flushSet = append(flushSet, val)
				oldSegIdxs = append(oldSegIdxs, sbsIndexes...)

				sbs, sbsDrops, sbsIndexes = sbs[:0], sbsDrops[:0], sbsIndexes[:0]
				totSize, totDocs = 0, 0
			}

			if len(flushSet) >= int(po.NumPersisterWorkers) {
				break
			}

			if _, ok := snapshot.segment.(segment.PersistedSegment); !ok {
				sbs = append(sbs, snapshot.segment)
				sbsDrops = append(sbsDrops, snapshot.deleted)
				sbsIndexes = append(sbsIndexes, i)
				totDocs += snapshot.segment.Count()
				totSize += snapshot.segment.Size()
			}
		}
		// if there were too few segments just merge them all as part of a single worker
		if len(flushSet) < po.NumPersisterWorkers {
			numSegsToFlushOut += len(sbs)
			val := &flushable{
				segments: slices.Clone(sbs),
				drops:    slices.Clone(sbsDrops),
				sbIdxs:   slices.Clone(sbsIndexes),
				totDocs:  totDocs,
			}
			flushSet = append(flushSet, val)
			oldSegIdxs = append(oldSegIdxs, sbsIndexes...)
		}
	}

	if numSegsToFlushOut < DefaultMinSegmentsForInMemoryMerge {
		return false, nil
	}

	// the newSnapshot at this point would contain the newly created file segments
	// and updated with the root.
	newSnapshot, newSegmentIDs, err := s.mergeAndPersistInMemorySegments(snapshot, flushSet)
	if err != nil {
		return false, err
	}

	if newSnapshot == nil {
		return false, nil
	}

	defer func() {
		_ = newSnapshot.DecRef()
	}()

	mergedSegmentIDs := map[uint64]struct{}{}
	for _, idx := range oldSegIdxs {
		mergedSegmentIDs[snapshot.segment[idx].id] = struct{}{}
	}

	newMergedSegmentIDs := make(map[uint64]struct{}, len(newSegmentIDs))
	for _, id := range newSegmentIDs {
		newMergedSegmentIDs[id] = struct{}{}
	}

	// construct a snapshot that's logically equivalent to the input
	// snapshot, but with merged segments replaced by the new segment
	equiv := &IndexSnapshot{
		parent:   snapshot.parent,
		segment:  make([]*SegmentSnapshot, 0, len(snapshot.segment)),
		internal: snapshot.internal,
		epoch:    snapshot.epoch,
		creator:  "persistSnapshotMaybeMerge",
	}

	// to track which segments haven't participated in the in-memory merge
	// they won't be flushed out to the disk yet, but in the next cycle will be
	// merged in-memory and then flushed out - this is to keep the number of
	// on-disk files in limit.
	exclude := make(map[uint64]struct{})

	// copy to the equiv the segments that weren't replaced
	for _, segment := range snapshot.segment {
		if _, wasMerged := mergedSegmentIDs[segment.id]; !wasMerged {
			equiv.segment = append(equiv.segment, segment)
			exclude[segment.id] = struct{}{}
		}
	}

	// append to the equiv the newly merged segments
	for _, segment := range newSnapshot.segment {
		if _, ok := newMergedSegmentIDs[segment.id]; ok {
			equiv.segment = append(equiv.segment, &SegmentSnapshot{
				id:      segment.id,
				segment: segment.segment,
				deleted: nil, // nil since merging handled deletions
				stats:   nil,
			})
		}
	}

	err = s.persistSnapshotDirect(equiv, exclude)
	if err != nil {
		return false, err
	}

	return true, nil
}

func copyToDirectory(srcPath string, d index.Directory) (int64, error) {
	if d == nil {
		return 0, nil
	}

	dest, err := d.GetWriter(filepath.Join("store", filepath.Base(srcPath)))
	if err != nil {
		return 0, fmt.Errorf("GetWriter err: %v", err)
	}

	sourceFileStat, err := os.Stat(srcPath)
	if err != nil {
		return 0, err
	}

	if !sourceFileStat.Mode().IsRegular() {
		return 0, fmt.Errorf("%s is not a regular file", srcPath)
	}

	source, err := os.Open(srcPath)
	if err != nil {
		return 0, err
	}
	defer source.Close()
	defer dest.Close()
	return io.Copy(dest, source)
}

func persistToDirectory(seg segment.UnpersistedSegment, d index.Directory,
	path string,
) error {
	if d == nil {
		return seg.Persist(path)
	}

	sg, ok := seg.(io.WriterTo)
	if !ok {
		return fmt.Errorf("no io.WriterTo segment implementation found")
	}

	w, err := d.GetWriter(filepath.Join("store", filepath.Base(path)))
	if err != nil {
		return err
	}

	_, err = sg.WriteTo(w)
	w.Close()

	return err
}

func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string,
	segPlugin SegmentPlugin, exclude map[uint64]struct{}, d index.Directory) (
	[]string, map[uint64]string, error) {
	snapshotsBucket, err := tx.CreateBucketIfNotExists(boltSnapshotsBucket)
	if err != nil {
		return nil, nil, err
	}
	newSnapshotKey := encodeUvarintAscending(nil, snapshot.epoch)
	snapshotBucket, err := snapshotsBucket.CreateBucketIfNotExists(newSnapshotKey)
	if err != nil {
		return nil, nil, err
	}

	// persist meta values
	metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey)
	if err != nil {
		return nil, nil, err
	}
	err = metaBucket.Put(boltMetaDataSegmentTypeKey, []byte(segPlugin.Type()))
	if err != nil {
		return nil, nil, err
	}
	buf := make([]byte, binary.MaxVarintLen32)
	binary.BigEndian.PutUint32(buf, segPlugin.Version())
	err = metaBucket.Put(boltMetaDataSegmentVersionKey, buf)
	if err != nil {
		return nil, nil, err
	}

	// Storing the timestamp at which the current indexSnapshot
	// was persisted, useful when you want to spread the
	// numSnapshotsToKeep reasonably better than consecutive
	// epochs.
	currTimeStamp := time.Now()
	timeStampBinary, err := currTimeStamp.MarshalText()
	if err != nil {
		return nil, nil, err
	}
	err = metaBucket.Put(boltMetaDataTimeStamp, timeStampBinary)
	if err != nil {
		return nil, nil, err
	}

	// persist internal values
	internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey)
	if err != nil {
		return nil, nil, err
	}
	// TODO optimize writing these in order?
	for k, v := range snapshot.internal {
		err = internalBucket.Put([]byte(k), v)
		if err != nil {
			return nil, nil, err
		}
	}

	if snapshot.parent != nil {
		val := make([]byte, 8)
		bytesWritten := atomic.LoadUint64(&snapshot.parent.stats.TotBytesWrittenAtIndexTime)
		binary.LittleEndian.PutUint64(val, bytesWritten)
		err = internalBucket.Put(TotBytesWrittenKey, val)
		if err != nil {
			return nil, nil, err
		}
	}

	filenames := make([]string, 0, len(snapshot.segment))
	newSegmentPaths := make(map[uint64]string, len(snapshot.segment))

	// first ensure that each segment in this snapshot has been persisted
	for _, segmentSnapshot := range snapshot.segment {
		snapshotSegmentKey := encodeUvarintAscending(nil, segmentSnapshot.id)
		snapshotSegmentBucket, err := snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey)
		if err != nil {
			return nil, nil, err
		}
		switch seg := segmentSnapshot.segment.(type) {
		case segment.PersistedSegment:
			segPath := seg.Path()
			_, err = copyToDirectory(segPath, d)
			if err != nil {
				return nil, nil, fmt.Errorf("segment: %s copy err: %v", segPath, err)
			}
			filename := filepath.Base(segPath)
			err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename))
			if err != nil {
				return nil, nil, err
			}
			filenames = append(filenames, filename)
		case segment.UnpersistedSegment:
			// need to persist this to disk if its not part of exclude list (which
			// restricts which in-memory segment to be persisted to disk)
			if _, ok := exclude[segmentSnapshot.id]; !ok {
				filename := zapFileName(segmentSnapshot.id)
				path := filepath.Join(path, filename)
				err := persistToDirectory(seg, d, path)
				if err != nil {
					return nil, nil, fmt.Errorf("segment: %s persist err: %v", path, err)
				}
				newSegmentPaths[segmentSnapshot.id] = path
				err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename))
				if err != nil {
					return nil, nil, err
				}
				filenames = append(filenames, filename)
			}
		default:
			return nil, nil, fmt.Errorf("unknown segment type: %T", seg)
		}
		// store current deleted bits
		var roaringBuf bytes.Buffer
		if segmentSnapshot.deleted != nil {
			_, err = segmentSnapshot.deleted.WriteTo(&roaringBuf)
			if err != nil {
				return nil, nil, fmt.Errorf("error persisting roaring bytes: %v", err)
			}
			err = snapshotSegmentBucket.Put(boltDeletedKey, roaringBuf.Bytes())
			if err != nil {
				return nil, nil, err
			}
		}

		// store segment stats
		if segmentSnapshot.stats != nil {
			b, err := json.Marshal(segmentSnapshot.stats.Fetch())
			if err != nil {
				return nil, nil, err
			}
			err = snapshotSegmentBucket.Put(boltStatsKey, b)
			if err != nil {
				return nil, nil, err
			}
		}
	}

	return filenames, newSegmentPaths, nil
}

func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot, exclude map[uint64]struct{}) (err error) {
	// start a write transaction
	tx, err := s.rootBolt.Begin(true)
	if err != nil {
		return err
	}
	// defer rollback on error
	defer func() {
		if err != nil {
			_ = tx.Rollback()
		}
	}()

	filenames, newSegmentPaths, err := prepareBoltSnapshot(snapshot, tx, s.path, s.segPlugin, exclude, nil)
	if err != nil {
		return err
	}

	// we need to swap in a new root only when we've persisted 1 or
	// more segments -- whereby the new root would have 1-for-1
	// replacements of in-memory segments with file-based segments
	//
	// other cases like updates to internal values only, and/or when
	// there are only deletions, are already covered and persisted by
	// the newly populated boltdb snapshotBucket above
	if len(newSegmentPaths) > 0 {
		// now try to open all the new snapshots
		newSegments := make(map[uint64]segment.Segment, len(newSegmentPaths))
		defer func() {
			for _, s := range newSegments {
				if s != nil {
					// cleanup segments that were opened but not
					// swapped into the new root
					_ = s.Close()
				}
			}
		}()
		for segmentID, path := range newSegmentPaths {
			newSegments[segmentID], err = s.segPlugin.Open(path)
			if err != nil {
				return fmt.Errorf("error opening new segment at %s, %v", path, err)
			}
		}

		persist := &persistIntroduction{
			persisted: newSegments,
			applied:   make(notificationChan),
		}

		select {
		case <-s.closeCh:
			return segment.ErrClosed
		case s.persists <- persist:
		}

		select {
		case <-s.closeCh:
			return segment.ErrClosed
		case <-persist.applied:
		}
	}

	err = tx.Commit()
	if err != nil {
		return err
	}

	err = s.rootBolt.Sync()
	if err != nil {
		return err
	}

	// allow files to become eligible for removal after commit, such
	// as file segments from snapshots that came from the merger
	s.rootLock.Lock()
	for _, filename := range filenames {
		delete(s.ineligibleForRemoval, filename)
	}
	s.rootLock.Unlock()

	return nil
}

func zapFileName(epoch uint64) string {
	return fmt.Sprintf("%012x.zap", epoch)
}

// bolt snapshot code

var (
	boltSnapshotsBucket           = []byte{'s'}
	boltPathKey                   = []byte{'p'}
	boltDeletedKey                = []byte{'d'}
	boltInternalKey               = []byte{'i'}
	boltMetaDataKey               = []byte{'m'}
	boltMetaDataSegmentTypeKey    = []byte("type")
	boltMetaDataSegmentVersionKey = []byte("version")
	boltMetaDataTimeStamp         = []byte("timeStamp")
	boltStatsKey                  = []byte("stats")
	TotBytesWrittenKey            = []byte("TotBytesWritten")
)

func (s *Scorch) loadFromBolt() error {
	err := s.rootBolt.View(func(tx *bolt.Tx) error {
		snapshots := tx.Bucket(boltSnapshotsBucket)
		if snapshots == nil {
			return nil
		}
		foundRoot := false
		c := snapshots.Cursor()
		for k, _ := c.Last(); k != nil; k, _ = c.Prev() {
			_, snapshotEpoch, err := decodeUvarintAscending(k)
			if err != nil {
				log.Printf("unable to parse segment epoch %x, continuing", k)
				continue
			}
			if foundRoot {
				s.AddEligibleForRemoval(snapshotEpoch)
				continue
			}
			snapshot := snapshots.Bucket(k)
			if snapshot == nil {
				log.Printf("snapshot key, but bucket missing %x, continuing", k)
				s.AddEligibleForRemoval(snapshotEpoch)
				continue
			}
			indexSnapshot, err := s.loadSnapshot(snapshot)
			if err != nil {
				log.Printf("unable to load snapshot, %v, continuing", err)
				s.AddEligibleForRemoval(snapshotEpoch)
				continue
			}
			indexSnapshot.epoch = snapshotEpoch
			// set the nextSegmentID
			s.nextSegmentID, err = s.maxSegmentIDOnDisk()
			if err != nil {
				return err
			}
			s.nextSegmentID++
			s.rootLock.Lock()
			s.nextSnapshotEpoch = snapshotEpoch + 1
			rootPrev := s.root
			s.root = indexSnapshot
			s.rootLock.Unlock()

			if rootPrev != nil {
				_ = rootPrev.DecRef()
			}

			foundRoot = true
		}
		return nil
	})
	if err != nil {
		return err
	}

	persistedSnapshots, err := s.rootBoltSnapshotMetaData()
	if err != nil {
		return err
	}
	s.checkPoints = persistedSnapshots
	return nil
}

// LoadSnapshot loads the segment with the specified epoch
// NOTE: this is currently ONLY intended to be used by the command-line tool
func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
	err = s.rootBolt.View(func(tx *bolt.Tx) error {
		snapshots := tx.Bucket(boltSnapshotsBucket)
		if snapshots == nil {
			return nil
		}
		snapshotKey := encodeUvarintAscending(nil, epoch)
		snapshot := snapshots.Bucket(snapshotKey)
		if snapshot == nil {
			return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch)
		}
		rv, err = s.loadSnapshot(snapshot)
		return err
	})
	if err != nil {
		return nil, err
	}
	return rv, nil
}

func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
	rv := &IndexSnapshot{
		parent:   s,
		internal: make(map[string][]byte),
		refs:     1,
		creator:  "loadSnapshot",
	}
	// first we look for the meta-data bucket, this will tell us
	// which segment type/version was used for this snapshot
	// all operations for this scorch will use this type/version
	metaBucket := snapshot.Bucket(boltMetaDataKey)
	if metaBucket == nil {
		_ = rv.DecRef()
		return nil, fmt.Errorf("meta-data bucket missing")
	}
	segmentType := string(metaBucket.Get(boltMetaDataSegmentTypeKey))
	segmentVersion := binary.BigEndian.Uint32(
		metaBucket.Get(boltMetaDataSegmentVersionKey))
	err := s.loadSegmentPlugin(segmentType, segmentVersion)
	if err != nil {
		_ = rv.DecRef()
		return nil, fmt.Errorf(
			"unable to load correct segment wrapper: %v", err)
	}
	var running uint64
	c := snapshot.Cursor()
	for k, _ := c.First(); k != nil; k, _ = c.Next() {
		if k[0] == boltInternalKey[0] {
			internalBucket := snapshot.Bucket(k)
			if internalBucket == nil {
				_ = rv.DecRef()
				return nil, fmt.Errorf("internal bucket missing")
			}
			err := internalBucket.ForEach(func(key []byte, val []byte) error {
				copiedVal := append([]byte(nil), val...)
				rv.internal[string(key)] = copiedVal
				return nil
			})
			if err != nil {
				_ = rv.DecRef()
				return nil, err
			}
		} else if k[0] != boltMetaDataKey[0] {
			segmentBucket := snapshot.Bucket(k)
			if segmentBucket == nil {
				_ = rv.DecRef()
				return nil, fmt.Errorf("segment key, but bucket missing % x", k)
			}
			segmentSnapshot, err := s.loadSegment(segmentBucket)
			if err != nil {
				_ = rv.DecRef()
				return nil, fmt.Errorf("failed to load segment: %v", err)
			}
			_, segmentSnapshot.id, err = decodeUvarintAscending(k)
			if err != nil {
				_ = rv.DecRef()
				return nil, fmt.Errorf("failed to decode segment id: %v", err)
			}
			rv.segment = append(rv.segment, segmentSnapshot)
			rv.offsets = append(rv.offsets, running)
			running += segmentSnapshot.segment.Count()
		}
	}
	return rv, nil
}

func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, error) {
	pathBytes := segmentBucket.Get(boltPathKey)
	if pathBytes == nil {
		return nil, fmt.Errorf("segment path missing")
	}
	segmentPath := s.path + string(os.PathSeparator) + string(pathBytes)
	segment, err := s.segPlugin.Open(segmentPath)
	if err != nil {
		return nil, fmt.Errorf("error opening bolt segment: %v", err)
	}

	rv := &SegmentSnapshot{
		segment:    segment,
		cachedDocs: &cachedDocs{cache: nil},
		cachedMeta: &cachedMeta{meta: nil},
	}
	deletedBytes := segmentBucket.Get(boltDeletedKey)
	if deletedBytes != nil {
		deletedBitmap := roaring.NewBitmap()
		r := bytes.NewReader(deletedBytes)
		_, err := deletedBitmap.ReadFrom(r)
		if err != nil {
			_ = segment.Close()
			return nil, fmt.Errorf("error reading deleted bytes: %v", err)
		}
		if !deletedBitmap.IsEmpty() {
			rv.deleted = deletedBitmap
		}
	}
	statBytes := segmentBucket.Get(boltStatsKey)
	if statBytes != nil {
		var statsMap map[string]map[string]uint64

		err := json.Unmarshal(statBytes, &statsMap)
		stats := &fieldStats{statMap: statsMap}
		if err != nil {
			_ = segment.Close()
			return nil, fmt.Errorf("error reading stat bytes: %v", err)
		}
		rv.stats = stats
	}

	return rv, nil
}

func (s *Scorch) removeOldData() {
	removed, err := s.removeOldBoltSnapshots()
	if err != nil {
		s.fireAsyncError(fmt.Errorf("got err removing old bolt snapshots: %v", err))
	}
	atomic.AddUint64(&s.stats.TotSnapshotsRemovedFromMetaStore, uint64(removed))

	err = s.removeOldZapFiles()
	if err != nil {
		s.fireAsyncError(fmt.Errorf("got err removing old zap files: %v", err))
	}
}

// NumSnapshotsToKeep represents how many recent, old snapshots to
// keep around per Scorch instance.  Useful for apps that require
// rollback'ability.
var NumSnapshotsToKeep = 1

// RollbackSamplingInterval controls how far back we are looking
// in the history to get the rollback points.
// For example, a value of 10 minutes ensures that the
// protected snapshots (NumSnapshotsToKeep = 3) are:
//
//	the very latest snapshot(ie the current one),
//	the snapshot that was persisted 10 minutes before the current one,
//	the snapshot that was persisted 20 minutes before the current one
//
// By default however, the timeseries way of protecting snapshots is
// disabled, and we protect the latest three contiguous snapshots
var RollbackSamplingInterval = 0 * time.Minute

// Controls what portion of the earlier rollback points to retain during
// a infrequent/sparse mutation scenario
var RollbackRetentionFactor = float64(0.5)

func getTimeSeriesSnapshots(maxDataPoints int, interval time.Duration,
	snapshots []*snapshotMetaData,
) (int, map[uint64]time.Time) {
	if interval == 0 {
		return len(snapshots), map[uint64]time.Time{}
	}
	// the map containing the time series snapshots, i.e the timeseries of snapshots
	// each of which is separated by rollbackSamplingInterval
	rv := make(map[uint64]time.Time)
	// the last point in the "time series", i.e. the timeseries of snapshots
	// each of which is separated by rollbackSamplingInterval
	ptr := len(snapshots) - 1
	rv[snapshots[ptr].epoch] = snapshots[ptr].timeStamp
	numSnapshotsProtected := 1

	// traverse the list in reverse order, older timestamps to newer ones.
	for i := ptr - 1; i >= 0; i-- {
		// If we find a timeStamp which is the next datapoint in our
		// timeseries of snapshots, and newer by RollbackSamplingInterval duration
		// (comparison in terms of minutes), which is the interval of our time
		// series. In this case, add the epoch rv
		if snapshots[i].timeStamp.Sub(snapshots[ptr].timeStamp).Minutes() >
			interval.Minutes() {
			if _, ok := rv[snapshots[i+1].epoch]; !ok {
				rv[snapshots[i+1].epoch] = snapshots[i+1].timeStamp
				ptr = i + 1
				numSnapshotsProtected++
			}
		} else if snapshots[i].timeStamp.Sub(snapshots[ptr].timeStamp).Minutes() ==
			interval.Minutes() {
			if _, ok := rv[snapshots[i].epoch]; !ok {
				rv[snapshots[i].epoch] = snapshots[i].timeStamp
				ptr = i
				numSnapshotsProtected++
			}
		}

		if numSnapshotsProtected >= maxDataPoints {
			break
		}
	}
	return ptr, rv
}

// getProtectedSnapshots aims to fetch the epochs keep based on a timestamp basis.
// It tries to get NumSnapshotsToKeep snapshots, each of which are separated
// by a time duration of RollbackSamplingInterval.
func getProtectedSnapshots(rollbackSamplingInterval time.Duration,
	numSnapshotsToKeep int,
	persistedSnapshots []*snapshotMetaData,
) map[uint64]time.Time {
	// keep numSnapshotsToKeep - 1 worth of time series snapshots, because we always
	// must preserve the very latest snapshot in bolt as well to avoid accidental
	// deletes of bolt entries and cleanups by the purger code.
	lastPoint, protectedEpochs := getTimeSeriesSnapshots(numSnapshotsToKeep-1,
		rollbackSamplingInterval, persistedSnapshots)
	if len(protectedEpochs) < numSnapshotsToKeep {
		numSnapshotsNeeded := numSnapshotsToKeep - len(protectedEpochs)
		// we protected the contiguous snapshots from the last point in time series
		for i := 0; i < numSnapshotsNeeded && i < lastPoint; i++ {
			protectedEpochs[persistedSnapshots[i].epoch] = persistedSnapshots[i].timeStamp
		}
	}

	return protectedEpochs
}

func newCheckPoints(snapshots map[uint64]time.Time) []*snapshotMetaData {
	rv := make([]*snapshotMetaData, 0)

	keys := make([]uint64, 0, len(snapshots))
	for k := range snapshots {
		keys = append(keys, k)
	}

	sort.SliceStable(keys, func(i, j int) bool {
		return snapshots[keys[i]].Sub(snapshots[keys[j]]) > 0
	})

	for _, key := range keys {
		rv = append(rv, &snapshotMetaData{
			epoch:     key,
			timeStamp: snapshots[key],
		})
	}

	return rv
}

// Removes enough snapshots from the rootBolt so that the
// s.eligibleForRemoval stays under the NumSnapshotsToKeep policy.
func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
	persistedSnapshots, err := s.rootBoltSnapshotMetaData()
	if err != nil {
		return 0, err
	}

	if len(persistedSnapshots) <= s.numSnapshotsToKeep {
		// we need to keep everything
		return 0, nil
	}

	protectedSnapshots := getProtectedSnapshots(s.rollbackSamplingInterval,
		s.numSnapshotsToKeep, persistedSnapshots)

	var epochsToRemove []uint64
	var newEligible []uint64
	s.rootLock.Lock()
	for _, epoch := range s.eligibleForRemoval {
		if _, ok := protectedSnapshots[epoch]; ok {
			// protected
			newEligible = append(newEligible, epoch)
		} else {
			epochsToRemove = append(epochsToRemove, epoch)
		}
	}
	s.eligibleForRemoval = newEligible
	s.rootLock.Unlock()
	s.checkPoints = newCheckPoints(protectedSnapshots)

	if len(epochsToRemove) == 0 {
		return 0, nil
	}

	tx, err := s.rootBolt.Begin(true)
	if err != nil {
		return 0, err
	}
	defer func() {
		if err == nil {
			err = tx.Commit()
		} else {
			_ = tx.Rollback()
		}
		if err == nil {
			err = s.rootBolt.Sync()
		}
	}()

	snapshots := tx.Bucket(boltSnapshotsBucket)
	if snapshots == nil {
		return 0, nil
	}

	for _, epochToRemove := range epochsToRemove {
		k := encodeUvarintAscending(nil, epochToRemove)
		err = snapshots.DeleteBucket(k)
		if err == bolt.ErrBucketNotFound {
			err = nil
		}
		if err == nil {
			numRemoved++
		}
	}

	return numRemoved, err
}

func (s *Scorch) maxSegmentIDOnDisk() (uint64, error) {
	files, err := os.ReadDir(s.path)
	if err != nil {
		return 0, err
	}

	var rv uint64
	for _, f := range files {
		fname := f.Name()
		if filepath.Ext(fname) == ".zap" {
			prefix := strings.TrimSuffix(fname, ".zap")
			id, err2 := strconv.ParseUint(prefix, 16, 64)
			if err2 != nil {
				return 0, err2
			}
			if id > rv {
				rv = id
			}
		}
	}
	return rv, err
}

// Removes any *.zap files which aren't listed in the rootBolt.
func (s *Scorch) removeOldZapFiles() error {
	liveFileNames, err := s.loadZapFileNames()
	if err != nil {
		return err
	}

	files, err := os.ReadDir(s.path)
	if err != nil {
		return err
	}

	s.rootLock.RLock()

	for _, f := range files {
		fname := f.Name()
		if filepath.Ext(fname) == ".zap" {
			if _, exists := liveFileNames[fname]; !exists && !s.ineligibleForRemoval[fname] && (s.copyScheduled[fname] <= 0) {
				err := os.Remove(s.path + string(os.PathSeparator) + fname)
				if err != nil {
					log.Printf("got err removing file: %s, err: %v", fname, err)
				}
			}
		}
	}

	s.rootLock.RUnlock()

	return nil
}

// In sparse mutation scenario, it can so happen that all protected
// snapshots are older than the numSnapshotsToKeep * rollbackSamplingInterval
// duration. This results in all of them being purged from the boltDB
// and the next iteration of the removeOldData() would end up protecting
// latest contiguous snapshot which is a poor pattern in the rollback checkpoints.
// Hence we try to retain atmost retentionFactor portion worth of old snapshots
// in such a scenario using the following function
func getBoundaryCheckPoint(retentionFactor float64,
	checkPoints []*snapshotMetaData, timeStamp time.Time,
) time.Time {
	if checkPoints != nil {
		boundary := checkPoints[int(math.Floor(float64(len(checkPoints))*
			retentionFactor))]
		if timeStamp.Sub(boundary.timeStamp) > 0 {
			// return the extended boundary which will dictate the older snapshots
			// to be retained
			return boundary.timeStamp
		}
	}

	return timeStamp
}

type snapshotMetaData struct {
	epoch     uint64
	timeStamp time.Time
}

func (s *Scorch) rootBoltSnapshotMetaData() ([]*snapshotMetaData, error) {
	var rv []*snapshotMetaData
	currTime := time.Now()
	// including the very latest snapshot there should be n snapshots, so the
	// very last one would be tc - (n-1) * d
	// for eg for n = 3 the checkpoints preserved should be tc, tc - d, tc - 2d
	expirationDuration := time.Duration(s.numSnapshotsToKeep-1) * s.rollbackSamplingInterval

	err := s.rootBolt.View(func(tx *bolt.Tx) error {
		snapshots := tx.Bucket(boltSnapshotsBucket)
		if snapshots == nil {
			return nil
		}
		sc := snapshots.Cursor()
		var found bool
		// traversal order - latest -> oldest epoch
		for sk, _ := sc.Last(); sk != nil; sk, _ = sc.Prev() {
			_, snapshotEpoch, err := decodeUvarintAscending(sk)
			if err != nil {
				continue
			}

			if expirationDuration == 0 {
				rv = append(rv, &snapshotMetaData{
					epoch: snapshotEpoch,
				})
				continue
			}

			snapshot := snapshots.Bucket(sk)
			if snapshot == nil {
				continue
			}
			metaBucket := snapshot.Bucket(boltMetaDataKey)
			if metaBucket == nil {
				continue
			}
			timeStampBytes := metaBucket.Get(boltMetaDataTimeStamp)
			var timeStamp time.Time
			err = timeStamp.UnmarshalText(timeStampBytes)
			if err != nil {
				continue
			}
			// Don't keep snapshots older than
			// expiration duration (numSnapshotsToKeep *
			// rollbackSamplingInterval, by default)
			if currTime.Sub(timeStamp) <= expirationDuration {
				rv = append(rv, &snapshotMetaData{
					epoch:     snapshotEpoch,
					timeStamp: timeStamp,
				})
			} else {
				if !found {
					found = true
					boundary := getBoundaryCheckPoint(s.rollbackRetentionFactor,
						s.checkPoints, timeStamp)
					expirationDuration = currTime.Sub(boundary)
					continue
				}
				k := encodeUvarintAscending(nil, snapshotEpoch)
				err = snapshots.DeleteBucket(k)
				if err == bolt.ErrBucketNotFound {
					err = nil
				}
			}
		}
		return nil
	})
	return rv, err
}

func (s *Scorch) RootBoltSnapshotEpochs() ([]uint64, error) {
	var rv []uint64
	err := s.rootBolt.View(func(tx *bolt.Tx) error {
		snapshots := tx.Bucket(boltSnapshotsBucket)
		if snapshots == nil {
			return nil
		}
		sc := snapshots.Cursor()
		for sk, _ := sc.Last(); sk != nil; sk, _ = sc.Prev() {
			_, snapshotEpoch, err := decodeUvarintAscending(sk)
			if err != nil {
				continue
			}
			rv = append(rv, snapshotEpoch)
		}
		return nil
	})
	return rv, err
}

// Returns the *.zap file names that are listed in the rootBolt.
func (s *Scorch) loadZapFileNames() (map[string]struct{}, error) {
	rv := map[string]struct{}{}
	err := s.rootBolt.View(func(tx *bolt.Tx) error {
		snapshots := tx.Bucket(boltSnapshotsBucket)
		if snapshots == nil {
			return nil
		}
		sc := snapshots.Cursor()
		for sk, _ := sc.First(); sk != nil; sk, _ = sc.Next() {
			snapshot := snapshots.Bucket(sk)
			if snapshot == nil {
				continue
			}
			segc := snapshot.Cursor()
			for segk, _ := segc.First(); segk != nil; segk, _ = segc.Next() {
				if segk[0] == boltInternalKey[0] {
					continue
				}
				segmentBucket := snapshot.Bucket(segk)
				if segmentBucket == nil {
					continue
				}
				pathBytes := segmentBucket.Get(boltPathKey)
				if pathBytes == nil {
					continue
				}
				pathString := string(pathBytes)
				rv[string(pathString)] = struct{}{}
			}
		}
		return nil
	})

	return rv, err
}