1
0
Fork 0
golang-github-blevesearch-b.../mapping/index.go
Daniel Baumann 982828099e
Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00

573 lines
17 KiB
Go

// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package mapping
import (
"encoding/json"
"fmt"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/analyzer/standard"
"github.com/blevesearch/bleve/v2/analysis/datetime/optional"
"github.com/blevesearch/bleve/v2/document"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/util"
index "github.com/blevesearch/bleve_index_api"
)
var MappingJSONStrict = false
const defaultTypeField = "_type"
const defaultType = "_default"
const defaultField = "_all"
const defaultAnalyzer = standard.Name
const defaultDateTimeParser = optional.Name
// An IndexMappingImpl controls how objects are placed
// into an index.
// First the type of the object is determined.
// Once the type is know, the appropriate
// DocumentMapping is selected by the type.
// If no mapping was determined for that type,
// a DefaultMapping will be used.
type IndexMappingImpl struct {
TypeMapping map[string]*DocumentMapping `json:"types,omitempty"`
DefaultMapping *DocumentMapping `json:"default_mapping"`
TypeField string `json:"type_field"`
DefaultType string `json:"default_type"`
DefaultAnalyzer string `json:"default_analyzer"`
DefaultDateTimeParser string `json:"default_datetime_parser"`
DefaultSynonymSource string `json:"default_synonym_source,omitempty"`
ScoringModel string `json:"scoring_model,omitempty"`
DefaultField string `json:"default_field"`
StoreDynamic bool `json:"store_dynamic"`
IndexDynamic bool `json:"index_dynamic"`
DocValuesDynamic bool `json:"docvalues_dynamic"`
CustomAnalysis *customAnalysis `json:"analysis,omitempty"`
cache *registry.Cache
}
// AddCustomCharFilter defines a custom char filter for use in this mapping
func (im *IndexMappingImpl) AddCustomCharFilter(name string, config map[string]interface{}) error {
_, err := im.cache.DefineCharFilter(name, config)
if err != nil {
return err
}
im.CustomAnalysis.CharFilters[name] = config
return nil
}
// AddCustomTokenizer defines a custom tokenizer for use in this mapping
func (im *IndexMappingImpl) AddCustomTokenizer(name string, config map[string]interface{}) error {
_, err := im.cache.DefineTokenizer(name, config)
if err != nil {
return err
}
im.CustomAnalysis.Tokenizers[name] = config
return nil
}
// AddCustomTokenMap defines a custom token map for use in this mapping
func (im *IndexMappingImpl) AddCustomTokenMap(name string, config map[string]interface{}) error {
_, err := im.cache.DefineTokenMap(name, config)
if err != nil {
return err
}
im.CustomAnalysis.TokenMaps[name] = config
return nil
}
// AddCustomTokenFilter defines a custom token filter for use in this mapping
func (im *IndexMappingImpl) AddCustomTokenFilter(name string, config map[string]interface{}) error {
_, err := im.cache.DefineTokenFilter(name, config)
if err != nil {
return err
}
im.CustomAnalysis.TokenFilters[name] = config
return nil
}
// AddCustomAnalyzer defines a custom analyzer for use in this mapping. The
// config map must have a "type" string entry to resolve the analyzer
// constructor. The constructor is invoked with the remaining entries and
// returned analyzer is registered in the IndexMapping.
//
// bleve comes with predefined analyzers, like
// github.com/blevesearch/bleve/analysis/analyzer/custom. They are
// available only if their package is imported by client code. To achieve this,
// use their metadata to fill configuration entries:
//
// import (
// "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
// "github.com/blevesearch/bleve/v2/analysis/char/html"
// "github.com/blevesearch/bleve/v2/analysis/token/lowercase"
// "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
// )
//
// m := bleve.NewIndexMapping()
// err := m.AddCustomAnalyzer("html", map[string]interface{}{
// "type": custom.Name,
// "char_filters": []string{
// html.Name,
// },
// "tokenizer": unicode.Name,
// "token_filters": []string{
// lowercase.Name,
// ...
// },
// })
func (im *IndexMappingImpl) AddCustomAnalyzer(name string, config map[string]interface{}) error {
_, err := im.cache.DefineAnalyzer(name, config)
if err != nil {
return err
}
im.CustomAnalysis.Analyzers[name] = config
return nil
}
// AddCustomDateTimeParser defines a custom date time parser for use in this mapping
func (im *IndexMappingImpl) AddCustomDateTimeParser(name string, config map[string]interface{}) error {
_, err := im.cache.DefineDateTimeParser(name, config)
if err != nil {
return err
}
im.CustomAnalysis.DateTimeParsers[name] = config
return nil
}
func (im *IndexMappingImpl) AddSynonymSource(name string, config map[string]interface{}) error {
_, err := im.cache.DefineSynonymSource(name, config)
if err != nil {
return err
}
im.CustomAnalysis.SynonymSources[name] = config
return nil
}
// NewIndexMapping creates a new IndexMapping that will use all the default indexing rules
func NewIndexMapping() *IndexMappingImpl {
return &IndexMappingImpl{
TypeMapping: make(map[string]*DocumentMapping),
DefaultMapping: NewDocumentMapping(),
TypeField: defaultTypeField,
DefaultType: defaultType,
DefaultAnalyzer: defaultAnalyzer,
DefaultDateTimeParser: defaultDateTimeParser,
DefaultField: defaultField,
IndexDynamic: IndexDynamic,
StoreDynamic: StoreDynamic,
DocValuesDynamic: DocValuesDynamic,
CustomAnalysis: newCustomAnalysis(),
cache: registry.NewCache(),
}
}
// Validate will walk the entire structure ensuring the following
// explicitly named and default analyzers can be built
func (im *IndexMappingImpl) Validate() error {
_, err := im.cache.AnalyzerNamed(im.DefaultAnalyzer)
if err != nil {
return err
}
_, err = im.cache.DateTimeParserNamed(im.DefaultDateTimeParser)
if err != nil {
return err
}
if im.DefaultSynonymSource != "" {
_, err = im.cache.SynonymSourceNamed(im.DefaultSynonymSource)
if err != nil {
return err
}
}
fieldAliasCtx := make(map[string]*FieldMapping)
err = im.DefaultMapping.Validate(im.cache, "", fieldAliasCtx)
if err != nil {
return err
}
for _, docMapping := range im.TypeMapping {
err = docMapping.Validate(im.cache, "", fieldAliasCtx)
if err != nil {
return err
}
}
if _, ok := index.SupportedScoringModels[im.ScoringModel]; !ok && im.ScoringModel != "" {
return fmt.Errorf("unsupported scoring model: %s", im.ScoringModel)
}
return nil
}
// AddDocumentMapping sets a custom document mapping for the specified type
func (im *IndexMappingImpl) AddDocumentMapping(doctype string, dm *DocumentMapping) {
im.TypeMapping[doctype] = dm
}
func (im *IndexMappingImpl) mappingForType(docType string) *DocumentMapping {
docMapping := im.TypeMapping[docType]
if docMapping == nil {
docMapping = im.DefaultMapping
}
return docMapping
}
// UnmarshalJSON offers custom unmarshaling with optional strict validation
func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error {
var tmp map[string]json.RawMessage
err := util.UnmarshalJSON(data, &tmp)
if err != nil {
return err
}
// set defaults for fields which might have been omitted
im.cache = registry.NewCache()
im.CustomAnalysis = newCustomAnalysis()
im.TypeField = defaultTypeField
im.DefaultType = defaultType
im.DefaultAnalyzer = defaultAnalyzer
im.DefaultDateTimeParser = defaultDateTimeParser
im.DefaultField = defaultField
im.DefaultMapping = NewDocumentMapping()
im.TypeMapping = make(map[string]*DocumentMapping)
im.StoreDynamic = StoreDynamic
im.IndexDynamic = IndexDynamic
im.DocValuesDynamic = DocValuesDynamic
var invalidKeys []string
for k, v := range tmp {
switch k {
case "analysis":
err := util.UnmarshalJSON(v, &im.CustomAnalysis)
if err != nil {
return err
}
case "type_field":
err := util.UnmarshalJSON(v, &im.TypeField)
if err != nil {
return err
}
case "default_type":
err := util.UnmarshalJSON(v, &im.DefaultType)
if err != nil {
return err
}
case "default_analyzer":
err := util.UnmarshalJSON(v, &im.DefaultAnalyzer)
if err != nil {
return err
}
case "default_datetime_parser":
err := util.UnmarshalJSON(v, &im.DefaultDateTimeParser)
if err != nil {
return err
}
case "default_synonym_source":
err := util.UnmarshalJSON(v, &im.DefaultSynonymSource)
if err != nil {
return err
}
case "default_field":
err := util.UnmarshalJSON(v, &im.DefaultField)
if err != nil {
return err
}
case "default_mapping":
err := util.UnmarshalJSON(v, &im.DefaultMapping)
if err != nil {
return err
}
case "types":
err := util.UnmarshalJSON(v, &im.TypeMapping)
if err != nil {
return err
}
case "store_dynamic":
err := util.UnmarshalJSON(v, &im.StoreDynamic)
if err != nil {
return err
}
case "index_dynamic":
err := util.UnmarshalJSON(v, &im.IndexDynamic)
if err != nil {
return err
}
case "docvalues_dynamic":
err := util.UnmarshalJSON(v, &im.DocValuesDynamic)
if err != nil {
return err
}
case "scoring_model":
err := util.UnmarshalJSON(v, &im.ScoringModel)
if err != nil {
return err
}
default:
invalidKeys = append(invalidKeys, k)
}
}
if MappingJSONStrict && len(invalidKeys) > 0 {
return fmt.Errorf("index mapping contains invalid keys: %v", invalidKeys)
}
err = im.CustomAnalysis.registerAll(im)
if err != nil {
return err
}
return nil
}
func (im *IndexMappingImpl) determineType(data interface{}) string {
// first see if the object implements bleveClassifier
bleveClassifier, ok := data.(bleveClassifier)
if ok {
return bleveClassifier.BleveType()
}
// next see if the object implements Classifier
classifier, ok := data.(Classifier)
if ok {
return classifier.Type()
}
// now see if we can find a type using the mapping
typ, ok := mustString(lookupPropertyPath(data, im.TypeField))
if ok {
return typ
}
return im.DefaultType
}
func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error {
docType := im.determineType(data)
docMapping := im.mappingForType(docType)
if docMapping.Enabled {
walkContext := im.newWalkContext(doc, docMapping)
docMapping.walkDocument(data, []string{}, []uint64{}, walkContext)
// see if the _all field was disabled
allMapping, _ := docMapping.documentMappingForPath("_all")
if allMapping == nil || allMapping.Enabled {
field := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, walkContext.excludedFromAll, index.IndexField|index.IncludeTermVectors)
doc.AddField(field)
}
doc.SetIndexed()
}
return nil
}
func (im *IndexMappingImpl) MapSynonymDocument(doc *document.Document, collection string, input []string, synonyms []string) error {
// determine all the synonym sources with the given collection
// and create a synonym field for each
err := im.SynonymSourceVisitor(func(name string, item analysis.SynonymSource) error {
if item.Collection() == collection {
// create a new field with the name of the synonym source
analyzer := im.AnalyzerNamed(item.Analyzer())
if analyzer == nil {
return fmt.Errorf("unknown analyzer named: %s", item.Analyzer())
}
field := document.NewSynonymField(name, analyzer, input, synonyms)
doc.AddField(field)
}
return nil
})
return err
}
type walkContext struct {
doc *document.Document
im *IndexMappingImpl
dm *DocumentMapping
excludedFromAll []string
}
func (im *IndexMappingImpl) newWalkContext(doc *document.Document, dm *DocumentMapping) *walkContext {
return &walkContext{
doc: doc,
im: im,
dm: dm,
excludedFromAll: []string{"_id"},
}
}
// AnalyzerNameForPath attempts to find the best analyzer to use with only a
// field name will walk all the document types, look for field mappings at the
// provided path, if one exists and it has an explicit analyzer that is
// returned.
func (im *IndexMappingImpl) AnalyzerNameForPath(path string) string {
// first we look for explicit mapping on the field
for _, docMapping := range im.TypeMapping {
analyzerName := docMapping.analyzerNameForPath(path)
if analyzerName != "" {
return analyzerName
}
}
// now try the default mapping
pathMapping, _ := im.DefaultMapping.documentMappingForPath(path)
if pathMapping != nil {
if len(pathMapping.Fields) > 0 {
if pathMapping.Fields[0].Analyzer != "" {
return pathMapping.Fields[0].Analyzer
}
}
}
// next we will try default analyzers for the path
pathDecoded := decodePath(path)
for _, docMapping := range im.TypeMapping {
if docMapping.Enabled {
rv := docMapping.defaultAnalyzerName(pathDecoded)
if rv != "" {
return rv
}
}
}
// now the default analyzer for the default mapping
if im.DefaultMapping.Enabled {
rv := im.DefaultMapping.defaultAnalyzerName(pathDecoded)
if rv != "" {
return rv
}
}
return im.DefaultAnalyzer
}
func (im *IndexMappingImpl) AnalyzerNamed(name string) analysis.Analyzer {
analyzer, err := im.cache.AnalyzerNamed(name)
if err != nil {
logger.Printf("error using analyzer named: %s", name)
return nil
}
return analyzer
}
func (im *IndexMappingImpl) DateTimeParserNamed(name string) analysis.DateTimeParser {
if name == "" {
name = im.DefaultDateTimeParser
}
dateTimeParser, err := im.cache.DateTimeParserNamed(name)
if err != nil {
logger.Printf("error using datetime parser named: %s", name)
return nil
}
return dateTimeParser
}
func (im *IndexMappingImpl) AnalyzeText(analyzerName string, text []byte) (analysis.TokenStream, error) {
analyzer, err := im.cache.AnalyzerNamed(analyzerName)
if err != nil {
return nil, err
}
return analyzer.Analyze(text), nil
}
// FieldAnalyzer returns the name of the analyzer used on a field.
func (im *IndexMappingImpl) FieldAnalyzer(field string) string {
return im.AnalyzerNameForPath(field)
}
// FieldMappingForPath returns the mapping for a specific field 'path'.
func (im *IndexMappingImpl) FieldMappingForPath(path string) FieldMapping {
if im.TypeMapping != nil {
for _, v := range im.TypeMapping {
fm := v.fieldDescribedByPath(path)
if fm != nil {
return *fm
}
}
}
fm := im.DefaultMapping.fieldDescribedByPath(path)
if fm != nil {
return *fm
}
return FieldMapping{}
}
// wrapper to satisfy new interface
func (im *IndexMappingImpl) DefaultSearchField() string {
return im.DefaultField
}
func (im *IndexMappingImpl) SynonymSourceNamed(name string) analysis.SynonymSource {
syn, err := im.cache.SynonymSourceNamed(name)
if err != nil {
logger.Printf("error using synonym source named: %s", name)
return nil
}
return syn
}
func (im *IndexMappingImpl) SynonymSourceForPath(path string) string {
// first we look for explicit mapping on the field
for _, docMapping := range im.TypeMapping {
synonymSource := docMapping.synonymSourceForPath(path)
if synonymSource != "" {
return synonymSource
}
}
// now try the default mapping
pathMapping, _ := im.DefaultMapping.documentMappingForPath(path)
if pathMapping != nil {
if len(pathMapping.Fields) > 0 {
if pathMapping.Fields[0].SynonymSource != "" {
return pathMapping.Fields[0].SynonymSource
}
}
}
// next we will try default synonym sources for the path
pathDecoded := decodePath(path)
for _, docMapping := range im.TypeMapping {
if docMapping.Enabled {
rv := docMapping.defaultSynonymSource(pathDecoded)
if rv != "" {
return rv
}
}
}
// now the default analyzer for the default mapping
if im.DefaultMapping.Enabled {
rv := im.DefaultMapping.defaultSynonymSource(pathDecoded)
if rv != "" {
return rv
}
}
return im.DefaultSynonymSource
}
// SynonymCount() returns the number of synonym sources defined in the mapping
func (im *IndexMappingImpl) SynonymCount() int {
return len(im.CustomAnalysis.SynonymSources)
}
// SynonymSourceVisitor() allows a visitor to iterate over all synonym sources
func (im *IndexMappingImpl) SynonymSourceVisitor(visitor analysis.SynonymSourceVisitor) error {
err := im.cache.SynonymSources.VisitSynonymSources(visitor)
if err != nil {
return err
}
return nil
}