Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c71cb8b61d
commit
982828099e
783 changed files with 150650 additions and 0 deletions
272
mapping/mapping_vectors.go
Normal file
272
mapping/mapping_vectors.go
Normal file
|
@ -0,0 +1,272 @@
|
|||
// Copyright (c) 2023 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//go:build vectors
|
||||
// +build vectors
|
||||
|
||||
package mapping
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/document"
|
||||
"github.com/blevesearch/bleve/v2/util"
|
||||
index "github.com/blevesearch/bleve_index_api"
|
||||
faiss "github.com/blevesearch/go-faiss"
|
||||
)
|
||||
|
||||
// Min and Max allowed dimensions for a vector field;
|
||||
// p.s must be set/updated at process init() _only_
|
||||
var (
|
||||
MinVectorDims = 1
|
||||
MaxVectorDims = 4096
|
||||
)
|
||||
|
||||
func NewVectorFieldMapping() *FieldMapping {
|
||||
return &FieldMapping{
|
||||
Type: "vector",
|
||||
Store: false,
|
||||
Index: true,
|
||||
IncludeInAll: false,
|
||||
DocValues: false,
|
||||
SkipFreqNorm: true,
|
||||
}
|
||||
}
|
||||
|
||||
func NewVectorBase64FieldMapping() *FieldMapping {
|
||||
return &FieldMapping{
|
||||
Type: "vector_base64",
|
||||
Store: false,
|
||||
Index: true,
|
||||
IncludeInAll: false,
|
||||
DocValues: false,
|
||||
SkipFreqNorm: true,
|
||||
}
|
||||
}
|
||||
|
||||
// validate and process a flat vector
|
||||
func processFlatVector(vecV reflect.Value, dims int) ([]float32, bool) {
|
||||
if vecV.Len() != dims {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
rv := make([]float32, dims)
|
||||
for i := 0; i < vecV.Len(); i++ {
|
||||
item := vecV.Index(i)
|
||||
if !item.CanInterface() {
|
||||
return nil, false
|
||||
}
|
||||
itemI := item.Interface()
|
||||
itemFloat, ok := util.ExtractNumericValFloat32(itemI)
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
rv[i] = itemFloat
|
||||
}
|
||||
|
||||
return rv, true
|
||||
}
|
||||
|
||||
// validate and process a vector
|
||||
// max supported depth of nesting is 2 ([][]float32)
|
||||
func processVector(vecI interface{}, dims int) ([]float32, bool) {
|
||||
vecV := reflect.ValueOf(vecI)
|
||||
if !vecV.IsValid() || vecV.Kind() != reflect.Slice || vecV.Len() == 0 {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// Let's examine the first element (head) of the vector.
|
||||
// If head is a slice, then vector is nested, otherwise flat.
|
||||
head := vecV.Index(0)
|
||||
if !head.CanInterface() {
|
||||
return nil, false
|
||||
}
|
||||
headI := head.Interface()
|
||||
headV := reflect.ValueOf(headI)
|
||||
if !headV.IsValid() {
|
||||
return nil, false
|
||||
}
|
||||
if headV.Kind() != reflect.Slice { // vector is flat
|
||||
return processFlatVector(vecV, dims)
|
||||
}
|
||||
|
||||
// # process nested vector
|
||||
|
||||
// pre-allocate memory for the flattened vector
|
||||
// so that we can use copy() later
|
||||
rv := make([]float32, dims*vecV.Len())
|
||||
|
||||
for i := 0; i < vecV.Len(); i++ {
|
||||
subVec := vecV.Index(i)
|
||||
if !subVec.CanInterface() {
|
||||
return nil, false
|
||||
}
|
||||
subVecI := subVec.Interface()
|
||||
subVecV := reflect.ValueOf(subVecI)
|
||||
if !subVecV.IsValid() {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
if subVecV.Kind() != reflect.Slice {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
flatVector, ok := processFlatVector(subVecV, dims)
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
copy(rv[i*dims:(i+1)*dims], flatVector)
|
||||
}
|
||||
|
||||
return rv, true
|
||||
}
|
||||
|
||||
func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
|
||||
pathString string, path []string, indexes []uint64, context *walkContext) bool {
|
||||
vector, ok := processVector(propertyMightBeVector, fm.Dims)
|
||||
// Don't add field to document if vector is invalid
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
// normalize raw vector if similarity is cosine
|
||||
if fm.Similarity == index.CosineSimilarity {
|
||||
vector = NormalizeVector(vector)
|
||||
}
|
||||
|
||||
fieldName := getFieldName(pathString, path, fm)
|
||||
options := fm.Options()
|
||||
field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, vector,
|
||||
fm.Dims, fm.Similarity, fm.VectorIndexOptimizedFor, options)
|
||||
context.doc.AddField(field)
|
||||
|
||||
// "_all" composite field is not applicable for vector field
|
||||
context.excludedFromAll = append(context.excludedFromAll, fieldName)
|
||||
return true
|
||||
}
|
||||
|
||||
func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interface{},
|
||||
pathString string, path []string, indexes []uint64, context *walkContext) {
|
||||
encodedString, ok := propertyMightBeVectorBase64.(string)
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
decodedVector, err := document.DecodeVector(encodedString)
|
||||
if err != nil || len(decodedVector) != fm.Dims {
|
||||
return
|
||||
}
|
||||
// normalize raw vector if similarity is cosine
|
||||
if fm.Similarity == index.CosineSimilarity {
|
||||
decodedVector = NormalizeVector(decodedVector)
|
||||
}
|
||||
|
||||
fieldName := getFieldName(pathString, path, fm)
|
||||
options := fm.Options()
|
||||
field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, decodedVector,
|
||||
fm.Dims, fm.Similarity, fm.VectorIndexOptimizedFor, options)
|
||||
context.doc.AddField(field)
|
||||
|
||||
// "_all" composite field is not applicable for vector_base64 field
|
||||
context.excludedFromAll = append(context.excludedFromAll, fieldName)
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------------
|
||||
// document validation functions
|
||||
|
||||
func validateFieldMapping(field *FieldMapping, parentName string,
|
||||
fieldAliasCtx map[string]*FieldMapping) error {
|
||||
switch field.Type {
|
||||
case "vector", "vector_base64":
|
||||
return validateVectorFieldAlias(field, parentName, fieldAliasCtx)
|
||||
default: // non-vector field
|
||||
return validateFieldType(field)
|
||||
}
|
||||
}
|
||||
|
||||
func validateVectorFieldAlias(field *FieldMapping, parentName string,
|
||||
fieldAliasCtx map[string]*FieldMapping) error {
|
||||
|
||||
if field.Name == "" {
|
||||
field.Name = parentName
|
||||
}
|
||||
|
||||
if field.Similarity == "" {
|
||||
field.Similarity = index.DefaultVectorSimilarityMetric
|
||||
}
|
||||
|
||||
if field.VectorIndexOptimizedFor == "" {
|
||||
field.VectorIndexOptimizedFor = index.DefaultIndexOptimization
|
||||
}
|
||||
if _, exists := index.SupportedVectorIndexOptimizations[field.VectorIndexOptimizedFor]; !exists {
|
||||
// if an unsupported config is provided, override to default
|
||||
field.VectorIndexOptimizedFor = index.DefaultIndexOptimization
|
||||
}
|
||||
|
||||
// following fields are not applicable for vector
|
||||
// thus, we set them to default values
|
||||
field.IncludeInAll = false
|
||||
field.IncludeTermVectors = false
|
||||
field.Store = false
|
||||
field.DocValues = false
|
||||
field.SkipFreqNorm = true
|
||||
|
||||
// # If alias is present, validate the field options as per the alias
|
||||
// note: reading from a nil map is safe
|
||||
if fieldAlias, ok := fieldAliasCtx[field.Name]; ok {
|
||||
if field.Dims != fieldAlias.Dims {
|
||||
return fmt.Errorf("field: '%s', invalid alias "+
|
||||
"(different dimensions %d and %d)", fieldAlias.Name, field.Dims,
|
||||
fieldAlias.Dims)
|
||||
}
|
||||
|
||||
if field.Similarity != fieldAlias.Similarity {
|
||||
return fmt.Errorf("field: '%s', invalid alias "+
|
||||
"(different similarity values %s and %s)", fieldAlias.Name,
|
||||
field.Similarity, fieldAlias.Similarity)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// # Validate field options
|
||||
|
||||
if field.Dims < MinVectorDims || field.Dims > MaxVectorDims {
|
||||
return fmt.Errorf("field: '%s', invalid vector dimension: %d,"+
|
||||
" value should be in range (%d, %d)", field.Name, field.Dims,
|
||||
MinVectorDims, MaxVectorDims)
|
||||
}
|
||||
|
||||
if _, ok := index.SupportedVectorSimilarityMetrics[field.Similarity]; !ok {
|
||||
return fmt.Errorf("field: '%s', invalid similarity "+
|
||||
"metric: '%s', valid metrics are: %+v", field.Name, field.Similarity,
|
||||
reflect.ValueOf(index.SupportedVectorSimilarityMetrics).MapKeys())
|
||||
}
|
||||
|
||||
if fieldAliasCtx != nil { // writing to a nil map is unsafe
|
||||
fieldAliasCtx[field.Name] = field
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func NormalizeVector(vec []float32) []float32 {
|
||||
// make a copy of the vector to avoid modifying the original
|
||||
// vector in-place
|
||||
vecCopy := make([]float32, len(vec))
|
||||
copy(vecCopy, vec)
|
||||
// normalize the vector copy using in-place normalization provided by faiss
|
||||
return faiss.NormalizeVector(vecCopy)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue