272 lines
7.6 KiB
Go
272 lines
7.6 KiB
Go
// Copyright (c) 2023 Couchbase, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
//go:build vectors
|
|
// +build vectors
|
|
|
|
package mapping
|
|
|
|
import (
|
|
"fmt"
|
|
"reflect"
|
|
|
|
"github.com/blevesearch/bleve/v2/document"
|
|
"github.com/blevesearch/bleve/v2/util"
|
|
index "github.com/blevesearch/bleve_index_api"
|
|
faiss "github.com/blevesearch/go-faiss"
|
|
)
|
|
|
|
// Min and Max allowed dimensions for a vector field;
|
|
// p.s must be set/updated at process init() _only_
|
|
var (
|
|
MinVectorDims = 1
|
|
MaxVectorDims = 4096
|
|
)
|
|
|
|
func NewVectorFieldMapping() *FieldMapping {
|
|
return &FieldMapping{
|
|
Type: "vector",
|
|
Store: false,
|
|
Index: true,
|
|
IncludeInAll: false,
|
|
DocValues: false,
|
|
SkipFreqNorm: true,
|
|
}
|
|
}
|
|
|
|
func NewVectorBase64FieldMapping() *FieldMapping {
|
|
return &FieldMapping{
|
|
Type: "vector_base64",
|
|
Store: false,
|
|
Index: true,
|
|
IncludeInAll: false,
|
|
DocValues: false,
|
|
SkipFreqNorm: true,
|
|
}
|
|
}
|
|
|
|
// validate and process a flat vector
|
|
func processFlatVector(vecV reflect.Value, dims int) ([]float32, bool) {
|
|
if vecV.Len() != dims {
|
|
return nil, false
|
|
}
|
|
|
|
rv := make([]float32, dims)
|
|
for i := 0; i < vecV.Len(); i++ {
|
|
item := vecV.Index(i)
|
|
if !item.CanInterface() {
|
|
return nil, false
|
|
}
|
|
itemI := item.Interface()
|
|
itemFloat, ok := util.ExtractNumericValFloat32(itemI)
|
|
if !ok {
|
|
return nil, false
|
|
}
|
|
rv[i] = itemFloat
|
|
}
|
|
|
|
return rv, true
|
|
}
|
|
|
|
// validate and process a vector
|
|
// max supported depth of nesting is 2 ([][]float32)
|
|
func processVector(vecI interface{}, dims int) ([]float32, bool) {
|
|
vecV := reflect.ValueOf(vecI)
|
|
if !vecV.IsValid() || vecV.Kind() != reflect.Slice || vecV.Len() == 0 {
|
|
return nil, false
|
|
}
|
|
|
|
// Let's examine the first element (head) of the vector.
|
|
// If head is a slice, then vector is nested, otherwise flat.
|
|
head := vecV.Index(0)
|
|
if !head.CanInterface() {
|
|
return nil, false
|
|
}
|
|
headI := head.Interface()
|
|
headV := reflect.ValueOf(headI)
|
|
if !headV.IsValid() {
|
|
return nil, false
|
|
}
|
|
if headV.Kind() != reflect.Slice { // vector is flat
|
|
return processFlatVector(vecV, dims)
|
|
}
|
|
|
|
// # process nested vector
|
|
|
|
// pre-allocate memory for the flattened vector
|
|
// so that we can use copy() later
|
|
rv := make([]float32, dims*vecV.Len())
|
|
|
|
for i := 0; i < vecV.Len(); i++ {
|
|
subVec := vecV.Index(i)
|
|
if !subVec.CanInterface() {
|
|
return nil, false
|
|
}
|
|
subVecI := subVec.Interface()
|
|
subVecV := reflect.ValueOf(subVecI)
|
|
if !subVecV.IsValid() {
|
|
return nil, false
|
|
}
|
|
|
|
if subVecV.Kind() != reflect.Slice {
|
|
return nil, false
|
|
}
|
|
|
|
flatVector, ok := processFlatVector(subVecV, dims)
|
|
if !ok {
|
|
return nil, false
|
|
}
|
|
|
|
copy(rv[i*dims:(i+1)*dims], flatVector)
|
|
}
|
|
|
|
return rv, true
|
|
}
|
|
|
|
func (fm *FieldMapping) processVector(propertyMightBeVector interface{},
|
|
pathString string, path []string, indexes []uint64, context *walkContext) bool {
|
|
vector, ok := processVector(propertyMightBeVector, fm.Dims)
|
|
// Don't add field to document if vector is invalid
|
|
if !ok {
|
|
return false
|
|
}
|
|
// normalize raw vector if similarity is cosine
|
|
if fm.Similarity == index.CosineSimilarity {
|
|
vector = NormalizeVector(vector)
|
|
}
|
|
|
|
fieldName := getFieldName(pathString, path, fm)
|
|
options := fm.Options()
|
|
field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, vector,
|
|
fm.Dims, fm.Similarity, fm.VectorIndexOptimizedFor, options)
|
|
context.doc.AddField(field)
|
|
|
|
// "_all" composite field is not applicable for vector field
|
|
context.excludedFromAll = append(context.excludedFromAll, fieldName)
|
|
return true
|
|
}
|
|
|
|
func (fm *FieldMapping) processVectorBase64(propertyMightBeVectorBase64 interface{},
|
|
pathString string, path []string, indexes []uint64, context *walkContext) {
|
|
encodedString, ok := propertyMightBeVectorBase64.(string)
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
decodedVector, err := document.DecodeVector(encodedString)
|
|
if err != nil || len(decodedVector) != fm.Dims {
|
|
return
|
|
}
|
|
// normalize raw vector if similarity is cosine
|
|
if fm.Similarity == index.CosineSimilarity {
|
|
decodedVector = NormalizeVector(decodedVector)
|
|
}
|
|
|
|
fieldName := getFieldName(pathString, path, fm)
|
|
options := fm.Options()
|
|
field := document.NewVectorFieldWithIndexingOptions(fieldName, indexes, decodedVector,
|
|
fm.Dims, fm.Similarity, fm.VectorIndexOptimizedFor, options)
|
|
context.doc.AddField(field)
|
|
|
|
// "_all" composite field is not applicable for vector_base64 field
|
|
context.excludedFromAll = append(context.excludedFromAll, fieldName)
|
|
}
|
|
|
|
// -----------------------------------------------------------------------------
|
|
// document validation functions
|
|
|
|
func validateFieldMapping(field *FieldMapping, parentName string,
|
|
fieldAliasCtx map[string]*FieldMapping) error {
|
|
switch field.Type {
|
|
case "vector", "vector_base64":
|
|
return validateVectorFieldAlias(field, parentName, fieldAliasCtx)
|
|
default: // non-vector field
|
|
return validateFieldType(field)
|
|
}
|
|
}
|
|
|
|
func validateVectorFieldAlias(field *FieldMapping, parentName string,
|
|
fieldAliasCtx map[string]*FieldMapping) error {
|
|
|
|
if field.Name == "" {
|
|
field.Name = parentName
|
|
}
|
|
|
|
if field.Similarity == "" {
|
|
field.Similarity = index.DefaultVectorSimilarityMetric
|
|
}
|
|
|
|
if field.VectorIndexOptimizedFor == "" {
|
|
field.VectorIndexOptimizedFor = index.DefaultIndexOptimization
|
|
}
|
|
if _, exists := index.SupportedVectorIndexOptimizations[field.VectorIndexOptimizedFor]; !exists {
|
|
// if an unsupported config is provided, override to default
|
|
field.VectorIndexOptimizedFor = index.DefaultIndexOptimization
|
|
}
|
|
|
|
// following fields are not applicable for vector
|
|
// thus, we set them to default values
|
|
field.IncludeInAll = false
|
|
field.IncludeTermVectors = false
|
|
field.Store = false
|
|
field.DocValues = false
|
|
field.SkipFreqNorm = true
|
|
|
|
// # If alias is present, validate the field options as per the alias
|
|
// note: reading from a nil map is safe
|
|
if fieldAlias, ok := fieldAliasCtx[field.Name]; ok {
|
|
if field.Dims != fieldAlias.Dims {
|
|
return fmt.Errorf("field: '%s', invalid alias "+
|
|
"(different dimensions %d and %d)", fieldAlias.Name, field.Dims,
|
|
fieldAlias.Dims)
|
|
}
|
|
|
|
if field.Similarity != fieldAlias.Similarity {
|
|
return fmt.Errorf("field: '%s', invalid alias "+
|
|
"(different similarity values %s and %s)", fieldAlias.Name,
|
|
field.Similarity, fieldAlias.Similarity)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// # Validate field options
|
|
|
|
if field.Dims < MinVectorDims || field.Dims > MaxVectorDims {
|
|
return fmt.Errorf("field: '%s', invalid vector dimension: %d,"+
|
|
" value should be in range (%d, %d)", field.Name, field.Dims,
|
|
MinVectorDims, MaxVectorDims)
|
|
}
|
|
|
|
if _, ok := index.SupportedVectorSimilarityMetrics[field.Similarity]; !ok {
|
|
return fmt.Errorf("field: '%s', invalid similarity "+
|
|
"metric: '%s', valid metrics are: %+v", field.Name, field.Similarity,
|
|
reflect.ValueOf(index.SupportedVectorSimilarityMetrics).MapKeys())
|
|
}
|
|
|
|
if fieldAliasCtx != nil { // writing to a nil map is unsafe
|
|
fieldAliasCtx[field.Name] = field
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func NormalizeVector(vec []float32) []float32 {
|
|
// make a copy of the vector to avoid modifying the original
|
|
// vector in-place
|
|
vecCopy := make([]float32, len(vec))
|
|
copy(vecCopy, vec)
|
|
// normalize the vector copy using in-place normalization provided by faiss
|
|
return faiss.NormalizeVector(vecCopy)
|
|
}
|