1
0
Fork 0

Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-19 00:20:02 +02:00
parent c71cb8b61d
commit 982828099e
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
783 changed files with 150650 additions and 0 deletions

View file

@ -0,0 +1,95 @@
package hierarchy
import (
"bytes"
"fmt"
"math"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "hierarchy"
type HierarchyFilter struct {
maxLevels int
delimiter []byte
splitInput bool
}
func NewHierarchyFilter(delimiter []byte, maxLevels int, splitInput bool) *HierarchyFilter {
return &HierarchyFilter{
maxLevels: maxLevels,
delimiter: delimiter,
splitInput: splitInput,
}
}
func (s *HierarchyFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, s.maxLevels)
var soFar [][]byte
for _, token := range input {
if s.splitInput {
parts := bytes.Split(token.Term, s.delimiter)
for _, part := range parts {
soFar, rv = s.buildToken(rv, soFar, part)
if len(soFar) >= s.maxLevels {
return rv
}
}
} else {
soFar, rv = s.buildToken(rv, soFar, token.Term)
if len(soFar) >= s.maxLevels {
return rv
}
}
}
return rv
}
func (s *HierarchyFilter) buildToken(tokenStream analysis.TokenStream, soFar [][]byte, part []byte) (
[][]byte, analysis.TokenStream) {
soFar = append(soFar, part)
term := bytes.Join(soFar, s.delimiter)
tokenStream = append(tokenStream, &analysis.Token{
Type: analysis.Shingle,
Term: term,
Start: 0,
End: len(term),
Position: 1,
})
return soFar, tokenStream
}
func HierarchyFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
max := math.MaxInt64
maxVal, ok := config["max"].(float64)
if ok {
max = int(maxVal)
}
splitInput := true
splitInputVal, ok := config["split_input"].(bool)
if ok {
splitInput = splitInputVal
}
delimiter, ok := config["delimiter"].(string)
if !ok {
return nil, fmt.Errorf("must specify delimiter")
}
return NewHierarchyFilter([]byte(delimiter), max, splitInput), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, HierarchyFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,229 @@
package hierarchy
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestHierarchyFilter(t *testing.T) {
tests := []struct {
name string
delimiter string
max int
splitInput bool
input analysis.TokenStream
output analysis.TokenStream
}{
{
name: "single token a/b/c, delimiter /",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a/b/c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
Type: analysis.Shingle,
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b"),
Type: analysis.Shingle,
Start: 0,
End: 3,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b/c"),
Type: analysis.Shingle,
Start: 0,
End: 5,
Position: 1,
},
},
delimiter: "/",
max: 10,
splitInput: true,
},
{
name: "multiple tokens already split a b c, delimiter /",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("b"),
},
&analysis.Token{
Term: []byte("c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
Type: analysis.Shingle,
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b"),
Type: analysis.Shingle,
Start: 0,
End: 3,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b/c"),
Type: analysis.Shingle,
Start: 0,
End: 5,
Position: 1,
},
},
delimiter: "/",
max: 10,
splitInput: true,
},
{
name: "single token a/b/c, delimiter /, limit 2",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a/b/c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
Type: analysis.Shingle,
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b"),
Type: analysis.Shingle,
Start: 0,
End: 3,
Position: 1,
},
},
delimiter: "/",
max: 2,
splitInput: true,
},
{
name: "multiple tokens already split a b c, delimiter /, limit 2",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("b"),
},
&analysis.Token{
Term: []byte("c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
Type: analysis.Shingle,
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b"),
Type: analysis.Shingle,
Start: 0,
End: 3,
Position: 1,
},
},
delimiter: "/",
max: 2,
splitInput: true,
},
{
name: "single token a/b/c, delimiter /, no split",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a/b/c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a/b/c"),
Type: analysis.Shingle,
Start: 0,
End: 5,
Position: 1,
},
},
delimiter: "/",
max: 10,
splitInput: false,
},
{
name: "multiple tokens already split a b c, delimiter /, no split",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("b"),
},
&analysis.Token{
Term: []byte("c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
Type: analysis.Shingle,
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b"),
Type: analysis.Shingle,
Start: 0,
End: 3,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b/c"),
Type: analysis.Shingle,
Start: 0,
End: 5,
Position: 1,
},
},
delimiter: "/",
max: 10,
splitInput: false,
},
}
for _, test := range tests {
test := test
t.Run(test.name, func(t *testing.T) {
filter := NewHierarchyFilter([]byte(test.delimiter), test.max, test.splitInput)
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
})
}
}