Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c71cb8b61d
commit
982828099e
783 changed files with 150650 additions and 0 deletions
95
analysis/token/hierarchy/hierarchy.go
Normal file
95
analysis/token/hierarchy/hierarchy.go
Normal file
|
@ -0,0 +1,95 @@
|
|||
package hierarchy
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "hierarchy"
|
||||
|
||||
type HierarchyFilter struct {
|
||||
maxLevels int
|
||||
delimiter []byte
|
||||
splitInput bool
|
||||
}
|
||||
|
||||
func NewHierarchyFilter(delimiter []byte, maxLevels int, splitInput bool) *HierarchyFilter {
|
||||
return &HierarchyFilter{
|
||||
maxLevels: maxLevels,
|
||||
delimiter: delimiter,
|
||||
splitInput: splitInput,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *HierarchyFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0, s.maxLevels)
|
||||
|
||||
var soFar [][]byte
|
||||
for _, token := range input {
|
||||
if s.splitInput {
|
||||
parts := bytes.Split(token.Term, s.delimiter)
|
||||
for _, part := range parts {
|
||||
soFar, rv = s.buildToken(rv, soFar, part)
|
||||
if len(soFar) >= s.maxLevels {
|
||||
return rv
|
||||
}
|
||||
}
|
||||
} else {
|
||||
soFar, rv = s.buildToken(rv, soFar, token.Term)
|
||||
if len(soFar) >= s.maxLevels {
|
||||
return rv
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *HierarchyFilter) buildToken(tokenStream analysis.TokenStream, soFar [][]byte, part []byte) (
|
||||
[][]byte, analysis.TokenStream) {
|
||||
|
||||
soFar = append(soFar, part)
|
||||
term := bytes.Join(soFar, s.delimiter)
|
||||
|
||||
tokenStream = append(tokenStream, &analysis.Token{
|
||||
Type: analysis.Shingle,
|
||||
Term: term,
|
||||
Start: 0,
|
||||
End: len(term),
|
||||
Position: 1,
|
||||
})
|
||||
|
||||
return soFar, tokenStream
|
||||
}
|
||||
|
||||
func HierarchyFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
max := math.MaxInt64
|
||||
maxVal, ok := config["max"].(float64)
|
||||
if ok {
|
||||
max = int(maxVal)
|
||||
}
|
||||
|
||||
splitInput := true
|
||||
splitInputVal, ok := config["split_input"].(bool)
|
||||
if ok {
|
||||
splitInput = splitInputVal
|
||||
}
|
||||
|
||||
delimiter, ok := config["delimiter"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify delimiter")
|
||||
}
|
||||
|
||||
return NewHierarchyFilter([]byte(delimiter), max, splitInput), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, HierarchyFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
229
analysis/token/hierarchy/hierarchy_test.go
Normal file
229
analysis/token/hierarchy/hierarchy_test.go
Normal file
|
@ -0,0 +1,229 @@
|
|||
package hierarchy
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestHierarchyFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
delimiter string
|
||||
max int
|
||||
splitInput bool
|
||||
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
name: "single token a/b/c, delimiter /",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 10,
|
||||
splitInput: true,
|
||||
},
|
||||
{
|
||||
name: "multiple tokens already split a b c, delimiter /",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("b"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 10,
|
||||
splitInput: true,
|
||||
},
|
||||
{
|
||||
name: "single token a/b/c, delimiter /, limit 2",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 2,
|
||||
splitInput: true,
|
||||
},
|
||||
{
|
||||
name: "multiple tokens already split a b c, delimiter /, limit 2",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("b"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 2,
|
||||
splitInput: true,
|
||||
},
|
||||
|
||||
{
|
||||
name: "single token a/b/c, delimiter /, no split",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 10,
|
||||
splitInput: false,
|
||||
},
|
||||
{
|
||||
name: "multiple tokens already split a b c, delimiter /, no split",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("b"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 10,
|
||||
splitInput: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
filter := NewHierarchyFilter([]byte(test.delimiter), test.max, test.splitInput)
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue