1
0
Fork 0

Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-19 00:20:02 +02:00
parent c71cb8b61d
commit 982828099e
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
783 changed files with 150650 additions and 0 deletions

View file

@ -0,0 +1,57 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package apostrophe
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "apostrophe"
const RightSingleQuotationMark = ""
const Apostrophe = "'"
const Apostrophes = Apostrophe + RightSingleQuotationMark
type ApostropheFilter struct{}
func NewApostropheFilter() *ApostropheFilter {
return &ApostropheFilter{}
}
func (s *ApostropheFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
if firstApostrophe >= 0 {
// found an apostrophe
token.Term = token.Term[0:firstApostrophe]
}
}
return input
}
func ApostropheFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewApostropheFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, ApostropheFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,99 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package apostrophe
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestApostropheFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Türkiye'de"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Türkiye"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("2003'te"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("2003"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Van"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Van"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Gölü'nü"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Gölü"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("gördüm"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("gördüm"),
},
},
},
}
for _, test := range tests {
apostropheFilter := NewApostropheFilter()
actual := apostropheFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,81 @@
// Copyright (c) 2016 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package camelcase
import (
"bytes"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "camelCase"
// CamelCaseFilter splits a given token into a set of tokens where each resulting token
// falls into one the following classes:
// 1. Upper case followed by lower case letters.
// Terminated by a number, an upper case letter, and a non alpha-numeric symbol.
// 2. Upper case followed by upper case letters.
// Terminated by a number, an upper case followed by a lower case letter, and a non alpha-numeric symbol.
// 3. Lower case followed by lower case letters.
// Terminated by a number, an upper case letter, and a non alpha-numeric symbol.
// 4. Number followed by numbers.
// Terminated by a letter, and a non alpha-numeric symbol.
// 5. Non alpha-numeric symbol followed by non alpha-numeric symbols.
// Terminated by a number, and a letter.
//
// It does a one-time sequential pass over an input token, from left to right.
// The scan is greedy and generates the longest substring that fits into one of the classes.
//
// See the test file for examples of classes and their parsings.
type CamelCaseFilter struct{}
func NewCamelCaseFilter() *CamelCaseFilter {
return &CamelCaseFilter{}
}
func (f *CamelCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
nextPosition := 1
for _, token := range input {
runeCount := utf8.RuneCount(token.Term)
runes := bytes.Runes(token.Term)
p := NewParser(runeCount, nextPosition, token.Start)
for i := 0; i < runeCount; i++ {
if i+1 >= runeCount {
p.Push(runes[i], nil)
} else {
p.Push(runes[i], &runes[i+1])
}
}
rv = append(rv, p.FlushTokens()...)
nextPosition = p.NextPosition()
}
return rv
}
func CamelCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewCamelCaseFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, CamelCaseFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,95 @@
// Copyright (c) 2016 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package camelcase
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestCamelCaseFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: tokenStream(""),
output: tokenStream(""),
},
{
input: tokenStream("a"),
output: tokenStream("a"),
},
{
input: tokenStream("...aMACMac123macILoveGolang"),
output: tokenStream("...", "a", "MAC", "Mac", "123", "mac", "I", "Love", "Golang"),
},
{
input: tokenStream("Lang"),
output: tokenStream("Lang"),
},
{
input: tokenStream("GLang"),
output: tokenStream("G", "Lang"),
},
{
input: tokenStream("GOLang"),
output: tokenStream("GO", "Lang"),
},
{
input: tokenStream("GOOLang"),
output: tokenStream("GOO", "Lang"),
},
{
input: tokenStream("1234"),
output: tokenStream("1234"),
},
{
input: tokenStream("starbucks"),
output: tokenStream("starbucks"),
},
{
input: tokenStream("Starbucks TVSamsungIsGREAT000"),
output: tokenStream("Starbucks", " ", "TV", "Samsung", "Is", "GREAT", "000"),
},
}
for _, test := range tests {
ccFilter := NewCamelCaseFilter()
actual := ccFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s \n\n got %s", test.output, actual)
}
}
}
func tokenStream(termStrs ...string) analysis.TokenStream {
tokenStream := make([]*analysis.Token, len(termStrs))
index := 0
for i, termStr := range termStrs {
tokenStream[i] = &analysis.Token{
Term: []byte(termStr),
Position: i + 1,
Start: index,
End: index + len(termStr),
}
index += len(termStr)
}
return analysis.TokenStream(tokenStream)
}

View file

@ -0,0 +1,109 @@
// Copyright (c) 2016 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package camelcase
import (
"github.com/blevesearch/bleve/v2/analysis"
)
func (p *Parser) buildTokenFromTerm(buffer []rune) *analysis.Token {
term := analysis.BuildTermFromRunes(buffer)
token := &analysis.Token{
Term: term,
Position: p.position,
Start: p.index,
End: p.index + len(term),
}
p.position++
p.index += len(term)
return token
}
// Parser accepts a symbol and passes it to the current state (representing a class).
// The state can accept it (and accumulate it). Otherwise, the parser creates a new state that
// starts with the pushed symbol.
//
// Parser accumulates a new resulting token every time it switches state.
// Use FlushTokens() to get the results after the last symbol was pushed.
type Parser struct {
bufferLen int
buffer []rune
current State
tokens []*analysis.Token
position int
index int
}
func NewParser(length, position, index int) *Parser {
return &Parser{
bufferLen: length,
buffer: make([]rune, 0, length),
tokens: make([]*analysis.Token, 0, length),
position: position,
index: index,
}
}
func (p *Parser) Push(sym rune, peek *rune) {
if p.current == nil {
// the start of parsing
p.current = p.NewState(sym)
p.buffer = append(p.buffer, sym)
} else if p.current.Member(sym, peek) {
// same state, just accumulate
p.buffer = append(p.buffer, sym)
} else {
// the old state is no more, thus convert the buffer
p.tokens = append(p.tokens, p.buildTokenFromTerm(p.buffer))
// let the new state begin
p.current = p.NewState(sym)
p.buffer = make([]rune, 0, p.bufferLen)
p.buffer = append(p.buffer, sym)
}
}
// Note. States have to have different starting symbols.
func (p *Parser) NewState(sym rune) State {
var found State
found = &LowerCaseState{}
if found.StartSym(sym) {
return found
}
found = &UpperCaseState{}
if found.StartSym(sym) {
return found
}
found = &NumberCaseState{}
if found.StartSym(sym) {
return found
}
return &NonAlphaNumericCaseState{}
}
func (p *Parser) FlushTokens() []*analysis.Token {
p.tokens = append(p.tokens, p.buildTokenFromTerm(p.buffer))
return p.tokens
}
func (p *Parser) NextPosition() int {
return p.position
}

View file

@ -0,0 +1,87 @@
// Copyright (c) 2016 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package camelcase
import (
"unicode"
)
// States codify the classes that the parser recognizes.
type State interface {
// is _sym_ the start character
StartSym(sym rune) bool
// is _sym_ a member of a class.
// peek, the next sym on the tape, can also be used to determine a class.
Member(sym rune, peek *rune) bool
}
type LowerCaseState struct{}
func (s *LowerCaseState) Member(sym rune, peek *rune) bool {
return unicode.IsLower(sym)
}
func (s *LowerCaseState) StartSym(sym rune) bool {
return s.Member(sym, nil)
}
type UpperCaseState struct {
startedCollecting bool // denotes that the start character has been read
collectingUpper bool // denotes if this is a class of all upper case letters
}
func (s *UpperCaseState) Member(sym rune, peek *rune) bool {
if !(unicode.IsLower(sym) || unicode.IsUpper(sym)) {
return false
}
if peek != nil && unicode.IsUpper(sym) && unicode.IsLower(*peek) {
return false
}
if !s.startedCollecting {
// now we have to determine if upper-case letters are collected.
s.startedCollecting = true
s.collectingUpper = unicode.IsUpper(sym)
return true
}
return s.collectingUpper == unicode.IsUpper(sym)
}
func (s *UpperCaseState) StartSym(sym rune) bool {
return unicode.IsUpper(sym)
}
type NumberCaseState struct{}
func (s *NumberCaseState) Member(sym rune, peek *rune) bool {
return unicode.IsNumber(sym)
}
func (s *NumberCaseState) StartSym(sym rune) bool {
return s.Member(sym, nil)
}
type NonAlphaNumericCaseState struct{}
func (s *NonAlphaNumericCaseState) Member(sym rune, peek *rune) bool {
return !unicode.IsLower(sym) && !unicode.IsUpper(sym) && !unicode.IsNumber(sym)
}
func (s *NonAlphaNumericCaseState) StartSym(sym rune) bool {
return s.Member(sym, nil)
}

View file

@ -0,0 +1,144 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package compound
import (
"bytes"
"fmt"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "dict_compound"
const defaultMinWordSize = 5
const defaultMinSubWordSize = 2
const defaultMaxSubWordSize = 15
const defaultOnlyLongestMatch = false
type DictionaryCompoundFilter struct {
dict analysis.TokenMap
minWordSize int
minSubWordSize int
maxSubWordSize int
onlyLongestMatch bool
}
func NewDictionaryCompoundFilter(dict analysis.TokenMap, minWordSize, minSubWordSize, maxSubWordSize int, onlyLongestMatch bool) *DictionaryCompoundFilter {
return &DictionaryCompoundFilter{
dict: dict,
minWordSize: minWordSize,
minSubWordSize: minSubWordSize,
maxSubWordSize: maxSubWordSize,
onlyLongestMatch: onlyLongestMatch,
}
}
func (f *DictionaryCompoundFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
rv = append(rv, token)
tokenLen := utf8.RuneCount(token.Term)
if tokenLen >= f.minWordSize {
newtokens := f.decompose(token)
for _, newtoken := range newtokens {
rv = append(rv, newtoken)
}
}
}
return rv
}
func (f *DictionaryCompoundFilter) decompose(token *analysis.Token) []*analysis.Token {
runes := bytes.Runes(token.Term)
rv := make([]*analysis.Token, 0)
rlen := len(runes)
for i := 0; i <= (rlen - f.minSubWordSize); i++ {
var longestMatchToken *analysis.Token
for j := f.minSubWordSize; j <= f.maxSubWordSize; j++ {
if i+j > rlen {
break
}
_, inDict := f.dict[string(runes[i:i+j])]
if inDict {
newtoken := analysis.Token{
Term: []byte(string(runes[i : i+j])),
Position: token.Position,
Start: token.Start + i,
End: token.Start + i + j,
Type: token.Type,
KeyWord: token.KeyWord,
}
if f.onlyLongestMatch {
if longestMatchToken == nil || utf8.RuneCount(longestMatchToken.Term) < j {
longestMatchToken = &newtoken
}
} else {
rv = append(rv, &newtoken)
}
}
}
if f.onlyLongestMatch && longestMatchToken != nil {
rv = append(rv, longestMatchToken)
}
}
return rv
}
func DictionaryCompoundFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
minWordSize := defaultMinWordSize
minSubWordSize := defaultMinSubWordSize
maxSubWordSize := defaultMaxSubWordSize
onlyLongestMatch := defaultOnlyLongestMatch
minVal, ok := config["min_word_size"].(float64)
if ok {
minWordSize = int(minVal)
}
minSubVal, ok := config["min_subword_size"].(float64)
if ok {
minSubWordSize = int(minSubVal)
}
maxSubVal, ok := config["max_subword_size"].(float64)
if ok {
maxSubWordSize = int(maxSubVal)
}
onlyVal, ok := config["only_longest_match"].(bool)
if ok {
onlyLongestMatch = onlyVal
}
dictTokenMapName, ok := config["dict_token_map"].(string)
if !ok {
return nil, fmt.Errorf("must specify dict_token_map")
}
dictTokenMap, err := cache.TokenMapNamed(dictTokenMapName)
if err != nil {
return nil, fmt.Errorf("error building dict compound words filter: %v", err)
}
return NewDictionaryCompoundFilter(dictTokenMap, minWordSize, minSubWordSize, maxSubWordSize, onlyLongestMatch), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, DictionaryCompoundFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,187 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package compound
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/tokenmap"
"github.com/blevesearch/bleve/v2/registry"
)
func TestStopWordsFilter(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("i"),
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("like"),
Start: 2,
End: 6,
Position: 2,
},
&analysis.Token{
Term: []byte("to"),
Start: 7,
End: 9,
Position: 3,
},
&analysis.Token{
Term: []byte("play"),
Start: 10,
End: 14,
Position: 4,
},
&analysis.Token{
Term: []byte("softball"),
Start: 15,
End: 23,
Position: 5,
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("i"),
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("like"),
Start: 2,
End: 6,
Position: 2,
},
&analysis.Token{
Term: []byte("to"),
Start: 7,
End: 9,
Position: 3,
},
&analysis.Token{
Term: []byte("play"),
Start: 10,
End: 14,
Position: 4,
},
&analysis.Token{
Term: []byte("softball"),
Start: 15,
End: 23,
Position: 5,
},
&analysis.Token{
Term: []byte("soft"),
Start: 15,
End: 19,
Position: 5,
},
&analysis.Token{
Term: []byte("ball"),
Start: 19,
End: 23,
Position: 5,
},
}
cache := registry.NewCache()
dictListConfig := map[string]interface{}{
"type": tokenmap.Name,
"tokens": []interface{}{"factor", "soft", "ball", "team"},
}
_, err := cache.DefineTokenMap("dict_test", dictListConfig)
if err != nil {
t.Fatal(err)
}
dictConfig := map[string]interface{}{
"type": "dict_compound",
"dict_token_map": "dict_test",
}
dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
if err != nil {
t.Fatal(err)
}
ouputTokenStream := dictFilter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
}
}
func TestStopWordsFilterLongestMatch(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("softestball"),
Start: 0,
End: 11,
Position: 1,
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("softestball"),
Start: 0,
End: 11,
Position: 1,
},
&analysis.Token{
Term: []byte("softest"),
Start: 0,
End: 7,
Position: 1,
},
&analysis.Token{
Term: []byte("ball"),
Start: 7,
End: 11,
Position: 1,
},
}
cache := registry.NewCache()
dictListConfig := map[string]interface{}{
"type": tokenmap.Name,
"tokens": []interface{}{"soft", "softest", "ball"},
}
_, err := cache.DefineTokenMap("dict_test", dictListConfig)
if err != nil {
t.Fatal(err)
}
dictConfig := map[string]interface{}{
"type": "dict_compound",
"dict_token_map": "dict_test",
"only_longest_match": true,
}
dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
if err != nil {
t.Fatal(err)
}
ouputTokenStream := dictFilter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
}
}

View file

@ -0,0 +1,118 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package edgengram
import (
"bytes"
"fmt"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "edge_ngram"
type Side bool
const BACK Side = true
const FRONT Side = false
type EdgeNgramFilter struct {
back Side
minLength int
maxLength int
}
func NewEdgeNgramFilter(side Side, minLength, maxLength int) *EdgeNgramFilter {
return &EdgeNgramFilter{
back: side,
minLength: minLength,
maxLength: maxLength,
}
}
func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
runeCount := utf8.RuneCount(token.Term)
runes := bytes.Runes(token.Term)
if s.back {
i := runeCount
// index of the starting rune for this token
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
// build an ngram of this size starting at i
if i-ngramSize >= 0 {
ngramTerm := analysis.BuildTermFromRunes(runes[i-ngramSize : i])
token := analysis.Token{
Position: token.Position,
Start: token.Start,
End: token.End,
Type: token.Type,
Term: ngramTerm,
}
rv = append(rv, &token)
}
}
} else {
i := 0
// index of the starting rune for this token
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
// build an ngram of this size starting at i
if i+ngramSize <= runeCount {
ngramTerm := analysis.BuildTermFromRunes(runes[i : i+ngramSize])
token := analysis.Token{
Position: token.Position,
Start: token.Start,
End: token.End,
Type: token.Type,
Term: ngramTerm,
}
rv = append(rv, &token)
}
}
}
}
return rv
}
func EdgeNgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
side := FRONT
back, ok := config["back"].(bool)
if ok && back {
side = BACK
}
minVal, ok := config["min"].(float64)
if !ok {
return nil, fmt.Errorf("must specify min")
}
min := int(minVal)
maxVal, ok := config["max"].(float64)
if !ok {
return nil, fmt.Errorf("must specify max")
}
max := int(maxVal)
return NewEdgeNgramFilter(side, min, max), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, EdgeNgramFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,189 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package edgengram
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestEdgeNgramFilter(t *testing.T) {
tests := []struct {
side Side
min int
max int
input analysis.TokenStream
output analysis.TokenStream
}{
{
side: FRONT,
min: 1,
max: 1,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcde"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
},
},
{
side: BACK,
min: 1,
max: 1,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcde"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("e"),
},
},
},
{
side: FRONT,
min: 1,
max: 3,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcde"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("ab"),
},
&analysis.Token{
Term: []byte("abc"),
},
},
},
{
side: BACK,
min: 1,
max: 3,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcde"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("e"),
},
&analysis.Token{
Term: []byte("de"),
},
&analysis.Token{
Term: []byte("cde"),
},
},
},
{
side: FRONT,
min: 1,
max: 3,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcde"),
},
&analysis.Token{
Term: []byte("vwxyz"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("ab"),
},
&analysis.Token{
Term: []byte("abc"),
},
&analysis.Token{
Term: []byte("v"),
},
&analysis.Token{
Term: []byte("vw"),
},
&analysis.Token{
Term: []byte("vwx"),
},
},
},
{
side: BACK,
min: 3,
max: 5,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Beryl"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ryl"),
},
&analysis.Token{
Term: []byte("eryl"),
},
&analysis.Token{
Term: []byte("Beryl"),
},
},
},
{
side: FRONT,
min: 3,
max: 5,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Beryl"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Ber"),
},
&analysis.Token{
Term: []byte("Bery"),
},
&analysis.Token{
Term: []byte("Beryl"),
},
},
},
}
for _, test := range tests {
edgeNgramFilter := NewEdgeNgramFilter(test.side, test.min, test.max)
actual := edgeNgramFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}

View file

@ -0,0 +1,77 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package elision
import (
"fmt"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "elision"
const RightSingleQuotationMark = ''
const Apostrophe = '\''
type ElisionFilter struct {
articles analysis.TokenMap
}
func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter {
return &ElisionFilter{
articles: articles,
}
}
func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := token.Term
for i := 0; i < len(term); {
r, size := utf8.DecodeRune(term[i:])
if r == Apostrophe || r == RightSingleQuotationMark {
// see if the prefix matches one of the articles
prefix := term[0:i]
_, articleMatch := s.articles[string(prefix)]
if articleMatch {
token.Term = term[i+size:]
break
}
}
i += size
}
}
return input
}
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMapName, ok := config["articles_token_map"].(string)
if !ok {
return nil, fmt.Errorf("must specify articles_token_map")
}
articlesTokenMap, err := cache.TokenMapNamed(articlesTokenMapName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return NewElisionFilter(articlesTokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, ElisionFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,85 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package elision
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/tokenmap"
"github.com/blevesearch/bleve/v2/registry"
)
func TestElisionFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ar" + string(Apostrophe) + "word"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("word"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ar" + string(RightSingleQuotationMark) + "word"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("word"),
},
},
},
}
cache := registry.NewCache()
articleListConfig := map[string]interface{}{
"type": tokenmap.Name,
"tokens": []interface{}{"ar"},
}
_, err := cache.DefineTokenMap("articles_test", articleListConfig)
if err != nil {
t.Fatal(err)
}
elisionConfig := map[string]interface{}{
"type": "elision",
"articles_token_map": "articles_test",
}
elisionFilter, err := cache.DefineTokenFilter("elision_test", elisionConfig)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,95 @@
package hierarchy
import (
"bytes"
"fmt"
"math"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "hierarchy"
type HierarchyFilter struct {
maxLevels int
delimiter []byte
splitInput bool
}
func NewHierarchyFilter(delimiter []byte, maxLevels int, splitInput bool) *HierarchyFilter {
return &HierarchyFilter{
maxLevels: maxLevels,
delimiter: delimiter,
splitInput: splitInput,
}
}
func (s *HierarchyFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, s.maxLevels)
var soFar [][]byte
for _, token := range input {
if s.splitInput {
parts := bytes.Split(token.Term, s.delimiter)
for _, part := range parts {
soFar, rv = s.buildToken(rv, soFar, part)
if len(soFar) >= s.maxLevels {
return rv
}
}
} else {
soFar, rv = s.buildToken(rv, soFar, token.Term)
if len(soFar) >= s.maxLevels {
return rv
}
}
}
return rv
}
func (s *HierarchyFilter) buildToken(tokenStream analysis.TokenStream, soFar [][]byte, part []byte) (
[][]byte, analysis.TokenStream) {
soFar = append(soFar, part)
term := bytes.Join(soFar, s.delimiter)
tokenStream = append(tokenStream, &analysis.Token{
Type: analysis.Shingle,
Term: term,
Start: 0,
End: len(term),
Position: 1,
})
return soFar, tokenStream
}
func HierarchyFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
max := math.MaxInt64
maxVal, ok := config["max"].(float64)
if ok {
max = int(maxVal)
}
splitInput := true
splitInputVal, ok := config["split_input"].(bool)
if ok {
splitInput = splitInputVal
}
delimiter, ok := config["delimiter"].(string)
if !ok {
return nil, fmt.Errorf("must specify delimiter")
}
return NewHierarchyFilter([]byte(delimiter), max, splitInput), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, HierarchyFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,229 @@
package hierarchy
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestHierarchyFilter(t *testing.T) {
tests := []struct {
name string
delimiter string
max int
splitInput bool
input analysis.TokenStream
output analysis.TokenStream
}{
{
name: "single token a/b/c, delimiter /",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a/b/c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
Type: analysis.Shingle,
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b"),
Type: analysis.Shingle,
Start: 0,
End: 3,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b/c"),
Type: analysis.Shingle,
Start: 0,
End: 5,
Position: 1,
},
},
delimiter: "/",
max: 10,
splitInput: true,
},
{
name: "multiple tokens already split a b c, delimiter /",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("b"),
},
&analysis.Token{
Term: []byte("c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
Type: analysis.Shingle,
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b"),
Type: analysis.Shingle,
Start: 0,
End: 3,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b/c"),
Type: analysis.Shingle,
Start: 0,
End: 5,
Position: 1,
},
},
delimiter: "/",
max: 10,
splitInput: true,
},
{
name: "single token a/b/c, delimiter /, limit 2",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a/b/c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
Type: analysis.Shingle,
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b"),
Type: analysis.Shingle,
Start: 0,
End: 3,
Position: 1,
},
},
delimiter: "/",
max: 2,
splitInput: true,
},
{
name: "multiple tokens already split a b c, delimiter /, limit 2",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("b"),
},
&analysis.Token{
Term: []byte("c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
Type: analysis.Shingle,
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b"),
Type: analysis.Shingle,
Start: 0,
End: 3,
Position: 1,
},
},
delimiter: "/",
max: 2,
splitInput: true,
},
{
name: "single token a/b/c, delimiter /, no split",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a/b/c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a/b/c"),
Type: analysis.Shingle,
Start: 0,
End: 5,
Position: 1,
},
},
delimiter: "/",
max: 10,
splitInput: false,
},
{
name: "multiple tokens already split a b c, delimiter /, no split",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("b"),
},
&analysis.Token{
Term: []byte("c"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
Type: analysis.Shingle,
Start: 0,
End: 1,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b"),
Type: analysis.Shingle,
Start: 0,
End: 3,
Position: 1,
},
&analysis.Token{
Term: []byte("a/b/c"),
Type: analysis.Shingle,
Start: 0,
End: 5,
Position: 1,
},
},
delimiter: "/",
max: 10,
splitInput: false,
},
}
for _, test := range tests {
test := test
t.Run(test.name, func(t *testing.T) {
filter := NewHierarchyFilter([]byte(test.delimiter), test.max, test.splitInput)
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
})
}
}

View file

@ -0,0 +1,63 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package keyword
import (
"fmt"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "keyword_marker"
type KeyWordMarkerFilter struct {
keyWords analysis.TokenMap
}
func NewKeyWordMarkerFilter(keyWords analysis.TokenMap) *KeyWordMarkerFilter {
return &KeyWordMarkerFilter{
keyWords: keyWords,
}
}
func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
_, isKeyWord := f.keyWords[string(token.Term)]
if isKeyWord {
token.KeyWord = true
}
}
return input
}
func KeyWordMarkerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
keywordsTokenMapName, ok := config["keywords_token_map"].(string)
if !ok {
return nil, fmt.Errorf("must specify keywords_token_map")
}
keywordsTokenMap, err := cache.TokenMapNamed(keywordsTokenMapName)
if err != nil {
return nil, fmt.Errorf("error building keyword marker filter: %v", err)
}
return NewKeyWordMarkerFilter(keywordsTokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, KeyWordMarkerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,73 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package keyword
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestKeyWordMarkerFilter(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("in"),
},
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("park"),
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("walk"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("in"),
},
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("park"),
KeyWord: true,
},
}
keyWordsMap := analysis.NewTokenMap()
keyWordsMap.AddToken("walk")
keyWordsMap.AddToken("park")
filter := NewKeyWordMarkerFilter(keyWordsMap)
ouputTokenStream := filter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream[0].KeyWord, ouputTokenStream[0].KeyWord)
}
}

View file

@ -0,0 +1,80 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package length
import (
"fmt"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "length"
type LengthFilter struct {
min int
max int
}
func NewLengthFilter(min, max int) *LengthFilter {
return &LengthFilter{
min: min,
max: max,
}
}
func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
wordLen := utf8.RuneCount(token.Term)
if f.min > 0 && f.min > wordLen {
continue
}
if f.max > 0 && f.max < wordLen {
continue
}
rv = append(rv, token)
}
return rv
}
func LengthFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
min := 0
max := 0
minVal, ok := config["min"].(float64)
if ok {
min = int(minVal)
}
maxVal, ok := config["max"].(float64)
if ok {
max = int(maxVal)
}
if min == max && max == 0 {
return nil, fmt.Errorf("either min or max must be non-zero")
}
return NewLengthFilter(min, max), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, LengthFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,99 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package length
import (
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestLengthFilter(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("1"),
},
&analysis.Token{
Term: []byte("two"),
},
&analysis.Token{
Term: []byte("three"),
},
}
lengthFilter := NewLengthFilter(3, 4)
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
if len(ouputTokenStream) != 1 {
t.Fatalf("expected 1 output token")
}
if string(ouputTokenStream[0].Term) != "two" {
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
}
}
func TestLengthFilterNoMax(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("1"),
},
&analysis.Token{
Term: []byte("two"),
},
&analysis.Token{
Term: []byte("three"),
},
}
lengthFilter := NewLengthFilter(3, -1)
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
if len(ouputTokenStream) != 2 {
t.Fatalf("expected 2 output token")
}
if string(ouputTokenStream[0].Term) != "two" {
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
}
if string(ouputTokenStream[1].Term) != "three" {
t.Errorf("expected term `three`, got `%s`", ouputTokenStream[0].Term)
}
}
func TestLengthFilterNoMin(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("1"),
},
&analysis.Token{
Term: []byte("two"),
},
&analysis.Token{
Term: []byte("three"),
},
}
lengthFilter := NewLengthFilter(-1, 4)
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
if len(ouputTokenStream) != 2 {
t.Fatalf("expected 2 output token")
}
if string(ouputTokenStream[0].Term) != "1" {
t.Errorf("expected term `1`, got `%s`", ouputTokenStream[0].Term)
}
if string(ouputTokenStream[1].Term) != "two" {
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
}
}

View file

@ -0,0 +1,108 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package lowercase implements a TokenFilter which converts
// tokens to lower case according to unicode rules.
package lowercase
import (
"bytes"
"unicode"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
// Name is the name used to register LowerCaseFilter in the bleve registry
const Name = "to_lower"
type LowerCaseFilter struct {
}
func NewLowerCaseFilter() *LowerCaseFilter {
return &LowerCaseFilter{}
}
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
token.Term = toLowerDeferredCopy(token.Term)
}
return input
}
func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewLowerCaseFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor)
if err != nil {
panic(err)
}
}
// toLowerDeferredCopy will function exactly like
// bytes.ToLower() only it will reuse (overwrite)
// the original byte array when possible
// NOTE: because its possible that the lower-case
// form of a rune has a different utf-8 encoded
// length, in these cases a new byte array is allocated
func toLowerDeferredCopy(s []byte) []byte {
j := 0
for i := 0; i < len(s); {
wid := 1
r := rune(s[i])
if r >= utf8.RuneSelf {
r, wid = utf8.DecodeRune(s[i:])
}
l := unicode.ToLower(r)
// If the rune is already lowercased, just move to the
// next rune.
if l == r {
i += wid
j += wid
continue
}
// Handles the Unicode edge-case where the last
// rune in a word on the greek Σ needs to be converted
// differently.
if l == 'σ' && i+2 == len(s) {
l = 'ς'
}
lwid := utf8.RuneLen(l)
if lwid > wid {
// utf-8 encoded replacement is wider
// for now, punt and defer
// to bytes.ToLower() for the remainder
// only known to happen with chars
// Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
// Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
rest := bytes.ToLower(s[i:])
rv := make([]byte, j+len(rest))
copy(rv[:j], s[:j])
copy(rv[j:], rest)
return rv
} else {
utf8.EncodeRune(s[j:], l)
}
i += wid
j += lwid
}
return s[:j]
}

View file

@ -0,0 +1,166 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package lowercase
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestLowerCaseFilter(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("ONE"),
},
&analysis.Token{
Term: []byte("two"),
},
&analysis.Token{
Term: []byte("ThReE"),
},
&analysis.Token{
Term: []byte("steven's"),
},
// these characters are chosen in particular
// because the utf-8 encoding of the lower-case
// version has a different length
// Rune İ(304) width 2 - Lower i(105) width 1
// Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
// Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
&analysis.Token{
Term: []byte("İȺȾCAT"),
},
&analysis.Token{
Term: []byte("ȺȾCAT"),
},
&analysis.Token{
Term: []byte("ὈΔΥΣΣ"),
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("one"),
},
&analysis.Token{
Term: []byte("two"),
},
&analysis.Token{
Term: []byte("three"),
},
&analysis.Token{
Term: []byte("steven's"),
},
&analysis.Token{
Term: []byte("iⱥⱦcat"),
},
&analysis.Token{
Term: []byte("ⱥⱦcat"),
},
&analysis.Token{
Term: []byte("ὀδυσς"),
},
}
filter := NewLowerCaseFilter()
ouputTokenStream := filter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
t.Errorf("expected %s got %s", expectedTokenStream[0].Term, ouputTokenStream[0].Term)
}
}
func BenchmarkLowerCaseFilter(b *testing.B) {
input := analysis.TokenStream{
&analysis.Token{
Term: []byte("A"),
},
&analysis.Token{
Term: []byte("boiling"),
},
&analysis.Token{
Term: []byte("liquid"),
},
&analysis.Token{
Term: []byte("expanding"),
},
&analysis.Token{
Term: []byte("vapor"),
},
&analysis.Token{
Term: []byte("explosion"),
},
&analysis.Token{
Term: []byte("caused"),
},
&analysis.Token{
Term: []byte("by"),
},
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("rupture"),
},
&analysis.Token{
Term: []byte("of"),
},
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("vessel"),
},
&analysis.Token{
Term: []byte("containing"),
},
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("pressurized"),
},
&analysis.Token{
Term: []byte("liquid"),
},
&analysis.Token{
Term: []byte("above"),
},
&analysis.Token{
Term: []byte("its"),
},
&analysis.Token{
Term: []byte("boiling"),
},
&analysis.Token{
Term: []byte("point"),
},
&analysis.Token{
Term: []byte("İȺȾCAT"),
},
&analysis.Token{
Term: []byte("ȺȾCAT"),
},
}
filter := NewLowerCaseFilter()
b.ResetTimer()
for i := 0; i < b.N; i++ {
filter.Filter(input)
}
}

View file

@ -0,0 +1,116 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ngram
import (
"bytes"
"fmt"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "ngram"
type NgramFilter struct {
minLength int
maxLength int
}
func NewNgramFilter(minLength, maxLength int) *NgramFilter {
return &NgramFilter{
minLength: minLength,
maxLength: maxLength,
}
}
func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
for _, token := range input {
runeCount := utf8.RuneCount(token.Term)
runes := bytes.Runes(token.Term)
for i := 0; i < runeCount; i++ {
// index of the starting rune for this token
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
// build an ngram of this size starting at i
if i+ngramSize <= runeCount {
ngramTerm := analysis.BuildTermFromRunes(runes[i : i+ngramSize])
token := analysis.Token{
Position: token.Position,
Start: token.Start,
End: token.End,
Type: token.Type,
Term: ngramTerm,
}
rv = append(rv, &token)
}
}
}
}
return rv
}
func NgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
minVal, ok := config["min"]
if !ok {
return nil, fmt.Errorf("must specify min")
}
min, err := convertToInt(minVal)
if err != nil {
return nil, err
}
maxVal, ok := config["max"]
if !ok {
return nil, fmt.Errorf("must specify max")
}
max, err := convertToInt(maxVal)
if err != nil {
return nil, err
}
return NewNgramFilter(min, max), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, NgramFilterConstructor)
if err != nil {
panic(err)
}
}
// Expects either an int or a flaot64 value
func convertToInt(val interface{}) (int, error) {
var intVal int
var floatVal float64
var ok bool
intVal, ok = val.(int)
if ok {
return intVal, nil
}
floatVal, ok = val.(float64)
if ok {
return int(floatVal), nil
}
return 0, fmt.Errorf("failed to convert to int value")
}

View file

@ -0,0 +1,192 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ngram
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestNgramFilter(t *testing.T) {
tests := []struct {
min int
max int
input analysis.TokenStream
output analysis.TokenStream
}{
{
min: 1,
max: 1,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcde"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("b"),
},
&analysis.Token{
Term: []byte("c"),
},
&analysis.Token{
Term: []byte("d"),
},
&analysis.Token{
Term: []byte("e"),
},
},
},
{
min: 2,
max: 2,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcde"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ab"),
},
&analysis.Token{
Term: []byte("bc"),
},
&analysis.Token{
Term: []byte("cd"),
},
&analysis.Token{
Term: []byte("de"),
},
},
},
{
min: 1,
max: 3,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcde"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("ab"),
},
&analysis.Token{
Term: []byte("abc"),
},
&analysis.Token{
Term: []byte("b"),
},
&analysis.Token{
Term: []byte("bc"),
},
&analysis.Token{
Term: []byte("bcd"),
},
&analysis.Token{
Term: []byte("c"),
},
&analysis.Token{
Term: []byte("cd"),
},
&analysis.Token{
Term: []byte("cde"),
},
&analysis.Token{
Term: []byte("d"),
},
&analysis.Token{
Term: []byte("de"),
},
&analysis.Token{
Term: []byte("e"),
},
},
},
}
for _, test := range tests {
ngramFilter := NewNgramFilter(test.min, test.max)
actual := ngramFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}
func TestConversionInt(t *testing.T) {
config := map[string]interface{}{
"type": Name,
"min": 3,
"max": 8,
}
f, err := NgramFilterConstructor(config, nil)
if err != nil {
t.Errorf("Failed to construct the ngram filter: %v", err)
}
ngram := f.(*NgramFilter)
if ngram.minLength != 3 && ngram.maxLength != 8 {
t.Errorf("Failed to construct the bounds. Got %v and %v.", ngram.minLength, ngram.maxLength)
}
}
func TestConversionFloat(t *testing.T) {
config := map[string]interface{}{
"type": Name,
"min": float64(3),
"max": float64(8),
}
f, err := NgramFilterConstructor(config, nil)
if err != nil {
t.Errorf("Failed to construct the ngram filter: %v", err)
}
ngram := f.(*NgramFilter)
if ngram.minLength != 3 && ngram.maxLength != 8 {
t.Errorf("Failed to construct the bounds. Got %v and %v.", ngram.minLength, ngram.maxLength)
}
}
func TestBadConversion(t *testing.T) {
config := map[string]interface{}{
"type": Name,
"min": "3",
}
_, err := NgramFilterConstructor(config, nil)
if err == nil {
t.Errorf("Expected conversion error.")
}
if err.Error() != "failed to convert to int value" {
t.Errorf("Wrong error recevied. Got %v.", err)
}
}

View file

@ -0,0 +1,56 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package porter
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/go-porterstemmer"
)
const Name = "stemmer_porter"
type PorterStemmer struct {
}
func NewPorterStemmer() *PorterStemmer {
return &PorterStemmer{}
}
func (s *PorterStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
// if it is not a protected keyword, stem it
if !token.KeyWord {
termRunes := bytes.Runes(token.Term)
stemmedRunes := porterstemmer.StemWithoutLowerCasing(termRunes)
token.Term = analysis.BuildTermFromRunes(stemmedRunes)
}
}
return input
}
func PorterStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewPorterStemmer(), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, PorterStemmerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,115 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package porter
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestPorterStemmer(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
// a term which does stem, but does not change length
&analysis.Token{
Term: []byte("marty"),
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("talk"),
},
&analysis.Token{
Term: []byte("busi"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
&analysis.Token{
Term: []byte("marti"),
},
}
filter := NewPorterStemmer()
ouputTokenStream := filter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
}
}
func BenchmarkPorterStemmer(b *testing.B) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
}
filter := NewPorterStemmer()
b.ResetTimer()
for i := 0; i < b.N; i++ {
filter.Filter(inputTokenStream)
}
}

View file

@ -0,0 +1,78 @@
// Copyright (c) 2019 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package reverse
import (
"unicode"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
// Name is the name used to register ReverseFilter in the bleve registry
const Name = "reverse"
type ReverseFilter struct {
}
func NewReverseFilter() *ReverseFilter {
return &ReverseFilter{}
}
func (f *ReverseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
token.Term = reverse(token.Term)
}
return input
}
func ReverseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewReverseFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, ReverseFilterConstructor)
if err != nil {
panic(err)
}
}
// reverse(..) will generate a reversed version of the provided
// unicode array and return it back to its caller.
func reverse(s []byte) []byte {
cursorIn := 0
inputRunes := []rune(string(s))
cursorOut := len(s)
output := make([]byte, len(s))
for i := 0; i < len(inputRunes); {
wid := utf8.RuneLen(inputRunes[i])
i++
for i < len(inputRunes) {
r := inputRunes[i]
if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Me, r) || unicode.Is(unicode.Mc, r) {
wid += utf8.RuneLen(r)
i++
} else {
break
}
}
copy(output[cursorOut-wid:cursorOut], s[cursorIn:cursorIn+wid])
cursorIn += wid
cursorOut -= wid
}
return output
}

View file

@ -0,0 +1,184 @@
// Copyright (c) 2019 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package reverse
import (
"bytes"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestReverseFilter(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{},
&analysis.Token{
Term: []byte("one"),
},
&analysis.Token{
Term: []byte("TWo"),
},
&analysis.Token{
Term: []byte("thRee"),
},
&analysis.Token{
Term: []byte("four's"),
},
&analysis.Token{
Term: []byte("what's this in reverse"),
},
&analysis.Token{
Term: []byte("œ∑´®†"),
},
&analysis.Token{
Term: []byte("İȺȾCAT÷≥≤µ123"),
},
&analysis.Token{
Term: []byte("!@#$%^&*()"),
},
&analysis.Token{
Term: []byte("cafés"),
},
&analysis.Token{
Term: []byte("¿Dónde estás?"),
},
&analysis.Token{
Term: []byte("Me gustaría una cerveza."),
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{},
&analysis.Token{
Term: []byte("eno"),
},
&analysis.Token{
Term: []byte("oWT"),
},
&analysis.Token{
Term: []byte("eeRht"),
},
&analysis.Token{
Term: []byte("s'ruof"),
},
&analysis.Token{
Term: []byte("esrever ni siht s'tahw"),
},
&analysis.Token{
Term: []byte("†®´∑œ"),
},
&analysis.Token{
Term: []byte("321µ≤≥÷TACȾȺİ"),
},
&analysis.Token{
Term: []byte(")(*&^%$#@!"),
},
&analysis.Token{
Term: []byte("séfac"),
},
&analysis.Token{
Term: []byte("?sátse ednóD¿"),
},
&analysis.Token{
Term: []byte(".azevrec anu aíratsug eM"),
},
}
filter := NewReverseFilter()
outputTokenStream := filter.Filter(inputTokenStream)
for i := 0; i < len(expectedTokenStream); i++ {
if !bytes.Equal(outputTokenStream[i].Term, expectedTokenStream[i].Term) {
t.Errorf("[%d] expected %s got %s",
i+1, expectedTokenStream[i].Term, outputTokenStream[i].Term)
}
}
}
func BenchmarkReverseFilter(b *testing.B) {
input := analysis.TokenStream{
&analysis.Token{
Term: []byte("A"),
},
&analysis.Token{
Term: []byte("boiling"),
},
&analysis.Token{
Term: []byte("liquid"),
},
&analysis.Token{
Term: []byte("expanding"),
},
&analysis.Token{
Term: []byte("vapor"),
},
&analysis.Token{
Term: []byte("explosion"),
},
&analysis.Token{
Term: []byte("caused"),
},
&analysis.Token{
Term: []byte("by"),
},
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("rupture"),
},
&analysis.Token{
Term: []byte("of"),
},
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("vessel"),
},
&analysis.Token{
Term: []byte("containing"),
},
&analysis.Token{
Term: []byte("pressurized"),
},
&analysis.Token{
Term: []byte("liquid"),
},
&analysis.Token{
Term: []byte("above"),
},
&analysis.Token{
Term: []byte("its"),
},
&analysis.Token{
Term: []byte("boiling"),
},
&analysis.Token{
Term: []byte("point"),
},
&analysis.Token{
Term: []byte("İȺȾCAT"),
},
&analysis.Token{
Term: []byte("Me gustaría una cerveza."),
},
}
filter := NewReverseFilter()
b.ResetTimer()
for i := 0; i < b.N; i++ {
filter.Filter(input)
}
}

View file

@ -0,0 +1,172 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package shingle
import (
"container/ring"
"fmt"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "shingle"
type ShingleFilter struct {
min int
max int
outputOriginal bool
tokenSeparator string
fill string
}
func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *ShingleFilter {
return &ShingleFilter{
min: min,
max: max,
outputOriginal: outputOriginal,
tokenSeparator: sep,
fill: fill,
}
}
func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
rv := make(analysis.TokenStream, 0, len(input))
ring := ring.New(s.max)
itemsInRing := 0
currentPosition := 0
for _, token := range input {
if s.outputOriginal {
rv = append(rv, token)
}
// if there are gaps, insert filler tokens
offset := token.Position - currentPosition
for offset > 1 {
fillerToken := analysis.Token{
Position: 0,
Start: -1,
End: -1,
Type: analysis.AlphaNumeric,
Term: []byte(s.fill),
}
ring.Value = &fillerToken
if itemsInRing < s.max {
itemsInRing++
}
rv = append(rv, s.shingleCurrentRingState(ring, itemsInRing)...)
ring = ring.Next()
offset--
}
currentPosition = token.Position
ring.Value = token
if itemsInRing < s.max {
itemsInRing++
}
rv = append(rv, s.shingleCurrentRingState(ring, itemsInRing)...)
ring = ring.Next()
}
return rv
}
func (s *ShingleFilter) shingleCurrentRingState(ring *ring.Ring, itemsInRing int) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
for shingleN := s.min; shingleN <= s.max; shingleN++ {
// if there are enough items in the ring
// to produce a shingle of this size
if itemsInRing >= shingleN {
thisShingleRing := ring.Move(-(shingleN - 1))
shingledBytes := make([]byte, 0)
pos := 0
start := -1
end := 0
for i := 0; i < shingleN; i++ {
if i != 0 {
shingledBytes = append(shingledBytes, []byte(s.tokenSeparator)...)
}
curr := thisShingleRing.Value.(*analysis.Token)
if pos == 0 && curr.Position != 0 {
pos = curr.Position
}
if start == -1 && curr.Start != -1 {
start = curr.Start
}
if curr.End != -1 {
end = curr.End
}
shingledBytes = append(shingledBytes, curr.Term...)
thisShingleRing = thisShingleRing.Next()
}
token := analysis.Token{
Type: analysis.Shingle,
Term: shingledBytes,
}
if pos != 0 {
token.Position = pos
}
if start != -1 {
token.Start = start
}
if end != -1 {
token.End = end
}
rv = append(rv, &token)
}
}
return rv
}
func ShingleFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
minVal, ok := config["min"].(float64)
if !ok {
return nil, fmt.Errorf("must specify min")
}
min := int(minVal)
maxVal, ok := config["max"].(float64)
if !ok {
return nil, fmt.Errorf("must specify max")
}
max := int(maxVal)
outputOriginal := false
outVal, ok := config["output_original"].(bool)
if ok {
outputOriginal = outVal
}
sep := " "
sepVal, ok := config["separator"].(string)
if ok {
sep = sepVal
}
fill := "_"
fillVal, ok := config["filler"].(string)
if ok {
fill = fillVal
}
return NewShingleFilter(min, max, outputOriginal, sep, fill), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, ShingleFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,416 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package shingle
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestShingleFilter(t *testing.T) {
tests := []struct {
min int
max int
outputOriginal bool
separator string
filler string
input analysis.TokenStream
output analysis.TokenStream
}{
{
min: 2,
max: 2,
outputOriginal: false,
separator: " ",
filler: "_",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("quick"),
},
&analysis.Token{
Term: []byte("brown"),
},
&analysis.Token{
Term: []byte("fox"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("the quick"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("quick brown"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("brown fox"),
Type: analysis.Shingle,
},
},
},
{
min: 3,
max: 3,
outputOriginal: false,
separator: " ",
filler: "_",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("quick"),
},
&analysis.Token{
Term: []byte("brown"),
},
&analysis.Token{
Term: []byte("fox"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("the quick brown"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("quick brown fox"),
Type: analysis.Shingle,
},
},
},
{
min: 2,
max: 3,
outputOriginal: false,
separator: " ",
filler: "_",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("quick"),
},
&analysis.Token{
Term: []byte("brown"),
},
&analysis.Token{
Term: []byte("fox"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("the quick"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("quick brown"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("the quick brown"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("brown fox"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("quick brown fox"),
Type: analysis.Shingle,
},
},
},
{
min: 3,
max: 3,
outputOriginal: false,
separator: " ",
filler: "_",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ugly"),
Position: 1,
},
&analysis.Token{
Term: []byte("quick"),
Position: 3,
},
&analysis.Token{
Term: []byte("brown"),
Position: 4,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ugly _ quick"),
Type: analysis.Shingle,
Position: 1,
},
&analysis.Token{
Term: []byte("_ quick brown"),
Type: analysis.Shingle,
Position: 3,
},
},
},
{
min: 1,
max: 5,
outputOriginal: false,
separator: " ",
filler: "_",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Position: 1,
},
&analysis.Token{
Term: []byte("text"),
Position: 2,
},
// token 3 removed by stop filter
&analysis.Token{
Term: []byte("see"),
Position: 4,
},
&analysis.Token{
Term: []byte("shingles"),
Position: 5,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Type: analysis.Shingle,
Position: 1,
},
&analysis.Token{
Term: []byte("text"),
Type: analysis.Shingle,
Position: 2,
},
&analysis.Token{
Term: []byte("test text"),
Type: analysis.Shingle,
Position: 1,
},
&analysis.Token{
Term: []byte("_"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("text _"),
Type: analysis.Shingle,
Position: 2,
},
&analysis.Token{
Term: []byte("test text _"),
Type: analysis.Shingle,
Position: 1,
},
&analysis.Token{
Term: []byte("see"),
Type: analysis.Shingle,
Position: 4,
},
&analysis.Token{
Term: []byte("_ see"),
Type: analysis.Shingle,
Position: 4,
},
&analysis.Token{
Term: []byte("text _ see"),
Type: analysis.Shingle,
Position: 2,
},
&analysis.Token{
Term: []byte("test text _ see"),
Type: analysis.Shingle,
Position: 1,
},
&analysis.Token{
Term: []byte("shingles"),
Type: analysis.Shingle,
Position: 5,
},
&analysis.Token{
Term: []byte("see shingles"),
Type: analysis.Shingle,
Position: 4,
},
&analysis.Token{
Term: []byte("_ see shingles"),
Type: analysis.Shingle,
Position: 4,
},
&analysis.Token{
Term: []byte("text _ see shingles"),
Type: analysis.Shingle,
Position: 2,
},
&analysis.Token{
Term: []byte("test text _ see shingles"),
Type: analysis.Shingle,
Position: 1,
},
},
},
{
min: 2,
max: 2,
outputOriginal: true,
separator: " ",
filler: "_",
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("quick"),
},
&analysis.Token{
Term: []byte("brown"),
},
&analysis.Token{
Term: []byte("fox"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("quick"),
},
&analysis.Token{
Term: []byte("the quick"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("brown"),
},
&analysis.Token{
Term: []byte("quick brown"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("fox"),
},
&analysis.Token{
Term: []byte("brown fox"),
Type: analysis.Shingle,
},
},
},
}
for _, test := range tests {
shingleFilter := NewShingleFilter(test.min, test.max, test.outputOriginal, test.separator, test.filler)
actual := shingleFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}
// TestShingleFilterBug431 tests that the shingle filter is in fact stateless
// by making using the same filter instance twice and ensuring we do not get
// contaminated output
func TestShingleFilterBug431(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("quick"),
},
&analysis.Token{
Term: []byte("brown"),
},
&analysis.Token{
Term: []byte("fox"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("the quick"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("quick brown"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("brown fox"),
Type: analysis.Shingle,
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("sad"),
},
&analysis.Token{
Term: []byte("dirty"),
},
&analysis.Token{
Term: []byte("sock"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("a sad"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("sad dirty"),
Type: analysis.Shingle,
},
&analysis.Token{
Term: []byte("dirty sock"),
Type: analysis.Shingle,
},
},
},
}
shingleFilter := NewShingleFilter(2, 2, false, " ", "_")
for _, test := range tests {
actual := shingleFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}

View file

@ -0,0 +1,62 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package snowball
import (
"fmt"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/snowball"
)
const Name = "stemmer_snowball"
type SnowballStemmer struct {
language string
}
func NewSnowballStemmer(language string) *SnowballStemmer {
return &SnowballStemmer{
language: language,
}
}
func (s *SnowballStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
// if it is not a protected keyword, stem it
if !token.KeyWord {
stemmed, _ := snowball.Stem(string(token.Term), s.language, true)
token.Term = []byte(stemmed)
}
}
return input
}
func SnowballStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
language, ok := config["language"].(string)
if !ok {
return nil, fmt.Errorf("must specify language")
}
return NewSnowballStemmer(language), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, SnowballStemmerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,115 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package snowball
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestSnowballStemmer(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
// a term which does stem, but does not change length
&analysis.Token{
Term: []byte("marty"),
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("talk"),
},
&analysis.Token{
Term: []byte("busi"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
&analysis.Token{
Term: []byte("marti"),
},
}
filter := NewSnowballStemmer("english")
ouputTokenStream := filter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
}
}
func BenchmarkSnowballStemmer(b *testing.B) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walking"),
},
&analysis.Token{
Term: []byte("talked"),
},
&analysis.Token{
Term: []byte("business"),
},
&analysis.Token{
Term: []byte("protected"),
KeyWord: true,
},
&analysis.Token{
Term: []byte("cat"),
},
&analysis.Token{
Term: []byte("done"),
},
}
filter := NewSnowballStemmer("english")
b.ResetTimer()
for i := 0; i < b.N; i++ {
filter.Filter(inputTokenStream)
}
}

View file

@ -0,0 +1,73 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package stop implements a TokenFilter removing tokens found in
// a TokenMap.
//
// It constructor takes the following arguments:
//
// "stop_token_map" (string): the name of the token map identifying tokens to
// remove.
package stop
import (
"fmt"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "stop_tokens"
type StopTokensFilter struct {
stopTokens analysis.TokenMap
}
func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter {
return &StopTokensFilter{
stopTokens: stopTokens,
}
}
func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
j := 0
for _, token := range input {
_, isStopToken := f.stopTokens[string(token.Term)]
if !isStopToken {
input[j] = token
j++
}
}
return input[:j]
}
func StopTokensFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
stopTokenMapName, ok := config["stop_token_map"].(string)
if !ok {
return nil, fmt.Errorf("must specify stop_token_map")
}
stopTokenMap, err := cache.TokenMapNamed(stopTokenMapName)
if err != nil {
return nil, fmt.Errorf("error building stop words filter: %v", err)
}
return NewStopTokensFilter(stopTokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, StopTokensFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,124 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package stop
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/tokenmap"
"github.com/blevesearch/bleve/v2/registry"
)
func TestStopWordsFilter(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("in"),
},
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("park"),
},
}
expectedTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("park"),
},
}
cache := registry.NewCache()
stopListConfig := map[string]interface{}{
"type": tokenmap.Name,
"tokens": []interface{}{"a", "in", "the"},
}
_, err := cache.DefineTokenMap("stop_test", stopListConfig)
if err != nil {
t.Fatal(err)
}
stopConfig := map[string]interface{}{
"type": "stop_tokens",
"stop_token_map": "stop_test",
}
stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig)
if err != nil {
t.Fatal(err)
}
ouputTokenStream := stopFilter.Filter(inputTokenStream)
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
}
}
func BenchmarkStopWordsFilter(b *testing.B) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{
Term: []byte("a"),
},
&analysis.Token{
Term: []byte("walk"),
},
&analysis.Token{
Term: []byte("in"),
},
&analysis.Token{
Term: []byte("the"),
},
&analysis.Token{
Term: []byte("park"),
},
}
cache := registry.NewCache()
stopListConfig := map[string]interface{}{
"type": tokenmap.Name,
"tokens": []interface{}{"a", "in", "the"},
}
_, err := cache.DefineTokenMap("stop_test", stopListConfig)
if err != nil {
b.Fatal(err)
}
stopConfig := map[string]interface{}{
"type": "stop_tokens",
"stop_token_map": "stop_test",
}
stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig)
if err != nil {
b.Fatal(err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
stopFilter.Filter(inputTokenStream)
}
}

View file

@ -0,0 +1,62 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package truncate
import (
"fmt"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "truncate_token"
type TruncateTokenFilter struct {
length int
}
func NewTruncateTokenFilter(length int) *TruncateTokenFilter {
return &TruncateTokenFilter{
length: length,
}
}
func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
wordLen := utf8.RuneCount(token.Term)
if wordLen > s.length {
token.Term = analysis.TruncateRunes(token.Term, wordLen-s.length)
}
}
return input
}
func TruncateTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
lenVal, ok := config["length"].(float64)
if !ok {
return nil, fmt.Errorf("must specify length")
}
length := int(lenVal)
return NewTruncateTokenFilter(length), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, TruncateTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,79 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package truncate
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestTruncateTokenFilter(t *testing.T) {
tests := []struct {
length int
input analysis.TokenStream
output analysis.TokenStream
}{
{
length: 5,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcdefgh"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("abcde"),
},
},
},
{
length: 3,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こんにちは世界"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こんに"),
},
},
},
{
length: 10,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("แยกคำภาษาไ"),
},
},
},
}
for _, test := range tests {
truncateTokenFilter := NewTruncateTokenFilter(test.length)
actual := truncateTokenFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,82 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package unicodenorm
import (
"fmt"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"golang.org/x/text/unicode/norm"
)
const Name = "normalize_unicode"
const NFC = "nfc"
const NFD = "nfd"
const NFKC = "nfkc"
const NFKD = "nfkd"
var forms = map[string]norm.Form{
NFC: norm.NFC,
NFD: norm.NFD,
NFKC: norm.NFKC,
NFKD: norm.NFKD,
}
type UnicodeNormalizeFilter struct {
form norm.Form
}
func NewUnicodeNormalizeFilter(formName string) (*UnicodeNormalizeFilter, error) {
form, ok := forms[formName]
if !ok {
return nil, fmt.Errorf("no form named %s", formName)
}
return &UnicodeNormalizeFilter{
form: form,
}, nil
}
func MustNewUnicodeNormalizeFilter(formName string) *UnicodeNormalizeFilter {
filter, err := NewUnicodeNormalizeFilter(formName)
if err != nil {
panic(err)
}
return filter
}
func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
token.Term = s.form.Bytes(token.Term)
}
return input
}
func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
formVal, ok := config["form"].(string)
if !ok {
return nil, fmt.Errorf("must specify form")
}
form := formVal
return NewUnicodeNormalizeFilter(form)
}
func init() {
err := registry.RegisterTokenFilter(Name, UnicodeNormalizeFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,162 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package unicodenorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
// the following tests come from the lucene
// test cases for CJK width filter
// which is our basis for using this
// as a substitute for that
func TestUnicodeNormalization(t *testing.T) {
tests := []struct {
formName string
input analysis.TokenStream
output analysis.TokenStream
}{
{
formName: NFKD,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Test"),
},
},
},
{
formName: NFKD,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("1234"),
},
},
},
{
formName: NFKD,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("カタカナ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("カタカナ"),
},
},
},
{
formName: NFKC,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ヴィッツ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ヴィッツ"),
},
},
},
{
formName: NFKC,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("パナソニック"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("パナソニック"),
},
},
},
{
formName: NFD,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u212B"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0041\u030A"),
},
},
},
{
formName: NFC,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u212B"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u00C5"),
},
},
},
{
formName: NFKD,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\uFB01"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0066\u0069"),
},
},
},
{
formName: NFKC,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\uFB01"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0066\u0069"),
},
},
},
}
for _, test := range tests {
filter := MustNewUnicodeNormalizeFilter(test.formName)
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
t.Errorf("expected %#v, got %#v", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,56 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package unique
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "unique"
// UniqueTermFilter retains only the tokens which mark the first occurrence of
// a term. Tokens whose term appears in a preceding token are dropped.
type UniqueTermFilter struct{}
func NewUniqueTermFilter() *UniqueTermFilter {
return &UniqueTermFilter{}
}
func (f *UniqueTermFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
encounteredTerms := make(map[string]struct{}, len(input)/4)
j := 0
for _, token := range input {
term := string(token.Term)
if _, ok := encounteredTerms[term]; ok {
continue
}
encounteredTerms[term] = struct{}{}
input[j] = token
j++
}
return input[:j]
}
func UniqueTermFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewUniqueTermFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(Name, UniqueTermFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,84 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package unique
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestUniqueTermFilter(t *testing.T) {
var tests = []struct {
input analysis.TokenStream
// expected indices of input which should be included in the output. We
// use indices instead of another TokenStream, since position/start/end
// should be preserved.
expectedIndices []int
}{
{
input: tokenStream(),
expectedIndices: []int{},
},
{
input: tokenStream("a"),
expectedIndices: []int{0},
},
{
input: tokenStream("each", "term", "in", "this", "sentence", "is", "unique"),
expectedIndices: []int{0, 1, 2, 3, 4, 5, 6},
},
{
input: tokenStream("Lui", "è", "alto", "e", "lei", "è", "bassa"),
expectedIndices: []int{0, 1, 2, 3, 4, 6},
},
{
input: tokenStream("a", "a", "A", "a", "a", "A"),
expectedIndices: []int{0, 2},
},
}
uniqueTermFilter := NewUniqueTermFilter()
for _, test := range tests {
expected := subStream(test.input, test.expectedIndices)
actual := uniqueTermFilter.Filter(test.input)
if !reflect.DeepEqual(actual, expected) {
t.Errorf("expected %s \n\n got %s", expected, actual)
}
}
}
func tokenStream(termStrs ...string) analysis.TokenStream {
tokenStream := make([]*analysis.Token, len(termStrs))
index := 0
for i, termStr := range termStrs {
tokenStream[i] = &analysis.Token{
Term: []byte(termStr),
Position: i + 1,
Start: index,
End: index + len(termStr),
}
index += len(termStr)
}
return analysis.TokenStream(tokenStream)
}
func subStream(stream analysis.TokenStream, indices []int) analysis.TokenStream {
result := make(analysis.TokenStream, len(indices))
for i, index := range indices {
result[i] = stream[index]
}
return result
}