Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c71cb8b61d
commit
982828099e
783 changed files with 150650 additions and 0 deletions
57
analysis/token/apostrophe/apostrophe.go
Normal file
57
analysis/token/apostrophe/apostrophe.go
Normal file
|
@ -0,0 +1,57 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package apostrophe
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "apostrophe"
|
||||
|
||||
const RightSingleQuotationMark = "’"
|
||||
const Apostrophe = "'"
|
||||
const Apostrophes = Apostrophe + RightSingleQuotationMark
|
||||
|
||||
type ApostropheFilter struct{}
|
||||
|
||||
func NewApostropheFilter() *ApostropheFilter {
|
||||
return &ApostropheFilter{}
|
||||
}
|
||||
|
||||
func (s *ApostropheFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
|
||||
if firstApostrophe >= 0 {
|
||||
// found an apostrophe
|
||||
token.Term = token.Term[0:firstApostrophe]
|
||||
}
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
func ApostropheFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewApostropheFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, ApostropheFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
99
analysis/token/apostrophe/apostrophe_test.go
Normal file
99
analysis/token/apostrophe/apostrophe_test.go
Normal file
|
@ -0,0 +1,99 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package apostrophe
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestApostropheFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Türkiye'de"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Türkiye"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("2003'te"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("2003"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Van"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Van"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Gölü'nü"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Gölü"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("gördüm"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("gördüm"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
apostropheFilter := NewApostropheFilter()
|
||||
actual := apostropheFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
81
analysis/token/camelcase/camelcase.go
Normal file
81
analysis/token/camelcase/camelcase.go
Normal file
|
@ -0,0 +1,81 @@
|
|||
// Copyright (c) 2016 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package camelcase
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "camelCase"
|
||||
|
||||
// CamelCaseFilter splits a given token into a set of tokens where each resulting token
|
||||
// falls into one the following classes:
|
||||
// 1. Upper case followed by lower case letters.
|
||||
// Terminated by a number, an upper case letter, and a non alpha-numeric symbol.
|
||||
// 2. Upper case followed by upper case letters.
|
||||
// Terminated by a number, an upper case followed by a lower case letter, and a non alpha-numeric symbol.
|
||||
// 3. Lower case followed by lower case letters.
|
||||
// Terminated by a number, an upper case letter, and a non alpha-numeric symbol.
|
||||
// 4. Number followed by numbers.
|
||||
// Terminated by a letter, and a non alpha-numeric symbol.
|
||||
// 5. Non alpha-numeric symbol followed by non alpha-numeric symbols.
|
||||
// Terminated by a number, and a letter.
|
||||
//
|
||||
// It does a one-time sequential pass over an input token, from left to right.
|
||||
// The scan is greedy and generates the longest substring that fits into one of the classes.
|
||||
//
|
||||
// See the test file for examples of classes and their parsings.
|
||||
type CamelCaseFilter struct{}
|
||||
|
||||
func NewCamelCaseFilter() *CamelCaseFilter {
|
||||
return &CamelCaseFilter{}
|
||||
}
|
||||
|
||||
func (f *CamelCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
nextPosition := 1
|
||||
for _, token := range input {
|
||||
runeCount := utf8.RuneCount(token.Term)
|
||||
runes := bytes.Runes(token.Term)
|
||||
|
||||
p := NewParser(runeCount, nextPosition, token.Start)
|
||||
for i := 0; i < runeCount; i++ {
|
||||
if i+1 >= runeCount {
|
||||
p.Push(runes[i], nil)
|
||||
} else {
|
||||
p.Push(runes[i], &runes[i+1])
|
||||
}
|
||||
}
|
||||
rv = append(rv, p.FlushTokens()...)
|
||||
nextPosition = p.NextPosition()
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func CamelCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewCamelCaseFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, CamelCaseFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
95
analysis/token/camelcase/camelcase_test.go
Normal file
95
analysis/token/camelcase/camelcase_test.go
Normal file
|
@ -0,0 +1,95 @@
|
|||
// Copyright (c) 2016 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package camelcase
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestCamelCaseFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: tokenStream(""),
|
||||
output: tokenStream(""),
|
||||
},
|
||||
{
|
||||
input: tokenStream("a"),
|
||||
output: tokenStream("a"),
|
||||
},
|
||||
|
||||
{
|
||||
input: tokenStream("...aMACMac123macILoveGolang"),
|
||||
output: tokenStream("...", "a", "MAC", "Mac", "123", "mac", "I", "Love", "Golang"),
|
||||
},
|
||||
{
|
||||
input: tokenStream("Lang"),
|
||||
output: tokenStream("Lang"),
|
||||
},
|
||||
{
|
||||
input: tokenStream("GLang"),
|
||||
output: tokenStream("G", "Lang"),
|
||||
},
|
||||
{
|
||||
input: tokenStream("GOLang"),
|
||||
output: tokenStream("GO", "Lang"),
|
||||
},
|
||||
{
|
||||
input: tokenStream("GOOLang"),
|
||||
output: tokenStream("GOO", "Lang"),
|
||||
},
|
||||
{
|
||||
input: tokenStream("1234"),
|
||||
output: tokenStream("1234"),
|
||||
},
|
||||
{
|
||||
input: tokenStream("starbucks"),
|
||||
output: tokenStream("starbucks"),
|
||||
},
|
||||
{
|
||||
input: tokenStream("Starbucks TVSamsungIsGREAT000"),
|
||||
output: tokenStream("Starbucks", " ", "TV", "Samsung", "Is", "GREAT", "000"),
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
ccFilter := NewCamelCaseFilter()
|
||||
actual := ccFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s \n\n got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func tokenStream(termStrs ...string) analysis.TokenStream {
|
||||
tokenStream := make([]*analysis.Token, len(termStrs))
|
||||
index := 0
|
||||
for i, termStr := range termStrs {
|
||||
tokenStream[i] = &analysis.Token{
|
||||
Term: []byte(termStr),
|
||||
Position: i + 1,
|
||||
Start: index,
|
||||
End: index + len(termStr),
|
||||
}
|
||||
index += len(termStr)
|
||||
}
|
||||
return analysis.TokenStream(tokenStream)
|
||||
}
|
109
analysis/token/camelcase/parser.go
Normal file
109
analysis/token/camelcase/parser.go
Normal file
|
@ -0,0 +1,109 @@
|
|||
// Copyright (c) 2016 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package camelcase
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func (p *Parser) buildTokenFromTerm(buffer []rune) *analysis.Token {
|
||||
term := analysis.BuildTermFromRunes(buffer)
|
||||
token := &analysis.Token{
|
||||
Term: term,
|
||||
Position: p.position,
|
||||
Start: p.index,
|
||||
End: p.index + len(term),
|
||||
}
|
||||
p.position++
|
||||
p.index += len(term)
|
||||
return token
|
||||
}
|
||||
|
||||
// Parser accepts a symbol and passes it to the current state (representing a class).
|
||||
// The state can accept it (and accumulate it). Otherwise, the parser creates a new state that
|
||||
// starts with the pushed symbol.
|
||||
//
|
||||
// Parser accumulates a new resulting token every time it switches state.
|
||||
// Use FlushTokens() to get the results after the last symbol was pushed.
|
||||
type Parser struct {
|
||||
bufferLen int
|
||||
buffer []rune
|
||||
current State
|
||||
tokens []*analysis.Token
|
||||
position int
|
||||
index int
|
||||
}
|
||||
|
||||
func NewParser(length, position, index int) *Parser {
|
||||
return &Parser{
|
||||
bufferLen: length,
|
||||
buffer: make([]rune, 0, length),
|
||||
tokens: make([]*analysis.Token, 0, length),
|
||||
position: position,
|
||||
index: index,
|
||||
}
|
||||
}
|
||||
|
||||
func (p *Parser) Push(sym rune, peek *rune) {
|
||||
if p.current == nil {
|
||||
// the start of parsing
|
||||
p.current = p.NewState(sym)
|
||||
p.buffer = append(p.buffer, sym)
|
||||
|
||||
} else if p.current.Member(sym, peek) {
|
||||
// same state, just accumulate
|
||||
p.buffer = append(p.buffer, sym)
|
||||
|
||||
} else {
|
||||
// the old state is no more, thus convert the buffer
|
||||
p.tokens = append(p.tokens, p.buildTokenFromTerm(p.buffer))
|
||||
|
||||
// let the new state begin
|
||||
p.current = p.NewState(sym)
|
||||
p.buffer = make([]rune, 0, p.bufferLen)
|
||||
p.buffer = append(p.buffer, sym)
|
||||
}
|
||||
}
|
||||
|
||||
// Note. States have to have different starting symbols.
|
||||
func (p *Parser) NewState(sym rune) State {
|
||||
var found State
|
||||
|
||||
found = &LowerCaseState{}
|
||||
if found.StartSym(sym) {
|
||||
return found
|
||||
}
|
||||
|
||||
found = &UpperCaseState{}
|
||||
if found.StartSym(sym) {
|
||||
return found
|
||||
}
|
||||
|
||||
found = &NumberCaseState{}
|
||||
if found.StartSym(sym) {
|
||||
return found
|
||||
}
|
||||
|
||||
return &NonAlphaNumericCaseState{}
|
||||
}
|
||||
|
||||
func (p *Parser) FlushTokens() []*analysis.Token {
|
||||
p.tokens = append(p.tokens, p.buildTokenFromTerm(p.buffer))
|
||||
return p.tokens
|
||||
}
|
||||
|
||||
func (p *Parser) NextPosition() int {
|
||||
return p.position
|
||||
}
|
87
analysis/token/camelcase/states.go
Normal file
87
analysis/token/camelcase/states.go
Normal file
|
@ -0,0 +1,87 @@
|
|||
// Copyright (c) 2016 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package camelcase
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// States codify the classes that the parser recognizes.
|
||||
type State interface {
|
||||
// is _sym_ the start character
|
||||
StartSym(sym rune) bool
|
||||
|
||||
// is _sym_ a member of a class.
|
||||
// peek, the next sym on the tape, can also be used to determine a class.
|
||||
Member(sym rune, peek *rune) bool
|
||||
}
|
||||
|
||||
type LowerCaseState struct{}
|
||||
|
||||
func (s *LowerCaseState) Member(sym rune, peek *rune) bool {
|
||||
return unicode.IsLower(sym)
|
||||
}
|
||||
|
||||
func (s *LowerCaseState) StartSym(sym rune) bool {
|
||||
return s.Member(sym, nil)
|
||||
}
|
||||
|
||||
type UpperCaseState struct {
|
||||
startedCollecting bool // denotes that the start character has been read
|
||||
collectingUpper bool // denotes if this is a class of all upper case letters
|
||||
}
|
||||
|
||||
func (s *UpperCaseState) Member(sym rune, peek *rune) bool {
|
||||
if !(unicode.IsLower(sym) || unicode.IsUpper(sym)) {
|
||||
return false
|
||||
}
|
||||
|
||||
if peek != nil && unicode.IsUpper(sym) && unicode.IsLower(*peek) {
|
||||
return false
|
||||
}
|
||||
|
||||
if !s.startedCollecting {
|
||||
// now we have to determine if upper-case letters are collected.
|
||||
s.startedCollecting = true
|
||||
s.collectingUpper = unicode.IsUpper(sym)
|
||||
return true
|
||||
}
|
||||
|
||||
return s.collectingUpper == unicode.IsUpper(sym)
|
||||
}
|
||||
|
||||
func (s *UpperCaseState) StartSym(sym rune) bool {
|
||||
return unicode.IsUpper(sym)
|
||||
}
|
||||
|
||||
type NumberCaseState struct{}
|
||||
|
||||
func (s *NumberCaseState) Member(sym rune, peek *rune) bool {
|
||||
return unicode.IsNumber(sym)
|
||||
}
|
||||
|
||||
func (s *NumberCaseState) StartSym(sym rune) bool {
|
||||
return s.Member(sym, nil)
|
||||
}
|
||||
|
||||
type NonAlphaNumericCaseState struct{}
|
||||
|
||||
func (s *NonAlphaNumericCaseState) Member(sym rune, peek *rune) bool {
|
||||
return !unicode.IsLower(sym) && !unicode.IsUpper(sym) && !unicode.IsNumber(sym)
|
||||
}
|
||||
|
||||
func (s *NonAlphaNumericCaseState) StartSym(sym rune) bool {
|
||||
return s.Member(sym, nil)
|
||||
}
|
144
analysis/token/compound/dict.go
Normal file
144
analysis/token/compound/dict.go
Normal file
|
@ -0,0 +1,144 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package compound
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "dict_compound"
|
||||
|
||||
const defaultMinWordSize = 5
|
||||
const defaultMinSubWordSize = 2
|
||||
const defaultMaxSubWordSize = 15
|
||||
const defaultOnlyLongestMatch = false
|
||||
|
||||
type DictionaryCompoundFilter struct {
|
||||
dict analysis.TokenMap
|
||||
minWordSize int
|
||||
minSubWordSize int
|
||||
maxSubWordSize int
|
||||
onlyLongestMatch bool
|
||||
}
|
||||
|
||||
func NewDictionaryCompoundFilter(dict analysis.TokenMap, minWordSize, minSubWordSize, maxSubWordSize int, onlyLongestMatch bool) *DictionaryCompoundFilter {
|
||||
return &DictionaryCompoundFilter{
|
||||
dict: dict,
|
||||
minWordSize: minWordSize,
|
||||
minSubWordSize: minSubWordSize,
|
||||
maxSubWordSize: maxSubWordSize,
|
||||
onlyLongestMatch: onlyLongestMatch,
|
||||
}
|
||||
}
|
||||
|
||||
func (f *DictionaryCompoundFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
for _, token := range input {
|
||||
rv = append(rv, token)
|
||||
tokenLen := utf8.RuneCount(token.Term)
|
||||
if tokenLen >= f.minWordSize {
|
||||
newtokens := f.decompose(token)
|
||||
for _, newtoken := range newtokens {
|
||||
rv = append(rv, newtoken)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (f *DictionaryCompoundFilter) decompose(token *analysis.Token) []*analysis.Token {
|
||||
runes := bytes.Runes(token.Term)
|
||||
rv := make([]*analysis.Token, 0)
|
||||
rlen := len(runes)
|
||||
for i := 0; i <= (rlen - f.minSubWordSize); i++ {
|
||||
var longestMatchToken *analysis.Token
|
||||
for j := f.minSubWordSize; j <= f.maxSubWordSize; j++ {
|
||||
if i+j > rlen {
|
||||
break
|
||||
}
|
||||
_, inDict := f.dict[string(runes[i:i+j])]
|
||||
if inDict {
|
||||
newtoken := analysis.Token{
|
||||
Term: []byte(string(runes[i : i+j])),
|
||||
Position: token.Position,
|
||||
Start: token.Start + i,
|
||||
End: token.Start + i + j,
|
||||
Type: token.Type,
|
||||
KeyWord: token.KeyWord,
|
||||
}
|
||||
if f.onlyLongestMatch {
|
||||
if longestMatchToken == nil || utf8.RuneCount(longestMatchToken.Term) < j {
|
||||
longestMatchToken = &newtoken
|
||||
}
|
||||
} else {
|
||||
rv = append(rv, &newtoken)
|
||||
}
|
||||
}
|
||||
}
|
||||
if f.onlyLongestMatch && longestMatchToken != nil {
|
||||
rv = append(rv, longestMatchToken)
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func DictionaryCompoundFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
|
||||
minWordSize := defaultMinWordSize
|
||||
minSubWordSize := defaultMinSubWordSize
|
||||
maxSubWordSize := defaultMaxSubWordSize
|
||||
onlyLongestMatch := defaultOnlyLongestMatch
|
||||
|
||||
minVal, ok := config["min_word_size"].(float64)
|
||||
if ok {
|
||||
minWordSize = int(minVal)
|
||||
}
|
||||
minSubVal, ok := config["min_subword_size"].(float64)
|
||||
if ok {
|
||||
minSubWordSize = int(minSubVal)
|
||||
}
|
||||
maxSubVal, ok := config["max_subword_size"].(float64)
|
||||
if ok {
|
||||
maxSubWordSize = int(maxSubVal)
|
||||
}
|
||||
onlyVal, ok := config["only_longest_match"].(bool)
|
||||
if ok {
|
||||
onlyLongestMatch = onlyVal
|
||||
}
|
||||
|
||||
dictTokenMapName, ok := config["dict_token_map"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify dict_token_map")
|
||||
}
|
||||
dictTokenMap, err := cache.TokenMapNamed(dictTokenMapName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error building dict compound words filter: %v", err)
|
||||
}
|
||||
return NewDictionaryCompoundFilter(dictTokenMap, minWordSize, minSubWordSize, maxSubWordSize, onlyLongestMatch), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, DictionaryCompoundFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
187
analysis/token/compound/dict_test.go
Normal file
187
analysis/token/compound/dict_test.go
Normal file
|
@ -0,0 +1,187 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package compound
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenmap"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestStopWordsFilter(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("i"),
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("like"),
|
||||
Start: 2,
|
||||
End: 6,
|
||||
Position: 2,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("to"),
|
||||
Start: 7,
|
||||
End: 9,
|
||||
Position: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("play"),
|
||||
Start: 10,
|
||||
End: 14,
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("softball"),
|
||||
Start: 15,
|
||||
End: 23,
|
||||
Position: 5,
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("i"),
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("like"),
|
||||
Start: 2,
|
||||
End: 6,
|
||||
Position: 2,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("to"),
|
||||
Start: 7,
|
||||
End: 9,
|
||||
Position: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("play"),
|
||||
Start: 10,
|
||||
End: 14,
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("softball"),
|
||||
Start: 15,
|
||||
End: 23,
|
||||
Position: 5,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("soft"),
|
||||
Start: 15,
|
||||
End: 19,
|
||||
Position: 5,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ball"),
|
||||
Start: 19,
|
||||
End: 23,
|
||||
Position: 5,
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
dictListConfig := map[string]interface{}{
|
||||
"type": tokenmap.Name,
|
||||
"tokens": []interface{}{"factor", "soft", "ball", "team"},
|
||||
}
|
||||
_, err := cache.DefineTokenMap("dict_test", dictListConfig)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
dictConfig := map[string]interface{}{
|
||||
"type": "dict_compound",
|
||||
"dict_token_map": "dict_test",
|
||||
}
|
||||
dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ouputTokenStream := dictFilter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStopWordsFilterLongestMatch(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("softestball"),
|
||||
Start: 0,
|
||||
End: 11,
|
||||
Position: 1,
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("softestball"),
|
||||
Start: 0,
|
||||
End: 11,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("softest"),
|
||||
Start: 0,
|
||||
End: 7,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ball"),
|
||||
Start: 7,
|
||||
End: 11,
|
||||
Position: 1,
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
dictListConfig := map[string]interface{}{
|
||||
"type": tokenmap.Name,
|
||||
"tokens": []interface{}{"soft", "softest", "ball"},
|
||||
}
|
||||
_, err := cache.DefineTokenMap("dict_test", dictListConfig)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
dictConfig := map[string]interface{}{
|
||||
"type": "dict_compound",
|
||||
"dict_token_map": "dict_test",
|
||||
"only_longest_match": true,
|
||||
}
|
||||
dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ouputTokenStream := dictFilter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
||||
}
|
||||
}
|
118
analysis/token/edgengram/edgengram.go
Normal file
118
analysis/token/edgengram/edgengram.go
Normal file
|
@ -0,0 +1,118 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package edgengram
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "edge_ngram"
|
||||
|
||||
type Side bool
|
||||
|
||||
const BACK Side = true
|
||||
const FRONT Side = false
|
||||
|
||||
type EdgeNgramFilter struct {
|
||||
back Side
|
||||
minLength int
|
||||
maxLength int
|
||||
}
|
||||
|
||||
func NewEdgeNgramFilter(side Side, minLength, maxLength int) *EdgeNgramFilter {
|
||||
return &EdgeNgramFilter{
|
||||
back: side,
|
||||
minLength: minLength,
|
||||
maxLength: maxLength,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
for _, token := range input {
|
||||
runeCount := utf8.RuneCount(token.Term)
|
||||
runes := bytes.Runes(token.Term)
|
||||
if s.back {
|
||||
i := runeCount
|
||||
// index of the starting rune for this token
|
||||
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
|
||||
// build an ngram of this size starting at i
|
||||
if i-ngramSize >= 0 {
|
||||
ngramTerm := analysis.BuildTermFromRunes(runes[i-ngramSize : i])
|
||||
token := analysis.Token{
|
||||
Position: token.Position,
|
||||
Start: token.Start,
|
||||
End: token.End,
|
||||
Type: token.Type,
|
||||
Term: ngramTerm,
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
i := 0
|
||||
// index of the starting rune for this token
|
||||
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
|
||||
// build an ngram of this size starting at i
|
||||
if i+ngramSize <= runeCount {
|
||||
ngramTerm := analysis.BuildTermFromRunes(runes[i : i+ngramSize])
|
||||
token := analysis.Token{
|
||||
Position: token.Position,
|
||||
Start: token.Start,
|
||||
End: token.End,
|
||||
Type: token.Type,
|
||||
Term: ngramTerm,
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func EdgeNgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
side := FRONT
|
||||
back, ok := config["back"].(bool)
|
||||
if ok && back {
|
||||
side = BACK
|
||||
}
|
||||
minVal, ok := config["min"].(float64)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify min")
|
||||
}
|
||||
min := int(minVal)
|
||||
maxVal, ok := config["max"].(float64)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify max")
|
||||
}
|
||||
max := int(maxVal)
|
||||
|
||||
return NewEdgeNgramFilter(side, min, max), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, EdgeNgramFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
189
analysis/token/edgengram/edgengram_test.go
Normal file
189
analysis/token/edgengram/edgengram_test.go
Normal file
|
@ -0,0 +1,189 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package edgengram
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestEdgeNgramFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
side Side
|
||||
min int
|
||||
max int
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
side: FRONT,
|
||||
min: 1,
|
||||
max: 1,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcde"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
side: BACK,
|
||||
min: 1,
|
||||
max: 1,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcde"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("e"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
side: FRONT,
|
||||
min: 1,
|
||||
max: 3,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcde"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ab"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("abc"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
side: BACK,
|
||||
min: 1,
|
||||
max: 3,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcde"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("e"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("de"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cde"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
side: FRONT,
|
||||
min: 1,
|
||||
max: 3,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcde"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("vwxyz"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ab"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("abc"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("v"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("vw"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("vwx"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
side: BACK,
|
||||
min: 3,
|
||||
max: 5,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Beryl"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ryl"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("eryl"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("Beryl"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
side: FRONT,
|
||||
min: 3,
|
||||
max: 5,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Beryl"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Ber"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("Bery"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("Beryl"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
edgeNgramFilter := NewEdgeNgramFilter(test.side, test.min, test.max)
|
||||
actual := edgeNgramFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
77
analysis/token/elision/elision.go
Normal file
77
analysis/token/elision/elision.go
Normal file
|
@ -0,0 +1,77 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package elision
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "elision"
|
||||
|
||||
const RightSingleQuotationMark = '’'
|
||||
const Apostrophe = '\''
|
||||
|
||||
type ElisionFilter struct {
|
||||
articles analysis.TokenMap
|
||||
}
|
||||
|
||||
func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter {
|
||||
return &ElisionFilter{
|
||||
articles: articles,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := token.Term
|
||||
for i := 0; i < len(term); {
|
||||
r, size := utf8.DecodeRune(term[i:])
|
||||
if r == Apostrophe || r == RightSingleQuotationMark {
|
||||
// see if the prefix matches one of the articles
|
||||
prefix := term[0:i]
|
||||
_, articleMatch := s.articles[string(prefix)]
|
||||
if articleMatch {
|
||||
token.Term = term[i+size:]
|
||||
break
|
||||
}
|
||||
}
|
||||
i += size
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
articlesTokenMapName, ok := config["articles_token_map"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify articles_token_map")
|
||||
}
|
||||
articlesTokenMap, err := cache.TokenMapNamed(articlesTokenMapName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error building elision filter: %v", err)
|
||||
}
|
||||
return NewElisionFilter(articlesTokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, ElisionFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
85
analysis/token/elision/elision_test.go
Normal file
85
analysis/token/elision/elision_test.go
Normal file
|
@ -0,0 +1,85 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package elision
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenmap"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestElisionFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ar" + string(Apostrophe) + "word"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("word"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ar" + string(RightSingleQuotationMark) + "word"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("word"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
|
||||
articleListConfig := map[string]interface{}{
|
||||
"type": tokenmap.Name,
|
||||
"tokens": []interface{}{"ar"},
|
||||
}
|
||||
_, err := cache.DefineTokenMap("articles_test", articleListConfig)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
elisionConfig := map[string]interface{}{
|
||||
"type": "elision",
|
||||
"articles_token_map": "articles_test",
|
||||
}
|
||||
elisionFilter, err := cache.DefineTokenFilter("elision_test", elisionConfig)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
|
||||
actual := elisionFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
95
analysis/token/hierarchy/hierarchy.go
Normal file
95
analysis/token/hierarchy/hierarchy.go
Normal file
|
@ -0,0 +1,95 @@
|
|||
package hierarchy
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "hierarchy"
|
||||
|
||||
type HierarchyFilter struct {
|
||||
maxLevels int
|
||||
delimiter []byte
|
||||
splitInput bool
|
||||
}
|
||||
|
||||
func NewHierarchyFilter(delimiter []byte, maxLevels int, splitInput bool) *HierarchyFilter {
|
||||
return &HierarchyFilter{
|
||||
maxLevels: maxLevels,
|
||||
delimiter: delimiter,
|
||||
splitInput: splitInput,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *HierarchyFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0, s.maxLevels)
|
||||
|
||||
var soFar [][]byte
|
||||
for _, token := range input {
|
||||
if s.splitInput {
|
||||
parts := bytes.Split(token.Term, s.delimiter)
|
||||
for _, part := range parts {
|
||||
soFar, rv = s.buildToken(rv, soFar, part)
|
||||
if len(soFar) >= s.maxLevels {
|
||||
return rv
|
||||
}
|
||||
}
|
||||
} else {
|
||||
soFar, rv = s.buildToken(rv, soFar, token.Term)
|
||||
if len(soFar) >= s.maxLevels {
|
||||
return rv
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *HierarchyFilter) buildToken(tokenStream analysis.TokenStream, soFar [][]byte, part []byte) (
|
||||
[][]byte, analysis.TokenStream) {
|
||||
|
||||
soFar = append(soFar, part)
|
||||
term := bytes.Join(soFar, s.delimiter)
|
||||
|
||||
tokenStream = append(tokenStream, &analysis.Token{
|
||||
Type: analysis.Shingle,
|
||||
Term: term,
|
||||
Start: 0,
|
||||
End: len(term),
|
||||
Position: 1,
|
||||
})
|
||||
|
||||
return soFar, tokenStream
|
||||
}
|
||||
|
||||
func HierarchyFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
max := math.MaxInt64
|
||||
maxVal, ok := config["max"].(float64)
|
||||
if ok {
|
||||
max = int(maxVal)
|
||||
}
|
||||
|
||||
splitInput := true
|
||||
splitInputVal, ok := config["split_input"].(bool)
|
||||
if ok {
|
||||
splitInput = splitInputVal
|
||||
}
|
||||
|
||||
delimiter, ok := config["delimiter"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify delimiter")
|
||||
}
|
||||
|
||||
return NewHierarchyFilter([]byte(delimiter), max, splitInput), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, HierarchyFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
229
analysis/token/hierarchy/hierarchy_test.go
Normal file
229
analysis/token/hierarchy/hierarchy_test.go
Normal file
|
@ -0,0 +1,229 @@
|
|||
package hierarchy
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestHierarchyFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
delimiter string
|
||||
max int
|
||||
splitInput bool
|
||||
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
name: "single token a/b/c, delimiter /",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 10,
|
||||
splitInput: true,
|
||||
},
|
||||
{
|
||||
name: "multiple tokens already split a b c, delimiter /",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("b"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 10,
|
||||
splitInput: true,
|
||||
},
|
||||
{
|
||||
name: "single token a/b/c, delimiter /, limit 2",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 2,
|
||||
splitInput: true,
|
||||
},
|
||||
{
|
||||
name: "multiple tokens already split a b c, delimiter /, limit 2",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("b"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 2,
|
||||
splitInput: true,
|
||||
},
|
||||
|
||||
{
|
||||
name: "single token a/b/c, delimiter /, no split",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 10,
|
||||
splitInput: false,
|
||||
},
|
||||
{
|
||||
name: "multiple tokens already split a b c, delimiter /, no split",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("b"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("c"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 1,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a/b/c"),
|
||||
Type: analysis.Shingle,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
delimiter: "/",
|
||||
max: 10,
|
||||
splitInput: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
test := test
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
filter := NewHierarchyFilter([]byte(test.delimiter), test.max, test.splitInput)
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
}
|
63
analysis/token/keyword/keyword.go
Normal file
63
analysis/token/keyword/keyword.go
Normal file
|
@ -0,0 +1,63 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package keyword
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "keyword_marker"
|
||||
|
||||
type KeyWordMarkerFilter struct {
|
||||
keyWords analysis.TokenMap
|
||||
}
|
||||
|
||||
func NewKeyWordMarkerFilter(keyWords analysis.TokenMap) *KeyWordMarkerFilter {
|
||||
return &KeyWordMarkerFilter{
|
||||
keyWords: keyWords,
|
||||
}
|
||||
}
|
||||
|
||||
func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
_, isKeyWord := f.keyWords[string(token.Term)]
|
||||
if isKeyWord {
|
||||
token.KeyWord = true
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func KeyWordMarkerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
keywordsTokenMapName, ok := config["keywords_token_map"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify keywords_token_map")
|
||||
}
|
||||
keywordsTokenMap, err := cache.TokenMapNamed(keywordsTokenMapName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error building keyword marker filter: %v", err)
|
||||
}
|
||||
return NewKeyWordMarkerFilter(keywordsTokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, KeyWordMarkerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
73
analysis/token/keyword/keyword_test.go
Normal file
73
analysis/token/keyword/keyword_test.go
Normal file
|
@ -0,0 +1,73 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package keyword
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestKeyWordMarkerFilter(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("in"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("park"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("in"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("park"),
|
||||
KeyWord: true,
|
||||
},
|
||||
}
|
||||
|
||||
keyWordsMap := analysis.NewTokenMap()
|
||||
keyWordsMap.AddToken("walk")
|
||||
keyWordsMap.AddToken("park")
|
||||
|
||||
filter := NewKeyWordMarkerFilter(keyWordsMap)
|
||||
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream[0].KeyWord, ouputTokenStream[0].KeyWord)
|
||||
}
|
||||
}
|
80
analysis/token/length/length.go
Normal file
80
analysis/token/length/length.go
Normal file
|
@ -0,0 +1,80 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package length
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "length"
|
||||
|
||||
type LengthFilter struct {
|
||||
min int
|
||||
max int
|
||||
}
|
||||
|
||||
func NewLengthFilter(min, max int) *LengthFilter {
|
||||
return &LengthFilter{
|
||||
min: min,
|
||||
max: max,
|
||||
}
|
||||
}
|
||||
|
||||
func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
for _, token := range input {
|
||||
wordLen := utf8.RuneCount(token.Term)
|
||||
if f.min > 0 && f.min > wordLen {
|
||||
continue
|
||||
}
|
||||
if f.max > 0 && f.max < wordLen {
|
||||
continue
|
||||
}
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func LengthFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
min := 0
|
||||
max := 0
|
||||
|
||||
minVal, ok := config["min"].(float64)
|
||||
if ok {
|
||||
min = int(minVal)
|
||||
}
|
||||
maxVal, ok := config["max"].(float64)
|
||||
if ok {
|
||||
max = int(maxVal)
|
||||
}
|
||||
if min == max && max == 0 {
|
||||
return nil, fmt.Errorf("either min or max must be non-zero")
|
||||
}
|
||||
|
||||
return NewLengthFilter(min, max), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, LengthFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
99
analysis/token/length/length_test.go
Normal file
99
analysis/token/length/length_test.go
Normal file
|
@ -0,0 +1,99 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package length
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestLengthFilter(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("two"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("three"),
|
||||
},
|
||||
}
|
||||
|
||||
lengthFilter := NewLengthFilter(3, 4)
|
||||
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
|
||||
if len(ouputTokenStream) != 1 {
|
||||
t.Fatalf("expected 1 output token")
|
||||
}
|
||||
if string(ouputTokenStream[0].Term) != "two" {
|
||||
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLengthFilterNoMax(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("two"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("three"),
|
||||
},
|
||||
}
|
||||
|
||||
lengthFilter := NewLengthFilter(3, -1)
|
||||
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
|
||||
if len(ouputTokenStream) != 2 {
|
||||
t.Fatalf("expected 2 output token")
|
||||
}
|
||||
if string(ouputTokenStream[0].Term) != "two" {
|
||||
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
|
||||
}
|
||||
if string(ouputTokenStream[1].Term) != "three" {
|
||||
t.Errorf("expected term `three`, got `%s`", ouputTokenStream[0].Term)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLengthFilterNoMin(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("two"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("three"),
|
||||
},
|
||||
}
|
||||
|
||||
lengthFilter := NewLengthFilter(-1, 4)
|
||||
ouputTokenStream := lengthFilter.Filter(inputTokenStream)
|
||||
if len(ouputTokenStream) != 2 {
|
||||
t.Fatalf("expected 2 output token")
|
||||
}
|
||||
if string(ouputTokenStream[0].Term) != "1" {
|
||||
t.Errorf("expected term `1`, got `%s`", ouputTokenStream[0].Term)
|
||||
}
|
||||
if string(ouputTokenStream[1].Term) != "two" {
|
||||
t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
|
||||
}
|
||||
}
|
108
analysis/token/lowercase/lowercase.go
Normal file
108
analysis/token/lowercase/lowercase.go
Normal file
|
@ -0,0 +1,108 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Package lowercase implements a TokenFilter which converts
|
||||
// tokens to lower case according to unicode rules.
|
||||
package lowercase
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
// Name is the name used to register LowerCaseFilter in the bleve registry
|
||||
const Name = "to_lower"
|
||||
|
||||
type LowerCaseFilter struct {
|
||||
}
|
||||
|
||||
func NewLowerCaseFilter() *LowerCaseFilter {
|
||||
return &LowerCaseFilter{}
|
||||
}
|
||||
|
||||
func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
token.Term = toLowerDeferredCopy(token.Term)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewLowerCaseFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// toLowerDeferredCopy will function exactly like
|
||||
// bytes.ToLower() only it will reuse (overwrite)
|
||||
// the original byte array when possible
|
||||
// NOTE: because its possible that the lower-case
|
||||
// form of a rune has a different utf-8 encoded
|
||||
// length, in these cases a new byte array is allocated
|
||||
func toLowerDeferredCopy(s []byte) []byte {
|
||||
j := 0
|
||||
for i := 0; i < len(s); {
|
||||
wid := 1
|
||||
r := rune(s[i])
|
||||
if r >= utf8.RuneSelf {
|
||||
r, wid = utf8.DecodeRune(s[i:])
|
||||
}
|
||||
|
||||
l := unicode.ToLower(r)
|
||||
|
||||
// If the rune is already lowercased, just move to the
|
||||
// next rune.
|
||||
if l == r {
|
||||
i += wid
|
||||
j += wid
|
||||
continue
|
||||
}
|
||||
|
||||
// Handles the Unicode edge-case where the last
|
||||
// rune in a word on the greek Σ needs to be converted
|
||||
// differently.
|
||||
if l == 'σ' && i+2 == len(s) {
|
||||
l = 'ς'
|
||||
}
|
||||
|
||||
lwid := utf8.RuneLen(l)
|
||||
if lwid > wid {
|
||||
// utf-8 encoded replacement is wider
|
||||
// for now, punt and defer
|
||||
// to bytes.ToLower() for the remainder
|
||||
// only known to happen with chars
|
||||
// Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
|
||||
// Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
|
||||
rest := bytes.ToLower(s[i:])
|
||||
rv := make([]byte, j+len(rest))
|
||||
copy(rv[:j], s[:j])
|
||||
copy(rv[j:], rest)
|
||||
return rv
|
||||
} else {
|
||||
utf8.EncodeRune(s[j:], l)
|
||||
}
|
||||
i += wid
|
||||
j += lwid
|
||||
}
|
||||
return s[:j]
|
||||
}
|
166
analysis/token/lowercase/lowercase_test.go
Normal file
166
analysis/token/lowercase/lowercase_test.go
Normal file
|
@ -0,0 +1,166 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package lowercase
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestLowerCaseFilter(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ONE"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("two"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ThReE"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("steven's"),
|
||||
},
|
||||
// these characters are chosen in particular
|
||||
// because the utf-8 encoding of the lower-case
|
||||
// version has a different length
|
||||
// Rune İ(304) width 2 - Lower i(105) width 1
|
||||
// Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
|
||||
// Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
|
||||
&analysis.Token{
|
||||
Term: []byte("İȺȾCAT"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ȺȾCAT"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ὈΔΥΣΣ"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("one"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("two"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("three"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("steven's"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("iⱥⱦcat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ⱥⱦcat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ὀδυσς"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewLowerCaseFilter()
|
||||
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
||||
t.Errorf("expected %s got %s", expectedTokenStream[0].Term, ouputTokenStream[0].Term)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkLowerCaseFilter(b *testing.B) {
|
||||
input := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("A"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("boiling"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("liquid"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("expanding"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("vapor"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("explosion"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("caused"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("by"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("rupture"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("of"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("vessel"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("containing"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("pressurized"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("liquid"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("above"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("its"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("boiling"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("point"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("İȺȾCAT"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ȺȾCAT"),
|
||||
},
|
||||
}
|
||||
filter := NewLowerCaseFilter()
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
filter.Filter(input)
|
||||
}
|
||||
}
|
116
analysis/token/ngram/ngram.go
Normal file
116
analysis/token/ngram/ngram.go
Normal file
|
@ -0,0 +1,116 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ngram
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "ngram"
|
||||
|
||||
type NgramFilter struct {
|
||||
minLength int
|
||||
maxLength int
|
||||
}
|
||||
|
||||
func NewNgramFilter(minLength, maxLength int) *NgramFilter {
|
||||
return &NgramFilter{
|
||||
minLength: minLength,
|
||||
maxLength: maxLength,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
for _, token := range input {
|
||||
runeCount := utf8.RuneCount(token.Term)
|
||||
runes := bytes.Runes(token.Term)
|
||||
for i := 0; i < runeCount; i++ {
|
||||
// index of the starting rune for this token
|
||||
for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
|
||||
// build an ngram of this size starting at i
|
||||
if i+ngramSize <= runeCount {
|
||||
ngramTerm := analysis.BuildTermFromRunes(runes[i : i+ngramSize])
|
||||
token := analysis.Token{
|
||||
Position: token.Position,
|
||||
Start: token.Start,
|
||||
End: token.End,
|
||||
Type: token.Type,
|
||||
Term: ngramTerm,
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func NgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
minVal, ok := config["min"]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify min")
|
||||
}
|
||||
|
||||
min, err := convertToInt(minVal)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
maxVal, ok := config["max"]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify max")
|
||||
}
|
||||
|
||||
max, err := convertToInt(maxVal)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return NewNgramFilter(min, max), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, NgramFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// Expects either an int or a flaot64 value
|
||||
func convertToInt(val interface{}) (int, error) {
|
||||
var intVal int
|
||||
var floatVal float64
|
||||
var ok bool
|
||||
|
||||
intVal, ok = val.(int)
|
||||
if ok {
|
||||
return intVal, nil
|
||||
}
|
||||
|
||||
floatVal, ok = val.(float64)
|
||||
if ok {
|
||||
return int(floatVal), nil
|
||||
}
|
||||
|
||||
return 0, fmt.Errorf("failed to convert to int value")
|
||||
}
|
192
analysis/token/ngram/ngram_test.go
Normal file
192
analysis/token/ngram/ngram_test.go
Normal file
|
@ -0,0 +1,192 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ngram
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestNgramFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
min int
|
||||
max int
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
min: 1,
|
||||
max: 1,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcde"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("b"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("c"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("d"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("e"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 2,
|
||||
max: 2,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcde"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ab"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("bc"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cd"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("de"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 1,
|
||||
max: 3,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcde"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ab"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("abc"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("b"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("bc"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("bcd"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("c"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cd"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cde"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("d"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("de"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("e"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
ngramFilter := NewNgramFilter(test.min, test.max)
|
||||
actual := ngramFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestConversionInt(t *testing.T) {
|
||||
config := map[string]interface{}{
|
||||
"type": Name,
|
||||
"min": 3,
|
||||
"max": 8,
|
||||
}
|
||||
|
||||
f, err := NgramFilterConstructor(config, nil)
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("Failed to construct the ngram filter: %v", err)
|
||||
}
|
||||
|
||||
ngram := f.(*NgramFilter)
|
||||
if ngram.minLength != 3 && ngram.maxLength != 8 {
|
||||
t.Errorf("Failed to construct the bounds. Got %v and %v.", ngram.minLength, ngram.maxLength)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConversionFloat(t *testing.T) {
|
||||
config := map[string]interface{}{
|
||||
"type": Name,
|
||||
"min": float64(3),
|
||||
"max": float64(8),
|
||||
}
|
||||
|
||||
f, err := NgramFilterConstructor(config, nil)
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("Failed to construct the ngram filter: %v", err)
|
||||
}
|
||||
|
||||
ngram := f.(*NgramFilter)
|
||||
if ngram.minLength != 3 && ngram.maxLength != 8 {
|
||||
t.Errorf("Failed to construct the bounds. Got %v and %v.", ngram.minLength, ngram.maxLength)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBadConversion(t *testing.T) {
|
||||
config := map[string]interface{}{
|
||||
"type": Name,
|
||||
"min": "3",
|
||||
}
|
||||
|
||||
_, err := NgramFilterConstructor(config, nil)
|
||||
|
||||
if err == nil {
|
||||
t.Errorf("Expected conversion error.")
|
||||
}
|
||||
|
||||
if err.Error() != "failed to convert to int value" {
|
||||
t.Errorf("Wrong error recevied. Got %v.", err)
|
||||
}
|
||||
}
|
56
analysis/token/porter/porter.go
Normal file
56
analysis/token/porter/porter.go
Normal file
|
@ -0,0 +1,56 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package porter
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/go-porterstemmer"
|
||||
)
|
||||
|
||||
const Name = "stemmer_porter"
|
||||
|
||||
type PorterStemmer struct {
|
||||
}
|
||||
|
||||
func NewPorterStemmer() *PorterStemmer {
|
||||
return &PorterStemmer{}
|
||||
}
|
||||
|
||||
func (s *PorterStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
// if it is not a protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
termRunes := bytes.Runes(token.Term)
|
||||
stemmedRunes := porterstemmer.StemWithoutLowerCasing(termRunes)
|
||||
token.Term = analysis.BuildTermFromRunes(stemmedRunes)
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func PorterStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewPorterStemmer(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, PorterStemmerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
115
analysis/token/porter/porter_test.go
Normal file
115
analysis/token/porter/porter_test.go
Normal file
|
@ -0,0 +1,115 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package porter
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestPorterStemmer(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walking"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talked"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("business"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
// a term which does stem, but does not change length
|
||||
&analysis.Token{
|
||||
Term: []byte("marty"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("busi"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marti"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewPorterStemmer()
|
||||
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkPorterStemmer(b *testing.B) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walking"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talked"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("business"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewPorterStemmer()
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
filter.Filter(inputTokenStream)
|
||||
}
|
||||
|
||||
}
|
78
analysis/token/reverse/reverse.go
Normal file
78
analysis/token/reverse/reverse.go
Normal file
|
@ -0,0 +1,78 @@
|
|||
// Copyright (c) 2019 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package reverse
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
// Name is the name used to register ReverseFilter in the bleve registry
|
||||
const Name = "reverse"
|
||||
|
||||
type ReverseFilter struct {
|
||||
}
|
||||
|
||||
func NewReverseFilter() *ReverseFilter {
|
||||
return &ReverseFilter{}
|
||||
}
|
||||
|
||||
func (f *ReverseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
token.Term = reverse(token.Term)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func ReverseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewReverseFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, ReverseFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// reverse(..) will generate a reversed version of the provided
|
||||
// unicode array and return it back to its caller.
|
||||
func reverse(s []byte) []byte {
|
||||
cursorIn := 0
|
||||
inputRunes := []rune(string(s))
|
||||
cursorOut := len(s)
|
||||
output := make([]byte, len(s))
|
||||
for i := 0; i < len(inputRunes); {
|
||||
wid := utf8.RuneLen(inputRunes[i])
|
||||
i++
|
||||
for i < len(inputRunes) {
|
||||
r := inputRunes[i]
|
||||
if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Me, r) || unicode.Is(unicode.Mc, r) {
|
||||
wid += utf8.RuneLen(r)
|
||||
i++
|
||||
} else {
|
||||
break
|
||||
}
|
||||
}
|
||||
copy(output[cursorOut-wid:cursorOut], s[cursorIn:cursorIn+wid])
|
||||
cursorIn += wid
|
||||
cursorOut -= wid
|
||||
}
|
||||
|
||||
return output
|
||||
}
|
184
analysis/token/reverse/reverse_test.go
Normal file
184
analysis/token/reverse/reverse_test.go
Normal file
|
@ -0,0 +1,184 @@
|
|||
// Copyright (c) 2019 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package reverse
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestReverseFilter(t *testing.T) {
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{},
|
||||
&analysis.Token{
|
||||
Term: []byte("one"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("TWo"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("thRee"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("four's"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("what's this in reverse"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("œ∑´®†"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("İȺȾCAT÷≥≤µ123"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("!@#$%^&*()"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cafés"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("¿Dónde estás?"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("Me gustaría una cerveza."),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{},
|
||||
&analysis.Token{
|
||||
Term: []byte("eno"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("oWT"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("eeRht"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("s'ruof"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("esrever ni siht s'tahw"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("†®´∑œ"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("321µ≤≥÷TACȾȺİ"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte(")(*&^%$#@!"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("séfac"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("?sátse ednóD¿"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte(".azevrec anu aíratsug eM"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewReverseFilter()
|
||||
outputTokenStream := filter.Filter(inputTokenStream)
|
||||
for i := 0; i < len(expectedTokenStream); i++ {
|
||||
if !bytes.Equal(outputTokenStream[i].Term, expectedTokenStream[i].Term) {
|
||||
t.Errorf("[%d] expected %s got %s",
|
||||
i+1, expectedTokenStream[i].Term, outputTokenStream[i].Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkReverseFilter(b *testing.B) {
|
||||
input := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("A"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("boiling"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("liquid"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("expanding"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("vapor"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("explosion"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("caused"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("by"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("rupture"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("of"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("vessel"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("containing"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("pressurized"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("liquid"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("above"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("its"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("boiling"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("point"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("İȺȾCAT"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("Me gustaría una cerveza."),
|
||||
},
|
||||
}
|
||||
filter := NewReverseFilter()
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
filter.Filter(input)
|
||||
}
|
||||
}
|
172
analysis/token/shingle/shingle.go
Normal file
172
analysis/token/shingle/shingle.go
Normal file
|
@ -0,0 +1,172 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package shingle
|
||||
|
||||
import (
|
||||
"container/ring"
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "shingle"
|
||||
|
||||
type ShingleFilter struct {
|
||||
min int
|
||||
max int
|
||||
outputOriginal bool
|
||||
tokenSeparator string
|
||||
fill string
|
||||
}
|
||||
|
||||
func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *ShingleFilter {
|
||||
return &ShingleFilter{
|
||||
min: min,
|
||||
max: max,
|
||||
outputOriginal: outputOriginal,
|
||||
tokenSeparator: sep,
|
||||
fill: fill,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
ring := ring.New(s.max)
|
||||
itemsInRing := 0
|
||||
currentPosition := 0
|
||||
for _, token := range input {
|
||||
if s.outputOriginal {
|
||||
rv = append(rv, token)
|
||||
}
|
||||
|
||||
// if there are gaps, insert filler tokens
|
||||
offset := token.Position - currentPosition
|
||||
for offset > 1 {
|
||||
fillerToken := analysis.Token{
|
||||
Position: 0,
|
||||
Start: -1,
|
||||
End: -1,
|
||||
Type: analysis.AlphaNumeric,
|
||||
Term: []byte(s.fill),
|
||||
}
|
||||
ring.Value = &fillerToken
|
||||
if itemsInRing < s.max {
|
||||
itemsInRing++
|
||||
}
|
||||
rv = append(rv, s.shingleCurrentRingState(ring, itemsInRing)...)
|
||||
ring = ring.Next()
|
||||
offset--
|
||||
}
|
||||
currentPosition = token.Position
|
||||
|
||||
ring.Value = token
|
||||
if itemsInRing < s.max {
|
||||
itemsInRing++
|
||||
}
|
||||
rv = append(rv, s.shingleCurrentRingState(ring, itemsInRing)...)
|
||||
ring = ring.Next()
|
||||
}
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *ShingleFilter) shingleCurrentRingState(ring *ring.Ring, itemsInRing int) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
for shingleN := s.min; shingleN <= s.max; shingleN++ {
|
||||
// if there are enough items in the ring
|
||||
// to produce a shingle of this size
|
||||
if itemsInRing >= shingleN {
|
||||
thisShingleRing := ring.Move(-(shingleN - 1))
|
||||
shingledBytes := make([]byte, 0)
|
||||
pos := 0
|
||||
start := -1
|
||||
end := 0
|
||||
for i := 0; i < shingleN; i++ {
|
||||
if i != 0 {
|
||||
shingledBytes = append(shingledBytes, []byte(s.tokenSeparator)...)
|
||||
}
|
||||
curr := thisShingleRing.Value.(*analysis.Token)
|
||||
if pos == 0 && curr.Position != 0 {
|
||||
pos = curr.Position
|
||||
}
|
||||
if start == -1 && curr.Start != -1 {
|
||||
start = curr.Start
|
||||
}
|
||||
if curr.End != -1 {
|
||||
end = curr.End
|
||||
}
|
||||
shingledBytes = append(shingledBytes, curr.Term...)
|
||||
thisShingleRing = thisShingleRing.Next()
|
||||
}
|
||||
token := analysis.Token{
|
||||
Type: analysis.Shingle,
|
||||
Term: shingledBytes,
|
||||
}
|
||||
if pos != 0 {
|
||||
token.Position = pos
|
||||
}
|
||||
if start != -1 {
|
||||
token.Start = start
|
||||
}
|
||||
if end != -1 {
|
||||
token.End = end
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func ShingleFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
minVal, ok := config["min"].(float64)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify min")
|
||||
}
|
||||
min := int(minVal)
|
||||
maxVal, ok := config["max"].(float64)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify max")
|
||||
}
|
||||
max := int(maxVal)
|
||||
|
||||
outputOriginal := false
|
||||
outVal, ok := config["output_original"].(bool)
|
||||
if ok {
|
||||
outputOriginal = outVal
|
||||
}
|
||||
|
||||
sep := " "
|
||||
sepVal, ok := config["separator"].(string)
|
||||
if ok {
|
||||
sep = sepVal
|
||||
}
|
||||
|
||||
fill := "_"
|
||||
fillVal, ok := config["filler"].(string)
|
||||
if ok {
|
||||
fill = fillVal
|
||||
}
|
||||
|
||||
return NewShingleFilter(min, max, outputOriginal, sep, fill), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, ShingleFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
416
analysis/token/shingle/shingle_test.go
Normal file
416
analysis/token/shingle/shingle_test.go
Normal file
|
@ -0,0 +1,416 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package shingle
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestShingleFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
min int
|
||||
max int
|
||||
outputOriginal bool
|
||||
separator string
|
||||
filler string
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
min: 2,
|
||||
max: 2,
|
||||
outputOriginal: false,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 3,
|
||||
max: 3,
|
||||
outputOriginal: false,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 2,
|
||||
max: 3,
|
||||
outputOriginal: false,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 3,
|
||||
max: 3,
|
||||
outputOriginal: false,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ugly"),
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
Position: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
Position: 4,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ugly _ quick"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("_ quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 1,
|
||||
max: 5,
|
||||
outputOriginal: false,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("text"),
|
||||
Position: 2,
|
||||
},
|
||||
// token 3 removed by stop filter
|
||||
&analysis.Token{
|
||||
Term: []byte("see"),
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("shingles"),
|
||||
Position: 5,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("text"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 2,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test text"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("_"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("text _"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 2,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test text _"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("see"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("_ see"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("text _ see"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 2,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test text _ see"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("shingles"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 5,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("see shingles"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("_ see shingles"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("text _ see shingles"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 2,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test text _ see shingles"),
|
||||
Type: analysis.Shingle,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
min: 2,
|
||||
max: 2,
|
||||
outputOriginal: true,
|
||||
separator: " ",
|
||||
filler: "_",
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
shingleFilter := NewShingleFilter(test.min, test.max, test.outputOriginal, test.separator, test.filler)
|
||||
actual := shingleFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestShingleFilterBug431 tests that the shingle filter is in fact stateless
|
||||
// by making using the same filter instance twice and ensuring we do not get
|
||||
// contaminated output
|
||||
func TestShingleFilterBug431(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("fox"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("the quick"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("quick brown"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("brown fox"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("sad"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("dirty"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("sock"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a sad"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("sad dirty"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("dirty sock"),
|
||||
Type: analysis.Shingle,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
shingleFilter := NewShingleFilter(2, 2, false, " ", "_")
|
||||
for _, test := range tests {
|
||||
actual := shingleFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
62
analysis/token/snowball/snowball.go
Normal file
62
analysis/token/snowball/snowball.go
Normal file
|
@ -0,0 +1,62 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package snowball
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/snowball"
|
||||
)
|
||||
|
||||
const Name = "stemmer_snowball"
|
||||
|
||||
type SnowballStemmer struct {
|
||||
language string
|
||||
}
|
||||
|
||||
func NewSnowballStemmer(language string) *SnowballStemmer {
|
||||
return &SnowballStemmer{
|
||||
language: language,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *SnowballStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
// if it is not a protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
stemmed, _ := snowball.Stem(string(token.Term), s.language, true)
|
||||
token.Term = []byte(stemmed)
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func SnowballStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
language, ok := config["language"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify language")
|
||||
}
|
||||
return NewSnowballStemmer(language), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, SnowballStemmerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
115
analysis/token/snowball/snowball_test.go
Normal file
115
analysis/token/snowball/snowball_test.go
Normal file
|
@ -0,0 +1,115 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package snowball
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestSnowballStemmer(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walking"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talked"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("business"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
// a term which does stem, but does not change length
|
||||
&analysis.Token{
|
||||
Term: []byte("marty"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("busi"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marti"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewSnowballStemmer("english")
|
||||
ouputTokenStream := filter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSnowballStemmer(b *testing.B) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walking"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("talked"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("business"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("protected"),
|
||||
KeyWord: true,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("done"),
|
||||
},
|
||||
}
|
||||
|
||||
filter := NewSnowballStemmer("english")
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
filter.Filter(inputTokenStream)
|
||||
}
|
||||
|
||||
}
|
73
analysis/token/stop/stop.go
Normal file
73
analysis/token/stop/stop.go
Normal file
|
@ -0,0 +1,73 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Package stop implements a TokenFilter removing tokens found in
|
||||
// a TokenMap.
|
||||
//
|
||||
// It constructor takes the following arguments:
|
||||
//
|
||||
// "stop_token_map" (string): the name of the token map identifying tokens to
|
||||
// remove.
|
||||
package stop
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "stop_tokens"
|
||||
|
||||
type StopTokensFilter struct {
|
||||
stopTokens analysis.TokenMap
|
||||
}
|
||||
|
||||
func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter {
|
||||
return &StopTokensFilter{
|
||||
stopTokens: stopTokens,
|
||||
}
|
||||
}
|
||||
|
||||
func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
j := 0
|
||||
for _, token := range input {
|
||||
_, isStopToken := f.stopTokens[string(token.Term)]
|
||||
if !isStopToken {
|
||||
input[j] = token
|
||||
j++
|
||||
}
|
||||
}
|
||||
|
||||
return input[:j]
|
||||
}
|
||||
|
||||
func StopTokensFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
stopTokenMapName, ok := config["stop_token_map"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify stop_token_map")
|
||||
}
|
||||
stopTokenMap, err := cache.TokenMapNamed(stopTokenMapName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error building stop words filter: %v", err)
|
||||
}
|
||||
return NewStopTokensFilter(stopTokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, StopTokensFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
124
analysis/token/stop/stop_test.go
Normal file
124
analysis/token/stop/stop_test.go
Normal file
|
@ -0,0 +1,124 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stop
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenmap"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestStopWordsFilter(t *testing.T) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("in"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("park"),
|
||||
},
|
||||
}
|
||||
|
||||
expectedTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("park"),
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
stopListConfig := map[string]interface{}{
|
||||
"type": tokenmap.Name,
|
||||
"tokens": []interface{}{"a", "in", "the"},
|
||||
}
|
||||
_, err := cache.DefineTokenMap("stop_test", stopListConfig)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
stopConfig := map[string]interface{}{
|
||||
"type": "stop_tokens",
|
||||
"stop_token_map": "stop_test",
|
||||
}
|
||||
stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
ouputTokenStream := stopFilter.Filter(inputTokenStream)
|
||||
if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
|
||||
t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkStopWordsFilter(b *testing.B) {
|
||||
|
||||
inputTokenStream := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("a"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("walk"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("in"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("the"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("park"),
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
stopListConfig := map[string]interface{}{
|
||||
"type": tokenmap.Name,
|
||||
"tokens": []interface{}{"a", "in", "the"},
|
||||
}
|
||||
_, err := cache.DefineTokenMap("stop_test", stopListConfig)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
stopConfig := map[string]interface{}{
|
||||
"type": "stop_tokens",
|
||||
"stop_token_map": "stop_test",
|
||||
}
|
||||
stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
stopFilter.Filter(inputTokenStream)
|
||||
}
|
||||
|
||||
}
|
62
analysis/token/truncate/truncate.go
Normal file
62
analysis/token/truncate/truncate.go
Normal file
|
@ -0,0 +1,62 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package truncate
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "truncate_token"
|
||||
|
||||
type TruncateTokenFilter struct {
|
||||
length int
|
||||
}
|
||||
|
||||
func NewTruncateTokenFilter(length int) *TruncateTokenFilter {
|
||||
return &TruncateTokenFilter{
|
||||
length: length,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
wordLen := utf8.RuneCount(token.Term)
|
||||
if wordLen > s.length {
|
||||
token.Term = analysis.TruncateRunes(token.Term, wordLen-s.length)
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func TruncateTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
lenVal, ok := config["length"].(float64)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify length")
|
||||
}
|
||||
length := int(lenVal)
|
||||
|
||||
return NewTruncateTokenFilter(length), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, TruncateTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
79
analysis/token/truncate/truncate_test.go
Normal file
79
analysis/token/truncate/truncate_test.go
Normal file
|
@ -0,0 +1,79 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package truncate
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestTruncateTokenFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
length int
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
length: 5,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcdefgh"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abcde"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
length: 3,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こんにちは世界"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こんに"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
length: 10,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("แยกคำภาษาไ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
truncateTokenFilter := NewTruncateTokenFilter(test.length)
|
||||
actual := truncateTokenFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
82
analysis/token/unicodenorm/unicodenorm.go
Normal file
82
analysis/token/unicodenorm/unicodenorm.go
Normal file
|
@ -0,0 +1,82 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package unicodenorm
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
const Name = "normalize_unicode"
|
||||
|
||||
const NFC = "nfc"
|
||||
const NFD = "nfd"
|
||||
const NFKC = "nfkc"
|
||||
const NFKD = "nfkd"
|
||||
|
||||
var forms = map[string]norm.Form{
|
||||
NFC: norm.NFC,
|
||||
NFD: norm.NFD,
|
||||
NFKC: norm.NFKC,
|
||||
NFKD: norm.NFKD,
|
||||
}
|
||||
|
||||
type UnicodeNormalizeFilter struct {
|
||||
form norm.Form
|
||||
}
|
||||
|
||||
func NewUnicodeNormalizeFilter(formName string) (*UnicodeNormalizeFilter, error) {
|
||||
form, ok := forms[formName]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("no form named %s", formName)
|
||||
}
|
||||
return &UnicodeNormalizeFilter{
|
||||
form: form,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func MustNewUnicodeNormalizeFilter(formName string) *UnicodeNormalizeFilter {
|
||||
filter, err := NewUnicodeNormalizeFilter(formName)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return filter
|
||||
}
|
||||
|
||||
func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
token.Term = s.form.Bytes(token.Term)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
formVal, ok := config["form"].(string)
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("must specify form")
|
||||
}
|
||||
form := formVal
|
||||
return NewUnicodeNormalizeFilter(form)
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, UnicodeNormalizeFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
162
analysis/token/unicodenorm/unicodenorm_test.go
Normal file
162
analysis/token/unicodenorm/unicodenorm_test.go
Normal file
|
@ -0,0 +1,162 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package unicodenorm
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
// the following tests come from the lucene
|
||||
// test cases for CJK width filter
|
||||
// which is our basis for using this
|
||||
// as a substitute for that
|
||||
func TestUnicodeNormalization(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
formName string
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
formName: NFKD,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Test"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Test"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
formName: NFKD,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1234"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("1234"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
formName: NFKD,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("カタカナ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("カタカナ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
formName: NFKC,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ヴィッツ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ヴィッツ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
formName: NFKC,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("パナソニック"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("パナソニック"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
formName: NFD,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u212B"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0041\u030A"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
formName: NFC,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u212B"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u00C5"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
formName: NFKD,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\uFB01"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0066\u0069"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
formName: NFKC,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\uFB01"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0066\u0069"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
filter := MustNewUnicodeNormalizeFilter(test.formName)
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
t.Errorf("expected %#v, got %#v", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
56
analysis/token/unique/unique.go
Normal file
56
analysis/token/unique/unique.go
Normal file
|
@ -0,0 +1,56 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package unique
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "unique"
|
||||
|
||||
// UniqueTermFilter retains only the tokens which mark the first occurrence of
|
||||
// a term. Tokens whose term appears in a preceding token are dropped.
|
||||
type UniqueTermFilter struct{}
|
||||
|
||||
func NewUniqueTermFilter() *UniqueTermFilter {
|
||||
return &UniqueTermFilter{}
|
||||
}
|
||||
|
||||
func (f *UniqueTermFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
encounteredTerms := make(map[string]struct{}, len(input)/4)
|
||||
j := 0
|
||||
for _, token := range input {
|
||||
term := string(token.Term)
|
||||
if _, ok := encounteredTerms[term]; ok {
|
||||
continue
|
||||
}
|
||||
encounteredTerms[term] = struct{}{}
|
||||
input[j] = token
|
||||
j++
|
||||
}
|
||||
return input[:j]
|
||||
}
|
||||
|
||||
func UniqueTermFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewUniqueTermFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(Name, UniqueTermFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
84
analysis/token/unique/unique_test.go
Normal file
84
analysis/token/unique/unique_test.go
Normal file
|
@ -0,0 +1,84 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package unique
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestUniqueTermFilter(t *testing.T) {
|
||||
var tests = []struct {
|
||||
input analysis.TokenStream
|
||||
// expected indices of input which should be included in the output. We
|
||||
// use indices instead of another TokenStream, since position/start/end
|
||||
// should be preserved.
|
||||
expectedIndices []int
|
||||
}{
|
||||
{
|
||||
input: tokenStream(),
|
||||
expectedIndices: []int{},
|
||||
},
|
||||
{
|
||||
input: tokenStream("a"),
|
||||
expectedIndices: []int{0},
|
||||
},
|
||||
{
|
||||
input: tokenStream("each", "term", "in", "this", "sentence", "is", "unique"),
|
||||
expectedIndices: []int{0, 1, 2, 3, 4, 5, 6},
|
||||
},
|
||||
{
|
||||
input: tokenStream("Lui", "è", "alto", "e", "lei", "è", "bassa"),
|
||||
expectedIndices: []int{0, 1, 2, 3, 4, 6},
|
||||
},
|
||||
{
|
||||
input: tokenStream("a", "a", "A", "a", "a", "A"),
|
||||
expectedIndices: []int{0, 2},
|
||||
},
|
||||
}
|
||||
uniqueTermFilter := NewUniqueTermFilter()
|
||||
for _, test := range tests {
|
||||
expected := subStream(test.input, test.expectedIndices)
|
||||
actual := uniqueTermFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, expected) {
|
||||
t.Errorf("expected %s \n\n got %s", expected, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func tokenStream(termStrs ...string) analysis.TokenStream {
|
||||
tokenStream := make([]*analysis.Token, len(termStrs))
|
||||
index := 0
|
||||
for i, termStr := range termStrs {
|
||||
tokenStream[i] = &analysis.Token{
|
||||
Term: []byte(termStr),
|
||||
Position: i + 1,
|
||||
Start: index,
|
||||
End: index + len(termStr),
|
||||
}
|
||||
index += len(termStr)
|
||||
}
|
||||
return analysis.TokenStream(tokenStream)
|
||||
}
|
||||
|
||||
func subStream(stream analysis.TokenStream, indices []int) analysis.TokenStream {
|
||||
result := make(analysis.TokenStream, len(indices))
|
||||
for i, index := range indices {
|
||||
result[i] = stream[index]
|
||||
}
|
||||
return result
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue