1
0
Fork 0

Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-19 00:20:02 +02:00
parent c71cb8b61d
commit 982828099e
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
783 changed files with 150650 additions and 0 deletions

View file

@ -0,0 +1,144 @@
// Copyright (c) 2015 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// package exception implements a Tokenizer which extracts pieces matched by a
// regular expression from the input data, delegates the rest to another
// tokenizer, then insert back extracted parts in the token stream. Use it to
// preserve sequences which a regular tokenizer would alter or remove.
//
// Its constructor takes the following arguments:
//
// "exceptions" ([]string): one or more Go regular expressions matching the
// sequence to preserve. Multiple expressions are combined with "|".
//
// "tokenizer" (string): the name of the tokenizer processing the data not
// matched by "exceptions".
package exception
import (
"fmt"
"regexp"
"strings"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "exception"
type ExceptionsTokenizer struct {
exception *regexp.Regexp
remaining analysis.Tokenizer
}
func NewExceptionsTokenizer(exception *regexp.Regexp, remaining analysis.Tokenizer) *ExceptionsTokenizer {
return &ExceptionsTokenizer{
exception: exception,
remaining: remaining,
}
}
func (t *ExceptionsTokenizer) Tokenize(input []byte) analysis.TokenStream {
rv := make(analysis.TokenStream, 0)
matches := t.exception.FindAllIndex(input, -1)
currInput := 0
lastPos := 0
for _, match := range matches {
start := match[0]
end := match[1]
if start > currInput {
// need to defer to remaining for unprocessed section
intermediate := t.remaining.Tokenize(input[currInput:start])
// add intermediate tokens to our result stream
for _, token := range intermediate {
// adjust token offsets
token.Position += lastPos
token.Start += currInput
token.End += currInput
rv = append(rv, token)
}
lastPos += len(intermediate)
currInput = start
}
// create single token with this regexp match
token := &analysis.Token{
Term: input[start:end],
Start: start,
End: end,
Position: lastPos + 1,
}
rv = append(rv, token)
lastPos++
currInput = end
}
if currInput < len(input) {
// need to defer to remaining for unprocessed section
intermediate := t.remaining.Tokenize(input[currInput:])
// add intermediate tokens to our result stream
for _, token := range intermediate {
// adjust token offsets
token.Position += lastPos
token.Start += currInput
token.End += currInput
rv = append(rv, token)
}
}
return rv
}
func ExceptionsTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
exceptions := []string{}
iexceptions, ok := config["exceptions"].([]interface{})
if ok {
for _, exception := range iexceptions {
exception, ok := exception.(string)
if ok {
exceptions = append(exceptions, exception)
}
}
}
aexceptions, ok := config["exceptions"].([]string)
if ok {
exceptions = append(exceptions, aexceptions...)
}
if len(exceptions) == 0 {
return nil, fmt.Errorf("no pattern found in 'exception' property")
}
exceptionPattern := strings.Join(exceptions, "|")
r, err := regexp.Compile(exceptionPattern)
if err != nil {
return nil, fmt.Errorf("unable to build regexp tokenizer: %v", err)
}
remainingName, ok := config["tokenizer"].(string)
if !ok {
return nil, fmt.Errorf("must specify tokenizer for remaining input")
}
remaining, err := cache.TokenizerNamed(remainingName)
if err != nil {
return nil, err
}
return NewExceptionsTokenizer(r, remaining), nil
}
func init() {
err := registry.RegisterTokenizer(Name, ExceptionsTokenizerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,171 @@
// Copyright (c) 2015 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package exception
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
_ "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/registry"
)
func TestExceptionsTokenizer(t *testing.T) {
tests := []struct {
config map[string]interface{}
input []byte
patterns []string
result analysis.TokenStream
}{
{
input: []byte("test http://blevesearch.com/ words"),
config: map[string]interface{}{
"type": "exception",
"tokenizer": "unicode",
"exceptions": []interface{}{
`[hH][tT][tT][pP][sS]?://(\S)*`,
`[fF][iI][lL][eE]://(\S)*`,
`[fF][tT][pP]://(\S)*`,
},
},
result: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Position: 1,
Start: 0,
End: 4,
},
&analysis.Token{
Term: []byte("http://blevesearch.com/"),
Position: 2,
Start: 5,
End: 28,
},
&analysis.Token{
Term: []byte("words"),
Position: 3,
Start: 29,
End: 34,
},
},
},
{
input: []byte("what ftp://blevesearch.com/ songs"),
config: map[string]interface{}{
"type": "exception",
"tokenizer": "unicode",
"exceptions": []interface{}{
`[hH][tT][tT][pP][sS]?://(\S)*`,
`[fF][iI][lL][eE]://(\S)*`,
`[fF][tT][pP]://(\S)*`,
},
},
result: analysis.TokenStream{
&analysis.Token{
Term: []byte("what"),
Position: 1,
Start: 0,
End: 4,
},
&analysis.Token{
Term: []byte("ftp://blevesearch.com/"),
Position: 2,
Start: 5,
End: 27,
},
&analysis.Token{
Term: []byte("songs"),
Position: 3,
Start: 28,
End: 33,
},
},
},
{
input: []byte("please email marty@couchbase.com the URL https://blevesearch.com/"),
config: map[string]interface{}{
"type": "exception",
"tokenizer": "unicode",
"exceptions": []interface{}{
`[hH][tT][tT][pP][sS]?://(\S)*`,
`[fF][iI][lL][eE]://(\S)*`,
`[fF][tT][pP]://(\S)*`,
`\S+@\S+`,
},
},
result: analysis.TokenStream{
&analysis.Token{
Term: []byte("please"),
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("email"),
Position: 2,
Start: 7,
End: 12,
},
&analysis.Token{
Term: []byte("marty@couchbase.com"),
Position: 3,
Start: 13,
End: 32,
},
&analysis.Token{
Term: []byte("the"),
Position: 4,
Start: 33,
End: 36,
},
&analysis.Token{
Term: []byte("URL"),
Position: 5,
Start: 37,
End: 40,
},
&analysis.Token{
Term: []byte("https://blevesearch.com/"),
Position: 6,
Start: 41,
End: 65,
},
},
},
}
// remaining := unicode.NewUnicodeTokenizer()
for _, test := range tests {
// build the requested exception tokenizer
cache := registry.NewCache()
tokenizer, err := cache.DefineTokenizer("custom", test.config)
if err != nil {
t.Fatal(err)
}
// pattern := strings.Join(test.patterns, "|")
// r, err := regexp.Compile(pattern)
// if err != nil {
// t.Fatal(err)
// }
// tokenizer := NewExceptionsTokenizer(r, remaining)
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.result) {
t.Errorf("expected %v, got %v", test.result, actual)
}
}
}