Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00 · 2025-05-19 00:20:02 +02:00 · 982828099e
commit 982828099e
parent c71cb8b61d
783 changed files with 150650 additions and 0 deletions
--- a/analysis/tokenizer/regexp/regexp.go
+++ b/analysis/tokenizer/regexp/regexp.go
@ -0,0 +1,87 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+import (
+	"fmt"
+	"regexp"
+	"strconv"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "regexp"
+
+var IdeographRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`)
+
+type RegexpTokenizer struct {
+	r *regexp.Regexp
+}
+
+func NewRegexpTokenizer(r *regexp.Regexp) *RegexpTokenizer {
+	return &RegexpTokenizer{
+		r: r,
+	}
+}
+
+func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream {
+	matches := rt.r.FindAllIndex(input, -1)
+	rv := make(analysis.TokenStream, 0, len(matches))
+	for i, match := range matches {
+		matchBytes := input[match[0]:match[1]]
+		if match[1]-match[0] > 0 {
+			token := analysis.Token{
+				Term:     matchBytes,
+				Start:    match[0],
+				End:      match[1],
+				Position: i + 1,
+				Type:     detectTokenType(matchBytes),
+			}
+			rv = append(rv, &token)
+		}
+	}
+	return rv
+}
+
+func RegexpTokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
+	rval, ok := config["regexp"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify regexp")
+	}
+	r, err := regexp.Compile(rval)
+	if err != nil {
+		return nil, fmt.Errorf("unable to build regexp tokenizer: %v", err)
+	}
+	return NewRegexpTokenizer(r), nil
+}
+
+func init() {
+	err := registry.RegisterTokenizer(Name, RegexpTokenizerConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
+
+func detectTokenType(termBytes []byte) analysis.TokenType {
+	if IdeographRegexp.Match(termBytes) {
+		return analysis.Ideographic
+	}
+	_, err := strconv.ParseFloat(string(termBytes), 64)
+	if err == nil {
+		return analysis.Numeric
+	}
+	return analysis.AlphaNumeric
+}
--- a/analysis/tokenizer/regexp/regexp_test.go
+++ b/analysis/tokenizer/regexp/regexp_test.go
@ -0,0 +1,166 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package regexp
+
+import (
+	"reflect"
+	"regexp"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestBoundary(t *testing.T) {
+
+	wordRegex := regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}|\w+`)
+
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		{
+			[]byte("Hello World."),
+			analysis.TokenStream{
+				{
+					Start:    0,
+					End:      5,
+					Term:     []byte("Hello"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
+				},
+				{
+					Start:    6,
+					End:      11,
+					Term:     []byte("World"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
+				},
+			},
+		},
+		{
+			[]byte("こんにちは世界"),
+			analysis.TokenStream{
+				{
+					Start:    0,
+					End:      3,
+					Term:     []byte("こ"),
+					Position: 1,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    3,
+					End:      6,
+					Term:     []byte("ん"),
+					Position: 2,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    6,
+					End:      9,
+					Term:     []byte("に"),
+					Position: 3,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    9,
+					End:      12,
+					Term:     []byte("ち"),
+					Position: 4,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    12,
+					End:      15,
+					Term:     []byte("は"),
+					Position: 5,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    15,
+					End:      18,
+					Term:     []byte("世"),
+					Position: 6,
+					Type:     analysis.Ideographic,
+				},
+				{
+					Start:    18,
+					End:      21,
+					Term:     []byte("界"),
+					Position: 7,
+					Type:     analysis.Ideographic,
+				},
+			},
+		},
+		{
+			[]byte(""),
+			analysis.TokenStream{},
+		},
+	}
+
+	for _, test := range tests {
+		tokenizer := NewRegexpTokenizer(wordRegex)
+		actual := tokenizer.Tokenize(test.input)
+
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
+		}
+	}
+}
+
+func TestBugProducingEmptyTokens(t *testing.T) {
+
+	wordRegex := regexp.MustCompile(`[0-9a-zA-Z_]*`)
+
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		{
+			[]byte("Chatha Edwards Sr."),
+			analysis.TokenStream{
+				{
+					Start:    0,
+					End:      6,
+					Term:     []byte("Chatha"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
+				},
+				{
+					Start:    7,
+					End:      14,
+					Term:     []byte("Edwards"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
+				},
+				{
+					Start:    15,
+					End:      17,
+					Term:     []byte("Sr"),
+					Position: 3,
+					Type:     analysis.AlphaNumeric,
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		tokenizer := NewRegexpTokenizer(wordRegex)
+		actual := tokenizer.Tokenize(test.input)
+
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
+		}
+	}
+}