Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00 · 2025-05-19 00:20:02 +02:00 · 982828099e
commit 982828099e
parent c71cb8b61d
783 changed files with 150650 additions and 0 deletions
--- a/analysis/tokenizer/character/character.go
+++ b/analysis/tokenizer/character/character.go
@ -0,0 +1,76 @@
+//  Copyright (c) 2016 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package character
+
+import (
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+type IsTokenRune func(r rune) bool
+
+type CharacterTokenizer struct {
+	isTokenRun IsTokenRune
+}
+
+func NewCharacterTokenizer(f IsTokenRune) *CharacterTokenizer {
+	return &CharacterTokenizer{
+		isTokenRun: f,
+	}
+}
+
+func (c *CharacterTokenizer) Tokenize(input []byte) analysis.TokenStream {
+
+	rv := make(analysis.TokenStream, 0, 1024)
+
+	offset := 0
+	start := 0
+	end := 0
+	count := 0
+	for currRune, size := utf8.DecodeRune(input[offset:]); currRune != utf8.RuneError; currRune, size = utf8.DecodeRune(input[offset:]) {
+		isToken := c.isTokenRun(currRune)
+		if isToken {
+			end = offset + size
+		} else {
+			if end-start > 0 {
+				// build token
+				rv = append(rv, &analysis.Token{
+					Term:     input[start:end],
+					Start:    start,
+					End:      end,
+					Position: count + 1,
+					Type:     analysis.AlphaNumeric,
+				})
+				count++
+			}
+			start = offset + size
+			end = start
+		}
+		offset += size
+	}
+	// if we ended in the middle of a token, finish it
+	if end-start > 0 {
+		// build token
+		rv = append(rv, &analysis.Token{
+			Term:     input[start:end],
+			Start:    start,
+			End:      end,
+			Position: count + 1,
+			Type:     analysis.AlphaNumeric,
+		})
+	}
+	return rv
+}
--- a/analysis/tokenizer/character/character_test.go
+++ b/analysis/tokenizer/character/character_test.go
@ -0,0 +1,84 @@
+//  Copyright (c) 2016 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package character
+
+import (
+	"reflect"
+	"testing"
+	"unicode"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestCharacterTokenizer(t *testing.T) {
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		{
+			[]byte("Hello World."),
+			analysis.TokenStream{
+				{
+					Start:    0,
+					End:      5,
+					Term:     []byte("Hello"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
+				},
+				{
+					Start:    6,
+					End:      11,
+					Term:     []byte("World"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
+				},
+			},
+		},
+		{
+			[]byte("dominique@mcdiabetes.com"),
+			analysis.TokenStream{
+				{
+					Start:    0,
+					End:      9,
+					Term:     []byte("dominique"),
+					Position: 1,
+					Type:     analysis.AlphaNumeric,
+				},
+				{
+					Start:    10,
+					End:      20,
+					Term:     []byte("mcdiabetes"),
+					Position: 2,
+					Type:     analysis.AlphaNumeric,
+				},
+				{
+					Start:    21,
+					End:      24,
+					Term:     []byte("com"),
+					Position: 3,
+					Type:     analysis.AlphaNumeric,
+				},
+			},
+		},
+	}
+
+	tokenizer := NewCharacterTokenizer(unicode.IsLetter)
+	for _, test := range tests {
+		actual := tokenizer.Tokenize(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
+		}
+	}
+}