Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00 · 2025-05-19 00:20:02 +02:00 · 982828099e
commit 982828099e
parent c71cb8b61d
783 changed files with 150650 additions and 0 deletions
--- a/analysis/lang/ckb/analyzer_ckb.go
+++ b/analysis/lang/ckb/analyzer_ckb.go
@ -0,0 +1,64 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ckb
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
+	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const AnalyzerName = "ckb"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
+	if err != nil {
+		return nil, err
+	}
+	normCkbFilter, err := cache.TokenFilterNamed(NormalizeName)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopCkbFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.DefaultAnalyzer{
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			normCkbFilter,
+			toLowerFilter,
+			stopCkbFilter,
+			stemmerCkbFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/ckb/analyzer_ckb_test.go
+++ b/analysis/lang/ckb/analyzer_ckb_test.go
@ -0,0 +1,77 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ckb
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func TestSoraniAnalyzer(t *testing.T) {
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		// stop word removal
+		{
+			input: []byte("ئەم پیاوە"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("پیاو"),
+					Position: 2,
+					Start:    7,
+					End:      17,
+				},
+			},
+		},
+		{
+			input: []byte("پیاوە"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("پیاو"),
+					Position: 1,
+					Start:    0,
+					End:      10,
+				},
+			},
+		},
+		{
+			input: []byte("پیاو"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("پیاو"),
+					Position: 1,
+					Start:    0,
+					End:      8,
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := analyzer.Analyze(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %v, got %v", test.output, actual)
+		}
+	}
+}
--- a/analysis/lang/ckb/sorani_normalize.go
+++ b/analysis/lang/ckb/sorani_normalize.go
@ -0,0 +1,121 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ckb
+
+import (
+	"bytes"
+	"unicode"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const NormalizeName = "normalize_ckb"
+
+const (
+	Yeh        = '\u064A'
+	DotlessYeh = '\u0649'
+	FarsiYeh   = '\u06CC'
+
+	Kaf   = '\u0643'
+	Keheh = '\u06A9'
+
+	Heh            = '\u0647'
+	Ae             = '\u06D5'
+	Zwnj           = '\u200C'
+	HehDoachashmee = '\u06BE'
+	TehMarbuta     = '\u0629'
+
+	Reh       = '\u0631'
+	Rreh      = '\u0695'
+	RrehAbove = '\u0692'
+
+	Tatweel  = '\u0640'
+	Fathatan = '\u064B'
+	Dammatan = '\u064C'
+	Kasratan = '\u064D'
+	Fatha    = '\u064E'
+	Damma    = '\u064F'
+	Kasra    = '\u0650'
+	Shadda   = '\u0651'
+	Sukun    = '\u0652'
+)
+
+type SoraniNormalizeFilter struct {
+}
+
+func NewSoraniNormalizeFilter() *SoraniNormalizeFilter {
+	return &SoraniNormalizeFilter{}
+}
+
+func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		term := normalize(token.Term)
+		token.Term = term
+	}
+	return input
+}
+
+func normalize(input []byte) []byte {
+	runes := bytes.Runes(input)
+	for i := 0; i < len(runes); i++ {
+		switch runes[i] {
+		case Yeh, DotlessYeh:
+			runes[i] = FarsiYeh
+		case Kaf:
+			runes[i] = Keheh
+		case Zwnj:
+			if i > 0 && runes[i-1] == Heh {
+				runes[i-1] = Ae
+			}
+			runes = analysis.DeleteRune(runes, i)
+			i--
+		case Heh:
+			if i == len(runes)-1 {
+				runes[i] = Ae
+			}
+		case TehMarbuta:
+			runes[i] = Ae
+		case HehDoachashmee:
+			runes[i] = Heh
+		case Reh:
+			if i == 0 {
+				runes[i] = Rreh
+			}
+		case RrehAbove:
+			runes[i] = Rreh
+		case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
+			runes = analysis.DeleteRune(runes, i)
+			i--
+		default:
+			if unicode.In(runes[i], unicode.Cf) {
+				runes = analysis.DeleteRune(runes, i)
+				i--
+			}
+		}
+	}
+	return analysis.BuildTermFromRunes(runes)
+}
+
+func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewSoraniNormalizeFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/ckb/sorani_normalize_test.go
+++ b/analysis/lang/ckb/sorani_normalize_test.go
@ -0,0 +1,323 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ckb
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestSoraniNormalizeFilter(t *testing.T) {
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		// test Y
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u064A"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u06CC"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0649"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u06CC"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u06CC"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u06CC"),
+				},
+			},
+		},
+		// test K
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0643"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u06A9"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u06A9"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u06A9"),
+				},
+			},
+		},
+		// test H
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0647\u200C"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u06D5"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0647\u200C\u06A9"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u06D5\u06A9"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u06BE"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0647"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0629"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u06D5"),
+				},
+			},
+		},
+		// test final H
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0647\u0647\u0647"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0647\u0647\u06D5"),
+				},
+			},
+		},
+		// test RR
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0692"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0695"),
+				},
+			},
+		},
+		// test initial RR
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0631\u0631\u0631"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0695\u0631\u0631"),
+				},
+			},
+		},
+		// test remove
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0640"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u064B"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u064C"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u064D"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u064E"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u064F"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0650"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0651"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0652"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u200C"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+		// empty
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+	}
+
+	soraniNormalizeFilter := NewSoraniNormalizeFilter()
+	for _, test := range tests {
+		actual := soraniNormalizeFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %#v, got %#v", test.output, actual)
+			t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
+		}
+	}
+}
--- a/analysis/lang/ckb/sorani_stemmer_filter.go
+++ b/analysis/lang/ckb/sorani_stemmer_filter.go
@ -0,0 +1,151 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ckb
+
+import (
+	"bytes"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const StemmerName = "stemmer_ckb"
+
+type SoraniStemmerFilter struct {
+}
+
+func NewSoraniStemmerFilter() *SoraniStemmerFilter {
+	return &SoraniStemmerFilter{}
+}
+
+func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		// if not protected keyword, stem it
+		if !token.KeyWord {
+			stemmed := stem(token.Term)
+			token.Term = stemmed
+		}
+	}
+	return input
+}
+
+func stem(input []byte) []byte {
+	inputLen := utf8.RuneCount(input)
+
+	// postposition
+	if inputLen > 5 && bytes.HasSuffix(input, []byte("دا")) {
+		input = truncateRunes(input, 2)
+		inputLen = utf8.RuneCount(input)
+	} else if inputLen > 4 && bytes.HasSuffix(input, []byte("نا")) {
+		input = truncateRunes(input, 1)
+		inputLen = utf8.RuneCount(input)
+	} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەوە")) {
+		input = truncateRunes(input, 3)
+		inputLen = utf8.RuneCount(input)
+	}
+
+	// possessive pronoun
+	if inputLen > 6 &&
+		(bytes.HasSuffix(input, []byte("مان")) ||
+			bytes.HasSuffix(input, []byte("یان")) ||
+			bytes.HasSuffix(input, []byte("تان"))) {
+		input = truncateRunes(input, 3)
+		inputLen = utf8.RuneCount(input)
+	}
+
+	// indefinite singular ezafe
+	if inputLen > 6 && bytes.HasSuffix(input, []byte("ێکی")) {
+		return truncateRunes(input, 3)
+	} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یەکی")) {
+		return truncateRunes(input, 4)
+	}
+
+	if inputLen > 5 && bytes.HasSuffix(input, []byte("ێک")) {
+		// indefinite singular
+		return truncateRunes(input, 2)
+	} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یەک")) {
+		// indefinite singular
+		return truncateRunes(input, 3)
+	} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەکە")) {
+		// definite singular
+		return truncateRunes(input, 3)
+	} else if inputLen > 5 && bytes.HasSuffix(input, []byte("کە")) {
+		// definite singular
+		return truncateRunes(input, 2)
+	} else if inputLen > 7 && bytes.HasSuffix(input, []byte("ەکان")) {
+		// definite plural
+		return truncateRunes(input, 4)
+	} else if inputLen > 6 && bytes.HasSuffix(input, []byte("کان")) {
+		// definite plural
+		return truncateRunes(input, 3)
+	} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانی")) {
+		// indefinite plural ezafe
+		return truncateRunes(input, 4)
+	} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انی")) {
+		// indefinite plural ezafe
+		return truncateRunes(input, 3)
+	} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یان")) {
+		// indefinite plural
+		return truncateRunes(input, 3)
+	} else if inputLen > 5 && bytes.HasSuffix(input, []byte("ان")) {
+		// indefinite plural
+		return truncateRunes(input, 2)
+	} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانە")) {
+		// demonstrative plural
+		return truncateRunes(input, 4)
+	} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انە")) {
+		// demonstrative plural
+		return truncateRunes(input, 3)
+	} else if inputLen > 5 && (bytes.HasSuffix(input, []byte("ایە")) || bytes.HasSuffix(input, []byte("ەیە"))) {
+		// demonstrative singular
+		return truncateRunes(input, 2)
+	} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ە")) {
+		// demonstrative singular
+		return truncateRunes(input, 1)
+	} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ی")) {
+		// absolute singular ezafe
+		return truncateRunes(input, 1)
+	}
+	return input
+}
+
+func truncateRunes(input []byte, num int) []byte {
+	runes := bytes.Runes(input)
+	runes = runes[:len(runes)-num]
+	out := buildTermFromRunes(runes)
+	return out
+}
+
+func buildTermFromRunes(runes []rune) []byte {
+	rv := make([]byte, 0, len(runes)*4)
+	for _, r := range runes {
+		runeBytes := make([]byte, utf8.RuneLen(r))
+		utf8.EncodeRune(runeBytes, r)
+		rv = append(rv, runeBytes...)
+	}
+	return rv
+}
+
+func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewSoraniStemmerFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/ckb/sorani_stemmer_filter_test.go
+++ b/analysis/lang/ckb/sorani_stemmer_filter_test.go
@ -0,0 +1,299 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ckb
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/analysis/tokenizer/single"
+)
+
+func TestSoraniStemmerFilter(t *testing.T) {
+
+	// in order to match the lucene tests
+	// we will test with an analyzer, not just the stemmer
+	analyzer := analysis.DefaultAnalyzer{
+		Tokenizer: single.NewSingleTokenTokenizer(),
+		TokenFilters: []analysis.TokenFilter{
+			NewSoraniNormalizeFilter(),
+			NewSoraniStemmerFilter(),
+		},
+	}
+
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		{ // -ek
+			input: []byte("پیاوێک"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("پیاو"),
+					Position: 1,
+					Start:    0,
+					End:      12,
+				},
+			},
+		},
+		{ // -yek
+			input: []byte("دەرگایەک"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("دەرگا"),
+					Position: 1,
+					Start:    0,
+					End:      16,
+				},
+			},
+		},
+		{ // -aka
+			input: []byte("پیاوەكە"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("پیاو"),
+					Position: 1,
+					Start:    0,
+					End:      14,
+				},
+			},
+		},
+		{ // -ka
+			input: []byte("دەرگاكە"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("دەرگا"),
+					Position: 1,
+					Start:    0,
+					End:      14,
+				},
+			},
+		},
+		{ // -a
+			input: []byte("کتاویە"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("کتاوی"),
+					Position: 1,
+					Start:    0,
+					End:      12,
+				},
+			},
+		},
+		{ // -ya
+			input: []byte("دەرگایە"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("دەرگا"),
+					Position: 1,
+					Start:    0,
+					End:      14,
+				},
+			},
+		},
+		{ // -An
+			input: []byte("پیاوان"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("پیاو"),
+					Position: 1,
+					Start:    0,
+					End:      12,
+				},
+			},
+		},
+		{ // -yAn
+			input: []byte("دەرگایان"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("دەرگا"),
+					Position: 1,
+					Start:    0,
+					End:      16,
+				},
+			},
+		},
+		{ // -akAn
+			input: []byte("پیاوەکان"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("پیاو"),
+					Position: 1,
+					Start:    0,
+					End:      16,
+				},
+			},
+		},
+		{ // -kAn
+			input: []byte("دەرگاکان"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("دەرگا"),
+					Position: 1,
+					Start:    0,
+					End:      16,
+				},
+			},
+		},
+		{ // -Ana
+			input: []byte("پیاوانە"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("پیاو"),
+					Position: 1,
+					Start:    0,
+					End:      14,
+				},
+			},
+		},
+		{ // -yAna
+			input: []byte("دەرگایانە"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("دەرگا"),
+					Position: 1,
+					Start:    0,
+					End:      18,
+				},
+			},
+		},
+		{ // Ezafe singular
+			input: []byte("هۆتیلی"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("هۆتیل"),
+					Position: 1,
+					Start:    0,
+					End:      12,
+				},
+			},
+		},
+		{ // Ezafe indefinite
+			input: []byte("هۆتیلێکی"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("هۆتیل"),
+					Position: 1,
+					Start:    0,
+					End:      16,
+				},
+			},
+		},
+		{ // Ezafe plural
+			input: []byte("هۆتیلانی"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("هۆتیل"),
+					Position: 1,
+					Start:    0,
+					End:      16,
+				},
+			},
+		},
+		{ // -awa
+			input: []byte("دوورەوە"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("دوور"),
+					Position: 1,
+					Start:    0,
+					End:      14,
+				},
+			},
+		},
+		{ // -dA
+			input: []byte("نیوەشەودا"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("نیوەشەو"),
+					Position: 1,
+					Start:    0,
+					End:      18,
+				},
+			},
+		},
+		{ // -A
+			input: []byte("سۆرانا"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("سۆران"),
+					Position: 1,
+					Start:    0,
+					End:      12,
+				},
+			},
+		},
+		{ // -mAn
+			input: []byte("پارەمان"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("پارە"),
+					Position: 1,
+					Start:    0,
+					End:      14,
+				},
+			},
+		},
+		{ // -tAn
+			input: []byte("پارەتان"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("پارە"),
+					Position: 1,
+					Start:    0,
+					End:      14,
+				},
+			},
+		},
+		{ // -yAn
+			input: []byte("پارەیان"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("پارە"),
+					Position: 1,
+					Start:    0,
+					End:      14,
+				},
+			},
+		},
+		{ // empty
+			input: []byte(""),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte(""),
+					Position: 1,
+					Start:    0,
+					End:      0,
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		actual := analyzer.Analyze(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("for input %s(% x)", test.input, test.input)
+			t.Errorf("\texpected:")
+			for _, token := range test.output {
+				t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
+			}
+			t.Errorf("\tactual:")
+			for _, token := range actual {
+				t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
+			}
+		}
+	}
+}
--- a/analysis/lang/ckb/stop_filter_ckb.go
+++ b/analysis/lang/ckb/stop_filter_ckb.go
@ -0,0 +1,36 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ckb
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/analysis/token/stop"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/ckb/stop_words_ckb.go
+++ b/analysis/lang/ckb/stop_words_ckb.go
@ -0,0 +1,163 @@
+package ckb
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const StopName = "stop_ckb"
+
+// this content was obtained from:
+// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
+// ` was changed to ' to allow for literal string
+
+var SoraniStopWords = []byte(`# set of kurdish stopwords
+# note these have been normalized with our scheme (e represented with U+06D5, etc)
+# constructed from:
+# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
+# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
+# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
+
+# and
+و
+# which
+کە
+# of
+ی
+# made/did
+کرد
+# that/which
+ئەوەی
+# on/head
+سەر
+# two
+دوو
+# also
+هەروەها
+# from/that
+لەو
+# makes/does
+دەکات
+# some
+چەند
+# every
+هەر
+
+# demonstratives
+# that
+ئەو
+# this
+ئەم
+
+# personal pronouns
+# I
+من
+# we
+ئێمە
+# you
+تۆ
+# you
+ئێوە
+# he/she/it
+ئەو
+# they
+ئەوان
+
+# prepositions
+# to/with/by
+بە
+پێ
+# without
+بەبێ
+# along with/while/during
+بەدەم
+# in the opinion of
+بەلای
+# according to
+بەپێی
+# before
+بەرلە
+# in the direction of
+بەرەوی
+# in front of/toward
+بەرەوە
+# before/in the face of
+بەردەم
+# without
+بێ
+# except for
+بێجگە
+# for
+بۆ
+# on/in
+دە
+تێ
+# with
+دەگەڵ
+# after
+دوای
+# except for/aside from
+جگە
+# in/from
+لە
+لێ
+# in front of/before/because of
+لەبەر
+# between/among
+لەبەینی
+# concerning/about
+لەبابەت
+# concerning
+لەبارەی
+# instead of
+لەباتی
+# beside
+لەبن
+# instead of
+لەبرێتی
+# behind
+لەدەم
+# with/together with
+لەگەڵ
+# by
+لەلایەن
+# within
+لەناو
+# between/among
+لەنێو
+# for the sake of
+لەپێناوی
+# with respect to
+لەرەوی
+# by means of/for
+لەرێ
+# for the sake of
+لەرێگا
+# on/on top of/according to
+لەسەر
+# under
+لەژێر
+# between/among
+ناو
+# between/among
+نێوان
+# after
+پاش
+# before
+پێش
+# like
+وەک
+`)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(SoraniStopWords)
+	return rv, err
+}
+
+func init() {
+	err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
+	if err != nil {
+		panic(err)
+	}
+}