Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c71cb8b61d
commit
982828099e
783 changed files with 150650 additions and 0 deletions
64
analysis/lang/ckb/analyzer_ckb.go
Normal file
64
analysis/lang/ckb/analyzer_ckb.go
Normal file
|
@ -0,0 +1,64 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "ckb"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normCkbFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopCkbFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: unicodeTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
normCkbFilter,
|
||||
toLowerFilter,
|
||||
stopCkbFilter,
|
||||
stemmerCkbFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
77
analysis/lang/ckb/analyzer_ckb_test.go
Normal file
77
analysis/lang/ckb/analyzer_ckb_test.go
Normal file
|
@ -0,0 +1,77 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestSoraniAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stop word removal
|
||||
{
|
||||
input: []byte("ئەم پیاوە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 2,
|
||||
Start: 7,
|
||||
End: 17,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("پیاوە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("پیاو"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
121
analysis/lang/ckb/sorani_normalize.go
Normal file
121
analysis/lang/ckb/sorani_normalize.go
Normal file
|
@ -0,0 +1,121 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_ckb"
|
||||
|
||||
const (
|
||||
Yeh = '\u064A'
|
||||
DotlessYeh = '\u0649'
|
||||
FarsiYeh = '\u06CC'
|
||||
|
||||
Kaf = '\u0643'
|
||||
Keheh = '\u06A9'
|
||||
|
||||
Heh = '\u0647'
|
||||
Ae = '\u06D5'
|
||||
Zwnj = '\u200C'
|
||||
HehDoachashmee = '\u06BE'
|
||||
TehMarbuta = '\u0629'
|
||||
|
||||
Reh = '\u0631'
|
||||
Rreh = '\u0695'
|
||||
RrehAbove = '\u0692'
|
||||
|
||||
Tatweel = '\u0640'
|
||||
Fathatan = '\u064B'
|
||||
Dammatan = '\u064C'
|
||||
Kasratan = '\u064D'
|
||||
Fatha = '\u064E'
|
||||
Damma = '\u064F'
|
||||
Kasra = '\u0650'
|
||||
Shadda = '\u0651'
|
||||
Sukun = '\u0652'
|
||||
)
|
||||
|
||||
type SoraniNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewSoraniNormalizeFilter() *SoraniNormalizeFilter {
|
||||
return &SoraniNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case Yeh, DotlessYeh:
|
||||
runes[i] = FarsiYeh
|
||||
case Kaf:
|
||||
runes[i] = Keheh
|
||||
case Zwnj:
|
||||
if i > 0 && runes[i-1] == Heh {
|
||||
runes[i-1] = Ae
|
||||
}
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
case Heh:
|
||||
if i == len(runes)-1 {
|
||||
runes[i] = Ae
|
||||
}
|
||||
case TehMarbuta:
|
||||
runes[i] = Ae
|
||||
case HehDoachashmee:
|
||||
runes[i] = Heh
|
||||
case Reh:
|
||||
if i == 0 {
|
||||
runes[i] = Rreh
|
||||
}
|
||||
case RrehAbove:
|
||||
runes[i] = Rreh
|
||||
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
default:
|
||||
if unicode.In(runes[i], unicode.Cf) {
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewSoraniNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
323
analysis/lang/ckb/sorani_normalize_test.go
Normal file
323
analysis/lang/ckb/sorani_normalize_test.go
Normal file
|
@ -0,0 +1,323 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestSoraniNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// test Y
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064A"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0649"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test K
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0643"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06A9"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06A9"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06A9"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test H
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u200C"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06D5"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u200C\u06A9"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06D5\u06A9"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06BE"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0629"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06D5"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test final H
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u0647\u0647"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u0647\u06D5"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test RR
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0692"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0695"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test initial RR
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0631\u0631\u0631"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0695\u0631\u0631"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test remove
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0640"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064B"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064C"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064D"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064E"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064F"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0650"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0651"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0652"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u200C"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
soraniNormalizeFilter := NewSoraniNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := soraniNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
151
analysis/lang/ckb/sorani_stemmer_filter.go
Normal file
151
analysis/lang/ckb/sorani_stemmer_filter.go
Normal file
|
@ -0,0 +1,151 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_ckb"
|
||||
|
||||
type SoraniStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewSoraniStemmerFilter() *SoraniStemmerFilter {
|
||||
return &SoraniStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
// if not protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
stemmed := stem(token.Term)
|
||||
token.Term = stemmed
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func stem(input []byte) []byte {
|
||||
inputLen := utf8.RuneCount(input)
|
||||
|
||||
// postposition
|
||||
if inputLen > 5 && bytes.HasSuffix(input, []byte("دا")) {
|
||||
input = truncateRunes(input, 2)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("نا")) {
|
||||
input = truncateRunes(input, 1)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەوە")) {
|
||||
input = truncateRunes(input, 3)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
}
|
||||
|
||||
// possessive pronoun
|
||||
if inputLen > 6 &&
|
||||
(bytes.HasSuffix(input, []byte("مان")) ||
|
||||
bytes.HasSuffix(input, []byte("یان")) ||
|
||||
bytes.HasSuffix(input, []byte("تان"))) {
|
||||
input = truncateRunes(input, 3)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
}
|
||||
|
||||
// indefinite singular ezafe
|
||||
if inputLen > 6 && bytes.HasSuffix(input, []byte("ێکی")) {
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یەکی")) {
|
||||
return truncateRunes(input, 4)
|
||||
}
|
||||
|
||||
if inputLen > 5 && bytes.HasSuffix(input, []byte("ێک")) {
|
||||
// indefinite singular
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یەک")) {
|
||||
// indefinite singular
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەکە")) {
|
||||
// definite singular
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("کە")) {
|
||||
// definite singular
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("ەکان")) {
|
||||
// definite plural
|
||||
return truncateRunes(input, 4)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("کان")) {
|
||||
// definite plural
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانی")) {
|
||||
// indefinite plural ezafe
|
||||
return truncateRunes(input, 4)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انی")) {
|
||||
// indefinite plural ezafe
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یان")) {
|
||||
// indefinite plural
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("ان")) {
|
||||
// indefinite plural
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانە")) {
|
||||
// demonstrative plural
|
||||
return truncateRunes(input, 4)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انە")) {
|
||||
// demonstrative plural
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 5 && (bytes.HasSuffix(input, []byte("ایە")) || bytes.HasSuffix(input, []byte("ەیە"))) {
|
||||
// demonstrative singular
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ە")) {
|
||||
// demonstrative singular
|
||||
return truncateRunes(input, 1)
|
||||
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ی")) {
|
||||
// absolute singular ezafe
|
||||
return truncateRunes(input, 1)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func truncateRunes(input []byte, num int) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
runes = runes[:len(runes)-num]
|
||||
out := buildTermFromRunes(runes)
|
||||
return out
|
||||
}
|
||||
|
||||
func buildTermFromRunes(runes []rune) []byte {
|
||||
rv := make([]byte, 0, len(runes)*4)
|
||||
for _, r := range runes {
|
||||
runeBytes := make([]byte, utf8.RuneLen(r))
|
||||
utf8.EncodeRune(runeBytes, r)
|
||||
rv = append(rv, runeBytes...)
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewSoraniStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
299
analysis/lang/ckb/sorani_stemmer_filter_test.go
Normal file
299
analysis/lang/ckb/sorani_stemmer_filter_test.go
Normal file
|
@ -0,0 +1,299 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/single"
|
||||
)
|
||||
|
||||
func TestSoraniStemmerFilter(t *testing.T) {
|
||||
|
||||
// in order to match the lucene tests
|
||||
// we will test with an analyzer, not just the stemmer
|
||||
analyzer := analysis.DefaultAnalyzer{
|
||||
Tokenizer: single.NewSingleTokenTokenizer(),
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
NewSoraniNormalizeFilter(),
|
||||
NewSoraniStemmerFilter(),
|
||||
},
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{ // -ek
|
||||
input: []byte("پیاوێک"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yek
|
||||
input: []byte("دەرگایەک"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -aka
|
||||
input: []byte("پیاوەكە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -ka
|
||||
input: []byte("دەرگاكە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -a
|
||||
input: []byte("کتاویە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("کتاوی"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -ya
|
||||
input: []byte("دەرگایە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -An
|
||||
input: []byte("پیاوان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yAn
|
||||
input: []byte("دەرگایان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -akAn
|
||||
input: []byte("پیاوەکان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -kAn
|
||||
input: []byte("دەرگاکان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -Ana
|
||||
input: []byte("پیاوانە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yAna
|
||||
input: []byte("دەرگایانە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 18,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // Ezafe singular
|
||||
input: []byte("هۆتیلی"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هۆتیل"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // Ezafe indefinite
|
||||
input: []byte("هۆتیلێکی"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هۆتیل"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // Ezafe plural
|
||||
input: []byte("هۆتیلانی"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هۆتیل"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -awa
|
||||
input: []byte("دوورەوە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دوور"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -dA
|
||||
input: []byte("نیوەشەودا"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("نیوەشەو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 18,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -A
|
||||
input: []byte("سۆرانا"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سۆران"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -mAn
|
||||
input: []byte("پارەمان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پارە"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -tAn
|
||||
input: []byte("پارەتان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پارە"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yAn
|
||||
input: []byte("پارەیان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پارە"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // empty
|
||||
input: []byte(""),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("for input %s(% x)", test.input, test.input)
|
||||
t.Errorf("\texpected:")
|
||||
for _, token := range test.output {
|
||||
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
|
||||
}
|
||||
t.Errorf("\tactual:")
|
||||
for _, token := range actual {
|
||||
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/ckb/stop_filter_ckb.go
Normal file
36
analysis/lang/ckb/stop_filter_ckb.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
163
analysis/lang/ckb/stop_words_ckb.go
Normal file
163
analysis/lang/ckb/stop_words_ckb.go
Normal file
|
@ -0,0 +1,163 @@
|
|||
package ckb
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_ckb"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var SoraniStopWords = []byte(`# set of kurdish stopwords
|
||||
# note these have been normalized with our scheme (e represented with U+06D5, etc)
|
||||
# constructed from:
|
||||
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
|
||||
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
|
||||
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
|
||||
|
||||
# and
|
||||
و
|
||||
# which
|
||||
کە
|
||||
# of
|
||||
ی
|
||||
# made/did
|
||||
کرد
|
||||
# that/which
|
||||
ئەوەی
|
||||
# on/head
|
||||
سەر
|
||||
# two
|
||||
دوو
|
||||
# also
|
||||
هەروەها
|
||||
# from/that
|
||||
لەو
|
||||
# makes/does
|
||||
دەکات
|
||||
# some
|
||||
چەند
|
||||
# every
|
||||
هەر
|
||||
|
||||
# demonstratives
|
||||
# that
|
||||
ئەو
|
||||
# this
|
||||
ئەم
|
||||
|
||||
# personal pronouns
|
||||
# I
|
||||
من
|
||||
# we
|
||||
ئێمە
|
||||
# you
|
||||
تۆ
|
||||
# you
|
||||
ئێوە
|
||||
# he/she/it
|
||||
ئەو
|
||||
# they
|
||||
ئەوان
|
||||
|
||||
# prepositions
|
||||
# to/with/by
|
||||
بە
|
||||
پێ
|
||||
# without
|
||||
بەبێ
|
||||
# along with/while/during
|
||||
بەدەم
|
||||
# in the opinion of
|
||||
بەلای
|
||||
# according to
|
||||
بەپێی
|
||||
# before
|
||||
بەرلە
|
||||
# in the direction of
|
||||
بەرەوی
|
||||
# in front of/toward
|
||||
بەرەوە
|
||||
# before/in the face of
|
||||
بەردەم
|
||||
# without
|
||||
بێ
|
||||
# except for
|
||||
بێجگە
|
||||
# for
|
||||
بۆ
|
||||
# on/in
|
||||
دە
|
||||
تێ
|
||||
# with
|
||||
دەگەڵ
|
||||
# after
|
||||
دوای
|
||||
# except for/aside from
|
||||
جگە
|
||||
# in/from
|
||||
لە
|
||||
لێ
|
||||
# in front of/before/because of
|
||||
لەبەر
|
||||
# between/among
|
||||
لەبەینی
|
||||
# concerning/about
|
||||
لەبابەت
|
||||
# concerning
|
||||
لەبارەی
|
||||
# instead of
|
||||
لەباتی
|
||||
# beside
|
||||
لەبن
|
||||
# instead of
|
||||
لەبرێتی
|
||||
# behind
|
||||
لەدەم
|
||||
# with/together with
|
||||
لەگەڵ
|
||||
# by
|
||||
لەلایەن
|
||||
# within
|
||||
لەناو
|
||||
# between/among
|
||||
لەنێو
|
||||
# for the sake of
|
||||
لەپێناوی
|
||||
# with respect to
|
||||
لەرەوی
|
||||
# by means of/for
|
||||
لەرێ
|
||||
# for the sake of
|
||||
لەرێگا
|
||||
# on/on top of/according to
|
||||
لەسەر
|
||||
# under
|
||||
لەژێر
|
||||
# between/among
|
||||
ناو
|
||||
# between/among
|
||||
نێوان
|
||||
# after
|
||||
پاش
|
||||
# before
|
||||
پێش
|
||||
# like
|
||||
وەک
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(SoraniStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue