Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c71cb8b61d
commit
982828099e
783 changed files with 150650 additions and 0 deletions
68
analysis/lang/ar/analyzer_ar.go
Normal file
68
analysis/lang/ar/analyzer_ar.go
Normal file
|
@ -0,0 +1,68 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "ar"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeFilter := unicodenorm.MustNewUnicodeNormalizeFilter(unicodenorm.NFKC)
|
||||
stopArFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeArFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerArFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
normalizeFilter,
|
||||
stopArFilter,
|
||||
normalizeArFilter,
|
||||
stemmerArFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
184
analysis/lang/ar/analyzer_ar_test.go
Normal file
184
analysis/lang/ar/analyzer_ar_test.go
Normal file
|
@ -0,0 +1,184 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestArabicAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte("كبير"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كبير"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
// feminine marker
|
||||
{
|
||||
input: []byte("كبيرة"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كبير"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("مشروب"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مشروب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
// plural -at
|
||||
{
|
||||
input: []byte("مشروبات"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مشروب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
// plural -in
|
||||
{
|
||||
input: []byte("أمريكيين"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("امريك"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
// singular with bare alif
|
||||
{
|
||||
input: []byte("امريكي"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("امريك"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("كتاب"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتاب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
// definite article
|
||||
{
|
||||
input: []byte("الكتاب"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتاب"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("ما ملكت أيمانكم"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ملكت"),
|
||||
Position: 2,
|
||||
Start: 5,
|
||||
End: 13,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ايمانكم"),
|
||||
Position: 3,
|
||||
Start: 14,
|
||||
End: 28,
|
||||
},
|
||||
},
|
||||
},
|
||||
// stopwords
|
||||
{
|
||||
input: []byte("الذين ملكت أيمانكم"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ملكت"),
|
||||
Position: 2,
|
||||
Start: 11,
|
||||
End: 19,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ايمانكم"),
|
||||
Position: 3,
|
||||
Start: 20,
|
||||
End: 34,
|
||||
},
|
||||
},
|
||||
},
|
||||
// presentation form normalization
|
||||
{
|
||||
input: []byte("ﺍﻟﺴﻼﻢ"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 15,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
88
analysis/lang/ar/arabic_normalize.go
Normal file
88
analysis/lang/ar/arabic_normalize.go
Normal file
|
@ -0,0 +1,88 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_ar"
|
||||
|
||||
const (
|
||||
Alef = '\u0627'
|
||||
AlefMadda = '\u0622'
|
||||
AlefHamzaAbove = '\u0623'
|
||||
AlefHamzaBelow = '\u0625'
|
||||
Yeh = '\u064A'
|
||||
DotlessYeh = '\u0649'
|
||||
TehMarbuta = '\u0629'
|
||||
Heh = '\u0647'
|
||||
Tatweel = '\u0640'
|
||||
Fathatan = '\u064B'
|
||||
Dammatan = '\u064C'
|
||||
Kasratan = '\u064D'
|
||||
Fatha = '\u064E'
|
||||
Damma = '\u064F'
|
||||
Kasra = '\u0650'
|
||||
Shadda = '\u0651'
|
||||
Sukun = '\u0652'
|
||||
)
|
||||
|
||||
type ArabicNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewArabicNormalizeFilter() *ArabicNormalizeFilter {
|
||||
return &ArabicNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case AlefMadda, AlefHamzaAbove, AlefHamzaBelow:
|
||||
runes[i] = Alef
|
||||
case DotlessYeh:
|
||||
runes[i] = Yeh
|
||||
case TehMarbuta:
|
||||
runes[i] = Heh
|
||||
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewArabicNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
234
analysis/lang/ar/arabic_normalize_test.go
Normal file
234
analysis/lang/ar/arabic_normalize_test.go
Normal file
|
@ -0,0 +1,234 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestArabicNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// AlifMadda
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("آجن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("اجن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AlifHamzaAbove
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("أحمد"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("احمد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AlifHamzaBelow
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("إعاذ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("اعاذ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AlifMaksura
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بنى"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بني"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// TehMarbuta
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("فاطمة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("فاطمه"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Tatweel
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("روبرـــــت"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("روبرت"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Fatha
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مَبنا"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("مبنا"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Kasra
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("علِي"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("علي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Damma
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بُوات"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بوات"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Fathatan
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولداً"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولدا"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Kasratan
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولدٍ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Dammatan
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولدٌ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ولد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Sukun
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("نلْسون"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("نلسون"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Shaddah
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هتميّ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هتمي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
arabicNormalizeFilter := NewArabicNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := arabicNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
121
analysis/lang/ar/stemmer_ar.go
Normal file
121
analysis/lang/ar/stemmer_ar.go
Normal file
|
@ -0,0 +1,121 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_ar"
|
||||
|
||||
// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer
|
||||
var prefixes = [][]rune{
|
||||
[]rune("ال"),
|
||||
[]rune("وال"),
|
||||
[]rune("بال"),
|
||||
[]rune("كال"),
|
||||
[]rune("فال"),
|
||||
[]rune("لل"),
|
||||
[]rune("و"),
|
||||
}
|
||||
var suffixes = [][]rune{
|
||||
[]rune("ها"),
|
||||
[]rune("ان"),
|
||||
[]rune("ات"),
|
||||
[]rune("ون"),
|
||||
[]rune("ين"),
|
||||
[]rune("يه"),
|
||||
[]rune("ية"),
|
||||
[]rune("ه"),
|
||||
[]rune("ة"),
|
||||
[]rune("ي"),
|
||||
}
|
||||
|
||||
type ArabicStemmerFilter struct{}
|
||||
|
||||
func NewArabicStemmerFilter() *ArabicStemmerFilter {
|
||||
return &ArabicStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := stem(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func canStemPrefix(input, prefix []rune) bool {
|
||||
// Wa- prefix requires at least 3 characters.
|
||||
if len(prefix) == 1 && len(input) < 4 {
|
||||
return false
|
||||
}
|
||||
// Other prefixes require only 2.
|
||||
if len(input)-len(prefix) < 2 {
|
||||
return false
|
||||
}
|
||||
for i := range prefix {
|
||||
if prefix[i] != input[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func canStemSuffix(input, suffix []rune) bool {
|
||||
// All suffixes require at least 2 characters after stemming.
|
||||
if len(input)-len(suffix) < 2 {
|
||||
return false
|
||||
}
|
||||
stemEnd := len(input) - len(suffix)
|
||||
for i := range suffix {
|
||||
if suffix[i] != input[stemEnd+i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func stem(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
// Strip a single prefix.
|
||||
for _, p := range prefixes {
|
||||
if canStemPrefix(runes, p) {
|
||||
runes = runes[len(p):]
|
||||
break
|
||||
}
|
||||
}
|
||||
// Strip off multiple suffixes, in their order in the suffixes array.
|
||||
for _, s := range suffixes {
|
||||
if canStemSuffix(runes, s) {
|
||||
runes = runes[:len(runes)-len(s)]
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewArabicStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
397
analysis/lang/ar/stemmer_ar_test.go
Normal file
397
analysis/lang/ar/stemmer_ar_test.go
Normal file
|
@ -0,0 +1,397 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestArabicStemmerFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// AlPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// WalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("والحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// BalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بالحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// KalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كالحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// FalPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("فالحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// LlPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("للاخر"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("اخر"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// WaPrefix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("وحسن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("حسن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AhSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زوجها"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زوج"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AnSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدان"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// AtSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدات"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// WnSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدون"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YnSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدين"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YhSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهديه"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YpSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدية"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهده"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// PSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YSuffix
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدي"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// ComboPrefSuf
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("وساهدون"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// ComboSuf
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهدهات"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ساهد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Shouldn't Stem
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الو"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الو"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// NonArabic
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("English"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("English"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("السلام"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلامة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("السلامة"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سلام"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("الوصل"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("وصل"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("والصل"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("صل"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
arabicStemmerFilter := NewArabicStemmerFilter()
|
||||
for _, test := range tests {
|
||||
actual := arabicStemmerFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/ar/stop_filter_ar.go
Normal file
36
analysis/lang/ar/stop_filter_ar.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ar
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
152
analysis/lang/ar/stop_words_ar.go
Normal file
152
analysis/lang/ar/stop_words_ar.go
Normal file
|
@ -0,0 +1,152 @@
|
|||
package ar
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_ar"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var ArabicStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# Cleaned on October 11, 2009 (not normalized, so use before normalization)
|
||||
# This means that when modifying this list, you might need to add some
|
||||
# redundant entries, for example containing forms with both أ and ا
|
||||
من
|
||||
ومن
|
||||
منها
|
||||
منه
|
||||
في
|
||||
وفي
|
||||
فيها
|
||||
فيه
|
||||
و
|
||||
ف
|
||||
ثم
|
||||
او
|
||||
أو
|
||||
ب
|
||||
بها
|
||||
به
|
||||
ا
|
||||
أ
|
||||
اى
|
||||
اي
|
||||
أي
|
||||
أى
|
||||
لا
|
||||
ولا
|
||||
الا
|
||||
ألا
|
||||
إلا
|
||||
لكن
|
||||
ما
|
||||
وما
|
||||
كما
|
||||
فما
|
||||
عن
|
||||
مع
|
||||
اذا
|
||||
إذا
|
||||
ان
|
||||
أن
|
||||
إن
|
||||
انها
|
||||
أنها
|
||||
إنها
|
||||
انه
|
||||
أنه
|
||||
إنه
|
||||
بان
|
||||
بأن
|
||||
فان
|
||||
فأن
|
||||
وان
|
||||
وأن
|
||||
وإن
|
||||
التى
|
||||
التي
|
||||
الذى
|
||||
الذي
|
||||
الذين
|
||||
الى
|
||||
الي
|
||||
إلى
|
||||
إلي
|
||||
على
|
||||
عليها
|
||||
عليه
|
||||
اما
|
||||
أما
|
||||
إما
|
||||
ايضا
|
||||
أيضا
|
||||
كل
|
||||
وكل
|
||||
لم
|
||||
ولم
|
||||
لن
|
||||
ولن
|
||||
هى
|
||||
هي
|
||||
هو
|
||||
وهى
|
||||
وهي
|
||||
وهو
|
||||
فهى
|
||||
فهي
|
||||
فهو
|
||||
انت
|
||||
أنت
|
||||
لك
|
||||
لها
|
||||
له
|
||||
هذه
|
||||
هذا
|
||||
تلك
|
||||
ذلك
|
||||
هناك
|
||||
كانت
|
||||
كان
|
||||
يكون
|
||||
تكون
|
||||
وكانت
|
||||
وكان
|
||||
غير
|
||||
بعض
|
||||
قد
|
||||
نحو
|
||||
بين
|
||||
بينما
|
||||
منذ
|
||||
ضمن
|
||||
حيث
|
||||
الان
|
||||
الآن
|
||||
خلال
|
||||
بعد
|
||||
قبل
|
||||
حتى
|
||||
عند
|
||||
عندما
|
||||
لدى
|
||||
جميع
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(ArabicStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
36
analysis/lang/bg/stop_filter_bg.go
Normal file
36
analysis/lang/bg/stop_filter_bg.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package bg
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
220
analysis/lang/bg/stop_words_bg.go
Normal file
220
analysis/lang/bg/stop_words_bg.go
Normal file
|
@ -0,0 +1,220 @@
|
|||
package bg
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_bg"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var BulgarianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
а
|
||||
аз
|
||||
ако
|
||||
ала
|
||||
бе
|
||||
без
|
||||
беше
|
||||
би
|
||||
бил
|
||||
била
|
||||
били
|
||||
било
|
||||
близо
|
||||
бъдат
|
||||
бъде
|
||||
бяха
|
||||
в
|
||||
вас
|
||||
ваш
|
||||
ваша
|
||||
вероятно
|
||||
вече
|
||||
взема
|
||||
ви
|
||||
вие
|
||||
винаги
|
||||
все
|
||||
всеки
|
||||
всички
|
||||
всичко
|
||||
всяка
|
||||
във
|
||||
въпреки
|
||||
върху
|
||||
г
|
||||
ги
|
||||
главно
|
||||
го
|
||||
д
|
||||
да
|
||||
дали
|
||||
до
|
||||
докато
|
||||
докога
|
||||
дори
|
||||
досега
|
||||
доста
|
||||
е
|
||||
едва
|
||||
един
|
||||
ето
|
||||
за
|
||||
зад
|
||||
заедно
|
||||
заради
|
||||
засега
|
||||
затова
|
||||
защо
|
||||
защото
|
||||
и
|
||||
из
|
||||
или
|
||||
им
|
||||
има
|
||||
имат
|
||||
иска
|
||||
й
|
||||
каза
|
||||
как
|
||||
каква
|
||||
какво
|
||||
както
|
||||
какъв
|
||||
като
|
||||
кога
|
||||
когато
|
||||
което
|
||||
които
|
||||
кой
|
||||
който
|
||||
колко
|
||||
която
|
||||
къде
|
||||
където
|
||||
към
|
||||
ли
|
||||
м
|
||||
ме
|
||||
между
|
||||
мен
|
||||
ми
|
||||
мнозина
|
||||
мога
|
||||
могат
|
||||
може
|
||||
моля
|
||||
момента
|
||||
му
|
||||
н
|
||||
на
|
||||
над
|
||||
назад
|
||||
най
|
||||
направи
|
||||
напред
|
||||
например
|
||||
нас
|
||||
не
|
||||
него
|
||||
нея
|
||||
ни
|
||||
ние
|
||||
никой
|
||||
нито
|
||||
но
|
||||
някои
|
||||
някой
|
||||
няма
|
||||
обаче
|
||||
около
|
||||
освен
|
||||
особено
|
||||
от
|
||||
отгоре
|
||||
отново
|
||||
още
|
||||
пак
|
||||
по
|
||||
повече
|
||||
повечето
|
||||
под
|
||||
поне
|
||||
поради
|
||||
после
|
||||
почти
|
||||
прави
|
||||
пред
|
||||
преди
|
||||
през
|
||||
при
|
||||
пък
|
||||
първо
|
||||
с
|
||||
са
|
||||
само
|
||||
се
|
||||
сега
|
||||
си
|
||||
скоро
|
||||
след
|
||||
сме
|
||||
според
|
||||
сред
|
||||
срещу
|
||||
сте
|
||||
съм
|
||||
със
|
||||
също
|
||||
т
|
||||
тази
|
||||
така
|
||||
такива
|
||||
такъв
|
||||
там
|
||||
твой
|
||||
те
|
||||
тези
|
||||
ти
|
||||
тн
|
||||
то
|
||||
това
|
||||
тогава
|
||||
този
|
||||
той
|
||||
толкова
|
||||
точно
|
||||
трябва
|
||||
тук
|
||||
тъй
|
||||
тя
|
||||
тях
|
||||
у
|
||||
харесва
|
||||
ч
|
||||
че
|
||||
често
|
||||
чрез
|
||||
ще
|
||||
щом
|
||||
я
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(BulgarianStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
33
analysis/lang/ca/articles_ca.go
Normal file
33
analysis/lang/ca/articles_ca.go
Normal file
|
@ -0,0 +1,33 @@
|
|||
package ca
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const ArticlesName = "articles_ca"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
|
||||
|
||||
var CatalanArticles = []byte(`
|
||||
d
|
||||
l
|
||||
m
|
||||
n
|
||||
s
|
||||
t
|
||||
`)
|
||||
|
||||
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(CatalanArticles)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
40
analysis/lang/ca/elision_ca.go
Normal file
40
analysis/lang/ca/elision_ca.go
Normal file
|
@ -0,0 +1,40 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ca
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/elision"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const ElisionName = "elision_ca"
|
||||
|
||||
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error building elision filter: %v", err)
|
||||
}
|
||||
return elision.NewElisionFilter(articlesTokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
61
analysis/lang/ca/elision_ca_test.go
Normal file
61
analysis/lang/ca/elision_ca_test.go
Normal file
|
@ -0,0 +1,61 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ca
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestFrenchElision(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("l'Institut"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("d'Estudis"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Institut"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("Estudis"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := elisionFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/ca/stop_filter_ca.go
Normal file
36
analysis/lang/ca/stop_filter_ca.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ca
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
247
analysis/lang/ca/stop_words_ca.go
Normal file
247
analysis/lang/ca/stop_words_ca.go
Normal file
|
@ -0,0 +1,247 @@
|
|||
package ca
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_ca"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var CatalanStopWords = []byte(`# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
|
||||
a
|
||||
abans
|
||||
ací
|
||||
ah
|
||||
així
|
||||
això
|
||||
al
|
||||
als
|
||||
aleshores
|
||||
algun
|
||||
alguna
|
||||
algunes
|
||||
alguns
|
||||
alhora
|
||||
allà
|
||||
allí
|
||||
allò
|
||||
altra
|
||||
altre
|
||||
altres
|
||||
amb
|
||||
ambdós
|
||||
ambdues
|
||||
apa
|
||||
aquell
|
||||
aquella
|
||||
aquelles
|
||||
aquells
|
||||
aquest
|
||||
aquesta
|
||||
aquestes
|
||||
aquests
|
||||
aquí
|
||||
baix
|
||||
cada
|
||||
cadascú
|
||||
cadascuna
|
||||
cadascunes
|
||||
cadascuns
|
||||
com
|
||||
contra
|
||||
d'un
|
||||
d'una
|
||||
d'unes
|
||||
d'uns
|
||||
dalt
|
||||
de
|
||||
del
|
||||
dels
|
||||
des
|
||||
després
|
||||
dins
|
||||
dintre
|
||||
donat
|
||||
doncs
|
||||
durant
|
||||
e
|
||||
eh
|
||||
el
|
||||
els
|
||||
em
|
||||
en
|
||||
encara
|
||||
ens
|
||||
entre
|
||||
érem
|
||||
eren
|
||||
éreu
|
||||
es
|
||||
és
|
||||
esta
|
||||
està
|
||||
estàvem
|
||||
estaven
|
||||
estàveu
|
||||
esteu
|
||||
et
|
||||
etc
|
||||
ets
|
||||
fins
|
||||
fora
|
||||
gairebé
|
||||
ha
|
||||
han
|
||||
has
|
||||
havia
|
||||
he
|
||||
hem
|
||||
heu
|
||||
hi
|
||||
ho
|
||||
i
|
||||
igual
|
||||
iguals
|
||||
ja
|
||||
l'hi
|
||||
la
|
||||
les
|
||||
li
|
||||
li'n
|
||||
llavors
|
||||
m'he
|
||||
ma
|
||||
mal
|
||||
malgrat
|
||||
mateix
|
||||
mateixa
|
||||
mateixes
|
||||
mateixos
|
||||
me
|
||||
mentre
|
||||
més
|
||||
meu
|
||||
meus
|
||||
meva
|
||||
meves
|
||||
molt
|
||||
molta
|
||||
moltes
|
||||
molts
|
||||
mon
|
||||
mons
|
||||
n'he
|
||||
n'hi
|
||||
ne
|
||||
ni
|
||||
no
|
||||
nogensmenys
|
||||
només
|
||||
nosaltres
|
||||
nostra
|
||||
nostre
|
||||
nostres
|
||||
o
|
||||
oh
|
||||
oi
|
||||
on
|
||||
pas
|
||||
pel
|
||||
pels
|
||||
per
|
||||
però
|
||||
perquè
|
||||
poc
|
||||
poca
|
||||
pocs
|
||||
poques
|
||||
potser
|
||||
propi
|
||||
qual
|
||||
quals
|
||||
quan
|
||||
quant
|
||||
que
|
||||
què
|
||||
quelcom
|
||||
qui
|
||||
quin
|
||||
quina
|
||||
quines
|
||||
quins
|
||||
s'ha
|
||||
s'han
|
||||
sa
|
||||
semblant
|
||||
semblants
|
||||
ses
|
||||
seu
|
||||
seus
|
||||
seva
|
||||
seva
|
||||
seves
|
||||
si
|
||||
sobre
|
||||
sobretot
|
||||
sóc
|
||||
solament
|
||||
sols
|
||||
son
|
||||
són
|
||||
sons
|
||||
sota
|
||||
sou
|
||||
t'ha
|
||||
t'han
|
||||
t'he
|
||||
ta
|
||||
tal
|
||||
també
|
||||
tampoc
|
||||
tan
|
||||
tant
|
||||
tanta
|
||||
tantes
|
||||
teu
|
||||
teus
|
||||
teva
|
||||
teves
|
||||
ton
|
||||
tons
|
||||
tot
|
||||
tota
|
||||
totes
|
||||
tots
|
||||
un
|
||||
una
|
||||
unes
|
||||
uns
|
||||
us
|
||||
va
|
||||
vaig
|
||||
vam
|
||||
van
|
||||
vas
|
||||
veu
|
||||
vosaltres
|
||||
vostra
|
||||
vostre
|
||||
vostres
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(CatalanStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
60
analysis/lang/cjk/analyzer_cjk.go
Normal file
60
analysis/lang/cjk/analyzer_cjk.go
Normal file
|
@ -0,0 +1,60 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "cjk"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
widthFilter, err := cache.TokenFilterNamed(WidthName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
bigramFilter, err := cache.TokenFilterNamed(BigramName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
widthFilter,
|
||||
toLowerFilter,
|
||||
bigramFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
642
analysis/lang/cjk/analyzer_cjk_test.go
Normal file
642
analysis/lang/cjk/analyzer_cjk_test.go
Normal file
|
@ -0,0 +1,642 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestCJKAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte("こんにちは世界"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は世"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一二三四五六七八九十"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一二"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("二三"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("三四"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("四五"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("五六"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("六七"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("七八"),
|
||||
Type: analysis.Double,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 24,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("八九"),
|
||||
Type: analysis.Double,
|
||||
Position: 8,
|
||||
Start: 21,
|
||||
End: 27,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("九十"),
|
||||
Type: analysis.Double,
|
||||
Position: 9,
|
||||
Start: 24,
|
||||
End: 30,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一 二三四 五六七八九 十"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("二三"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 10,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("三四"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 7,
|
||||
End: 13,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("五六"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 14,
|
||||
End: 20,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("六七"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 17,
|
||||
End: 23,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("七八"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 20,
|
||||
End: 26,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("八九"),
|
||||
Type: analysis.Double,
|
||||
Position: 7,
|
||||
Start: 23,
|
||||
End: 29,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("十"),
|
||||
Type: analysis.Single,
|
||||
Position: 8,
|
||||
Start: 30,
|
||||
End: 33,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("abc defgh ijklmn opqrstu vwxy z"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abc"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("defgh"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ijklmn"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 3,
|
||||
Start: 10,
|
||||
End: 16,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("opqrstu"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 4,
|
||||
Start: 17,
|
||||
End: 24,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("vwxy"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 5,
|
||||
Start: 25,
|
||||
End: 29,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("z"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 30,
|
||||
End: 31,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あい"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あい "),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("test"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("test "),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あいtest"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 2,
|
||||
Start: 6,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("testあい "),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あいうえおabcかきくけこ"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("いう"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("うえ"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("えお"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("abc"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 5,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("かき"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 18,
|
||||
End: 24,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("きく"),
|
||||
Type: analysis.Double,
|
||||
Position: 7,
|
||||
Start: 21,
|
||||
End: 27,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("くけ"),
|
||||
Type: analysis.Double,
|
||||
Position: 8,
|
||||
Start: 24,
|
||||
End: 30,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("けこ"),
|
||||
Type: analysis.Double,
|
||||
Position: 9,
|
||||
Start: 27,
|
||||
End: 33,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("あいうえおabんcかきくけ こ"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("あい"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("いう"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("うえ"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("えお"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ab"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 5,
|
||||
Start: 15,
|
||||
End: 17,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Single,
|
||||
Position: 6,
|
||||
Start: 17,
|
||||
End: 20,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("c"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 7,
|
||||
Start: 20,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("かき"),
|
||||
Type: analysis.Double,
|
||||
Position: 8,
|
||||
Start: 21,
|
||||
End: 27,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("きく"),
|
||||
Type: analysis.Double,
|
||||
Position: 9,
|
||||
Start: 24,
|
||||
End: 30,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("くけ"),
|
||||
Type: analysis.Double,
|
||||
Position: 10,
|
||||
Start: 27,
|
||||
End: 33,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Single,
|
||||
Position: 11,
|
||||
Start: 34,
|
||||
End: 37,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一 روبرت موير"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("روبرت"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 14,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("موير"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 3,
|
||||
Start: 15,
|
||||
End: 23,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一 رُوبرت موير"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("رُوبرت"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 16,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("موير"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 3,
|
||||
Start: 17,
|
||||
End: 25,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("𩬅艱鍟䇹愯瀛"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("𩬅艱"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 7,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("艱鍟"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 4,
|
||||
End: 10,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("鍟䇹"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 7,
|
||||
End: 13,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("䇹愯"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 10,
|
||||
End: 16,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("愯瀛"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 13,
|
||||
End: 19,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("一丁丂"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("一丁"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("丁丂"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
for _, test := range tests {
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkCJKAnalyzer(b *testing.B) {
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
analyzer.Analyze(bleveWikiArticleJapanese)
|
||||
}
|
||||
}
|
||||
|
||||
var bleveWikiArticleJapanese = []byte(`加圧容器に貯蔵されている液体物質は、その時の気液平衡状態にあるが、火災により容器が加熱されていると容器内の液体は、その物質の大気圧のもとでの沸点より十分に高い温度まで加熱され、圧力も高くなる。この状態で容器が破裂すると容器内部の圧力は瞬間的に大気圧にまで低下する。
|
||||
この時に容器内の平衡状態が破られ、液体は突沸し、気体になることで爆発現象を起こす。液化石油ガスなどでは、さらに拡散して空気と混ざったガスが自由空間蒸気雲爆発を起こす。液化石油ガスなどの常温常圧で気体になる物を高い圧力で液化して収納している容器、あるいは、そのような液体を輸送するためのパイプラインや配管などが火災などによって破壊されたときに起きる。
|
||||
ブリーブという現象が明らかになったのは、フランス・リヨンの郊外にあるフェザンという町のフェザン製油所(ウニオン・ド・ゼネラル・ド・ペトロール)で大規模な爆発火災事故が発生したときだと言われている。
|
||||
中身の液体が高温高圧の水である場合には「水蒸気爆発」と呼ばれる。`)
|
210
analysis/lang/cjk/cjk_bigram.go
Normal file
210
analysis/lang/cjk/cjk_bigram.go
Normal file
|
@ -0,0 +1,210 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"container/ring"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const BigramName = "cjk_bigram"
|
||||
|
||||
type CJKBigramFilter struct {
|
||||
outputUnigram bool
|
||||
}
|
||||
|
||||
func NewCJKBigramFilter(outputUnigram bool) *CJKBigramFilter {
|
||||
return &CJKBigramFilter{
|
||||
outputUnigram: outputUnigram,
|
||||
}
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
r := ring.New(2)
|
||||
itemsInRing := 0
|
||||
pos := 1
|
||||
outputPos := 1
|
||||
|
||||
rv := make(analysis.TokenStream, 0, len(input))
|
||||
|
||||
for _, tokout := range input {
|
||||
if tokout.Type == analysis.Ideographic {
|
||||
runes := bytes.Runes(tokout.Term)
|
||||
sofar := 0
|
||||
for _, run := range runes {
|
||||
rlen := utf8.RuneLen(run)
|
||||
token := &analysis.Token{
|
||||
Term: tokout.Term[sofar : sofar+rlen],
|
||||
Start: tokout.Start + sofar,
|
||||
End: tokout.Start + sofar + rlen,
|
||||
Position: pos,
|
||||
Type: tokout.Type,
|
||||
KeyWord: tokout.KeyWord,
|
||||
}
|
||||
pos++
|
||||
sofar += rlen
|
||||
if itemsInRing > 0 {
|
||||
// if items already buffered
|
||||
// check to see if this is aligned
|
||||
curr := r.Value.(*analysis.Token)
|
||||
if token.Start-curr.End != 0 {
|
||||
// not aligned flush
|
||||
flushToken := s.flush(r, &itemsInRing, outputPos)
|
||||
if flushToken != nil {
|
||||
outputPos++
|
||||
rv = append(rv, flushToken)
|
||||
}
|
||||
}
|
||||
}
|
||||
// now we can add this token to the buffer
|
||||
r = r.Next()
|
||||
r.Value = token
|
||||
if itemsInRing < 2 {
|
||||
itemsInRing++
|
||||
}
|
||||
builtUnigram := false
|
||||
if itemsInRing > 1 && s.outputUnigram {
|
||||
unigram := s.buildUnigram(r, &itemsInRing, outputPos)
|
||||
if unigram != nil {
|
||||
builtUnigram = true
|
||||
rv = append(rv, unigram)
|
||||
}
|
||||
}
|
||||
bigramToken := s.outputBigram(r, &itemsInRing, outputPos)
|
||||
if bigramToken != nil {
|
||||
rv = append(rv, bigramToken)
|
||||
outputPos++
|
||||
}
|
||||
|
||||
// prev token should be removed if unigram was built
|
||||
if builtUnigram {
|
||||
itemsInRing--
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
// flush anything already buffered
|
||||
flushToken := s.flush(r, &itemsInRing, outputPos)
|
||||
if flushToken != nil {
|
||||
rv = append(rv, flushToken)
|
||||
outputPos++
|
||||
}
|
||||
// output this token as is
|
||||
tokout.Position = outputPos
|
||||
rv = append(rv, tokout)
|
||||
outputPos++
|
||||
}
|
||||
}
|
||||
|
||||
// deal with possible trailing unigram
|
||||
if itemsInRing == 1 || s.outputUnigram {
|
||||
if itemsInRing == 2 {
|
||||
r = r.Next()
|
||||
}
|
||||
unigram := s.buildUnigram(r, &itemsInRing, outputPos)
|
||||
if unigram != nil {
|
||||
rv = append(rv, unigram)
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) flush(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
|
||||
var rv *analysis.Token
|
||||
if *itemsInRing == 1 {
|
||||
rv = s.buildUnigram(r, itemsInRing, pos)
|
||||
}
|
||||
r.Value = nil
|
||||
*itemsInRing = 0
|
||||
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) outputBigram(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
|
||||
if *itemsInRing == 2 {
|
||||
thisShingleRing := r.Move(-1)
|
||||
shingledBytes := make([]byte, 0)
|
||||
|
||||
// do first token
|
||||
prev := thisShingleRing.Value.(*analysis.Token)
|
||||
shingledBytes = append(shingledBytes, prev.Term...)
|
||||
|
||||
// do second token
|
||||
thisShingleRing = thisShingleRing.Next()
|
||||
curr := thisShingleRing.Value.(*analysis.Token)
|
||||
shingledBytes = append(shingledBytes, curr.Term...)
|
||||
|
||||
token := analysis.Token{
|
||||
Type: analysis.Double,
|
||||
Term: shingledBytes,
|
||||
Position: pos,
|
||||
Start: prev.Start,
|
||||
End: curr.End,
|
||||
}
|
||||
return &token
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *CJKBigramFilter) buildUnigram(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
|
||||
switch *itemsInRing {
|
||||
case 2:
|
||||
thisShingleRing := r.Move(-1)
|
||||
// do first token
|
||||
prev := thisShingleRing.Value.(*analysis.Token)
|
||||
token := analysis.Token{
|
||||
Type: analysis.Single,
|
||||
Term: prev.Term,
|
||||
Position: pos,
|
||||
Start: prev.Start,
|
||||
End: prev.End,
|
||||
}
|
||||
return &token
|
||||
case 1:
|
||||
// do first token
|
||||
prev := r.Value.(*analysis.Token)
|
||||
token := analysis.Token{
|
||||
Type: analysis.Single,
|
||||
Term: prev.Term,
|
||||
Position: pos,
|
||||
Start: prev.Start,
|
||||
End: prev.End,
|
||||
}
|
||||
return &token
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CJKBigramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
outputUnigram := false
|
||||
outVal, ok := config["output_unigram"].(bool)
|
||||
if ok {
|
||||
outputUnigram = outVal
|
||||
}
|
||||
return NewCJKBigramFilter(outputUnigram), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(BigramName, CJKBigramFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
848
analysis/lang/cjk/cjk_bigram_test.go
Normal file
848
analysis/lang/cjk/cjk_bigram_test.go
Normal file
|
@ -0,0 +1,848 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"container/ring"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
// Helper function to create a token
|
||||
func makeToken(term string, start, end, pos int) *analysis.Token {
|
||||
return &analysis.Token{
|
||||
Term: []byte(term),
|
||||
Start: start,
|
||||
End: end,
|
||||
Position: pos, // Note: buildUnigram uses the 'pos' argument, not the token's original pos
|
||||
Type: analysis.Ideographic,
|
||||
}
|
||||
}
|
||||
|
||||
func TestCJKBigramFilter_buildUnigram(t *testing.T) {
|
||||
filter := NewCJKBigramFilter(false)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
ringSetup func() (*ring.Ring, int) // Function to set up the ring and itemsInRing
|
||||
inputPos int // Position to pass to buildUnigram
|
||||
expectToken *analysis.Token
|
||||
}{
|
||||
{
|
||||
name: "itemsInRing == 2",
|
||||
ringSetup: func() (*ring.Ring, int) {
|
||||
r := ring.New(2)
|
||||
token1 := makeToken("一", 0, 3, 1) // Original pos 1
|
||||
token2 := makeToken("二", 3, 6, 2) // Original pos 2
|
||||
r.Value = token1
|
||||
r = r.Next()
|
||||
r.Value = token2
|
||||
// r currently points to token2, r.Move(-1) points to token1
|
||||
return r, 2
|
||||
},
|
||||
inputPos: 10, // Expected output position
|
||||
expectToken: &analysis.Token{
|
||||
Type: analysis.Single,
|
||||
Term: []byte("一"),
|
||||
Position: 10, // Should use inputPos
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "itemsInRing == 1 (ring points to the single item)",
|
||||
ringSetup: func() (*ring.Ring, int) {
|
||||
r := ring.New(2)
|
||||
token1 := makeToken("三", 6, 9, 3)
|
||||
r.Value = token1
|
||||
// r points to token1
|
||||
return r, 1
|
||||
},
|
||||
inputPos: 11,
|
||||
expectToken: &analysis.Token{
|
||||
Type: analysis.Single,
|
||||
Term: []byte("三"),
|
||||
Position: 11, // Should use inputPos
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "itemsInRing == 1 (ring points to nil, next is the single item)",
|
||||
ringSetup: func() (*ring.Ring, int) {
|
||||
r := ring.New(2)
|
||||
token1 := makeToken("四", 9, 12, 4)
|
||||
r = r.Next() // r points to nil initially
|
||||
r.Value = token1
|
||||
// r points to token1
|
||||
return r, 1
|
||||
},
|
||||
inputPos: 12,
|
||||
expectToken: &analysis.Token{
|
||||
Type: analysis.Single,
|
||||
Term: []byte("四"),
|
||||
Position: 12, // Should use inputPos
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "itemsInRing == 0",
|
||||
ringSetup: func() (*ring.Ring, int) {
|
||||
r := ring.New(2)
|
||||
// Ring is empty
|
||||
return r, 0
|
||||
},
|
||||
inputPos: 13,
|
||||
expectToken: nil, // Expect nil when itemsInRing is not 1 or 2
|
||||
},
|
||||
{
|
||||
name: "itemsInRing > 2 (should behave like 0)",
|
||||
ringSetup: func() (*ring.Ring, int) {
|
||||
r := ring.New(2)
|
||||
token1 := makeToken("五", 12, 15, 5)
|
||||
token2 := makeToken("六", 15, 18, 6)
|
||||
r.Value = token1
|
||||
r = r.Next()
|
||||
r.Value = token2
|
||||
// Simulate incorrect itemsInRing count
|
||||
return r, 3
|
||||
},
|
||||
inputPos: 14,
|
||||
expectToken: nil, // Expect nil when itemsInRing is not 1 or 2
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
ringPtr, itemsInRing := tt.ringSetup()
|
||||
itemsInRingCopy := itemsInRing // Pass a pointer to a copy
|
||||
|
||||
gotToken := filter.buildUnigram(ringPtr, &itemsInRingCopy, tt.inputPos)
|
||||
|
||||
if !reflect.DeepEqual(gotToken, tt.expectToken) {
|
||||
t.Errorf("buildUnigram() got = %v, want %v", gotToken, tt.expectToken)
|
||||
}
|
||||
|
||||
// Check if itemsInRing was modified (it shouldn't be by buildUnigram)
|
||||
if itemsInRingCopy != itemsInRing {
|
||||
t.Errorf("buildUnigram() modified itemsInRing, got = %d, want %d", itemsInRingCopy, itemsInRing)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCJKBigramFilter_outputBigram(t *testing.T) {
|
||||
// Create a filter instance (outputUnigram value doesn't matter for outputBigram)
|
||||
filter := NewCJKBigramFilter(false)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
ringSetup func() (*ring.Ring, int) // Function to set up the ring and itemsInRing
|
||||
inputPos int // Position to pass to outputBigram
|
||||
expectToken *analysis.Token
|
||||
}{
|
||||
{
|
||||
name: "itemsInRing == 2",
|
||||
ringSetup: func() (*ring.Ring, int) {
|
||||
r := ring.New(2)
|
||||
token1 := makeToken("一", 0, 3, 1) // Original pos 1
|
||||
token2 := makeToken("二", 3, 6, 2) // Original pos 2
|
||||
r.Value = token1
|
||||
r = r.Next()
|
||||
r.Value = token2
|
||||
// r currently points to token2, r.Move(-1) points to token1
|
||||
return r, 2
|
||||
},
|
||||
inputPos: 10, // Expected output position
|
||||
expectToken: &analysis.Token{
|
||||
Type: analysis.Double,
|
||||
Term: []byte("一二"), // Combined term
|
||||
Position: 10, // Should use inputPos
|
||||
Start: 0, // Start of first token
|
||||
End: 6, // End of second token
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "itemsInRing == 2 with different terms",
|
||||
ringSetup: func() (*ring.Ring, int) {
|
||||
r := ring.New(2)
|
||||
token1 := makeToken("你好", 0, 6, 1)
|
||||
token2 := makeToken("世界", 6, 12, 2)
|
||||
r.Value = token1
|
||||
r = r.Next()
|
||||
r.Value = token2
|
||||
return r, 2
|
||||
},
|
||||
inputPos: 5,
|
||||
expectToken: &analysis.Token{
|
||||
Type: analysis.Double,
|
||||
Term: []byte("你好世界"),
|
||||
Position: 5,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "itemsInRing == 1",
|
||||
ringSetup: func() (*ring.Ring, int) {
|
||||
r := ring.New(2)
|
||||
token1 := makeToken("三", 6, 9, 3)
|
||||
r.Value = token1
|
||||
return r, 1
|
||||
},
|
||||
inputPos: 11,
|
||||
expectToken: nil, // Expect nil when itemsInRing is not 2
|
||||
},
|
||||
{
|
||||
name: "itemsInRing == 0",
|
||||
ringSetup: func() (*ring.Ring, int) {
|
||||
r := ring.New(2)
|
||||
// Ring is empty
|
||||
return r, 0
|
||||
},
|
||||
inputPos: 13,
|
||||
expectToken: nil, // Expect nil when itemsInRing is not 2
|
||||
},
|
||||
{
|
||||
name: "itemsInRing > 2 (should behave like 0)",
|
||||
ringSetup: func() (*ring.Ring, int) {
|
||||
r := ring.New(2)
|
||||
token1 := makeToken("五", 12, 15, 5)
|
||||
token2 := makeToken("六", 15, 18, 6)
|
||||
r.Value = token1
|
||||
r = r.Next()
|
||||
r.Value = token2
|
||||
// Simulate incorrect itemsInRing count
|
||||
return r, 3
|
||||
},
|
||||
inputPos: 14,
|
||||
expectToken: nil, // Expect nil when itemsInRing is not 2
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
ringPtr, itemsInRing := tt.ringSetup()
|
||||
itemsInRingCopy := itemsInRing // Pass a pointer to a copy
|
||||
|
||||
gotToken := filter.outputBigram(ringPtr, &itemsInRingCopy, tt.inputPos)
|
||||
|
||||
if !reflect.DeepEqual(gotToken, tt.expectToken) {
|
||||
t.Errorf("outputBigram() got = %v, want %v", gotToken, tt.expectToken)
|
||||
}
|
||||
|
||||
// Check if itemsInRing was modified (it shouldn't be by outputBigram)
|
||||
if itemsInRingCopy != itemsInRing {
|
||||
t.Errorf("outputBigram() modified itemsInRing, got = %d, want %d", itemsInRingCopy, itemsInRing)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestCJKBigramFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
outputUnigram bool
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// first test that non-adjacent terms are not combined
|
||||
{
|
||||
outputUnigram: false,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 5,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Single,
|
||||
Position: 2,
|
||||
Start: 5,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
outputUnigram: false,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は世"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
outputUnigram: true,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Single,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Single,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Single,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Single,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は世"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Single,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 18,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 15,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Single,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
// Assuming that `、` is removed by unicode tokenizer from `こんにちは、世界`
|
||||
outputUnigram: true,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 8,
|
||||
Start: 21,
|
||||
End: 24,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Single,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Single,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Single,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Single,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Single,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Single,
|
||||
Position: 6,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 18,
|
||||
End: 24,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Single,
|
||||
Position: 7,
|
||||
Start: 21,
|
||||
End: 24,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
outputUnigram: false,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こ"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 3,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ん"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("に"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ち"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("は"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 6,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 7,
|
||||
Start: 18,
|
||||
End: 21,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("界"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 8,
|
||||
Start: 21,
|
||||
End: 24,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("こん"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("んに"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("にち"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ちは"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cat"),
|
||||
Type: analysis.AlphaNumeric,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("世界"),
|
||||
Type: analysis.Double,
|
||||
Position: 6,
|
||||
Start: 18,
|
||||
End: 24,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
outputUnigram: false,
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("パイプライン"),
|
||||
Type: analysis.Ideographic,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 18,
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("パイ"),
|
||||
Type: analysis.Double,
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("イプ"),
|
||||
Type: analysis.Double,
|
||||
Position: 2,
|
||||
Start: 3,
|
||||
End: 9,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("プラ"),
|
||||
Type: analysis.Double,
|
||||
Position: 3,
|
||||
Start: 6,
|
||||
End: 12,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ライ"),
|
||||
Type: analysis.Double,
|
||||
Position: 4,
|
||||
Start: 9,
|
||||
End: 15,
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("イン"),
|
||||
Type: analysis.Double,
|
||||
Position: 5,
|
||||
Start: 12,
|
||||
End: 18,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
cjkBigramFilter := NewCJKBigramFilter(test.outputUnigram)
|
||||
actual := cjkBigramFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
104
analysis/lang/cjk/cjk_width.go
Normal file
104
analysis/lang/cjk/cjk_width.go
Normal file
|
@ -0,0 +1,104 @@
|
|||
// Copyright (c) 2016 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const WidthName = "cjk_width"
|
||||
|
||||
type CJKWidthFilter struct{}
|
||||
|
||||
func NewCJKWidthFilter() *CJKWidthFilter {
|
||||
return &CJKWidthFilter{}
|
||||
}
|
||||
|
||||
func (s *CJKWidthFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
runeCount := utf8.RuneCount(token.Term)
|
||||
runes := bytes.Runes(token.Term)
|
||||
for i := 0; i < runeCount; i++ {
|
||||
ch := runes[i]
|
||||
if ch >= 0xFF01 && ch <= 0xFF5E {
|
||||
// fullwidth ASCII variants
|
||||
runes[i] -= 0xFEE0
|
||||
} else if ch >= 0xFF65 && ch <= 0xFF9F {
|
||||
// halfwidth Katakana variants
|
||||
if (ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(runes, i, ch) {
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
runeCount = len(runes)
|
||||
} else {
|
||||
runes[i] = kanaNorm[ch-0xFF65]
|
||||
}
|
||||
}
|
||||
}
|
||||
token.Term = analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
var kanaNorm = []rune{
|
||||
0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
|
||||
0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
|
||||
0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
|
||||
0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
|
||||
0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
|
||||
0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
|
||||
0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A,
|
||||
}
|
||||
|
||||
var kanaCombineVoiced = []rune{
|
||||
78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
|
||||
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
|
||||
}
|
||||
var kanaCombineHalfVoiced = []rune{
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
|
||||
0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
}
|
||||
|
||||
func combine(text []rune, pos int, r rune) bool {
|
||||
prev := text[pos-1]
|
||||
if prev >= 0x30A6 && prev <= 0x30FD {
|
||||
if r == 0xFF9F {
|
||||
text[pos-1] += kanaCombineHalfVoiced[prev-0x30A6]
|
||||
} else {
|
||||
text[pos-1] += kanaCombineVoiced[prev-0x30A6]
|
||||
}
|
||||
return text[pos-1] != prev
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func CJKWidthFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewCJKWidthFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(WidthName, CJKWidthFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
93
analysis/lang/cjk/cjk_width_test.go
Normal file
93
analysis/lang/cjk/cjk_width_test.go
Normal file
|
@ -0,0 +1,93 @@
|
|||
// Copyright (c) 2016 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cjk
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestCJKWidthFilter(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Test"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("1234"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Test"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("1234"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("カタカナ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("カタカナ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ヴィッツ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ヴィッツ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("パナソニック"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("パナソニック"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
cjkWidthFilter := NewCJKWidthFilter()
|
||||
actual := cjkWidthFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
64
analysis/lang/ckb/analyzer_ckb.go
Normal file
64
analysis/lang/ckb/analyzer_ckb.go
Normal file
|
@ -0,0 +1,64 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "ckb"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normCkbFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopCkbFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: unicodeTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
normCkbFilter,
|
||||
toLowerFilter,
|
||||
stopCkbFilter,
|
||||
stemmerCkbFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
77
analysis/lang/ckb/analyzer_ckb_test.go
Normal file
77
analysis/lang/ckb/analyzer_ckb_test.go
Normal file
|
@ -0,0 +1,77 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestSoraniAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stop word removal
|
||||
{
|
||||
input: []byte("ئەم پیاوە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 2,
|
||||
Start: 7,
|
||||
End: 17,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("پیاوە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("پیاو"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
121
analysis/lang/ckb/sorani_normalize.go
Normal file
121
analysis/lang/ckb/sorani_normalize.go
Normal file
|
@ -0,0 +1,121 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_ckb"
|
||||
|
||||
const (
|
||||
Yeh = '\u064A'
|
||||
DotlessYeh = '\u0649'
|
||||
FarsiYeh = '\u06CC'
|
||||
|
||||
Kaf = '\u0643'
|
||||
Keheh = '\u06A9'
|
||||
|
||||
Heh = '\u0647'
|
||||
Ae = '\u06D5'
|
||||
Zwnj = '\u200C'
|
||||
HehDoachashmee = '\u06BE'
|
||||
TehMarbuta = '\u0629'
|
||||
|
||||
Reh = '\u0631'
|
||||
Rreh = '\u0695'
|
||||
RrehAbove = '\u0692'
|
||||
|
||||
Tatweel = '\u0640'
|
||||
Fathatan = '\u064B'
|
||||
Dammatan = '\u064C'
|
||||
Kasratan = '\u064D'
|
||||
Fatha = '\u064E'
|
||||
Damma = '\u064F'
|
||||
Kasra = '\u0650'
|
||||
Shadda = '\u0651'
|
||||
Sukun = '\u0652'
|
||||
)
|
||||
|
||||
type SoraniNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewSoraniNormalizeFilter() *SoraniNormalizeFilter {
|
||||
return &SoraniNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case Yeh, DotlessYeh:
|
||||
runes[i] = FarsiYeh
|
||||
case Kaf:
|
||||
runes[i] = Keheh
|
||||
case Zwnj:
|
||||
if i > 0 && runes[i-1] == Heh {
|
||||
runes[i-1] = Ae
|
||||
}
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
case Heh:
|
||||
if i == len(runes)-1 {
|
||||
runes[i] = Ae
|
||||
}
|
||||
case TehMarbuta:
|
||||
runes[i] = Ae
|
||||
case HehDoachashmee:
|
||||
runes[i] = Heh
|
||||
case Reh:
|
||||
if i == 0 {
|
||||
runes[i] = Rreh
|
||||
}
|
||||
case RrehAbove:
|
||||
runes[i] = Rreh
|
||||
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
default:
|
||||
if unicode.In(runes[i], unicode.Cf) {
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewSoraniNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
323
analysis/lang/ckb/sorani_normalize_test.go
Normal file
323
analysis/lang/ckb/sorani_normalize_test.go
Normal file
|
@ -0,0 +1,323 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestSoraniNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// test Y
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064A"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0649"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06CC"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test K
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0643"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06A9"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06A9"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06A9"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test H
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u200C"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06D5"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u200C\u06A9"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06D5\u06A9"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06BE"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0629"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u06D5"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test final H
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u0647\u0647"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0647\u0647\u06D5"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test RR
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0692"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0695"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test initial RR
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0631\u0631\u0631"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0695\u0631\u0631"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test remove
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0640"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064B"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064C"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064D"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064E"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u064F"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0650"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0651"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u0652"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("\u200C"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
soraniNormalizeFilter := NewSoraniNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := soraniNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
151
analysis/lang/ckb/sorani_stemmer_filter.go
Normal file
151
analysis/lang/ckb/sorani_stemmer_filter.go
Normal file
|
@ -0,0 +1,151 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_ckb"
|
||||
|
||||
type SoraniStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewSoraniStemmerFilter() *SoraniStemmerFilter {
|
||||
return &SoraniStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
// if not protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
stemmed := stem(token.Term)
|
||||
token.Term = stemmed
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func stem(input []byte) []byte {
|
||||
inputLen := utf8.RuneCount(input)
|
||||
|
||||
// postposition
|
||||
if inputLen > 5 && bytes.HasSuffix(input, []byte("دا")) {
|
||||
input = truncateRunes(input, 2)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("نا")) {
|
||||
input = truncateRunes(input, 1)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەوە")) {
|
||||
input = truncateRunes(input, 3)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
}
|
||||
|
||||
// possessive pronoun
|
||||
if inputLen > 6 &&
|
||||
(bytes.HasSuffix(input, []byte("مان")) ||
|
||||
bytes.HasSuffix(input, []byte("یان")) ||
|
||||
bytes.HasSuffix(input, []byte("تان"))) {
|
||||
input = truncateRunes(input, 3)
|
||||
inputLen = utf8.RuneCount(input)
|
||||
}
|
||||
|
||||
// indefinite singular ezafe
|
||||
if inputLen > 6 && bytes.HasSuffix(input, []byte("ێکی")) {
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یەکی")) {
|
||||
return truncateRunes(input, 4)
|
||||
}
|
||||
|
||||
if inputLen > 5 && bytes.HasSuffix(input, []byte("ێک")) {
|
||||
// indefinite singular
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یەک")) {
|
||||
// indefinite singular
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەکە")) {
|
||||
// definite singular
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("کە")) {
|
||||
// definite singular
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("ەکان")) {
|
||||
// definite plural
|
||||
return truncateRunes(input, 4)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("کان")) {
|
||||
// definite plural
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانی")) {
|
||||
// indefinite plural ezafe
|
||||
return truncateRunes(input, 4)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انی")) {
|
||||
// indefinite plural ezafe
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یان")) {
|
||||
// indefinite plural
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("ان")) {
|
||||
// indefinite plural
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانە")) {
|
||||
// demonstrative plural
|
||||
return truncateRunes(input, 4)
|
||||
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انە")) {
|
||||
// demonstrative plural
|
||||
return truncateRunes(input, 3)
|
||||
} else if inputLen > 5 && (bytes.HasSuffix(input, []byte("ایە")) || bytes.HasSuffix(input, []byte("ەیە"))) {
|
||||
// demonstrative singular
|
||||
return truncateRunes(input, 2)
|
||||
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ە")) {
|
||||
// demonstrative singular
|
||||
return truncateRunes(input, 1)
|
||||
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ی")) {
|
||||
// absolute singular ezafe
|
||||
return truncateRunes(input, 1)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func truncateRunes(input []byte, num int) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
runes = runes[:len(runes)-num]
|
||||
out := buildTermFromRunes(runes)
|
||||
return out
|
||||
}
|
||||
|
||||
func buildTermFromRunes(runes []rune) []byte {
|
||||
rv := make([]byte, 0, len(runes)*4)
|
||||
for _, r := range runes {
|
||||
runeBytes := make([]byte, utf8.RuneLen(r))
|
||||
utf8.EncodeRune(runeBytes, r)
|
||||
rv = append(rv, runeBytes...)
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewSoraniStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
299
analysis/lang/ckb/sorani_stemmer_filter_test.go
Normal file
299
analysis/lang/ckb/sorani_stemmer_filter_test.go
Normal file
|
@ -0,0 +1,299 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/single"
|
||||
)
|
||||
|
||||
func TestSoraniStemmerFilter(t *testing.T) {
|
||||
|
||||
// in order to match the lucene tests
|
||||
// we will test with an analyzer, not just the stemmer
|
||||
analyzer := analysis.DefaultAnalyzer{
|
||||
Tokenizer: single.NewSingleTokenTokenizer(),
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
NewSoraniNormalizeFilter(),
|
||||
NewSoraniStemmerFilter(),
|
||||
},
|
||||
}
|
||||
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{ // -ek
|
||||
input: []byte("پیاوێک"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yek
|
||||
input: []byte("دەرگایەک"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -aka
|
||||
input: []byte("پیاوەكە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -ka
|
||||
input: []byte("دەرگاكە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -a
|
||||
input: []byte("کتاویە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("کتاوی"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -ya
|
||||
input: []byte("دەرگایە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -An
|
||||
input: []byte("پیاوان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yAn
|
||||
input: []byte("دەرگایان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -akAn
|
||||
input: []byte("پیاوەکان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -kAn
|
||||
input: []byte("دەرگاکان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -Ana
|
||||
input: []byte("پیاوانە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پیاو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yAna
|
||||
input: []byte("دەرگایانە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دەرگا"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 18,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // Ezafe singular
|
||||
input: []byte("هۆتیلی"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هۆتیل"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // Ezafe indefinite
|
||||
input: []byte("هۆتیلێکی"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هۆتیل"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // Ezafe plural
|
||||
input: []byte("هۆتیلانی"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هۆتیل"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 16,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -awa
|
||||
input: []byte("دوورەوە"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("دوور"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -dA
|
||||
input: []byte("نیوەشەودا"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("نیوەشەو"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 18,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -A
|
||||
input: []byte("سۆرانا"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("سۆران"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 12,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -mAn
|
||||
input: []byte("پارەمان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پارە"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -tAn
|
||||
input: []byte("پارەتان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پارە"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // -yAn
|
||||
input: []byte("پارەیان"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("پارە"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{ // empty
|
||||
input: []byte(""),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("for input %s(% x)", test.input, test.input)
|
||||
t.Errorf("\texpected:")
|
||||
for _, token := range test.output {
|
||||
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
|
||||
}
|
||||
t.Errorf("\tactual:")
|
||||
for _, token := range actual {
|
||||
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/ckb/stop_filter_ckb.go
Normal file
36
analysis/lang/ckb/stop_filter_ckb.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ckb
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
163
analysis/lang/ckb/stop_words_ckb.go
Normal file
163
analysis/lang/ckb/stop_words_ckb.go
Normal file
|
@ -0,0 +1,163 @@
|
|||
package ckb
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_ckb"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var SoraniStopWords = []byte(`# set of kurdish stopwords
|
||||
# note these have been normalized with our scheme (e represented with U+06D5, etc)
|
||||
# constructed from:
|
||||
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
|
||||
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
|
||||
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
|
||||
|
||||
# and
|
||||
و
|
||||
# which
|
||||
کە
|
||||
# of
|
||||
ی
|
||||
# made/did
|
||||
کرد
|
||||
# that/which
|
||||
ئەوەی
|
||||
# on/head
|
||||
سەر
|
||||
# two
|
||||
دوو
|
||||
# also
|
||||
هەروەها
|
||||
# from/that
|
||||
لەو
|
||||
# makes/does
|
||||
دەکات
|
||||
# some
|
||||
چەند
|
||||
# every
|
||||
هەر
|
||||
|
||||
# demonstratives
|
||||
# that
|
||||
ئەو
|
||||
# this
|
||||
ئەم
|
||||
|
||||
# personal pronouns
|
||||
# I
|
||||
من
|
||||
# we
|
||||
ئێمە
|
||||
# you
|
||||
تۆ
|
||||
# you
|
||||
ئێوە
|
||||
# he/she/it
|
||||
ئەو
|
||||
# they
|
||||
ئەوان
|
||||
|
||||
# prepositions
|
||||
# to/with/by
|
||||
بە
|
||||
پێ
|
||||
# without
|
||||
بەبێ
|
||||
# along with/while/during
|
||||
بەدەم
|
||||
# in the opinion of
|
||||
بەلای
|
||||
# according to
|
||||
بەپێی
|
||||
# before
|
||||
بەرلە
|
||||
# in the direction of
|
||||
بەرەوی
|
||||
# in front of/toward
|
||||
بەرەوە
|
||||
# before/in the face of
|
||||
بەردەم
|
||||
# without
|
||||
بێ
|
||||
# except for
|
||||
بێجگە
|
||||
# for
|
||||
بۆ
|
||||
# on/in
|
||||
دە
|
||||
تێ
|
||||
# with
|
||||
دەگەڵ
|
||||
# after
|
||||
دوای
|
||||
# except for/aside from
|
||||
جگە
|
||||
# in/from
|
||||
لە
|
||||
لێ
|
||||
# in front of/before/because of
|
||||
لەبەر
|
||||
# between/among
|
||||
لەبەینی
|
||||
# concerning/about
|
||||
لەبابەت
|
||||
# concerning
|
||||
لەبارەی
|
||||
# instead of
|
||||
لەباتی
|
||||
# beside
|
||||
لەبن
|
||||
# instead of
|
||||
لەبرێتی
|
||||
# behind
|
||||
لەدەم
|
||||
# with/together with
|
||||
لەگەڵ
|
||||
# by
|
||||
لەلایەن
|
||||
# within
|
||||
لەناو
|
||||
# between/among
|
||||
لەنێو
|
||||
# for the sake of
|
||||
لەپێناوی
|
||||
# with respect to
|
||||
لەرەوی
|
||||
# by means of/for
|
||||
لەرێ
|
||||
# for the sake of
|
||||
لەرێگا
|
||||
# on/on top of/according to
|
||||
لەسەر
|
||||
# under
|
||||
لەژێر
|
||||
# between/among
|
||||
ناو
|
||||
# between/among
|
||||
نێوان
|
||||
# after
|
||||
پاش
|
||||
# before
|
||||
پێش
|
||||
# like
|
||||
وەک
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(SoraniStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
36
analysis/lang/cs/stop_filter_cs.go
Normal file
36
analysis/lang/cs/stop_filter_cs.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package cs
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
199
analysis/lang/cs/stop_words_cs.go
Normal file
199
analysis/lang/cs/stop_words_cs.go
Normal file
|
@ -0,0 +1,199 @@
|
|||
package cs
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_cs"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var CzechStopWords = []byte(`a
|
||||
s
|
||||
k
|
||||
o
|
||||
i
|
||||
u
|
||||
v
|
||||
z
|
||||
dnes
|
||||
cz
|
||||
tímto
|
||||
budeš
|
||||
budem
|
||||
byli
|
||||
jseš
|
||||
můj
|
||||
svým
|
||||
ta
|
||||
tomto
|
||||
tohle
|
||||
tuto
|
||||
tyto
|
||||
jej
|
||||
zda
|
||||
proč
|
||||
máte
|
||||
tato
|
||||
kam
|
||||
tohoto
|
||||
kdo
|
||||
kteří
|
||||
mi
|
||||
nám
|
||||
tom
|
||||
tomuto
|
||||
mít
|
||||
nic
|
||||
proto
|
||||
kterou
|
||||
byla
|
||||
toho
|
||||
protože
|
||||
asi
|
||||
ho
|
||||
naši
|
||||
napište
|
||||
re
|
||||
což
|
||||
tím
|
||||
takže
|
||||
svých
|
||||
její
|
||||
svými
|
||||
jste
|
||||
aj
|
||||
tu
|
||||
tedy
|
||||
teto
|
||||
bylo
|
||||
kde
|
||||
ke
|
||||
pravé
|
||||
ji
|
||||
nad
|
||||
nejsou
|
||||
či
|
||||
pod
|
||||
téma
|
||||
mezi
|
||||
přes
|
||||
ty
|
||||
pak
|
||||
vám
|
||||
ani
|
||||
když
|
||||
však
|
||||
neg
|
||||
jsem
|
||||
tento
|
||||
článku
|
||||
články
|
||||
aby
|
||||
jsme
|
||||
před
|
||||
pta
|
||||
jejich
|
||||
byl
|
||||
ještě
|
||||
až
|
||||
bez
|
||||
také
|
||||
pouze
|
||||
první
|
||||
vaše
|
||||
která
|
||||
nás
|
||||
nový
|
||||
tipy
|
||||
pokud
|
||||
může
|
||||
strana
|
||||
jeho
|
||||
své
|
||||
jiné
|
||||
zprávy
|
||||
nové
|
||||
není
|
||||
vás
|
||||
jen
|
||||
podle
|
||||
zde
|
||||
už
|
||||
být
|
||||
více
|
||||
bude
|
||||
již
|
||||
než
|
||||
který
|
||||
by
|
||||
které
|
||||
co
|
||||
nebo
|
||||
ten
|
||||
tak
|
||||
má
|
||||
při
|
||||
od
|
||||
po
|
||||
jsou
|
||||
jak
|
||||
další
|
||||
ale
|
||||
si
|
||||
se
|
||||
ve
|
||||
to
|
||||
jako
|
||||
za
|
||||
zpět
|
||||
ze
|
||||
do
|
||||
pro
|
||||
je
|
||||
na
|
||||
atd
|
||||
atp
|
||||
jakmile
|
||||
přičemž
|
||||
já
|
||||
on
|
||||
ona
|
||||
ono
|
||||
oni
|
||||
ony
|
||||
my
|
||||
vy
|
||||
jí
|
||||
ji
|
||||
mě
|
||||
mne
|
||||
jemu
|
||||
tomu
|
||||
těm
|
||||
těmu
|
||||
němu
|
||||
němuž
|
||||
jehož
|
||||
jíž
|
||||
jelikož
|
||||
jež
|
||||
jakož
|
||||
načež
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(CzechStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
59
analysis/lang/da/analyzer_da.go
Normal file
59
analysis/lang/da/analyzer_da.go
Normal file
|
@ -0,0 +1,59 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package da
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "da"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopDaFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerDaFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: unicodeTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopDaFilter,
|
||||
stemmerDaFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
71
analysis/lang/da/analyzer_da_test.go
Normal file
71
analysis/lang/da/analyzer_da_test.go
Normal file
|
@ -0,0 +1,71 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package da
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestDanishAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("undersøg"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("undersøg"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 9,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("undersøgelse"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("undersøg"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 13,
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word
|
||||
{
|
||||
input: []byte("på"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
52
analysis/lang/da/stemmer_da.go
Normal file
52
analysis/lang/da/stemmer_da.go
Normal file
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package da
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/snowballstem"
|
||||
"github.com/blevesearch/snowballstem/danish"
|
||||
)
|
||||
|
||||
const SnowballStemmerName = "stemmer_da_snowball"
|
||||
|
||||
type DanishStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewDanishStemmerFilter() *DanishStemmerFilter {
|
||||
return &DanishStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *DanishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
env := snowballstem.NewEnv(string(token.Term))
|
||||
danish.Stem(env)
|
||||
token.Term = []byte(env.Current())
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func DanishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewDanishStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(SnowballStemmerName, DanishStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
36
analysis/lang/da/stop_filter_da.go
Normal file
36
analysis/lang/da/stop_filter_da.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package da
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
137
analysis/lang/da/stop_words_da.go
Normal file
137
analysis/lang/da/stop_words_da.go
Normal file
|
@ -0,0 +1,137 @@
|
|||
package da
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_da"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var DanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Danish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large text sample.
|
||||
|
||||
|
||||
og | and
|
||||
i | in
|
||||
jeg | I
|
||||
det | that (dem. pronoun)/it (pers. pronoun)
|
||||
at | that (in front of a sentence)/to (with infinitive)
|
||||
en | a/an
|
||||
den | it (pers. pronoun)/that (dem. pronoun)
|
||||
til | to/at/for/until/against/by/of/into, more
|
||||
er | present tense of "to be"
|
||||
som | who, as
|
||||
på | on/upon/in/on/at/to/after/of/with/for, on
|
||||
de | they
|
||||
med | with/by/in, along
|
||||
han | he
|
||||
af | of/by/from/off/for/in/with/on, off
|
||||
for | at/for/to/from/by/of/ago, in front/before, because
|
||||
ikke | not
|
||||
der | who/which, there/those
|
||||
var | past tense of "to be"
|
||||
mig | me/myself
|
||||
sig | oneself/himself/herself/itself/themselves
|
||||
men | but
|
||||
et | a/an/one, one (number), someone/somebody/one
|
||||
har | present tense of "to have"
|
||||
om | round/about/for/in/a, about/around/down, if
|
||||
vi | we
|
||||
min | my
|
||||
havde | past tense of "to have"
|
||||
ham | him
|
||||
hun | she
|
||||
nu | now
|
||||
over | over/above/across/by/beyond/past/on/about, over/past
|
||||
da | then, when/as/since
|
||||
fra | from/off/since, off, since
|
||||
du | you
|
||||
ud | out
|
||||
sin | his/her/its/one's
|
||||
dem | them
|
||||
os | us/ourselves
|
||||
op | up
|
||||
man | you/one
|
||||
hans | his
|
||||
hvor | where
|
||||
eller | or
|
||||
hvad | what
|
||||
skal | must/shall etc.
|
||||
selv | myself/youself/herself/ourselves etc., even
|
||||
her | here
|
||||
alle | all/everyone/everybody etc.
|
||||
vil | will (verb)
|
||||
blev | past tense of "to stay/to remain/to get/to become"
|
||||
kunne | could
|
||||
ind | in
|
||||
når | when
|
||||
være | present tense of "to be"
|
||||
dog | however/yet/after all
|
||||
noget | something
|
||||
ville | would
|
||||
jo | you know/you see (adv), yes
|
||||
deres | their/theirs
|
||||
efter | after/behind/according to/for/by/from, later/afterwards
|
||||
ned | down
|
||||
skulle | should
|
||||
denne | this
|
||||
end | than
|
||||
dette | this
|
||||
mit | my/mine
|
||||
også | also
|
||||
under | under/beneath/below/during, below/underneath
|
||||
have | have
|
||||
dig | you
|
||||
anden | other
|
||||
hende | her
|
||||
mine | my
|
||||
alt | everything
|
||||
meget | much/very, plenty of
|
||||
sit | his, her, its, one's
|
||||
sine | his, her, its, one's
|
||||
vor | our
|
||||
mod | against
|
||||
disse | these
|
||||
hvis | if
|
||||
din | your/yours
|
||||
nogle | some
|
||||
hos | by/at
|
||||
blive | be/become
|
||||
mange | many
|
||||
ad | by/through
|
||||
bliver | present tense of "to be/to become"
|
||||
hendes | her/hers
|
||||
været | be
|
||||
thi | for (conj)
|
||||
jer | you
|
||||
sådan | such, like this/like that
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(DanishStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
64
analysis/lang/de/analyzer_de.go
Normal file
64
analysis/lang/de/analyzer_de.go
Normal file
|
@ -0,0 +1,64 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const AnalyzerName = "de"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopDeFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
lightStemmerDeFilter, err := cache.TokenFilterNamed(LightStemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: unicodeTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopDeFilter,
|
||||
normalizeDeFilter,
|
||||
lightStemmerDeFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
155
analysis/lang/de/analyzer_de_test.go
Normal file
155
analysis/lang/de/analyzer_de_test.go
Normal file
|
@ -0,0 +1,155 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestGermanAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte("Tisch"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("tisch"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Tische"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("tisch"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Tischen"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("tisch"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 7,
|
||||
},
|
||||
},
|
||||
},
|
||||
// german specials
|
||||
{
|
||||
input: []byte("Schaltflächen"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("schaltflach"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Schaltflaechen"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("schaltflach"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 14,
|
||||
},
|
||||
},
|
||||
},
|
||||
// tests added by marty to increase coverage
|
||||
{
|
||||
input: []byte("Blechern"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("blech"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Klecks"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("kleck"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Mindestens"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("mindest"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Kugelfest"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("kugelf"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 9,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Baldigst"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("baldig"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
98
analysis/lang/de/german_normalize.go
Normal file
98
analysis/lang/de/german_normalize.go
Normal file
|
@ -0,0 +1,98 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_de"
|
||||
|
||||
const (
|
||||
N = 0 /* ordinary state */
|
||||
V = 1 /* stops 'u' from entering umlaut state */
|
||||
U = 2 /* umlaut state, allows e-deletion */
|
||||
)
|
||||
|
||||
type GermanNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewGermanNormalizeFilter() *GermanNormalizeFilter {
|
||||
return &GermanNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
state := N
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case 'a', 'o':
|
||||
state = U
|
||||
case 'u':
|
||||
if state == N {
|
||||
state = U
|
||||
} else {
|
||||
state = V
|
||||
}
|
||||
case 'e':
|
||||
if state == U {
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
state = V
|
||||
case 'i', 'q', 'y':
|
||||
state = V
|
||||
case 'ä':
|
||||
runes[i] = 'a'
|
||||
state = V
|
||||
case 'ö':
|
||||
runes[i] = 'o'
|
||||
state = V
|
||||
case 'ü':
|
||||
runes[i] = 'u'
|
||||
state = V
|
||||
case 'ß':
|
||||
runes[i] = 's'
|
||||
i++
|
||||
runes = analysis.InsertRune(runes, i, 's')
|
||||
state = N
|
||||
default:
|
||||
state = N
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewGermanNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
103
analysis/lang/de/german_normalize_test.go
Normal file
103
analysis/lang/de/german_normalize_test.go
Normal file
|
@ -0,0 +1,103 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestGermanNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// Tests that a/o/u + e is equivalent to the umlaut form
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflächen"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflachen"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflaechen"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Schaltflachen"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Tests the specific heuristic that ue is not folded after a vowel or q.
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("dauer"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("dauer"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Tests german specific folding of sharp-s
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("weißbier"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("weissbier"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
germanNormalizeFilter := NewGermanNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := germanNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
119
analysis/lang/de/light_stemmer_de.go
Normal file
119
analysis/lang/de/light_stemmer_de.go
Normal file
|
@ -0,0 +1,119 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const LightStemmerName = "stemmer_de_light"
|
||||
|
||||
type GermanLightStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewGermanLightStemmerFilter() *GermanLightStemmerFilter {
|
||||
return &GermanLightStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *GermanLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
runes := bytes.Runes(token.Term)
|
||||
runes = stem(runes)
|
||||
token.Term = analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func stem(input []rune) []rune {
|
||||
|
||||
for i, r := range input {
|
||||
switch r {
|
||||
case 'ä', 'à', 'á', 'â':
|
||||
input[i] = 'a'
|
||||
case 'ö', 'ò', 'ó', 'ô':
|
||||
input[i] = 'o'
|
||||
case 'ï', 'ì', 'í', 'î':
|
||||
input[i] = 'i'
|
||||
case 'ü', 'ù', 'ú', 'û':
|
||||
input[i] = 'u'
|
||||
}
|
||||
}
|
||||
|
||||
input = step1(input)
|
||||
return step2(input)
|
||||
}
|
||||
|
||||
func stEnding(ch rune) bool {
|
||||
switch ch {
|
||||
case 'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't':
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func step1(s []rune) []rune {
|
||||
l := len(s)
|
||||
if l > 5 && s[l-3] == 'e' && s[l-2] == 'r' && s[l-1] == 'n' {
|
||||
return s[:l-3]
|
||||
}
|
||||
|
||||
if l > 4 && s[l-2] == 'e' {
|
||||
switch s[l-1] {
|
||||
case 'm', 'n', 'r', 's':
|
||||
return s[:l-2]
|
||||
}
|
||||
}
|
||||
|
||||
if l > 3 && s[l-1] == 'e' {
|
||||
return s[:l-1]
|
||||
}
|
||||
|
||||
if l > 3 && s[l-1] == 's' && stEnding(s[l-2]) {
|
||||
return s[:l-1]
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func step2(s []rune) []rune {
|
||||
l := len(s)
|
||||
if l > 5 && s[l-3] == 'e' && s[l-2] == 's' && s[l-1] == 't' {
|
||||
return s[:l-3]
|
||||
}
|
||||
|
||||
if l > 4 && s[l-2] == 'e' && (s[l-1] == 'r' || s[l-1] == 'n') {
|
||||
return s[:l-2]
|
||||
}
|
||||
|
||||
if l > 4 && s[l-2] == 's' && s[l-1] == 't' && stEnding(s[l-3]) {
|
||||
return s[:l-2]
|
||||
}
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
func GermanLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewGermanLightStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(LightStemmerName, GermanLightStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
52
analysis/lang/de/stemmer_de_snowball.go
Normal file
52
analysis/lang/de/stemmer_de_snowball.go
Normal file
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/snowballstem"
|
||||
"github.com/blevesearch/snowballstem/german"
|
||||
)
|
||||
|
||||
const SnowballStemmerName = "stemmer_de_snowball"
|
||||
|
||||
type GermanStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewGermanStemmerFilter() *GermanStemmerFilter {
|
||||
return &GermanStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *GermanStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
env := snowballstem.NewEnv(string(token.Term))
|
||||
german.Stem(env)
|
||||
token.Term = []byte(env.Current())
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func GermanStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewGermanStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(SnowballStemmerName, GermanStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
91
analysis/lang/de/stemmer_de_test.go
Normal file
91
analysis/lang/de/stemmer_de_test.go
Normal file
|
@ -0,0 +1,91 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestSnowballGermanStemmer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abzuschrecken"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abzuschreck"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abzuwarten"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("abzuwart"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("zwirnfabrik"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("zwirnfabr"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("zyniker"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("zynik"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/de/stop_filter_de.go
Normal file
36
analysis/lang/de/stop_filter_de.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package de
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
321
analysis/lang/de/stop_words_de.go
Normal file
321
analysis/lang/de/stop_words_de.go
Normal file
|
@ -0,0 +1,321 @@
|
|||
package de
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_de"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var GermanStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A German stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| The number of forms in this list is reduced significantly by passing it
|
||||
| through the German stemmer.
|
||||
|
||||
|
||||
aber | but
|
||||
|
||||
alle | all
|
||||
allem
|
||||
allen
|
||||
aller
|
||||
alles
|
||||
|
||||
als | than, as
|
||||
also | so
|
||||
am | an + dem
|
||||
an | at
|
||||
|
||||
ander | other
|
||||
andere
|
||||
anderem
|
||||
anderen
|
||||
anderer
|
||||
anderes
|
||||
anderm
|
||||
andern
|
||||
anderr
|
||||
anders
|
||||
|
||||
auch | also
|
||||
auf | on
|
||||
aus | out of
|
||||
bei | by
|
||||
bin | am
|
||||
bis | until
|
||||
bist | art
|
||||
da | there
|
||||
damit | with it
|
||||
dann | then
|
||||
|
||||
der | the
|
||||
den
|
||||
des
|
||||
dem
|
||||
die
|
||||
das
|
||||
|
||||
daß | that
|
||||
|
||||
derselbe | the same
|
||||
derselben
|
||||
denselben
|
||||
desselben
|
||||
demselben
|
||||
dieselbe
|
||||
dieselben
|
||||
dasselbe
|
||||
|
||||
dazu | to that
|
||||
|
||||
dein | thy
|
||||
deine
|
||||
deinem
|
||||
deinen
|
||||
deiner
|
||||
deines
|
||||
|
||||
denn | because
|
||||
|
||||
derer | of those
|
||||
dessen | of him
|
||||
|
||||
dich | thee
|
||||
dir | to thee
|
||||
du | thou
|
||||
|
||||
dies | this
|
||||
diese
|
||||
diesem
|
||||
diesen
|
||||
dieser
|
||||
dieses
|
||||
|
||||
|
||||
doch | (several meanings)
|
||||
dort | (over) there
|
||||
|
||||
|
||||
durch | through
|
||||
|
||||
ein | a
|
||||
eine
|
||||
einem
|
||||
einen
|
||||
einer
|
||||
eines
|
||||
|
||||
einig | some
|
||||
einige
|
||||
einigem
|
||||
einigen
|
||||
einiger
|
||||
einiges
|
||||
|
||||
einmal | once
|
||||
|
||||
er | he
|
||||
ihn | him
|
||||
ihm | to him
|
||||
|
||||
es | it
|
||||
etwas | something
|
||||
|
||||
euer | your
|
||||
eure
|
||||
eurem
|
||||
euren
|
||||
eurer
|
||||
eures
|
||||
|
||||
für | for
|
||||
gegen | towards
|
||||
gewesen | p.p. of sein
|
||||
hab | have
|
||||
habe | have
|
||||
haben | have
|
||||
hat | has
|
||||
hatte | had
|
||||
hatten | had
|
||||
hier | here
|
||||
hin | there
|
||||
hinter | behind
|
||||
|
||||
ich | I
|
||||
mich | me
|
||||
mir | to me
|
||||
|
||||
|
||||
ihr | you, to her
|
||||
ihre
|
||||
ihrem
|
||||
ihren
|
||||
ihrer
|
||||
ihres
|
||||
euch | to you
|
||||
|
||||
im | in + dem
|
||||
in | in
|
||||
indem | while
|
||||
ins | in + das
|
||||
ist | is
|
||||
|
||||
jede | each, every
|
||||
jedem
|
||||
jeden
|
||||
jeder
|
||||
jedes
|
||||
|
||||
jene | that
|
||||
jenem
|
||||
jenen
|
||||
jener
|
||||
jenes
|
||||
|
||||
jetzt | now
|
||||
kann | can
|
||||
|
||||
kein | no
|
||||
keine
|
||||
keinem
|
||||
keinen
|
||||
keiner
|
||||
keines
|
||||
|
||||
können | can
|
||||
könnte | could
|
||||
machen | do
|
||||
man | one
|
||||
|
||||
manche | some, many a
|
||||
manchem
|
||||
manchen
|
||||
mancher
|
||||
manches
|
||||
|
||||
mein | my
|
||||
meine
|
||||
meinem
|
||||
meinen
|
||||
meiner
|
||||
meines
|
||||
|
||||
mit | with
|
||||
muss | must
|
||||
musste | had to
|
||||
nach | to(wards)
|
||||
nicht | not
|
||||
nichts | nothing
|
||||
noch | still, yet
|
||||
nun | now
|
||||
nur | only
|
||||
ob | whether
|
||||
oder | or
|
||||
ohne | without
|
||||
sehr | very
|
||||
|
||||
sein | his
|
||||
seine
|
||||
seinem
|
||||
seinen
|
||||
seiner
|
||||
seines
|
||||
|
||||
selbst | self
|
||||
sich | herself
|
||||
|
||||
sie | they, she
|
||||
ihnen | to them
|
||||
|
||||
sind | are
|
||||
so | so
|
||||
|
||||
solche | such
|
||||
solchem
|
||||
solchen
|
||||
solcher
|
||||
solches
|
||||
|
||||
soll | shall
|
||||
sollte | should
|
||||
sondern | but
|
||||
sonst | else
|
||||
über | over
|
||||
um | about, around
|
||||
und | and
|
||||
|
||||
uns | us
|
||||
unse
|
||||
unsem
|
||||
unsen
|
||||
unser
|
||||
unses
|
||||
|
||||
unter | under
|
||||
viel | much
|
||||
vom | von + dem
|
||||
von | from
|
||||
vor | before
|
||||
während | while
|
||||
war | was
|
||||
waren | were
|
||||
warst | wast
|
||||
was | what
|
||||
weg | away, off
|
||||
weil | because
|
||||
weiter | further
|
||||
|
||||
welche | which
|
||||
welchem
|
||||
welchen
|
||||
welcher
|
||||
welches
|
||||
|
||||
wenn | when
|
||||
werde | will
|
||||
werden | will
|
||||
wie | how
|
||||
wieder | again
|
||||
will | want
|
||||
wir | we
|
||||
wird | will
|
||||
wirst | willst
|
||||
wo | where
|
||||
wollen | want
|
||||
wollte | wanted
|
||||
würde | would
|
||||
würden | would
|
||||
zu | to
|
||||
zum | zu + dem
|
||||
zur | zu + der
|
||||
zwar | indeed
|
||||
zwischen | between
|
||||
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(GermanStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
36
analysis/lang/el/stop_filter_el.go
Normal file
36
analysis/lang/el/stop_filter_el.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package el
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
105
analysis/lang/el/stop_words_el.go
Normal file
105
analysis/lang/el/stop_words_el.go
Normal file
|
@ -0,0 +1,105 @@
|
|||
package el
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_el"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var GreekStopWords = []byte(`# Lucene Greek Stopwords list
|
||||
# Note: by default this file is used after GreekLowerCaseFilter,
|
||||
# so when modifying this file use 'σ' instead of 'ς'
|
||||
ο
|
||||
η
|
||||
το
|
||||
οι
|
||||
τα
|
||||
του
|
||||
τησ
|
||||
των
|
||||
τον
|
||||
την
|
||||
και
|
||||
κι
|
||||
κ
|
||||
ειμαι
|
||||
εισαι
|
||||
ειναι
|
||||
ειμαστε
|
||||
ειστε
|
||||
στο
|
||||
στον
|
||||
στη
|
||||
στην
|
||||
μα
|
||||
αλλα
|
||||
απο
|
||||
για
|
||||
προσ
|
||||
με
|
||||
σε
|
||||
ωσ
|
||||
παρα
|
||||
αντι
|
||||
κατα
|
||||
μετα
|
||||
θα
|
||||
να
|
||||
δε
|
||||
δεν
|
||||
μη
|
||||
μην
|
||||
επι
|
||||
ενω
|
||||
εαν
|
||||
αν
|
||||
τοτε
|
||||
που
|
||||
πωσ
|
||||
ποιοσ
|
||||
ποια
|
||||
ποιο
|
||||
ποιοι
|
||||
ποιεσ
|
||||
ποιων
|
||||
ποιουσ
|
||||
αυτοσ
|
||||
αυτη
|
||||
αυτο
|
||||
αυτοι
|
||||
αυτων
|
||||
αυτουσ
|
||||
αυτεσ
|
||||
αυτα
|
||||
εκεινοσ
|
||||
εκεινη
|
||||
εκεινο
|
||||
εκεινοι
|
||||
εκεινεσ
|
||||
εκεινα
|
||||
εκεινων
|
||||
εκεινουσ
|
||||
οπωσ
|
||||
ομωσ
|
||||
ισωσ
|
||||
οσο
|
||||
οτι
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(GreekStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
73
analysis/lang/en/analyzer_en.go
Normal file
73
analysis/lang/en/analyzer_en.go
Normal file
|
@ -0,0 +1,73 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Package en implements an analyzer with reasonable defaults for processing
|
||||
// English text.
|
||||
//
|
||||
// It strips possessive suffixes ('s), transforms tokens to lower case,
|
||||
// removes stopwords from a built-in list, and applies porter stemming.
|
||||
//
|
||||
// The built-in stopwords list is defined in EnglishStopWords.
|
||||
package en
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/porter"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "en"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
possEnFilter, err := cache.TokenFilterNamed(PossessiveName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopEnFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerEnFilter, err := cache.TokenFilterNamed(porter.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
possEnFilter,
|
||||
toLowerFilter,
|
||||
stopEnFilter,
|
||||
stemmerEnFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
105
analysis/lang/en/analyzer_en_test.go
Normal file
105
analysis/lang/en/analyzer_en_test.go
Normal file
|
@ -0,0 +1,105 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestEnglishAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("books"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("book"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("book"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("book"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word removal
|
||||
{
|
||||
input: []byte("the"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
// possessive removal
|
||||
{
|
||||
input: []byte("steven's"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("steven"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("steven\u2019s"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("steven"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("steven\uFF07s"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("steven"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
177
analysis/lang/en/plural_stemmer.go
Normal file
177
analysis/lang/en/plural_stemmer.go
Normal file
|
@ -0,0 +1,177 @@
|
|||
/*
|
||||
This code was ported from the Open Search Project
|
||||
https://github.com/opensearch-project/OpenSearch/blob/main/modules/analysis-common/src/main/java/org/opensearch/analysis/common/EnglishPluralStemFilter.java
|
||||
The algorithm itself was created by Mark Harwood
|
||||
https://github.com/markharwood
|
||||
*/
|
||||
|
||||
/*
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* The OpenSearch Contributors require contributions made to
|
||||
* this file be licensed under the Apache-2.0 license or a
|
||||
* compatible open source license.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const PluralStemmerName = "stemmer_en_plural"
|
||||
|
||||
type EnglishPluralStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewEnglishPluralStemmerFilter() *EnglishPluralStemmerFilter {
|
||||
return &EnglishPluralStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *EnglishPluralStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
token.Term = []byte(stem(string(token.Term)))
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
func EnglishPluralStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewEnglishPluralStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(PluralStemmerName, EnglishPluralStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
// Words ending in oes that retain the e when stemmed
|
||||
var oesExceptions = []string{"shoes", "canoes", "oboes"}
|
||||
|
||||
// Words ending in ches that retain the e when stemmed
|
||||
var chesExceptions = []string{
|
||||
"cliches",
|
||||
"avalanches",
|
||||
"mustaches",
|
||||
"moustaches",
|
||||
"quiches",
|
||||
"headaches",
|
||||
"heartaches",
|
||||
"porsches",
|
||||
"tranches",
|
||||
"caches",
|
||||
}
|
||||
|
||||
func stem(word string) string {
|
||||
runes := []rune(strings.ToLower(word))
|
||||
|
||||
if len(runes) < 3 || runes[len(runes)-1] != 's' {
|
||||
return string(runes)
|
||||
}
|
||||
|
||||
switch runes[len(runes)-2] {
|
||||
case 'u':
|
||||
fallthrough
|
||||
case 's':
|
||||
return string(runes)
|
||||
case 'e':
|
||||
// Modified ies->y logic from original s-stemmer - only work on strings > 4
|
||||
// so spies -> spy still but pies->pie.
|
||||
// The original code also special-cased aies and eies for no good reason as far as I can tell.
|
||||
// ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies )
|
||||
if len(runes) > 4 && runes[len(runes)-3] == 'i' {
|
||||
runes[len(runes)-3] = 'y'
|
||||
return string(runes[0 : len(runes)-2])
|
||||
}
|
||||
|
||||
// Suffix rules to remove any dangling "e"
|
||||
if len(runes) > 3 {
|
||||
// xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe")
|
||||
if len(runes) > 4 && runes[len(runes)-3] == 'x' {
|
||||
return string(runes[0 : len(runes)-2])
|
||||
}
|
||||
|
||||
// oes
|
||||
if len(runes) > 3 && runes[len(runes)-3] == 'o' {
|
||||
if isException(runes, oesExceptions) {
|
||||
// Only remove the S
|
||||
return string(runes[0 : len(runes)-1])
|
||||
}
|
||||
// Remove the es
|
||||
return string(runes[0 : len(runes)-2])
|
||||
}
|
||||
|
||||
if len(runes) > 4 {
|
||||
// shes/sses
|
||||
if runes[len(runes)-4] == 's' && (runes[len(runes)-3] == 'h' || runes[len(runes)-3] == 's') {
|
||||
return string(runes[0 : len(runes)-2])
|
||||
}
|
||||
|
||||
// ches
|
||||
if len(runes) > 4 {
|
||||
if runes[len(runes)-4] == 'c' && runes[len(runes)-3] == 'h' {
|
||||
if isException(runes, chesExceptions) {
|
||||
// Only remove the S
|
||||
return string(runes[0 : len(runes)-1])
|
||||
}
|
||||
// Remove the es
|
||||
return string(runes[0 : len(runes)-2])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fallthrough
|
||||
default:
|
||||
return string(runes[0 : len(runes)-1])
|
||||
}
|
||||
}
|
||||
|
||||
func isException(word []rune, exceptions []string) bool {
|
||||
for _, exception := range exceptions {
|
||||
|
||||
exceptionRunes := []rune(exception)
|
||||
|
||||
exceptionPos := len(exceptionRunes) - 1
|
||||
wordPos := len(word) - 1
|
||||
|
||||
matched := true
|
||||
for exceptionPos >= 0 && wordPos >= 0 {
|
||||
if exceptionRunes[exceptionPos] != word[wordPos] {
|
||||
matched = false
|
||||
break
|
||||
}
|
||||
exceptionPos--
|
||||
wordPos--
|
||||
}
|
||||
if matched {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
46
analysis/lang/en/plural_stemmer_test.go
Normal file
46
analysis/lang/en/plural_stemmer_test.go
Normal file
|
@ -0,0 +1,46 @@
|
|||
package en
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestEnglishPluralStemmer(t *testing.T) {
|
||||
data := []struct {
|
||||
In, Out string
|
||||
}{
|
||||
{"dresses", "dress"},
|
||||
{"dress", "dress"},
|
||||
{"axes", "axe"},
|
||||
{"ad", "ad"},
|
||||
{"ads", "ad"},
|
||||
{"gas", "ga"},
|
||||
{"sass", "sass"},
|
||||
{"berries", "berry"},
|
||||
{"dresses", "dress"},
|
||||
{"spies", "spy"},
|
||||
{"shoes", "shoe"},
|
||||
{"headaches", "headache"},
|
||||
{"computer", "computer"},
|
||||
{"dressing", "dressing"},
|
||||
{"clothes", "clothe"},
|
||||
{"DRESSES", "dress"},
|
||||
{"frog", "frog"},
|
||||
{"dress", "dress"},
|
||||
{"runs", "run"},
|
||||
{"pies", "pie"},
|
||||
{"foxes", "fox"},
|
||||
{"axes", "axe"},
|
||||
{"foes", "fo"},
|
||||
{"dishes", "dish"},
|
||||
{"snitches", "snitch"},
|
||||
{"cliches", "cliche"},
|
||||
{"forests", "forest"},
|
||||
{"yes", "ye"},
|
||||
}
|
||||
|
||||
for _, datum := range data {
|
||||
stemmed := stem(datum.In)
|
||||
|
||||
if stemmed != datum.Out {
|
||||
t.Errorf("expected %v but got %v", datum.Out, stemmed)
|
||||
}
|
||||
}
|
||||
}
|
70
analysis/lang/en/possessive_filter_en.go
Normal file
70
analysis/lang/en/possessive_filter_en.go
Normal file
|
@ -0,0 +1,70 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
// PossessiveName is the name PossessiveFilter is registered as
|
||||
// in the bleve registry.
|
||||
const PossessiveName = "possessive_en"
|
||||
|
||||
const rightSingleQuotationMark = '’'
|
||||
const apostrophe = '\''
|
||||
const fullWidthApostrophe = '''
|
||||
|
||||
const apostropheChars = rightSingleQuotationMark + apostrophe + fullWidthApostrophe
|
||||
|
||||
// PossessiveFilter implements a TokenFilter which
|
||||
// strips the English possessive suffix ('s) from tokens.
|
||||
// It handle a variety of apostrophe types, is case-insensitive
|
||||
// and doesn't distinguish between possessive and contraction.
|
||||
// (ie "She's So Rad" becomes "She So Rad")
|
||||
type PossessiveFilter struct {
|
||||
}
|
||||
|
||||
func NewPossessiveFilter() *PossessiveFilter {
|
||||
return &PossessiveFilter{}
|
||||
}
|
||||
|
||||
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
lastRune, lastRuneSize := utf8.DecodeLastRune(token.Term)
|
||||
if lastRune == 's' || lastRune == 'S' {
|
||||
nextLastRune, nextLastRuneSize := utf8.DecodeLastRune(token.Term[:len(token.Term)-lastRuneSize])
|
||||
if nextLastRune == rightSingleQuotationMark ||
|
||||
nextLastRune == apostrophe ||
|
||||
nextLastRune == fullWidthApostrophe {
|
||||
token.Term = token.Term[:len(token.Term)-lastRuneSize-nextLastRuneSize]
|
||||
}
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func PossessiveFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewPossessiveFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(PossessiveName, PossessiveFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
142
analysis/lang/en/possessive_filter_en_test.go
Normal file
142
analysis/lang/en/possessive_filter_en_test.go
Normal file
|
@ -0,0 +1,142 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestEnglishPossessiveFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("marty's"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY'S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty’s"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY’S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty's"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY'S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("m"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("s"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("'s"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("marty"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("m"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("s"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := stemmerFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkEnglishPossessiveFilter(b *testing.B) {
|
||||
|
||||
input := analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("marty's"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY'S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty’s"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY’S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("marty's"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("MARTY'S"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("m"),
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
stemmerFilter.Filter(input)
|
||||
}
|
||||
|
||||
}
|
52
analysis/lang/en/stemmer_en_snowball.go
Normal file
52
analysis/lang/en/stemmer_en_snowball.go
Normal file
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/snowballstem"
|
||||
"github.com/blevesearch/snowballstem/english"
|
||||
)
|
||||
|
||||
const SnowballStemmerName = "stemmer_en_snowball"
|
||||
|
||||
type EnglishStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewEnglishStemmerFilter() *EnglishStemmerFilter {
|
||||
return &EnglishStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *EnglishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
env := snowballstem.NewEnv(string(token.Term))
|
||||
english.Stem(env)
|
||||
token.Term = []byte(env.Current())
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func EnglishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewEnglishStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(SnowballStemmerName, EnglishStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
79
analysis/lang/en/stemmer_en_test.go
Normal file
79
analysis/lang/en/stemmer_en_test.go
Normal file
|
@ -0,0 +1,79 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestSnowballEnglishStemmer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("enjoy"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("enjoy"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("enjoyed"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("enjoy"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("enjoyable"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("enjoy"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/en/stop_filter_en.go
Normal file
36
analysis/lang/en/stop_filter_en.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package en
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
347
analysis/lang/en/stop_words_en.go
Normal file
347
analysis/lang/en/stop_words_en.go
Normal file
|
@ -0,0 +1,347 @@
|
|||
package en
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_en"
|
||||
|
||||
// EnglishStopWords is the built-in list of stopwords used by the "stop_en" TokenFilter.
|
||||
//
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| An English stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| Many of the forms below are quite rare (e.g. "yourselves") but included for
|
||||
| completeness.
|
||||
|
||||
| PRONOUNS FORMS
|
||||
| 1st person sing
|
||||
|
||||
i | subject, always in upper case of course
|
||||
|
||||
me | object
|
||||
my | possessive adjective
|
||||
| the possessive pronoun 'mine' is best suppressed, because of the
|
||||
| sense of coal-mine etc.
|
||||
myself | reflexive
|
||||
| 1st person plural
|
||||
we | subject
|
||||
|
||||
| us | object
|
||||
| care is required here because US = United States. It is usually
|
||||
| safe to remove it if it is in lower case.
|
||||
our | possessive adjective
|
||||
ours | possessive pronoun
|
||||
ourselves | reflexive
|
||||
| second person (archaic 'thou' forms not included)
|
||||
you | subject and object
|
||||
your | possessive adjective
|
||||
yours | possessive pronoun
|
||||
yourself | reflexive (singular)
|
||||
yourselves | reflexive (plural)
|
||||
| third person singular
|
||||
he | subject
|
||||
him | object
|
||||
his | possessive adjective and pronoun
|
||||
himself | reflexive
|
||||
|
||||
she | subject
|
||||
her | object and possessive adjective
|
||||
hers | possessive pronoun
|
||||
herself | reflexive
|
||||
|
||||
it | subject and object
|
||||
its | possessive adjective
|
||||
itself | reflexive
|
||||
| third person plural
|
||||
they | subject
|
||||
them | object
|
||||
their | possessive adjective
|
||||
theirs | possessive pronoun
|
||||
themselves | reflexive
|
||||
| other forms (demonstratives, interrogatives)
|
||||
what
|
||||
which
|
||||
who
|
||||
whom
|
||||
this
|
||||
that
|
||||
these
|
||||
those
|
||||
|
||||
| VERB FORMS (using F.R. Palmer's nomenclature)
|
||||
| BE
|
||||
am | 1st person, present
|
||||
is | -s form (3rd person, present)
|
||||
are | present
|
||||
was | 1st person, past
|
||||
were | past
|
||||
be | infinitive
|
||||
been | past participle
|
||||
being | -ing form
|
||||
| HAVE
|
||||
have | simple
|
||||
has | -s form
|
||||
had | past
|
||||
having | -ing form
|
||||
| DO
|
||||
do | simple
|
||||
does | -s form
|
||||
did | past
|
||||
doing | -ing form
|
||||
|
||||
| The forms below are, I believe, best omitted, because of the significant
|
||||
| homonym forms:
|
||||
|
||||
| He made a WILL
|
||||
| old tin CAN
|
||||
| merry month of MAY
|
||||
| a smell of MUST
|
||||
| fight the good fight with all thy MIGHT
|
||||
|
||||
| would, could, should, ought might however be included
|
||||
|
||||
| | AUXILIARIES
|
||||
| | WILL
|
||||
|will
|
||||
|
||||
would
|
||||
|
||||
| | SHALL
|
||||
|shall
|
||||
|
||||
should
|
||||
|
||||
| | CAN
|
||||
|can
|
||||
|
||||
could
|
||||
|
||||
| | MAY
|
||||
|may
|
||||
|might
|
||||
| | MUST
|
||||
|must
|
||||
| | OUGHT
|
||||
|
||||
ought
|
||||
|
||||
| COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
|
||||
| pronoun + verb
|
||||
|
||||
i'm
|
||||
you're
|
||||
he's
|
||||
she's
|
||||
it's
|
||||
we're
|
||||
they're
|
||||
i've
|
||||
you've
|
||||
we've
|
||||
they've
|
||||
i'd
|
||||
you'd
|
||||
he'd
|
||||
she'd
|
||||
we'd
|
||||
they'd
|
||||
i'll
|
||||
you'll
|
||||
he'll
|
||||
she'll
|
||||
we'll
|
||||
they'll
|
||||
|
||||
| verb + negation
|
||||
|
||||
isn't
|
||||
aren't
|
||||
wasn't
|
||||
weren't
|
||||
hasn't
|
||||
haven't
|
||||
hadn't
|
||||
doesn't
|
||||
don't
|
||||
didn't
|
||||
|
||||
| auxiliary + negation
|
||||
|
||||
won't
|
||||
wouldn't
|
||||
shan't
|
||||
shouldn't
|
||||
can't
|
||||
cannot
|
||||
couldn't
|
||||
mustn't
|
||||
|
||||
| miscellaneous forms
|
||||
|
||||
let's
|
||||
that's
|
||||
who's
|
||||
what's
|
||||
here's
|
||||
there's
|
||||
when's
|
||||
where's
|
||||
why's
|
||||
how's
|
||||
|
||||
| rarer forms
|
||||
|
||||
| daren't needn't
|
||||
|
||||
| doubtful forms
|
||||
|
||||
| oughtn't mightn't
|
||||
|
||||
| ARTICLES
|
||||
a
|
||||
an
|
||||
the
|
||||
|
||||
| THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
|
||||
| high, that classification is pointless.)
|
||||
and
|
||||
but
|
||||
if
|
||||
or
|
||||
because
|
||||
as
|
||||
until
|
||||
while
|
||||
|
||||
of
|
||||
at
|
||||
by
|
||||
for
|
||||
with
|
||||
about
|
||||
against
|
||||
between
|
||||
into
|
||||
through
|
||||
during
|
||||
before
|
||||
after
|
||||
above
|
||||
below
|
||||
to
|
||||
from
|
||||
up
|
||||
down
|
||||
in
|
||||
out
|
||||
on
|
||||
off
|
||||
over
|
||||
under
|
||||
|
||||
again
|
||||
further
|
||||
then
|
||||
once
|
||||
|
||||
here
|
||||
there
|
||||
when
|
||||
where
|
||||
why
|
||||
how
|
||||
|
||||
all
|
||||
any
|
||||
both
|
||||
each
|
||||
few
|
||||
more
|
||||
most
|
||||
other
|
||||
some
|
||||
such
|
||||
|
||||
no
|
||||
nor
|
||||
not
|
||||
only
|
||||
own
|
||||
same
|
||||
so
|
||||
than
|
||||
too
|
||||
very
|
||||
|
||||
| Just for the record, the following words are among the commonest in English
|
||||
|
||||
| one
|
||||
| every
|
||||
| least
|
||||
| less
|
||||
| many
|
||||
| now
|
||||
| ever
|
||||
| never
|
||||
| say
|
||||
| says
|
||||
| said
|
||||
| also
|
||||
| get
|
||||
| go
|
||||
| goes
|
||||
| just
|
||||
| made
|
||||
| make
|
||||
| put
|
||||
| see
|
||||
| seen
|
||||
| whether
|
||||
| like
|
||||
| well
|
||||
| back
|
||||
| even
|
||||
| still
|
||||
| way
|
||||
| take
|
||||
| since
|
||||
| another
|
||||
| however
|
||||
| two
|
||||
| three
|
||||
| four
|
||||
| five
|
||||
| first
|
||||
| second
|
||||
| new
|
||||
| old
|
||||
| high
|
||||
| long
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(EnglishStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
66
analysis/lang/es/analyzer_es.go
Normal file
66
analysis/lang/es/analyzer_es.go
Normal file
|
@ -0,0 +1,66 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package es
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "es"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{},
|
||||
cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normalizeEsFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopEsFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
lightStemmerEsFilter, err := cache.TokenFilterNamed(LightStemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: unicodeTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopEsFilter,
|
||||
normalizeEsFilter,
|
||||
lightStemmerEsFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
122
analysis/lang/es/analyzer_es_test.go
Normal file
122
analysis/lang/es/analyzer_es_test.go
Normal file
|
@ -0,0 +1,122 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package es
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestSpanishAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("chicana"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chican"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 7,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("chicano"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chican"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 7,
|
||||
},
|
||||
},
|
||||
},
|
||||
// added by marty for better coverage
|
||||
{
|
||||
input: []byte("yeses"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("yes"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("jaeces"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("jaez"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("arcos"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("arc"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 5,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("caos"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("caos"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 4,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("parecer"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("parecer"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 7,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
78
analysis/lang/es/light_stemmer_es.go
Normal file
78
analysis/lang/es/light_stemmer_es.go
Normal file
|
@ -0,0 +1,78 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package es
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const LightStemmerName = "stemmer_es_light"
|
||||
|
||||
type SpanishLightStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewSpanishLightStemmerFilter() *SpanishLightStemmerFilter {
|
||||
return &SpanishLightStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *SpanishLightStemmerFilter) Filter(
|
||||
input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
runes := bytes.Runes(token.Term)
|
||||
runes = stem(runes)
|
||||
token.Term = analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func stem(input []rune) []rune {
|
||||
l := len(input)
|
||||
if l < 5 {
|
||||
return input
|
||||
}
|
||||
|
||||
switch input[l-1] {
|
||||
case 'o', 'a', 'e':
|
||||
return input[:l-1]
|
||||
case 's':
|
||||
if input[l-2] == 'e' && input[l-3] == 's' && input[l-4] == 'e' {
|
||||
return input[:l-2]
|
||||
}
|
||||
if input[l-2] == 'e' && input[l-3] == 'c' {
|
||||
input[l-3] = 'z'
|
||||
return input[:l-2]
|
||||
}
|
||||
if input[l-2] == 'o' || input[l-2] == 'a' || input[l-2] == 'e' {
|
||||
return input[:l-2]
|
||||
}
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
func SpanishLightStemmerFilterConstructor(config map[string]interface{},
|
||||
cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewSpanishLightStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(LightStemmerName, SpanishLightStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
70
analysis/lang/es/spanish_normalize.go
Normal file
70
analysis/lang/es/spanish_normalize.go
Normal file
|
@ -0,0 +1,70 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package es
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_es"
|
||||
|
||||
type SpanishNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewSpanishNormalizeFilter() *SpanishNormalizeFilter {
|
||||
return &SpanishNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *SpanishNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case 'à', 'á', 'â', 'ä':
|
||||
runes[i] = 'a'
|
||||
case 'ò', 'ó', 'ô', 'ö':
|
||||
runes[i] = 'o'
|
||||
case 'è', 'é', 'ê', 'ë':
|
||||
runes[i] = 'e'
|
||||
case 'ù', 'ú', 'û', 'ü':
|
||||
runes[i] = 'u'
|
||||
case 'ì', 'í', 'î', 'ï':
|
||||
runes[i] = 'i'
|
||||
}
|
||||
}
|
||||
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewSpanishNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
112
analysis/lang/es/spanish_normalize_test.go
Normal file
112
analysis/lang/es/spanish_normalize_test.go
Normal file
|
@ -0,0 +1,112 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package es
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestSpanishNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Guía"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Guia"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Belcebú"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Belcebu"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Limón"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("Limon"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("agüero"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("aguero"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("laúd"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("laud"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
spanishNormalizeFilter := NewSpanishNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := spanishNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
52
analysis/lang/es/stemmer_es_snowball.go
Normal file
52
analysis/lang/es/stemmer_es_snowball.go
Normal file
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package es
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/snowballstem"
|
||||
"github.com/blevesearch/snowballstem/spanish"
|
||||
)
|
||||
|
||||
const SnowballStemmerName = "stemmer_es_snowball"
|
||||
|
||||
type SpanishStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewSpanishStemmerFilter() *SpanishStemmerFilter {
|
||||
return &SpanishStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *SpanishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
env := snowballstem.NewEnv(string(token.Term))
|
||||
spanish.Stem(env)
|
||||
token.Term = []byte(env.Current())
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func SpanishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewSpanishStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(SnowballStemmerName, SpanishStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
79
analysis/lang/es/stemmer_es_snowball_test.go
Normal file
79
analysis/lang/es/stemmer_es_snowball_test.go
Normal file
|
@ -0,0 +1,79 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package es
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestSnowballSpanishStemmer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("agresivos"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("agres"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("agresivamente"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("agres"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("agresividad"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("agres"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/es/stop_filter_es.go
Normal file
36
analysis/lang/es/stop_filter_es.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2017 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
package es
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{},
|
||||
cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
383
analysis/lang/es/stop_words_es.go
Normal file
383
analysis/lang/es/stop_words_es.go
Normal file
|
@ -0,0 +1,383 @@
|
|||
package es
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_es"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var SpanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Spanish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
|
||||
| The following is a ranked list (commonest to rarest) of stopwords
|
||||
| deriving from a large sample of text.
|
||||
|
||||
| Extra words have been added at the end.
|
||||
|
||||
de | from, of
|
||||
la | the, her
|
||||
que | who, that
|
||||
el | the
|
||||
en | in
|
||||
y | and
|
||||
a | to
|
||||
los | the, them
|
||||
del | de + el
|
||||
se | himself, from him etc
|
||||
las | the, them
|
||||
por | for, by, etc
|
||||
un | a
|
||||
para | for
|
||||
con | with
|
||||
no | no
|
||||
una | a
|
||||
su | his, her
|
||||
al | a + el
|
||||
| es from SER
|
||||
lo | him
|
||||
como | how
|
||||
más | more
|
||||
pero | pero
|
||||
sus | su plural
|
||||
le | to him, her
|
||||
ya | already
|
||||
o | or
|
||||
| fue from SER
|
||||
este | this
|
||||
| ha from HABER
|
||||
sí | himself etc
|
||||
porque | because
|
||||
esta | this
|
||||
| son from SER
|
||||
entre | between
|
||||
| está from ESTAR
|
||||
cuando | when
|
||||
muy | very
|
||||
sin | without
|
||||
sobre | on
|
||||
| ser from SER
|
||||
| tiene from TENER
|
||||
también | also
|
||||
me | me
|
||||
hasta | until
|
||||
hay | there is/are
|
||||
donde | where
|
||||
| han from HABER
|
||||
quien | whom, that
|
||||
| están from ESTAR
|
||||
| estado from ESTAR
|
||||
desde | from
|
||||
todo | all
|
||||
nos | us
|
||||
durante | during
|
||||
| estados from ESTAR
|
||||
todos | all
|
||||
uno | a
|
||||
les | to them
|
||||
ni | nor
|
||||
contra | against
|
||||
otros | other
|
||||
| fueron from SER
|
||||
ese | that
|
||||
eso | that
|
||||
| había from HABER
|
||||
ante | before
|
||||
ellos | they
|
||||
e | and (variant of y)
|
||||
esto | this
|
||||
mí | me
|
||||
antes | before
|
||||
algunos | some
|
||||
qué | what?
|
||||
unos | a
|
||||
yo | I
|
||||
otro | other
|
||||
otras | other
|
||||
otra | other
|
||||
él | he
|
||||
tanto | so much, many
|
||||
esa | that
|
||||
estos | these
|
||||
mucho | much, many
|
||||
quienes | who
|
||||
nada | nothing
|
||||
muchos | many
|
||||
cual | who
|
||||
| sea from SER
|
||||
poco | few
|
||||
ella | she
|
||||
estar | to be
|
||||
| haber from HABER
|
||||
estas | these
|
||||
| estaba from ESTAR
|
||||
| estamos from ESTAR
|
||||
algunas | some
|
||||
algo | something
|
||||
nosotros | we
|
||||
|
||||
| other forms
|
||||
|
||||
mi | me
|
||||
mis | mi plural
|
||||
tú | thou
|
||||
te | thee
|
||||
ti | thee
|
||||
tu | thy
|
||||
tus | tu plural
|
||||
ellas | they
|
||||
nosotras | we
|
||||
vosotros | you
|
||||
vosotras | you
|
||||
os | you
|
||||
mío | mine
|
||||
mía |
|
||||
míos |
|
||||
mías |
|
||||
tuyo | thine
|
||||
tuya |
|
||||
tuyos |
|
||||
tuyas |
|
||||
suyo | his, hers, theirs
|
||||
suya |
|
||||
suyos |
|
||||
suyas |
|
||||
nuestro | ours
|
||||
nuestra |
|
||||
nuestros |
|
||||
nuestras |
|
||||
vuestro | yours
|
||||
vuestra |
|
||||
vuestros |
|
||||
vuestras |
|
||||
esos | those
|
||||
esas | those
|
||||
|
||||
| forms of estar, to be (not including the infinitive):
|
||||
estoy
|
||||
estás
|
||||
está
|
||||
estamos
|
||||
estáis
|
||||
están
|
||||
esté
|
||||
estés
|
||||
estemos
|
||||
estéis
|
||||
estén
|
||||
estaré
|
||||
estarás
|
||||
estará
|
||||
estaremos
|
||||
estaréis
|
||||
estarán
|
||||
estaría
|
||||
estarías
|
||||
estaríamos
|
||||
estaríais
|
||||
estarían
|
||||
estaba
|
||||
estabas
|
||||
estábamos
|
||||
estabais
|
||||
estaban
|
||||
estuve
|
||||
estuviste
|
||||
estuvo
|
||||
estuvimos
|
||||
estuvisteis
|
||||
estuvieron
|
||||
estuviera
|
||||
estuvieras
|
||||
estuviéramos
|
||||
estuvierais
|
||||
estuvieran
|
||||
estuviese
|
||||
estuvieses
|
||||
estuviésemos
|
||||
estuvieseis
|
||||
estuviesen
|
||||
estando
|
||||
estado
|
||||
estada
|
||||
estados
|
||||
estadas
|
||||
estad
|
||||
|
||||
| forms of haber, to have (not including the infinitive):
|
||||
he
|
||||
has
|
||||
ha
|
||||
hemos
|
||||
habéis
|
||||
han
|
||||
haya
|
||||
hayas
|
||||
hayamos
|
||||
hayáis
|
||||
hayan
|
||||
habré
|
||||
habrás
|
||||
habrá
|
||||
habremos
|
||||
habréis
|
||||
habrán
|
||||
habría
|
||||
habrías
|
||||
habríamos
|
||||
habríais
|
||||
habrían
|
||||
había
|
||||
habías
|
||||
habíamos
|
||||
habíais
|
||||
habían
|
||||
hube
|
||||
hubiste
|
||||
hubo
|
||||
hubimos
|
||||
hubisteis
|
||||
hubieron
|
||||
hubiera
|
||||
hubieras
|
||||
hubiéramos
|
||||
hubierais
|
||||
hubieran
|
||||
hubiese
|
||||
hubieses
|
||||
hubiésemos
|
||||
hubieseis
|
||||
hubiesen
|
||||
habiendo
|
||||
habido
|
||||
habida
|
||||
habidos
|
||||
habidas
|
||||
|
||||
| forms of ser, to be (not including the infinitive):
|
||||
soy
|
||||
eres
|
||||
es
|
||||
somos
|
||||
sois
|
||||
son
|
||||
sea
|
||||
seas
|
||||
seamos
|
||||
seáis
|
||||
sean
|
||||
seré
|
||||
serás
|
||||
será
|
||||
seremos
|
||||
seréis
|
||||
serán
|
||||
sería
|
||||
serías
|
||||
seríamos
|
||||
seríais
|
||||
serían
|
||||
era
|
||||
eras
|
||||
éramos
|
||||
erais
|
||||
eran
|
||||
fui
|
||||
fuiste
|
||||
fue
|
||||
fuimos
|
||||
fuisteis
|
||||
fueron
|
||||
fuera
|
||||
fueras
|
||||
fuéramos
|
||||
fuerais
|
||||
fueran
|
||||
fuese
|
||||
fueses
|
||||
fuésemos
|
||||
fueseis
|
||||
fuesen
|
||||
siendo
|
||||
sido
|
||||
| sed also means 'thirst'
|
||||
|
||||
| forms of tener, to have (not including the infinitive):
|
||||
tengo
|
||||
tienes
|
||||
tiene
|
||||
tenemos
|
||||
tenéis
|
||||
tienen
|
||||
tenga
|
||||
tengas
|
||||
tengamos
|
||||
tengáis
|
||||
tengan
|
||||
tendré
|
||||
tendrás
|
||||
tendrá
|
||||
tendremos
|
||||
tendréis
|
||||
tendrán
|
||||
tendría
|
||||
tendrías
|
||||
tendríamos
|
||||
tendríais
|
||||
tendrían
|
||||
tenía
|
||||
tenías
|
||||
teníamos
|
||||
teníais
|
||||
tenían
|
||||
tuve
|
||||
tuviste
|
||||
tuvo
|
||||
tuvimos
|
||||
tuvisteis
|
||||
tuvieron
|
||||
tuviera
|
||||
tuvieras
|
||||
tuviéramos
|
||||
tuvierais
|
||||
tuvieran
|
||||
tuviese
|
||||
tuvieses
|
||||
tuviésemos
|
||||
tuvieseis
|
||||
tuviesen
|
||||
teniendo
|
||||
tenido
|
||||
tenida
|
||||
tenidos
|
||||
tenidas
|
||||
tened
|
||||
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(SpanishStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
36
analysis/lang/eu/stop_filter_eu.go
Normal file
36
analysis/lang/eu/stop_filter_eu.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package eu
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
126
analysis/lang/eu/stop_words_eu.go
Normal file
126
analysis/lang/eu/stop_words_eu.go
Normal file
|
@ -0,0 +1,126 @@
|
|||
package eu
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_eu"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var BasqueStopWords = []byte(`# example set of basque stopwords
|
||||
al
|
||||
anitz
|
||||
arabera
|
||||
asko
|
||||
baina
|
||||
bat
|
||||
batean
|
||||
batek
|
||||
bati
|
||||
batzuei
|
||||
batzuek
|
||||
batzuetan
|
||||
batzuk
|
||||
bera
|
||||
beraiek
|
||||
berau
|
||||
berauek
|
||||
bere
|
||||
berori
|
||||
beroriek
|
||||
beste
|
||||
bezala
|
||||
da
|
||||
dago
|
||||
dira
|
||||
ditu
|
||||
du
|
||||
dute
|
||||
edo
|
||||
egin
|
||||
ere
|
||||
eta
|
||||
eurak
|
||||
ez
|
||||
gainera
|
||||
gu
|
||||
gutxi
|
||||
guzti
|
||||
haiei
|
||||
haiek
|
||||
haietan
|
||||
hainbeste
|
||||
hala
|
||||
han
|
||||
handik
|
||||
hango
|
||||
hara
|
||||
hari
|
||||
hark
|
||||
hartan
|
||||
hau
|
||||
hauei
|
||||
hauek
|
||||
hauetan
|
||||
hemen
|
||||
hemendik
|
||||
hemengo
|
||||
hi
|
||||
hona
|
||||
honek
|
||||
honela
|
||||
honetan
|
||||
honi
|
||||
hor
|
||||
hori
|
||||
horiei
|
||||
horiek
|
||||
horietan
|
||||
horko
|
||||
horra
|
||||
horrek
|
||||
horrela
|
||||
horretan
|
||||
horri
|
||||
hortik
|
||||
hura
|
||||
izan
|
||||
ni
|
||||
noiz
|
||||
nola
|
||||
non
|
||||
nondik
|
||||
nongo
|
||||
nor
|
||||
nora
|
||||
ze
|
||||
zein
|
||||
zen
|
||||
zenbait
|
||||
zenbat
|
||||
zer
|
||||
zergatik
|
||||
ziren
|
||||
zituen
|
||||
zu
|
||||
zuek
|
||||
zuen
|
||||
zuten
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(BasqueStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
74
analysis/lang/fa/analyzer_fa.go
Normal file
74
analysis/lang/fa/analyzer_fa.go
Normal file
|
@ -0,0 +1,74 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/char/zerowidthnonjoiner"
|
||||
"github.com/blevesearch/bleve/v2/analysis/lang/ar"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "fa"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
zFilter, err := cache.CharFilterNamed(zerowidthnonjoiner.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normArFilter, err := cache.TokenFilterNamed(ar.NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normFaFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopFaFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
CharFilters: []analysis.CharFilter{
|
||||
zFilter,
|
||||
},
|
||||
Tokenizer: unicodeTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
normArFilter,
|
||||
normFaFilter,
|
||||
stopFaFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
684
analysis/lang/fa/analyzer_fa_test.go
Normal file
684
analysis/lang/fa/analyzer_fa_test.go
Normal file
|
@ -0,0 +1,684 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestPersianAnalyzerVerbs(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// active present indicative
|
||||
{
|
||||
input: []byte("میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite indicative
|
||||
{
|
||||
input: []byte("خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite indicative
|
||||
{
|
||||
input: []byte("میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active future indicative
|
||||
{
|
||||
input: []byte("خواهد خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present progressive indicative
|
||||
{
|
||||
input: []byte("دارد میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active perfect indicative
|
||||
{
|
||||
input: []byte("خوردهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective perfect indicative
|
||||
{
|
||||
input: []byte("میخوردهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("میخورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("میخورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("میخورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present indicative
|
||||
{
|
||||
input: []byte("خورده میشود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite indicative
|
||||
{
|
||||
input: []byte("خورده شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite indicative
|
||||
{
|
||||
input: []byte("خورده میشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive perfect indicative
|
||||
{
|
||||
input: []byte("خورده شدهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective perfect indicative
|
||||
{
|
||||
input: []byte("خورده میشدهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده شده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده میشده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive future indicative
|
||||
{
|
||||
input: []byte("خورده خواهد شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present progressive indicative
|
||||
{
|
||||
input: []byte("دارد خورده میشود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت خورده میشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present subjunctive
|
||||
{
|
||||
input: []byte("خورده شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده شده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده میشده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده میشده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present subjunctive
|
||||
{
|
||||
input: []byte("بخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بخورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPersianAnalyzerVerbsDefective(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// active present indicative
|
||||
{
|
||||
input: []byte("مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite indicative
|
||||
{
|
||||
input: []byte("خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite indicative
|
||||
{
|
||||
input: []byte("مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active future indicative
|
||||
{
|
||||
input: []byte("خواهد خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present progressive indicative
|
||||
{
|
||||
input: []byte("دارد مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active perfect indicative
|
||||
{
|
||||
input: []byte("خورده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective perfect indicative
|
||||
{
|
||||
input: []byte("مي خورده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("مي خورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("مي خورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("مي خورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present indicative
|
||||
{
|
||||
input: []byte("خورده مي شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite indicative
|
||||
{
|
||||
input: []byte("خورده شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite indicative
|
||||
{
|
||||
input: []byte("خورده مي شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive perfect indicative
|
||||
{
|
||||
input: []byte("خورده شده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective perfect indicative
|
||||
{
|
||||
input: []byte("خورده مي شده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده شده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده مي شده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive future indicative
|
||||
{
|
||||
input: []byte("خورده خواهد شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present progressive indicative
|
||||
{
|
||||
input: []byte("دارد خورده مي شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت خورده مي شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present subjunctive
|
||||
{
|
||||
input: []byte("خورده شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده شده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده مي شده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده مي شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present subjunctive
|
||||
{
|
||||
input: []byte("بخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بخورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPersianAnalyzerOthers(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// nouns
|
||||
{
|
||||
input: []byte("برگ ها"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("برگ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("برگها"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("برگ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// non persian
|
||||
{
|
||||
input: []byte("English test."),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("english"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// others
|
||||
{
|
||||
input: []byte("خورده مي شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("برگها"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("برگ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
80
analysis/lang/fa/persian_normalize.go
Normal file
80
analysis/lang/fa/persian_normalize.go
Normal file
|
@ -0,0 +1,80 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_fa"
|
||||
|
||||
const (
|
||||
Yeh = '\u064A'
|
||||
FarsiYeh = '\u06CC'
|
||||
YehBarree = '\u06D2'
|
||||
Keheh = '\u06A9'
|
||||
Kaf = '\u0643'
|
||||
HamzaAbove = '\u0654'
|
||||
HehYeh = '\u06C0'
|
||||
HehGoal = '\u06C1'
|
||||
Heh = '\u0647'
|
||||
)
|
||||
|
||||
type PersianNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewPersianNormalizeFilter() *PersianNormalizeFilter {
|
||||
return &PersianNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case FarsiYeh, YehBarree:
|
||||
runes[i] = Yeh
|
||||
case Keheh:
|
||||
runes[i] = Kaf
|
||||
case HehYeh, HehGoal:
|
||||
runes[i] = Heh
|
||||
case HamzaAbove: // necessary for HEH + HAMZA
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewPersianNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
130
analysis/lang/fa/persian_normalize_test.go
Normal file
130
analysis/lang/fa/persian_normalize_test.go
Normal file
|
@ -0,0 +1,130 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestPersianNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// FarsiYeh
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("های"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هاي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YehBarree
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هاے"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هاي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Keheh
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("کشاندن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كشاندن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HehYeh
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابۀ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابه"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HehHamzaAbove
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابهٔ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابه"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HehGoal
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زادہ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زاده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
persianNormalizeFilter := NewPersianNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := persianNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/fa/stop_filter_fa.go
Normal file
36
analysis/lang/fa/stop_filter_fa.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
340
analysis/lang/fa/stop_words_fa.go
Normal file
340
analysis/lang/fa/stop_words_fa.go
Normal file
|
@ -0,0 +1,340 @@
|
|||
package fa
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_fa"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var PersianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# Note: by default this file is used after normalization, so when adding entries
|
||||
# to this file, use the arabic 'ي' instead of 'ی'
|
||||
انان
|
||||
نداشته
|
||||
سراسر
|
||||
خياه
|
||||
ايشان
|
||||
وي
|
||||
تاكنون
|
||||
بيشتري
|
||||
دوم
|
||||
پس
|
||||
ناشي
|
||||
وگو
|
||||
يا
|
||||
داشتند
|
||||
سپس
|
||||
هنگام
|
||||
هرگز
|
||||
پنج
|
||||
نشان
|
||||
امسال
|
||||
ديگر
|
||||
گروهي
|
||||
شدند
|
||||
چطور
|
||||
ده
|
||||
و
|
||||
دو
|
||||
نخستين
|
||||
ولي
|
||||
چرا
|
||||
چه
|
||||
وسط
|
||||
ه
|
||||
كدام
|
||||
قابل
|
||||
يك
|
||||
رفت
|
||||
هفت
|
||||
همچنين
|
||||
در
|
||||
هزار
|
||||
بله
|
||||
بلي
|
||||
شايد
|
||||
اما
|
||||
شناسي
|
||||
گرفته
|
||||
دهد
|
||||
داشته
|
||||
دانست
|
||||
داشتن
|
||||
خواهيم
|
||||
ميليارد
|
||||
وقتيكه
|
||||
امد
|
||||
خواهد
|
||||
جز
|
||||
اورده
|
||||
شده
|
||||
بلكه
|
||||
خدمات
|
||||
شدن
|
||||
برخي
|
||||
نبود
|
||||
بسياري
|
||||
جلوگيري
|
||||
حق
|
||||
كردند
|
||||
نوعي
|
||||
بعري
|
||||
نكرده
|
||||
نظير
|
||||
نبايد
|
||||
بوده
|
||||
بودن
|
||||
داد
|
||||
اورد
|
||||
هست
|
||||
جايي
|
||||
شود
|
||||
دنبال
|
||||
داده
|
||||
بايد
|
||||
سابق
|
||||
هيچ
|
||||
همان
|
||||
انجا
|
||||
كمتر
|
||||
كجاست
|
||||
گردد
|
||||
كسي
|
||||
تر
|
||||
مردم
|
||||
تان
|
||||
دادن
|
||||
بودند
|
||||
سري
|
||||
جدا
|
||||
ندارند
|
||||
مگر
|
||||
يكديگر
|
||||
دارد
|
||||
دهند
|
||||
بنابراين
|
||||
هنگامي
|
||||
سمت
|
||||
جا
|
||||
انچه
|
||||
خود
|
||||
دادند
|
||||
زياد
|
||||
دارند
|
||||
اثر
|
||||
بدون
|
||||
بهترين
|
||||
بيشتر
|
||||
البته
|
||||
به
|
||||
براساس
|
||||
بيرون
|
||||
كرد
|
||||
بعضي
|
||||
گرفت
|
||||
توي
|
||||
اي
|
||||
ميليون
|
||||
او
|
||||
جريان
|
||||
تول
|
||||
بر
|
||||
مانند
|
||||
برابر
|
||||
باشيم
|
||||
مدتي
|
||||
گويند
|
||||
اكنون
|
||||
تا
|
||||
تنها
|
||||
جديد
|
||||
چند
|
||||
بي
|
||||
نشده
|
||||
كردن
|
||||
كردم
|
||||
گويد
|
||||
كرده
|
||||
كنيم
|
||||
نمي
|
||||
نزد
|
||||
روي
|
||||
قصد
|
||||
فقط
|
||||
بالاي
|
||||
ديگران
|
||||
اين
|
||||
ديروز
|
||||
توسط
|
||||
سوم
|
||||
ايم
|
||||
دانند
|
||||
سوي
|
||||
استفاده
|
||||
شما
|
||||
كنار
|
||||
داريم
|
||||
ساخته
|
||||
طور
|
||||
امده
|
||||
رفته
|
||||
نخست
|
||||
بيست
|
||||
نزديك
|
||||
طي
|
||||
كنيد
|
||||
از
|
||||
انها
|
||||
تمامي
|
||||
داشت
|
||||
يكي
|
||||
طريق
|
||||
اش
|
||||
چيست
|
||||
روب
|
||||
نمايد
|
||||
گفت
|
||||
چندين
|
||||
چيزي
|
||||
تواند
|
||||
ام
|
||||
ايا
|
||||
با
|
||||
ان
|
||||
ايد
|
||||
ترين
|
||||
اينكه
|
||||
ديگري
|
||||
راه
|
||||
هايي
|
||||
بروز
|
||||
همچنان
|
||||
پاعين
|
||||
كس
|
||||
حدود
|
||||
مختلف
|
||||
مقابل
|
||||
چيز
|
||||
گيرد
|
||||
ندارد
|
||||
ضد
|
||||
همچون
|
||||
سازي
|
||||
شان
|
||||
مورد
|
||||
باره
|
||||
مرسي
|
||||
خويش
|
||||
برخوردار
|
||||
چون
|
||||
خارج
|
||||
شش
|
||||
هنوز
|
||||
تحت
|
||||
ضمن
|
||||
هستيم
|
||||
گفته
|
||||
فكر
|
||||
بسيار
|
||||
پيش
|
||||
براي
|
||||
روزهاي
|
||||
انكه
|
||||
نخواهد
|
||||
بالا
|
||||
كل
|
||||
وقتي
|
||||
كي
|
||||
چنين
|
||||
كه
|
||||
گيري
|
||||
نيست
|
||||
است
|
||||
كجا
|
||||
كند
|
||||
نيز
|
||||
يابد
|
||||
بندي
|
||||
حتي
|
||||
توانند
|
||||
عقب
|
||||
خواست
|
||||
كنند
|
||||
بين
|
||||
تمام
|
||||
همه
|
||||
ما
|
||||
باشند
|
||||
مثل
|
||||
شد
|
||||
اري
|
||||
باشد
|
||||
اره
|
||||
طبق
|
||||
بعد
|
||||
اگر
|
||||
صورت
|
||||
غير
|
||||
جاي
|
||||
بيش
|
||||
ريزي
|
||||
اند
|
||||
زيرا
|
||||
چگونه
|
||||
بار
|
||||
لطفا
|
||||
مي
|
||||
درباره
|
||||
من
|
||||
ديده
|
||||
همين
|
||||
گذاري
|
||||
برداري
|
||||
علت
|
||||
گذاشته
|
||||
هم
|
||||
فوق
|
||||
نه
|
||||
ها
|
||||
شوند
|
||||
اباد
|
||||
همواره
|
||||
هر
|
||||
اول
|
||||
خواهند
|
||||
چهار
|
||||
نام
|
||||
امروز
|
||||
مان
|
||||
هاي
|
||||
قبل
|
||||
كنم
|
||||
سعي
|
||||
تازه
|
||||
را
|
||||
هستند
|
||||
زير
|
||||
جلوي
|
||||
عنوان
|
||||
بود
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(PersianStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
60
analysis/lang/fi/analyzer_fi.go
Normal file
60
analysis/lang/fi/analyzer_fi.go
Normal file
|
@ -0,0 +1,60 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fi
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "fi"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopFiFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerFiFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: unicodeTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopFiFilter,
|
||||
stemmerFiFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
70
analysis/lang/fi/analyzer_fi_test.go
Normal file
70
analysis/lang/fi/analyzer_fi_test.go
Normal file
|
@ -0,0 +1,70 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fi
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestFinishAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("edeltäjiinsä"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("edeltäj"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("edeltäjistään"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("edeltäj"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word
|
||||
{
|
||||
input: []byte("olla"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
52
analysis/lang/fi/stemmer_fi.go
Normal file
52
analysis/lang/fi/stemmer_fi.go
Normal file
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fi
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/snowballstem"
|
||||
"github.com/blevesearch/snowballstem/finnish"
|
||||
)
|
||||
|
||||
const SnowballStemmerName = "stemmer_fi_snowball"
|
||||
|
||||
type FinnishStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewFinnishStemmerFilter() *FinnishStemmerFilter {
|
||||
return &FinnishStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *FinnishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
env := snowballstem.NewEnv(string(token.Term))
|
||||
finnish.Stem(env)
|
||||
token.Term = []byte(env.Current())
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func FinnishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewFinnishStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(SnowballStemmerName, FinnishStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
36
analysis/lang/fi/stop_filter_fi.go
Normal file
36
analysis/lang/fi/stop_filter_fi.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fi
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
124
analysis/lang/fi/stop_words_fi.go
Normal file
124
analysis/lang/fi/stop_words_fi.go
Normal file
|
@ -0,0 +1,124 @@
|
|||
package fi
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_fi"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var FinnishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| forms of BE
|
||||
|
||||
olla
|
||||
olen
|
||||
olet
|
||||
on
|
||||
olemme
|
||||
olette
|
||||
ovat
|
||||
ole | negative form
|
||||
|
||||
oli
|
||||
olisi
|
||||
olisit
|
||||
olisin
|
||||
olisimme
|
||||
olisitte
|
||||
olisivat
|
||||
olit
|
||||
olin
|
||||
olimme
|
||||
olitte
|
||||
olivat
|
||||
ollut
|
||||
olleet
|
||||
|
||||
en | negation
|
||||
et
|
||||
ei
|
||||
emme
|
||||
ette
|
||||
eivät
|
||||
|
||||
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
|
||||
minä minun minut minua minussa minusta minuun minulla minulta minulle | I
|
||||
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
|
||||
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
|
||||
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
|
||||
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
|
||||
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
|
||||
|
||||
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
|
||||
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
|
||||
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
|
||||
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
|
||||
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
|
||||
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
|
||||
|
||||
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
|
||||
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
|
||||
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
|
||||
mitkä | (pl)
|
||||
|
||||
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
|
||||
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
|
||||
|
||||
| conjunctions
|
||||
|
||||
että | that
|
||||
ja | and
|
||||
jos | if
|
||||
koska | because
|
||||
kuin | than
|
||||
mutta | but
|
||||
niin | so
|
||||
sekä | and
|
||||
sillä | for
|
||||
tai | or
|
||||
vaan | but
|
||||
vai | or
|
||||
vaikka | although
|
||||
|
||||
|
||||
| prepositions
|
||||
|
||||
kanssa | with
|
||||
mukaan | according to
|
||||
noin | about
|
||||
poikki | across
|
||||
yli | over, across
|
||||
|
||||
| other
|
||||
|
||||
kun | when
|
||||
niin | so
|
||||
nyt | now
|
||||
itse | self
|
||||
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(FinnishStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
65
analysis/lang/fr/analyzer_fr.go
Normal file
65
analysis/lang/fr/analyzer_fr.go
Normal file
|
@ -0,0 +1,65 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "fr"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopFrFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerFrFilter, err := cache.TokenFilterNamed(LightStemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
elisionFilter,
|
||||
stopFrFilter,
|
||||
stemmerFrFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
209
analysis/lang/fr/analyzer_fr_test.go
Normal file
209
analysis/lang/fr/analyzer_fr_test.go
Normal file
|
@ -0,0 +1,209 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestFrenchAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte(""),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
{
|
||||
input: []byte("chien chat cheval"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chien"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("chien CHAT CHEVAL"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chien"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte(" chien ,? + = - CHAT /: > CHEVAL"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chien"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("chien++"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chien"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("mot \"entreguillemet\""),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("mot"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("entreguilemet"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Jean-François"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("jean"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("francoi"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop words
|
||||
{
|
||||
input: []byte("le la chien les aux chat du des à cheval"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chien"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chat"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// nouns and adjectives
|
||||
{
|
||||
input: []byte("lances chismes habitable chiste éléments captifs"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("lanc"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chism"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("habitabl"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("chist"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("element"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("captif"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// verbs
|
||||
{
|
||||
input: []byte("finissions souffrirent rugissante"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("finision"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("soufrirent"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("rugisant"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ "),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("c3po"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("aujourd'hui"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("oeuf"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("ïaöuaä"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("anticonstitutionel"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("java"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("propriétaire"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("proprietair"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
40
analysis/lang/fr/articles_fr.go
Normal file
40
analysis/lang/fr/articles_fr.go
Normal file
|
@ -0,0 +1,40 @@
|
|||
package fr
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const ArticlesName = "articles_fr"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
|
||||
|
||||
var FrenchArticles = []byte(`
|
||||
l
|
||||
m
|
||||
t
|
||||
qu
|
||||
n
|
||||
s
|
||||
j
|
||||
d
|
||||
c
|
||||
jusqu
|
||||
quoiqu
|
||||
lorsqu
|
||||
puisqu
|
||||
`)
|
||||
|
||||
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(FrenchArticles)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
40
analysis/lang/fr/elision_fr.go
Normal file
40
analysis/lang/fr/elision_fr.go
Normal file
|
@ -0,0 +1,40 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/elision"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const ElisionName = "elision_fr"
|
||||
|
||||
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error building elision filter: %v", err)
|
||||
}
|
||||
return elision.NewElisionFilter(articlesTokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
55
analysis/lang/fr/elision_fr_test.go
Normal file
55
analysis/lang/fr/elision_fr_test.go
Normal file
|
@ -0,0 +1,55 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestFrenchElision(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("l'avion"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("avion"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := elisionFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
309
analysis/lang/fr/light_stemmer_fr.go
Normal file
309
analysis/lang/fr/light_stemmer_fr.go
Normal file
|
@ -0,0 +1,309 @@
|
|||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const LightStemmerName = "stemmer_fr_light"
|
||||
|
||||
type FrenchLightStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewFrenchLightStemmerFilter() *FrenchLightStemmerFilter {
|
||||
return &FrenchLightStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *FrenchLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
runes := bytes.Runes(token.Term)
|
||||
runes = stem(runes)
|
||||
token.Term = analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func stem(input []rune) []rune {
|
||||
|
||||
inputLen := len(input)
|
||||
|
||||
if inputLen > 5 && input[inputLen-1] == 'x' {
|
||||
if input[inputLen-3] == 'a' && input[inputLen-2] == 'u' && input[inputLen-4] != 'e' {
|
||||
input[inputLen-2] = 'l'
|
||||
}
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
|
||||
if inputLen > 3 && input[inputLen-1] == 'x' {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
|
||||
if inputLen > 3 && input[inputLen-1] == 's' {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "issement") {
|
||||
input = input[0 : inputLen-6]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "issant") {
|
||||
input = input[0 : inputLen-4]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 6 && analysis.RunesEndsWith(input, "ement") {
|
||||
input = input[0 : inputLen-4]
|
||||
inputLen = len(input)
|
||||
if inputLen > 3 && analysis.RunesEndsWith(input, "ive") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'f'
|
||||
}
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 11 && analysis.RunesEndsWith(input, "ficatrice") {
|
||||
input = input[0 : inputLen-5]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 10 && analysis.RunesEndsWith(input, "ficateur") {
|
||||
input = input[0 : inputLen-4]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "catrice") {
|
||||
input = input[0 : inputLen-3]
|
||||
inputLen = len(input)
|
||||
input[inputLen-4] = 'q'
|
||||
input[inputLen-3] = 'u'
|
||||
input[inputLen-2] = 'e'
|
||||
//s[len-1] = 'r' <-- unnecessary, already 'r'.
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "cateur") {
|
||||
input = input[0 : inputLen-2]
|
||||
inputLen = len(input)
|
||||
input[inputLen-4] = 'q'
|
||||
input[inputLen-3] = 'u'
|
||||
input[inputLen-2] = 'e'
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "atrice") {
|
||||
input = input[0 : inputLen-4]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 7 && analysis.RunesEndsWith(input, "ateur") {
|
||||
input = input[0 : inputLen-3]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 6 && analysis.RunesEndsWith(input, "trice") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-3] = 'e'
|
||||
input[inputLen-2] = 'u'
|
||||
input[inputLen-1] = 'r'
|
||||
}
|
||||
|
||||
if inputLen > 5 && analysis.RunesEndsWith(input, "ième") {
|
||||
return norm(input[0 : inputLen-4])
|
||||
}
|
||||
|
||||
if inputLen > 7 && analysis.RunesEndsWith(input, "teuse") {
|
||||
input = input[0 : inputLen-2]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 6 && analysis.RunesEndsWith(input, "teur") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'r'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 5 && analysis.RunesEndsWith(input, "euse") {
|
||||
return norm(input[0 : inputLen-2])
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "ère") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 7 && analysis.RunesEndsWith(input, "ive") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'f'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 4 &&
|
||||
(analysis.RunesEndsWith(input, "folle") ||
|
||||
analysis.RunesEndsWith(input, "molle")) {
|
||||
input = input[0 : inputLen-2]
|
||||
inputLen = len(input)
|
||||
input[inputLen-1] = 'u'
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "nnelle") {
|
||||
return norm(input[0 : inputLen-5])
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "nnel") {
|
||||
return norm(input[0 : inputLen-3])
|
||||
}
|
||||
|
||||
if inputLen > 4 && analysis.RunesEndsWith(input, "ète") {
|
||||
input = input[0 : inputLen-1]
|
||||
inputLen = len(input)
|
||||
input[inputLen-2] = 'e'
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "ique") {
|
||||
input = input[0 : inputLen-4]
|
||||
inputLen = len(input)
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "esse") {
|
||||
return norm(input[0 : inputLen-3])
|
||||
}
|
||||
|
||||
if inputLen > 7 && analysis.RunesEndsWith(input, "inage") {
|
||||
return norm(input[0 : inputLen-3])
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "isation") {
|
||||
input = input[0 : inputLen-7]
|
||||
inputLen = len(input)
|
||||
if inputLen > 5 && analysis.RunesEndsWith(input, "ual") {
|
||||
input[inputLen-2] = 'e'
|
||||
}
|
||||
return norm(input)
|
||||
}
|
||||
|
||||
if inputLen > 9 && analysis.RunesEndsWith(input, "isateur") {
|
||||
return norm(input[0 : inputLen-7])
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "ation") {
|
||||
return norm(input[0 : inputLen-5])
|
||||
}
|
||||
|
||||
if inputLen > 8 && analysis.RunesEndsWith(input, "ition") {
|
||||
return norm(input[0 : inputLen-5])
|
||||
}
|
||||
|
||||
return norm(input)
|
||||
|
||||
}
|
||||
|
||||
func norm(input []rune) []rune {
|
||||
|
||||
if len(input) > 4 {
|
||||
for i := 0; i < len(input); i++ {
|
||||
switch input[i] {
|
||||
case 'à', 'á', 'â':
|
||||
input[i] = 'a'
|
||||
case 'ô':
|
||||
input[i] = 'o'
|
||||
case 'è', 'é', 'ê':
|
||||
input[i] = 'e'
|
||||
case 'ù', 'û':
|
||||
input[i] = 'u'
|
||||
case 'î':
|
||||
input[i] = 'i'
|
||||
case 'ç':
|
||||
input[i] = 'c'
|
||||
}
|
||||
|
||||
ch := input[0]
|
||||
for i := 1; i < len(input); i++ {
|
||||
if input[i] == ch && unicode.IsLetter(ch) {
|
||||
input = analysis.DeleteRune(input, i)
|
||||
i -= 1
|
||||
} else {
|
||||
ch = input[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(input) > 4 && analysis.RunesEndsWith(input, "ie") {
|
||||
input = input[0 : len(input)-2]
|
||||
}
|
||||
|
||||
if len(input) > 4 {
|
||||
if input[len(input)-1] == 'r' {
|
||||
input = input[0 : len(input)-1]
|
||||
}
|
||||
if input[len(input)-1] == 'e' {
|
||||
input = input[0 : len(input)-1]
|
||||
}
|
||||
if input[len(input)-1] == 'e' {
|
||||
input = input[0 : len(input)-1]
|
||||
}
|
||||
if input[len(input)-1] == input[len(input)-2] && unicode.IsLetter(input[len(input)-1]) {
|
||||
input = input[0 : len(input)-1]
|
||||
}
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
func FrenchLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewFrenchLightStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(LightStemmerName, FrenchLightStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
1015
analysis/lang/fr/light_stemmer_fr_test.go
Normal file
1015
analysis/lang/fr/light_stemmer_fr_test.go
Normal file
File diff suppressed because it is too large
Load diff
82
analysis/lang/fr/minimal_stemmer_fr.go
Normal file
82
analysis/lang/fr/minimal_stemmer_fr.go
Normal file
|
@ -0,0 +1,82 @@
|
|||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const MinimalStemmerName = "stemmer_fr_min"
|
||||
|
||||
type FrenchMinimalStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewFrenchMinimalStemmerFilter() *FrenchMinimalStemmerFilter {
|
||||
return &FrenchMinimalStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *FrenchMinimalStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
runes := bytes.Runes(token.Term)
|
||||
runes = minstem(runes)
|
||||
token.Term = analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func minstem(input []rune) []rune {
|
||||
|
||||
if len(input) < 6 {
|
||||
return input
|
||||
}
|
||||
|
||||
if input[len(input)-1] == 'x' {
|
||||
if input[len(input)-3] == 'a' && input[len(input)-2] == 'u' {
|
||||
input[len(input)-2] = 'l'
|
||||
}
|
||||
return input[0 : len(input)-1]
|
||||
}
|
||||
|
||||
if input[len(input)-1] == 's' {
|
||||
input = input[0 : len(input)-1]
|
||||
}
|
||||
if input[len(input)-1] == 'r' {
|
||||
input = input[0 : len(input)-1]
|
||||
}
|
||||
if input[len(input)-1] == 'e' {
|
||||
input = input[0 : len(input)-1]
|
||||
}
|
||||
if input[len(input)-1] == 'é' {
|
||||
input = input[0 : len(input)-1]
|
||||
}
|
||||
if input[len(input)-1] == input[len(input)-2] {
|
||||
input = input[0 : len(input)-1]
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func FrenchMinimalStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewFrenchMinimalStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(MinimalStemmerName, FrenchMinimalStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
139
analysis/lang/fr/minimal_stemmer_fr_test.go
Normal file
139
analysis/lang/fr/minimal_stemmer_fr_test.go
Normal file
|
@ -0,0 +1,139 @@
|
|||
// Copyright (c) 2015 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestFrenchMinimalStemmer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chevaux"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("cheval"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("hiboux"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("hibou"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chantés"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chant"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chanter"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chant"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chante"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("chant"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("baronnes"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("baron"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("barons"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("baron"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("baron"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("baron"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
filter, err := cache.TokenFilterNamed(MinimalStemmerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
52
analysis/lang/fr/stemmer_fr_snowball.go
Normal file
52
analysis/lang/fr/stemmer_fr_snowball.go
Normal file
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/snowballstem"
|
||||
"github.com/blevesearch/snowballstem/french"
|
||||
)
|
||||
|
||||
const SnowballStemmerName = "stemmer_fr_snowball"
|
||||
|
||||
type FrenchStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewFrenchStemmerFilter() *FrenchStemmerFilter {
|
||||
return &FrenchStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *FrenchStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
env := snowballstem.NewEnv(string(token.Term))
|
||||
french.Stem(env)
|
||||
token.Term = []byte(env.Current())
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func FrenchStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewFrenchStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(SnowballStemmerName, FrenchStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
79
analysis/lang/fr/stemmer_fr_snowball_test.go
Normal file
79
analysis/lang/fr/stemmer_fr_snowball_test.go
Normal file
|
@ -0,0 +1,79 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestSnowballFrenchStemmer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("antagoniste"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("antagon"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("barbouillait"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("barbouill"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("calculateur"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("calcul"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/fr/stop_filter_fr.go
Normal file
36
analysis/lang/fr/stop_filter_fr.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fr
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
213
analysis/lang/fr/stop_words_fr.go
Normal file
213
analysis/lang/fr/stop_words_fr.go
Normal file
|
@ -0,0 +1,213 @@
|
|||
package fr
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_fr"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var FrenchStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A French stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
au | a + le
|
||||
aux | a + les
|
||||
avec | with
|
||||
ce | this
|
||||
ces | these
|
||||
dans | with
|
||||
de | of
|
||||
des | de + les
|
||||
du | de + le
|
||||
elle | she
|
||||
en | 'of them' etc
|
||||
et | and
|
||||
eux | them
|
||||
il | he
|
||||
je | I
|
||||
la | the
|
||||
le | the
|
||||
leur | their
|
||||
lui | him
|
||||
ma | my (fem)
|
||||
mais | but
|
||||
me | me
|
||||
même | same; as in moi-même (myself) etc
|
||||
mes | me (pl)
|
||||
moi | me
|
||||
mon | my (masc)
|
||||
ne | not
|
||||
nos | our (pl)
|
||||
notre | our
|
||||
nous | we
|
||||
on | one
|
||||
ou | where
|
||||
par | by
|
||||
pas | not
|
||||
pour | for
|
||||
qu | que before vowel
|
||||
que | that
|
||||
qui | who
|
||||
sa | his, her (fem)
|
||||
se | oneself
|
||||
ses | his (pl)
|
||||
son | his, her (masc)
|
||||
sur | on
|
||||
ta | thy (fem)
|
||||
te | thee
|
||||
tes | thy (pl)
|
||||
toi | thee
|
||||
ton | thy (masc)
|
||||
tu | thou
|
||||
un | a
|
||||
une | a
|
||||
vos | your (pl)
|
||||
votre | your
|
||||
vous | you
|
||||
|
||||
| single letter forms
|
||||
|
||||
c | c'
|
||||
d | d'
|
||||
j | j'
|
||||
l | l'
|
||||
à | to, at
|
||||
m | m'
|
||||
n | n'
|
||||
s | s'
|
||||
t | t'
|
||||
y | there
|
||||
|
||||
| forms of être (not including the infinitive):
|
||||
été
|
||||
étée
|
||||
étées
|
||||
étés
|
||||
étant
|
||||
suis
|
||||
es
|
||||
est
|
||||
sommes
|
||||
êtes
|
||||
sont
|
||||
serai
|
||||
seras
|
||||
sera
|
||||
serons
|
||||
serez
|
||||
seront
|
||||
serais
|
||||
serait
|
||||
serions
|
||||
seriez
|
||||
seraient
|
||||
étais
|
||||
était
|
||||
étions
|
||||
étiez
|
||||
étaient
|
||||
fus
|
||||
fut
|
||||
fûmes
|
||||
fûtes
|
||||
furent
|
||||
sois
|
||||
soit
|
||||
soyons
|
||||
soyez
|
||||
soient
|
||||
fusse
|
||||
fusses
|
||||
fût
|
||||
fussions
|
||||
fussiez
|
||||
fussent
|
||||
|
||||
| forms of avoir (not including the infinitive):
|
||||
ayant
|
||||
eu
|
||||
eue
|
||||
eues
|
||||
eus
|
||||
ai
|
||||
as
|
||||
avons
|
||||
avez
|
||||
ont
|
||||
aurai
|
||||
auras
|
||||
aura
|
||||
aurons
|
||||
aurez
|
||||
auront
|
||||
aurais
|
||||
aurait
|
||||
aurions
|
||||
auriez
|
||||
auraient
|
||||
avais
|
||||
avait
|
||||
avions
|
||||
aviez
|
||||
avaient
|
||||
eut
|
||||
eûmes
|
||||
eûtes
|
||||
eurent
|
||||
aie
|
||||
aies
|
||||
ait
|
||||
ayons
|
||||
ayez
|
||||
aient
|
||||
eusse
|
||||
eusses
|
||||
eût
|
||||
eussions
|
||||
eussiez
|
||||
eussent
|
||||
|
||||
| Later additions (from Jean-Christophe Deschamps)
|
||||
ceci | this
|
||||
cela | that
|
||||
celà | that
|
||||
cet | this
|
||||
cette | this
|
||||
ici | here
|
||||
ils | they
|
||||
les | the (pl)
|
||||
leurs | their (pl)
|
||||
quel | which
|
||||
quels | which
|
||||
quelle | which
|
||||
quelles | which
|
||||
sans | without
|
||||
soi | oneself
|
||||
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(FrenchStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
30
analysis/lang/ga/articles_ga.go
Normal file
30
analysis/lang/ga/articles_ga.go
Normal file
|
@ -0,0 +1,30 @@
|
|||
package ga
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const ArticlesName = "articles_ga"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
|
||||
|
||||
var IrishArticles = []byte(`
|
||||
d
|
||||
m
|
||||
b
|
||||
`)
|
||||
|
||||
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(IrishArticles)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
40
analysis/lang/ga/elision_ga.go
Normal file
40
analysis/lang/ga/elision_ga.go
Normal file
|
@ -0,0 +1,40 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ga
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/elision"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const ElisionName = "elision_ga"
|
||||
|
||||
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error building elision filter: %v", err)
|
||||
}
|
||||
return elision.NewElisionFilter(articlesTokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
55
analysis/lang/ga/elision_ga_test.go
Normal file
55
analysis/lang/ga/elision_ga_test.go
Normal file
|
@ -0,0 +1,55 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ga
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestFrenchElision(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("b'fhearr"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("fhearr"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := elisionFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/ga/stop_filter_ga.go
Normal file
36
analysis/lang/ga/stop_filter_ga.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package ga
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
137
analysis/lang/ga/stop_words_ga.go
Normal file
137
analysis/lang/ga/stop_words_ga.go
Normal file
|
@ -0,0 +1,137 @@
|
|||
package ga
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_ga"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var IrishStopWords = []byte(`
|
||||
a
|
||||
ach
|
||||
ag
|
||||
agus
|
||||
an
|
||||
aon
|
||||
ar
|
||||
arna
|
||||
as
|
||||
b'
|
||||
ba
|
||||
beirt
|
||||
bhúr
|
||||
caoga
|
||||
ceathair
|
||||
ceathrar
|
||||
chomh
|
||||
chtó
|
||||
chuig
|
||||
chun
|
||||
cois
|
||||
céad
|
||||
cúig
|
||||
cúigear
|
||||
d'
|
||||
daichead
|
||||
dar
|
||||
de
|
||||
deich
|
||||
deichniúr
|
||||
den
|
||||
dhá
|
||||
do
|
||||
don
|
||||
dtí
|
||||
dá
|
||||
dár
|
||||
dó
|
||||
faoi
|
||||
faoin
|
||||
faoina
|
||||
faoinár
|
||||
fara
|
||||
fiche
|
||||
gach
|
||||
gan
|
||||
go
|
||||
gur
|
||||
haon
|
||||
hocht
|
||||
i
|
||||
iad
|
||||
idir
|
||||
in
|
||||
ina
|
||||
ins
|
||||
inár
|
||||
is
|
||||
le
|
||||
leis
|
||||
lena
|
||||
lenár
|
||||
m'
|
||||
mar
|
||||
mo
|
||||
mé
|
||||
na
|
||||
nach
|
||||
naoi
|
||||
naonúr
|
||||
ná
|
||||
ní
|
||||
níor
|
||||
nó
|
||||
nócha
|
||||
ocht
|
||||
ochtar
|
||||
os
|
||||
roimh
|
||||
sa
|
||||
seacht
|
||||
seachtar
|
||||
seachtó
|
||||
seasca
|
||||
seisear
|
||||
siad
|
||||
sibh
|
||||
sinn
|
||||
sna
|
||||
sé
|
||||
sí
|
||||
tar
|
||||
thar
|
||||
thú
|
||||
triúr
|
||||
trí
|
||||
trína
|
||||
trínár
|
||||
tríocha
|
||||
tú
|
||||
um
|
||||
ár
|
||||
é
|
||||
éis
|
||||
í
|
||||
ó
|
||||
ón
|
||||
óna
|
||||
ónár
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(IrishStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
36
analysis/lang/gl/stop_filter_gl.go
Normal file
36
analysis/lang/gl/stop_filter_gl.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package gl
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
188
analysis/lang/gl/stop_words_gl.go
Normal file
188
analysis/lang/gl/stop_words_gl.go
Normal file
|
@ -0,0 +1,188 @@
|
|||
package gl
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_gl"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var GalicianStopWords = []byte(`# galican stopwords
|
||||
a
|
||||
aínda
|
||||
alí
|
||||
aquel
|
||||
aquela
|
||||
aquelas
|
||||
aqueles
|
||||
aquilo
|
||||
aquí
|
||||
ao
|
||||
aos
|
||||
as
|
||||
así
|
||||
á
|
||||
ben
|
||||
cando
|
||||
che
|
||||
co
|
||||
coa
|
||||
comigo
|
||||
con
|
||||
connosco
|
||||
contigo
|
||||
convosco
|
||||
coas
|
||||
cos
|
||||
cun
|
||||
cuns
|
||||
cunha
|
||||
cunhas
|
||||
da
|
||||
dalgunha
|
||||
dalgunhas
|
||||
dalgún
|
||||
dalgúns
|
||||
das
|
||||
de
|
||||
del
|
||||
dela
|
||||
delas
|
||||
deles
|
||||
desde
|
||||
deste
|
||||
do
|
||||
dos
|
||||
dun
|
||||
duns
|
||||
dunha
|
||||
dunhas
|
||||
e
|
||||
el
|
||||
ela
|
||||
elas
|
||||
eles
|
||||
en
|
||||
era
|
||||
eran
|
||||
esa
|
||||
esas
|
||||
ese
|
||||
eses
|
||||
esta
|
||||
estar
|
||||
estaba
|
||||
está
|
||||
están
|
||||
este
|
||||
estes
|
||||
estiven
|
||||
estou
|
||||
eu
|
||||
é
|
||||
facer
|
||||
foi
|
||||
foron
|
||||
fun
|
||||
había
|
||||
hai
|
||||
iso
|
||||
isto
|
||||
la
|
||||
las
|
||||
lle
|
||||
lles
|
||||
lo
|
||||
los
|
||||
mais
|
||||
me
|
||||
meu
|
||||
meus
|
||||
min
|
||||
miña
|
||||
miñas
|
||||
moi
|
||||
na
|
||||
nas
|
||||
neste
|
||||
nin
|
||||
no
|
||||
non
|
||||
nos
|
||||
nosa
|
||||
nosas
|
||||
noso
|
||||
nosos
|
||||
nós
|
||||
nun
|
||||
nunha
|
||||
nuns
|
||||
nunhas
|
||||
o
|
||||
os
|
||||
ou
|
||||
ó
|
||||
ós
|
||||
para
|
||||
pero
|
||||
pode
|
||||
pois
|
||||
pola
|
||||
polas
|
||||
polo
|
||||
polos
|
||||
por
|
||||
que
|
||||
se
|
||||
senón
|
||||
ser
|
||||
seu
|
||||
seus
|
||||
sexa
|
||||
sido
|
||||
sobre
|
||||
súa
|
||||
súas
|
||||
tamén
|
||||
tan
|
||||
te
|
||||
ten
|
||||
teñen
|
||||
teño
|
||||
ter
|
||||
teu
|
||||
teus
|
||||
ti
|
||||
tido
|
||||
tiña
|
||||
tiven
|
||||
túa
|
||||
túas
|
||||
un
|
||||
unha
|
||||
unhas
|
||||
uns
|
||||
vos
|
||||
vosa
|
||||
vosas
|
||||
voso
|
||||
vosos
|
||||
vós
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(GalicianStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
71
analysis/lang/hi/analyzer_hi.go
Normal file
71
analysis/lang/hi/analyzer_hi.go
Normal file
|
@ -0,0 +1,71 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hi
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/lang/in"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "hi"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
indicNormalizeFilter, err := cache.TokenFilterNamed(in.NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
hindiNormalizeFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopHiFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerHiFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
indicNormalizeFilter,
|
||||
hindiNormalizeFilter,
|
||||
stopHiFilter,
|
||||
stemmerHiFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue