1
0
Fork 0

Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-19 00:20:02 +02:00
parent c71cb8b61d
commit 982828099e
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
783 changed files with 150650 additions and 0 deletions

View file

@ -0,0 +1,68 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ar
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
)
const AnalyzerName = "ar"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
normalizeFilter := unicodenorm.MustNewUnicodeNormalizeFilter(unicodenorm.NFKC)
stopArFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
normalizeArFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stemmerArFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
normalizeFilter,
stopArFilter,
normalizeArFilter,
stemmerArFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,184 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ar
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestArabicAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte("كبير"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كبير"),
Position: 1,
Start: 0,
End: 8,
},
},
},
// feminine marker
{
input: []byte("كبيرة"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كبير"),
Position: 1,
Start: 0,
End: 10,
},
},
},
{
input: []byte("مشروب"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("مشروب"),
Position: 1,
Start: 0,
End: 10,
},
},
},
// plural -at
{
input: []byte("مشروبات"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("مشروب"),
Position: 1,
Start: 0,
End: 14,
},
},
},
// plural -in
{
input: []byte("أمريكيين"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("امريك"),
Position: 1,
Start: 0,
End: 16,
},
},
},
// singular with bare alif
{
input: []byte("امريكي"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("امريك"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{
input: []byte("كتاب"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتاب"),
Position: 1,
Start: 0,
End: 8,
},
},
},
// definite article
{
input: []byte("الكتاب"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتاب"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{
input: []byte("ما ملكت أيمانكم"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ملكت"),
Position: 2,
Start: 5,
End: 13,
},
&analysis.Token{
Term: []byte("ايمانكم"),
Position: 3,
Start: 14,
End: 28,
},
},
},
// stopwords
{
input: []byte("الذين ملكت أيمانكم"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ملكت"),
Position: 2,
Start: 11,
End: 19,
},
&analysis.Token{
Term: []byte("ايمانكم"),
Position: 3,
Start: 20,
End: 34,
},
},
},
// presentation form normalization
{
input: []byte("ﺍﻟﺴﻼﻢ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
Position: 1,
Start: 0,
End: 15,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,88 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ar
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const NormalizeName = "normalize_ar"
const (
Alef = '\u0627'
AlefMadda = '\u0622'
AlefHamzaAbove = '\u0623'
AlefHamzaBelow = '\u0625'
Yeh = '\u064A'
DotlessYeh = '\u0649'
TehMarbuta = '\u0629'
Heh = '\u0647'
Tatweel = '\u0640'
Fathatan = '\u064B'
Dammatan = '\u064C'
Kasratan = '\u064D'
Fatha = '\u064E'
Damma = '\u064F'
Kasra = '\u0650'
Shadda = '\u0651'
Sukun = '\u0652'
)
type ArabicNormalizeFilter struct {
}
func NewArabicNormalizeFilter() *ArabicNormalizeFilter {
return &ArabicNormalizeFilter{}
}
func (s *ArabicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case AlefMadda, AlefHamzaAbove, AlefHamzaBelow:
runes[i] = Alef
case DotlessYeh:
runes[i] = Yeh
case TehMarbuta:
runes[i] = Heh
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
runes = analysis.DeleteRune(runes, i)
i--
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewArabicNormalizeFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,234 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ar
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestArabicNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// AlifMadda
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("آجن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("اجن"),
},
},
},
// AlifHamzaAbove
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("أحمد"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("احمد"),
},
},
},
// AlifHamzaBelow
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("إعاذ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("اعاذ"),
},
},
},
// AlifMaksura
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("بنى"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("بني"),
},
},
},
// TehMarbuta
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("فاطمة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("فاطمه"),
},
},
},
// Tatweel
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("روبرـــــت"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("روبرت"),
},
},
},
// Fatha
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("مَبنا"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("مبنا"),
},
},
},
// Kasra
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("علِي"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("علي"),
},
},
},
// Damma
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("بُوات"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("بوات"),
},
},
},
// Fathatan
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولداً"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولدا"),
},
},
},
// Kasratan
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولدٍ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولد"),
},
},
},
// Dammatan
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولدٌ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ولد"),
},
},
},
// Sukun
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("نلْسون"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("نلسون"),
},
},
},
// Shaddah
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("هتميّ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هتمي"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
arabicNormalizeFilter := NewArabicNormalizeFilter()
for _, test := range tests {
actual := arabicNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,121 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ar
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StemmerName = "stemmer_ar"
// These were obtained from org.apache.lucene.analysis.ar.ArabicStemmer
var prefixes = [][]rune{
[]rune("ال"),
[]rune("وال"),
[]rune("بال"),
[]rune("كال"),
[]rune("فال"),
[]rune("لل"),
[]rune("و"),
}
var suffixes = [][]rune{
[]rune("ها"),
[]rune("ان"),
[]rune("ات"),
[]rune("ون"),
[]rune("ين"),
[]rune("يه"),
[]rune("ية"),
[]rune("ه"),
[]rune("ة"),
[]rune("ي"),
}
type ArabicStemmerFilter struct{}
func NewArabicStemmerFilter() *ArabicStemmerFilter {
return &ArabicStemmerFilter{}
}
func (s *ArabicStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := stem(token.Term)
token.Term = term
}
return input
}
func canStemPrefix(input, prefix []rune) bool {
// Wa- prefix requires at least 3 characters.
if len(prefix) == 1 && len(input) < 4 {
return false
}
// Other prefixes require only 2.
if len(input)-len(prefix) < 2 {
return false
}
for i := range prefix {
if prefix[i] != input[i] {
return false
}
}
return true
}
func canStemSuffix(input, suffix []rune) bool {
// All suffixes require at least 2 characters after stemming.
if len(input)-len(suffix) < 2 {
return false
}
stemEnd := len(input) - len(suffix)
for i := range suffix {
if suffix[i] != input[stemEnd+i] {
return false
}
}
return true
}
func stem(input []byte) []byte {
runes := bytes.Runes(input)
// Strip a single prefix.
for _, p := range prefixes {
if canStemPrefix(runes, p) {
runes = runes[len(p):]
break
}
}
// Strip off multiple suffixes, in their order in the suffixes array.
for _, s := range suffixes {
if canStemSuffix(runes, s) {
runes = runes[:len(runes)-len(s)]
}
}
return analysis.BuildTermFromRunes(runes)
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewArabicStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,397 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ar
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestArabicStemmerFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// AlPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("الحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// WalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("والحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// BalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("بالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// KalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("كالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// FalPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("فالحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// LlPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("للاخر"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("اخر"),
},
},
},
// WaPrefix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("وحسن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("حسن"),
},
},
},
// AhSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("زوجها"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("زوج"),
},
},
},
// AnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدان"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// AtSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدات"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// WnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدون"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YnSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدين"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YhSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهديه"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YpSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدية"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// HSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهده"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// PSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// YSuffix
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدي"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// ComboPrefSuf
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("وساهدون"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// ComboSuf
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهدهات"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ساهد"),
},
},
},
// Shouldn't Stem
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("الو"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("الو"),
},
},
},
// NonArabic
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("English"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("English"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("السلام"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلامة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("السلامة"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سلام"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("الوصل"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("وصل"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("والصل"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("صل"),
},
},
},
// Empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
arabicStemmerFilter := NewArabicStemmerFilter()
for _, test := range tests {
actual := arabicStemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ar
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,152 @@
package ar
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_ar"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
// ` was changed to ' to allow for literal string
var ArabicStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
# Cleaned on October 11, 2009 (not normalized, so use before normalization)
# This means that when modifying this list, you might need to add some
# redundant entries, for example containing forms with both أ and ا
من
ومن
منها
منه
في
وفي
فيها
فيه
و
ف
ثم
او
أو
ب
بها
به
ا
أ
اى
اي
أي
أى
لا
ولا
الا
ألا
إلا
لكن
ما
وما
كما
فما
عن
مع
اذا
إذا
ان
أن
إن
انها
أنها
إنها
انه
أنه
إنه
بان
بأن
فان
فأن
وان
وأن
وإن
التى
التي
الذى
الذي
الذين
الى
الي
إلى
إلي
على
عليها
عليه
اما
أما
إما
ايضا
أيضا
كل
وكل
لم
ولم
لن
ولن
هى
هي
هو
وهى
وهي
وهو
فهى
فهي
فهو
انت
أنت
لك
لها
له
هذه
هذا
تلك
ذلك
هناك
كانت
كان
يكون
تكون
وكانت
وكان
غير
بعض
قد
نحو
بين
بينما
منذ
ضمن
حيث
الان
الآن
خلال
بعد
قبل
حتى
عند
عندما
لدى
جميع
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(ArabicStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package bg
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,220 @@
package bg
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_bg"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var BulgarianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
а
аз
ако
ала
бе
без
беше
би
бил
била
били
било
близо
бъдат
бъде
бяха
в
вас
ваш
ваша
вероятно
вече
взема
ви
вие
винаги
все
всеки
всички
всичко
всяка
във
въпреки
върху
г
ги
главно
го
д
да
дали
до
докато
докога
дори
досега
доста
е
едва
един
ето
за
зад
заедно
заради
засега
затова
защо
защото
и
из
или
им
има
имат
иска
й
каза
как
каква
какво
както
какъв
като
кога
когато
което
които
кой
който
колко
която
къде
където
към
ли
м
ме
между
мен
ми
мнозина
мога
могат
може
моля
момента
му
н
на
над
назад
най
направи
напред
например
нас
не
него
нея
ни
ние
никой
нито
но
някои
някой
няма
обаче
около
освен
особено
от
отгоре
отново
още
пак
по
повече
повечето
под
поне
поради
после
почти
прави
пред
преди
през
при
пък
първо
с
са
само
се
сега
си
скоро
след
сме
според
сред
срещу
сте
съм
със
също
т
тази
така
такива
такъв
там
твой
те
тези
ти
тн
то
това
тогава
този
той
толкова
точно
трябва
тук
тъй
тя
тях
у
харесва
ч
че
често
чрез
ще
щом
я
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(BulgarianStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,33 @@
package ca
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const ArticlesName = "articles_ca"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var CatalanArticles = []byte(`
d
l
m
n
s
t
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CatalanArticles)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,40 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ca
import (
"fmt"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/elision"
"github.com/blevesearch/bleve/v2/registry"
)
const ElisionName = "elision_ca"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision.NewElisionFilter(articlesTokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,61 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ca
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("l'Institut"),
},
&analysis.Token{
Term: []byte("d'Estudis"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Institut"),
},
&analysis.Token{
Term: []byte("Estudis"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ca
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,247 @@
package ca
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_ca"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var CatalanStopWords = []byte(`# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
a
abans
ací
ah
així
això
al
als
aleshores
algun
alguna
algunes
alguns
alhora
allà
allí
allò
altra
altre
altres
amb
ambdós
ambdues
apa
aquell
aquella
aquelles
aquells
aquest
aquesta
aquestes
aquests
aquí
baix
cada
cadascú
cadascuna
cadascunes
cadascuns
com
contra
d'un
d'una
d'unes
d'uns
dalt
de
del
dels
des
després
dins
dintre
donat
doncs
durant
e
eh
el
els
em
en
encara
ens
entre
érem
eren
éreu
es
és
esta
està
estàvem
estaven
estàveu
esteu
et
etc
ets
fins
fora
gairebé
ha
han
has
havia
he
hem
heu
hi
ho
i
igual
iguals
ja
l'hi
la
les
li
li'n
llavors
m'he
ma
mal
malgrat
mateix
mateixa
mateixes
mateixos
me
mentre
més
meu
meus
meva
meves
molt
molta
moltes
molts
mon
mons
n'he
n'hi
ne
ni
no
nogensmenys
només
nosaltres
nostra
nostre
nostres
o
oh
oi
on
pas
pel
pels
per
però
perquè
poc
poca
pocs
poques
potser
propi
qual
quals
quan
quant
que
què
quelcom
qui
quin
quina
quines
quins
s'ha
s'han
sa
semblant
semblants
ses
seu
seus
seva
seva
seves
si
sobre
sobretot
sóc
solament
sols
son
són
sons
sota
sou
t'ha
t'han
t'he
ta
tal
també
tampoc
tan
tant
tanta
tantes
teu
teus
teva
teves
ton
tons
tot
tota
totes
tots
un
una
unes
uns
us
va
vaig
vam
van
vas
veu
vosaltres
vostra
vostre
vostres
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CatalanStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,60 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cjk
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
)
const AnalyzerName = "cjk"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
widthFilter, err := cache.TokenFilterNamed(WidthName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
bigramFilter, err := cache.TokenFilterNamed(BigramName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
widthFilter,
toLowerFilter,
bigramFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,642 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cjk
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestCJKAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte("こんにちは世界"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("は世"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
},
},
{
input: []byte("一二三四五六七八九十"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一二"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("二三"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("三四"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("四五"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("五六"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("六七"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
&analysis.Token{
Term: []byte("七八"),
Type: analysis.Double,
Position: 7,
Start: 18,
End: 24,
},
&analysis.Token{
Term: []byte("八九"),
Type: analysis.Double,
Position: 8,
Start: 21,
End: 27,
},
&analysis.Token{
Term: []byte("九十"),
Type: analysis.Double,
Position: 9,
Start: 24,
End: 30,
},
},
},
{
input: []byte("一 二三四 五六七八九 十"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("二三"),
Type: analysis.Double,
Position: 2,
Start: 4,
End: 10,
},
&analysis.Token{
Term: []byte("三四"),
Type: analysis.Double,
Position: 3,
Start: 7,
End: 13,
},
&analysis.Token{
Term: []byte("五六"),
Type: analysis.Double,
Position: 4,
Start: 14,
End: 20,
},
&analysis.Token{
Term: []byte("六七"),
Type: analysis.Double,
Position: 5,
Start: 17,
End: 23,
},
&analysis.Token{
Term: []byte("七八"),
Type: analysis.Double,
Position: 6,
Start: 20,
End: 26,
},
&analysis.Token{
Term: []byte("八九"),
Type: analysis.Double,
Position: 7,
Start: 23,
End: 29,
},
&analysis.Token{
Term: []byte("十"),
Type: analysis.Single,
Position: 8,
Start: 30,
End: 33,
},
},
},
{
input: []byte("abc defgh ijklmn opqrstu vwxy z"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("abc"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("defgh"),
Type: analysis.AlphaNumeric,
Position: 2,
Start: 4,
End: 9,
},
&analysis.Token{
Term: []byte("ijklmn"),
Type: analysis.AlphaNumeric,
Position: 3,
Start: 10,
End: 16,
},
&analysis.Token{
Term: []byte("opqrstu"),
Type: analysis.AlphaNumeric,
Position: 4,
Start: 17,
End: 24,
},
&analysis.Token{
Term: []byte("vwxy"),
Type: analysis.AlphaNumeric,
Position: 5,
Start: 25,
End: 29,
},
&analysis.Token{
Term: []byte("z"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 30,
End: 31,
},
},
},
{
input: []byte("あい"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("あい "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("test"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 4,
},
},
},
{
input: []byte("test "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 4,
},
},
},
{
input: []byte("あいtest"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 2,
Start: 6,
End: 10,
},
},
},
{
input: []byte("testあい "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("test"),
Type: analysis.AlphaNumeric,
Position: 1,
Start: 0,
End: 4,
},
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 2,
Start: 4,
End: 10,
},
},
},
{
input: []byte("あいうえおabcかきくけこ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("いう"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("うえ"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("えお"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("abc"),
Type: analysis.AlphaNumeric,
Position: 5,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("かき"),
Type: analysis.Double,
Position: 6,
Start: 18,
End: 24,
},
&analysis.Token{
Term: []byte("きく"),
Type: analysis.Double,
Position: 7,
Start: 21,
End: 27,
},
&analysis.Token{
Term: []byte("くけ"),
Type: analysis.Double,
Position: 8,
Start: 24,
End: 30,
},
&analysis.Token{
Term: []byte("けこ"),
Type: analysis.Double,
Position: 9,
Start: 27,
End: 33,
},
},
},
{
input: []byte("あいうえおabんcかきくけ こ"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("あい"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("いう"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("うえ"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("えお"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("ab"),
Type: analysis.AlphaNumeric,
Position: 5,
Start: 15,
End: 17,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Single,
Position: 6,
Start: 17,
End: 20,
},
&analysis.Token{
Term: []byte("c"),
Type: analysis.AlphaNumeric,
Position: 7,
Start: 20,
End: 21,
},
&analysis.Token{
Term: []byte("かき"),
Type: analysis.Double,
Position: 8,
Start: 21,
End: 27,
},
&analysis.Token{
Term: []byte("きく"),
Type: analysis.Double,
Position: 9,
Start: 24,
End: 30,
},
&analysis.Token{
Term: []byte("くけ"),
Type: analysis.Double,
Position: 10,
Start: 27,
End: 33,
},
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Single,
Position: 11,
Start: 34,
End: 37,
},
},
},
{
input: []byte("一 روبرت موير"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("روبرت"),
Type: analysis.AlphaNumeric,
Position: 2,
Start: 4,
End: 14,
},
&analysis.Token{
Term: []byte("موير"),
Type: analysis.AlphaNumeric,
Position: 3,
Start: 15,
End: 23,
},
},
},
{
input: []byte("一 رُوبرت موير"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("رُوبرت"),
Type: analysis.AlphaNumeric,
Position: 2,
Start: 4,
End: 16,
},
&analysis.Token{
Term: []byte("موير"),
Type: analysis.AlphaNumeric,
Position: 3,
Start: 17,
End: 25,
},
},
},
{
input: []byte("𩬅艱鍟䇹愯瀛"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("𩬅艱"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 7,
},
&analysis.Token{
Term: []byte("艱鍟"),
Type: analysis.Double,
Position: 2,
Start: 4,
End: 10,
},
&analysis.Token{
Term: []byte("鍟䇹"),
Type: analysis.Double,
Position: 3,
Start: 7,
End: 13,
},
&analysis.Token{
Term: []byte("䇹愯"),
Type: analysis.Double,
Position: 4,
Start: 10,
End: 16,
},
&analysis.Token{
Term: []byte("愯瀛"),
Type: analysis.Double,
Position: 5,
Start: 13,
End: 19,
},
},
},
{
input: []byte("一"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
},
},
{
input: []byte("一丁丂"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("一丁"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("丁丂"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
},
},
}
cache := registry.NewCache()
for _, test := range tests {
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}
func BenchmarkCJKAnalyzer(b *testing.B) {
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
b.Fatal(err)
}
for i := 0; i < b.N; i++ {
analyzer.Analyze(bleveWikiArticleJapanese)
}
}
var bleveWikiArticleJapanese = []byte(`加圧容器に貯蔵されている液体物質はその時の気液平衡状態にあるが火災により容器が加熱されていると容器内の液体はその物質の大気圧のもとでの沸点より十分に高い温度まで加熱され圧力も高くなるこの状態で容器が破裂すると容器内部の圧力は瞬間的に大気圧にまで低下する
この時に容器内の平衡状態が破られ液体は突沸し気体になることで爆発現象を起こす液化石油ガスなどではさらに拡散して空気と混ざったガスが自由空間蒸気雲爆発を起こす液化石油ガスなどの常温常圧で気体になる物を高い圧力で液化して収納している容器あるいはそのような液体を輸送するためのパイプラインや配管などが火災などによって破壊されたときに起きる
ブリーブという現象が明らかになったのはフランスリヨンの郊外にあるフェザンという町のフェザン製油所ウニオンゼネラルペトロールで大規模な爆発火災事故が発生したときだと言われている
中身の液体が高温高圧の水である場合には水蒸気爆発と呼ばれる`)

View file

@ -0,0 +1,210 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cjk
import (
"bytes"
"container/ring"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const BigramName = "cjk_bigram"
type CJKBigramFilter struct {
outputUnigram bool
}
func NewCJKBigramFilter(outputUnigram bool) *CJKBigramFilter {
return &CJKBigramFilter{
outputUnigram: outputUnigram,
}
}
func (s *CJKBigramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
r := ring.New(2)
itemsInRing := 0
pos := 1
outputPos := 1
rv := make(analysis.TokenStream, 0, len(input))
for _, tokout := range input {
if tokout.Type == analysis.Ideographic {
runes := bytes.Runes(tokout.Term)
sofar := 0
for _, run := range runes {
rlen := utf8.RuneLen(run)
token := &analysis.Token{
Term: tokout.Term[sofar : sofar+rlen],
Start: tokout.Start + sofar,
End: tokout.Start + sofar + rlen,
Position: pos,
Type: tokout.Type,
KeyWord: tokout.KeyWord,
}
pos++
sofar += rlen
if itemsInRing > 0 {
// if items already buffered
// check to see if this is aligned
curr := r.Value.(*analysis.Token)
if token.Start-curr.End != 0 {
// not aligned flush
flushToken := s.flush(r, &itemsInRing, outputPos)
if flushToken != nil {
outputPos++
rv = append(rv, flushToken)
}
}
}
// now we can add this token to the buffer
r = r.Next()
r.Value = token
if itemsInRing < 2 {
itemsInRing++
}
builtUnigram := false
if itemsInRing > 1 && s.outputUnigram {
unigram := s.buildUnigram(r, &itemsInRing, outputPos)
if unigram != nil {
builtUnigram = true
rv = append(rv, unigram)
}
}
bigramToken := s.outputBigram(r, &itemsInRing, outputPos)
if bigramToken != nil {
rv = append(rv, bigramToken)
outputPos++
}
// prev token should be removed if unigram was built
if builtUnigram {
itemsInRing--
}
}
} else {
// flush anything already buffered
flushToken := s.flush(r, &itemsInRing, outputPos)
if flushToken != nil {
rv = append(rv, flushToken)
outputPos++
}
// output this token as is
tokout.Position = outputPos
rv = append(rv, tokout)
outputPos++
}
}
// deal with possible trailing unigram
if itemsInRing == 1 || s.outputUnigram {
if itemsInRing == 2 {
r = r.Next()
}
unigram := s.buildUnigram(r, &itemsInRing, outputPos)
if unigram != nil {
rv = append(rv, unigram)
}
}
return rv
}
func (s *CJKBigramFilter) flush(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
var rv *analysis.Token
if *itemsInRing == 1 {
rv = s.buildUnigram(r, itemsInRing, pos)
}
r.Value = nil
*itemsInRing = 0
return rv
}
func (s *CJKBigramFilter) outputBigram(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
if *itemsInRing == 2 {
thisShingleRing := r.Move(-1)
shingledBytes := make([]byte, 0)
// do first token
prev := thisShingleRing.Value.(*analysis.Token)
shingledBytes = append(shingledBytes, prev.Term...)
// do second token
thisShingleRing = thisShingleRing.Next()
curr := thisShingleRing.Value.(*analysis.Token)
shingledBytes = append(shingledBytes, curr.Term...)
token := analysis.Token{
Type: analysis.Double,
Term: shingledBytes,
Position: pos,
Start: prev.Start,
End: curr.End,
}
return &token
}
return nil
}
func (s *CJKBigramFilter) buildUnigram(r *ring.Ring, itemsInRing *int, pos int) *analysis.Token {
switch *itemsInRing {
case 2:
thisShingleRing := r.Move(-1)
// do first token
prev := thisShingleRing.Value.(*analysis.Token)
token := analysis.Token{
Type: analysis.Single,
Term: prev.Term,
Position: pos,
Start: prev.Start,
End: prev.End,
}
return &token
case 1:
// do first token
prev := r.Value.(*analysis.Token)
token := analysis.Token{
Type: analysis.Single,
Term: prev.Term,
Position: pos,
Start: prev.Start,
End: prev.End,
}
return &token
}
return nil
}
func CJKBigramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
outputUnigram := false
outVal, ok := config["output_unigram"].(bool)
if ok {
outputUnigram = outVal
}
return NewCJKBigramFilter(outputUnigram), nil
}
func init() {
err := registry.RegisterTokenFilter(BigramName, CJKBigramFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,848 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cjk
import (
"container/ring"
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
// Helper function to create a token
func makeToken(term string, start, end, pos int) *analysis.Token {
return &analysis.Token{
Term: []byte(term),
Start: start,
End: end,
Position: pos, // Note: buildUnigram uses the 'pos' argument, not the token's original pos
Type: analysis.Ideographic,
}
}
func TestCJKBigramFilter_buildUnigram(t *testing.T) {
filter := NewCJKBigramFilter(false)
tests := []struct {
name string
ringSetup func() (*ring.Ring, int) // Function to set up the ring and itemsInRing
inputPos int // Position to pass to buildUnigram
expectToken *analysis.Token
}{
{
name: "itemsInRing == 2",
ringSetup: func() (*ring.Ring, int) {
r := ring.New(2)
token1 := makeToken("一", 0, 3, 1) // Original pos 1
token2 := makeToken("二", 3, 6, 2) // Original pos 2
r.Value = token1
r = r.Next()
r.Value = token2
// r currently points to token2, r.Move(-1) points to token1
return r, 2
},
inputPos: 10, // Expected output position
expectToken: &analysis.Token{
Type: analysis.Single,
Term: []byte("一"),
Position: 10, // Should use inputPos
Start: 0,
End: 3,
},
},
{
name: "itemsInRing == 1 (ring points to the single item)",
ringSetup: func() (*ring.Ring, int) {
r := ring.New(2)
token1 := makeToken("三", 6, 9, 3)
r.Value = token1
// r points to token1
return r, 1
},
inputPos: 11,
expectToken: &analysis.Token{
Type: analysis.Single,
Term: []byte("三"),
Position: 11, // Should use inputPos
Start: 6,
End: 9,
},
},
{
name: "itemsInRing == 1 (ring points to nil, next is the single item)",
ringSetup: func() (*ring.Ring, int) {
r := ring.New(2)
token1 := makeToken("四", 9, 12, 4)
r = r.Next() // r points to nil initially
r.Value = token1
// r points to token1
return r, 1
},
inputPos: 12,
expectToken: &analysis.Token{
Type: analysis.Single,
Term: []byte("四"),
Position: 12, // Should use inputPos
Start: 9,
End: 12,
},
},
{
name: "itemsInRing == 0",
ringSetup: func() (*ring.Ring, int) {
r := ring.New(2)
// Ring is empty
return r, 0
},
inputPos: 13,
expectToken: nil, // Expect nil when itemsInRing is not 1 or 2
},
{
name: "itemsInRing > 2 (should behave like 0)",
ringSetup: func() (*ring.Ring, int) {
r := ring.New(2)
token1 := makeToken("五", 12, 15, 5)
token2 := makeToken("六", 15, 18, 6)
r.Value = token1
r = r.Next()
r.Value = token2
// Simulate incorrect itemsInRing count
return r, 3
},
inputPos: 14,
expectToken: nil, // Expect nil when itemsInRing is not 1 or 2
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ringPtr, itemsInRing := tt.ringSetup()
itemsInRingCopy := itemsInRing // Pass a pointer to a copy
gotToken := filter.buildUnigram(ringPtr, &itemsInRingCopy, tt.inputPos)
if !reflect.DeepEqual(gotToken, tt.expectToken) {
t.Errorf("buildUnigram() got = %v, want %v", gotToken, tt.expectToken)
}
// Check if itemsInRing was modified (it shouldn't be by buildUnigram)
if itemsInRingCopy != itemsInRing {
t.Errorf("buildUnigram() modified itemsInRing, got = %d, want %d", itemsInRingCopy, itemsInRing)
}
})
}
}
func TestCJKBigramFilter_outputBigram(t *testing.T) {
// Create a filter instance (outputUnigram value doesn't matter for outputBigram)
filter := NewCJKBigramFilter(false)
tests := []struct {
name string
ringSetup func() (*ring.Ring, int) // Function to set up the ring and itemsInRing
inputPos int // Position to pass to outputBigram
expectToken *analysis.Token
}{
{
name: "itemsInRing == 2",
ringSetup: func() (*ring.Ring, int) {
r := ring.New(2)
token1 := makeToken("一", 0, 3, 1) // Original pos 1
token2 := makeToken("二", 3, 6, 2) // Original pos 2
r.Value = token1
r = r.Next()
r.Value = token2
// r currently points to token2, r.Move(-1) points to token1
return r, 2
},
inputPos: 10, // Expected output position
expectToken: &analysis.Token{
Type: analysis.Double,
Term: []byte("一二"), // Combined term
Position: 10, // Should use inputPos
Start: 0, // Start of first token
End: 6, // End of second token
},
},
{
name: "itemsInRing == 2 with different terms",
ringSetup: func() (*ring.Ring, int) {
r := ring.New(2)
token1 := makeToken("你好", 0, 6, 1)
token2 := makeToken("世界", 6, 12, 2)
r.Value = token1
r = r.Next()
r.Value = token2
return r, 2
},
inputPos: 5,
expectToken: &analysis.Token{
Type: analysis.Double,
Term: []byte("你好世界"),
Position: 5,
Start: 0,
End: 12,
},
},
{
name: "itemsInRing == 1",
ringSetup: func() (*ring.Ring, int) {
r := ring.New(2)
token1 := makeToken("三", 6, 9, 3)
r.Value = token1
return r, 1
},
inputPos: 11,
expectToken: nil, // Expect nil when itemsInRing is not 2
},
{
name: "itemsInRing == 0",
ringSetup: func() (*ring.Ring, int) {
r := ring.New(2)
// Ring is empty
return r, 0
},
inputPos: 13,
expectToken: nil, // Expect nil when itemsInRing is not 2
},
{
name: "itemsInRing > 2 (should behave like 0)",
ringSetup: func() (*ring.Ring, int) {
r := ring.New(2)
token1 := makeToken("五", 12, 15, 5)
token2 := makeToken("六", 15, 18, 6)
r.Value = token1
r = r.Next()
r.Value = token2
// Simulate incorrect itemsInRing count
return r, 3
},
inputPos: 14,
expectToken: nil, // Expect nil when itemsInRing is not 2
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ringPtr, itemsInRing := tt.ringSetup()
itemsInRingCopy := itemsInRing // Pass a pointer to a copy
gotToken := filter.outputBigram(ringPtr, &itemsInRingCopy, tt.inputPos)
if !reflect.DeepEqual(gotToken, tt.expectToken) {
t.Errorf("outputBigram() got = %v, want %v", gotToken, tt.expectToken)
}
// Check if itemsInRing was modified (it shouldn't be by outputBigram)
if itemsInRingCopy != itemsInRing {
t.Errorf("outputBigram() modified itemsInRing, got = %d, want %d", itemsInRingCopy, itemsInRing)
}
})
}
}
func TestCJKBigramFilter(t *testing.T) {
tests := []struct {
outputUnigram bool
input analysis.TokenStream
output analysis.TokenStream
}{
// first test that non-adjacent terms are not combined
{
outputUnigram: false,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 5,
End: 8,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Single,
Position: 2,
Start: 5,
End: 8,
},
},
},
{
outputUnigram: false,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Ideographic,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Ideographic,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Ideographic,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Ideographic,
Position: 6,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Ideographic,
Position: 7,
Start: 18,
End: 21,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("は世"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
},
},
{
outputUnigram: true,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Ideographic,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Ideographic,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Ideographic,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Ideographic,
Position: 6,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Ideographic,
Position: 7,
Start: 18,
End: 21,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Single,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Single,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Single,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Single,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("は世"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Single,
Position: 6,
Start: 15,
End: 18,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 6,
Start: 15,
End: 21,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Single,
Position: 7,
Start: 18,
End: 21,
},
},
},
{
// Assuming that `、` is removed by unicode tokenizer from `こんにちは、世界`
outputUnigram: true,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Ideographic,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Ideographic,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Ideographic,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Ideographic,
Position: 7,
Start: 18,
End: 21,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Ideographic,
Position: 8,
Start: 21,
End: 24,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Single,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Single,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Single,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Single,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Single,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Single,
Position: 6,
Start: 18,
End: 21,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 6,
Start: 18,
End: 24,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Single,
Position: 7,
Start: 21,
End: 24,
},
},
},
{
outputUnigram: false,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("こ"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 3,
},
&analysis.Token{
Term: []byte("ん"),
Type: analysis.Ideographic,
Position: 2,
Start: 3,
End: 6,
},
&analysis.Token{
Term: []byte("に"),
Type: analysis.Ideographic,
Position: 3,
Start: 6,
End: 9,
},
&analysis.Token{
Term: []byte("ち"),
Type: analysis.Ideographic,
Position: 4,
Start: 9,
End: 12,
},
&analysis.Token{
Term: []byte("は"),
Type: analysis.Ideographic,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("cat"),
Type: analysis.AlphaNumeric,
Position: 6,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世"),
Type: analysis.Ideographic,
Position: 7,
Start: 18,
End: 21,
},
&analysis.Token{
Term: []byte("界"),
Type: analysis.Ideographic,
Position: 8,
Start: 21,
End: 24,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("こん"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("んに"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("にち"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ちは"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("cat"),
Type: analysis.AlphaNumeric,
Position: 5,
Start: 12,
End: 15,
},
&analysis.Token{
Term: []byte("世界"),
Type: analysis.Double,
Position: 6,
Start: 18,
End: 24,
},
},
},
{
outputUnigram: false,
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("パイプライン"),
Type: analysis.Ideographic,
Position: 1,
Start: 0,
End: 18,
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("パイ"),
Type: analysis.Double,
Position: 1,
Start: 0,
End: 6,
},
&analysis.Token{
Term: []byte("イプ"),
Type: analysis.Double,
Position: 2,
Start: 3,
End: 9,
},
&analysis.Token{
Term: []byte("プラ"),
Type: analysis.Double,
Position: 3,
Start: 6,
End: 12,
},
&analysis.Token{
Term: []byte("ライ"),
Type: analysis.Double,
Position: 4,
Start: 9,
End: 15,
},
&analysis.Token{
Term: []byte("イン"),
Type: analysis.Double,
Position: 5,
Start: 12,
End: 18,
},
},
},
}
for _, test := range tests {
cjkBigramFilter := NewCJKBigramFilter(test.outputUnigram)
actual := cjkBigramFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}

View file

@ -0,0 +1,104 @@
// Copyright (c) 2016 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cjk
import (
"bytes"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const WidthName = "cjk_width"
type CJKWidthFilter struct{}
func NewCJKWidthFilter() *CJKWidthFilter {
return &CJKWidthFilter{}
}
func (s *CJKWidthFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runeCount := utf8.RuneCount(token.Term)
runes := bytes.Runes(token.Term)
for i := 0; i < runeCount; i++ {
ch := runes[i]
if ch >= 0xFF01 && ch <= 0xFF5E {
// fullwidth ASCII variants
runes[i] -= 0xFEE0
} else if ch >= 0xFF65 && ch <= 0xFF9F {
// halfwidth Katakana variants
if (ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(runes, i, ch) {
runes = analysis.DeleteRune(runes, i)
i--
runeCount = len(runes)
} else {
runes[i] = kanaNorm[ch-0xFF65]
}
}
}
token.Term = analysis.BuildTermFromRunes(runes)
}
return input
}
var kanaNorm = []rune{
0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A,
}
var kanaCombineVoiced = []rune{
78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
}
var kanaCombineHalfVoiced = []rune{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2,
0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
}
func combine(text []rune, pos int, r rune) bool {
prev := text[pos-1]
if prev >= 0x30A6 && prev <= 0x30FD {
if r == 0xFF9F {
text[pos-1] += kanaCombineHalfVoiced[prev-0x30A6]
} else {
text[pos-1] += kanaCombineVoiced[prev-0x30A6]
}
return text[pos-1] != prev
}
return false
}
func CJKWidthFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewCJKWidthFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(WidthName, CJKWidthFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,93 @@
// Copyright (c) 2016 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cjk
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestCJKWidthFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Test"),
},
&analysis.Token{
Term: []byte("1234"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("カタカナ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("カタカナ"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ヴィッツ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ヴィッツ"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("パナソニック"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("パナソニック"),
},
},
},
}
for _, test := range tests {
cjkWidthFilter := NewCJKWidthFilter()
actual := cjkWidthFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}

View file

@ -0,0 +1,64 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ckb
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/registry"
)
const AnalyzerName = "ckb"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
normCkbFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopCkbFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
normCkbFilter,
toLowerFilter,
stopCkbFilter,
stemmerCkbFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,77 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ckb
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestSoraniAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stop word removal
{
input: []byte("ئەم پیاوە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 2,
Start: 7,
End: 17,
},
},
},
{
input: []byte("پیاوە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 10,
},
},
},
{
input: []byte("پیاو"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 8,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View file

@ -0,0 +1,121 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ckb
import (
"bytes"
"unicode"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const NormalizeName = "normalize_ckb"
const (
Yeh = '\u064A'
DotlessYeh = '\u0649'
FarsiYeh = '\u06CC'
Kaf = '\u0643'
Keheh = '\u06A9'
Heh = '\u0647'
Ae = '\u06D5'
Zwnj = '\u200C'
HehDoachashmee = '\u06BE'
TehMarbuta = '\u0629'
Reh = '\u0631'
Rreh = '\u0695'
RrehAbove = '\u0692'
Tatweel = '\u0640'
Fathatan = '\u064B'
Dammatan = '\u064C'
Kasratan = '\u064D'
Fatha = '\u064E'
Damma = '\u064F'
Kasra = '\u0650'
Shadda = '\u0651'
Sukun = '\u0652'
)
type SoraniNormalizeFilter struct {
}
func NewSoraniNormalizeFilter() *SoraniNormalizeFilter {
return &SoraniNormalizeFilter{}
}
func (s *SoraniNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case Yeh, DotlessYeh:
runes[i] = FarsiYeh
case Kaf:
runes[i] = Keheh
case Zwnj:
if i > 0 && runes[i-1] == Heh {
runes[i-1] = Ae
}
runes = analysis.DeleteRune(runes, i)
i--
case Heh:
if i == len(runes)-1 {
runes[i] = Ae
}
case TehMarbuta:
runes[i] = Ae
case HehDoachashmee:
runes[i] = Heh
case Reh:
if i == 0 {
runes[i] = Rreh
}
case RrehAbove:
runes[i] = Rreh
case Tatweel, Kasratan, Dammatan, Fathatan, Fatha, Damma, Kasra, Shadda, Sukun:
runes = analysis.DeleteRune(runes, i)
i--
default:
if unicode.In(runes[i], unicode.Cf) {
runes = analysis.DeleteRune(runes, i)
i--
}
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSoraniNormalizeFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,323 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ckb
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestSoraniNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// test Y
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064A"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0649"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06CC"),
},
},
},
// test K
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0643"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06A9"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06A9"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06A9"),
},
},
},
// test H
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u200C"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06D5"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u200C\u06A9"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06D5\u06A9"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06BE"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0629"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u06D5"),
},
},
},
// test final H
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u0647\u0647"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0647\u0647\u06D5"),
},
},
},
// test RR
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0692"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0695"),
},
},
},
// test initial RR
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0631\u0631\u0631"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0695\u0631\u0631"),
},
},
},
// test remove
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0640"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064B"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064C"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064D"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064E"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u064F"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0650"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0651"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u0652"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("\u200C"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
soraniNormalizeFilter := NewSoraniNormalizeFilter()
for _, test := range tests {
actual := soraniNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,151 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ckb
import (
"bytes"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StemmerName = "stemmer_ckb"
type SoraniStemmerFilter struct {
}
func NewSoraniStemmerFilter() *SoraniStemmerFilter {
return &SoraniStemmerFilter{}
}
func (s *SoraniStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
// if not protected keyword, stem it
if !token.KeyWord {
stemmed := stem(token.Term)
token.Term = stemmed
}
}
return input
}
func stem(input []byte) []byte {
inputLen := utf8.RuneCount(input)
// postposition
if inputLen > 5 && bytes.HasSuffix(input, []byte("دا")) {
input = truncateRunes(input, 2)
inputLen = utf8.RuneCount(input)
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("نا")) {
input = truncateRunes(input, 1)
inputLen = utf8.RuneCount(input)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەوە")) {
input = truncateRunes(input, 3)
inputLen = utf8.RuneCount(input)
}
// possessive pronoun
if inputLen > 6 &&
(bytes.HasSuffix(input, []byte("مان")) ||
bytes.HasSuffix(input, []byte("یان")) ||
bytes.HasSuffix(input, []byte("تان"))) {
input = truncateRunes(input, 3)
inputLen = utf8.RuneCount(input)
}
// indefinite singular ezafe
if inputLen > 6 && bytes.HasSuffix(input, []byte("ێکی")) {
return truncateRunes(input, 3)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یەکی")) {
return truncateRunes(input, 4)
}
if inputLen > 5 && bytes.HasSuffix(input, []byte("ێک")) {
// indefinite singular
return truncateRunes(input, 2)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یەک")) {
// indefinite singular
return truncateRunes(input, 3)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("ەکە")) {
// definite singular
return truncateRunes(input, 3)
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("کە")) {
// definite singular
return truncateRunes(input, 2)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("ەکان")) {
// definite plural
return truncateRunes(input, 4)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("کان")) {
// definite plural
return truncateRunes(input, 3)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانی")) {
// indefinite plural ezafe
return truncateRunes(input, 4)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انی")) {
// indefinite plural ezafe
return truncateRunes(input, 3)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("یان")) {
// indefinite plural
return truncateRunes(input, 3)
} else if inputLen > 5 && bytes.HasSuffix(input, []byte("ان")) {
// indefinite plural
return truncateRunes(input, 2)
} else if inputLen > 7 && bytes.HasSuffix(input, []byte("یانە")) {
// demonstrative plural
return truncateRunes(input, 4)
} else if inputLen > 6 && bytes.HasSuffix(input, []byte("انە")) {
// demonstrative plural
return truncateRunes(input, 3)
} else if inputLen > 5 && (bytes.HasSuffix(input, []byte("ایە")) || bytes.HasSuffix(input, []byte("ەیە"))) {
// demonstrative singular
return truncateRunes(input, 2)
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ە")) {
// demonstrative singular
return truncateRunes(input, 1)
} else if inputLen > 4 && bytes.HasSuffix(input, []byte("ی")) {
// absolute singular ezafe
return truncateRunes(input, 1)
}
return input
}
func truncateRunes(input []byte, num int) []byte {
runes := bytes.Runes(input)
runes = runes[:len(runes)-num]
out := buildTermFromRunes(runes)
return out
}
func buildTermFromRunes(runes []rune) []byte {
rv := make([]byte, 0, len(runes)*4)
for _, r := range runes {
runeBytes := make([]byte, utf8.RuneLen(r))
utf8.EncodeRune(runeBytes, r)
rv = append(rv, runeBytes...)
}
return rv
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSoraniStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,299 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ckb
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/single"
)
func TestSoraniStemmerFilter(t *testing.T) {
// in order to match the lucene tests
// we will test with an analyzer, not just the stemmer
analyzer := analysis.DefaultAnalyzer{
Tokenizer: single.NewSingleTokenTokenizer(),
TokenFilters: []analysis.TokenFilter{
NewSoraniNormalizeFilter(),
NewSoraniStemmerFilter(),
},
}
tests := []struct {
input []byte
output analysis.TokenStream
}{
{ // -ek
input: []byte("پیاوێک"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -yek
input: []byte("دەرگایەک"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -aka
input: []byte("پیاوەكە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -ka
input: []byte("دەرگاكە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -a
input: []byte("کتاویە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("کتاوی"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -ya
input: []byte("دەرگایە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -An
input: []byte("پیاوان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -yAn
input: []byte("دەرگایان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -akAn
input: []byte("پیاوەکان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -kAn
input: []byte("دەرگاکان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -Ana
input: []byte("پیاوانە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پیاو"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -yAna
input: []byte("دەرگایانە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دەرگا"),
Position: 1,
Start: 0,
End: 18,
},
},
},
{ // Ezafe singular
input: []byte("هۆتیلی"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هۆتیل"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // Ezafe indefinite
input: []byte("هۆتیلێکی"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هۆتیل"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // Ezafe plural
input: []byte("هۆتیلانی"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هۆتیل"),
Position: 1,
Start: 0,
End: 16,
},
},
},
{ // -awa
input: []byte("دوورەوە"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("دوور"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -dA
input: []byte("نیوەشەودا"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("نیوەشەو"),
Position: 1,
Start: 0,
End: 18,
},
},
},
{ // -A
input: []byte("سۆرانا"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("سۆران"),
Position: 1,
Start: 0,
End: 12,
},
},
},
{ // -mAn
input: []byte("پارەمان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پارە"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -tAn
input: []byte("پارەتان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پارە"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // -yAn
input: []byte("پارەیان"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("پارە"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{ // empty
input: []byte(""),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
Position: 1,
Start: 0,
End: 0,
},
},
},
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("for input %s(% x)", test.input, test.input)
t.Errorf("\texpected:")
for _, token := range test.output {
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
}
t.Errorf("\tactual:")
for _, token := range actual {
t.Errorf("\t\t%v %s(% x)", token, token.Term, token.Term)
}
}
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ckb
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,163 @@
package ckb
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_ckb"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var SoraniStopWords = []byte(`# set of kurdish stopwords
# note these have been normalized with our scheme (e represented with U+06D5, etc)
# constructed from:
# * Fig 5 of "Building A Test Collection For Sorani Kurdish" (Esmaili et al)
# * "Sorani Kurdish: A Reference Grammar with selected readings" (Thackston)
# * Corpus-based analysis of 77M word Sorani collection: wikipedia, news, blogs, etc
# and
و
# which
کە
# of
ی
# made/did
کرد
# that/which
ئەوەی
# on/head
سەر
# two
دوو
# also
هەروەها
# from/that
لەو
# makes/does
دەکات
# some
چەند
# every
هەر
# demonstratives
# that
ئەو
# this
ئەم
# personal pronouns
# I
من
# we
ئێمە
# you
تۆ
# you
ئێوە
# he/she/it
ئەو
# they
ئەوان
# prepositions
# to/with/by
بە
پێ
# without
بەبێ
# along with/while/during
بەدەم
# in the opinion of
بەلای
# according to
بەپێی
# before
بەرلە
# in the direction of
بەرەوی
# in front of/toward
بەرەوە
# before/in the face of
بەردەم
# without
بێ
# except for
بێجگە
# for
بۆ
# on/in
دە
تێ
# with
دەگەڵ
# after
دوای
# except for/aside from
جگە
# in/from
لە
لێ
# in front of/before/because of
لەبەر
# between/among
لەبەینی
# concerning/about
لەبابەت
# concerning
لەبارەی
# instead of
لەباتی
# beside
لەبن
# instead of
لەبرێتی
# behind
لەدەم
# with/together with
لەگەڵ
# by
لەلایەن
# within
لەناو
# between/among
لەنێو
# for the sake of
لەپێناوی
# with respect to
لەرەوی
# by means of/for
لەرێ
# for the sake of
لەرێگا
# on/on top of/according to
لەسەر
# under
لەژێر
# between/among
ناو
# between/among
نێوان
# after
پاش
# before
پێش
# like
وەک
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SoraniStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package cs
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,199 @@
package cs
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_cs"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var CzechStopWords = []byte(`a
s
k
o
i
u
v
z
dnes
cz
tímto
budeš
budem
byli
jseš
můj
svým
ta
tomto
tohle
tuto
tyto
jej
zda
proč
máte
tato
kam
tohoto
kdo
kteří
mi
nám
tom
tomuto
mít
nic
proto
kterou
byla
toho
protože
asi
ho
naši
napište
re
což
tím
takže
svých
její
svými
jste
aj
tu
tedy
teto
bylo
kde
ke
pravé
ji
nad
nejsou
či
pod
téma
mezi
přes
ty
pak
vám
ani
když
však
neg
jsem
tento
článku
články
aby
jsme
před
pta
jejich
byl
ještě
bez
také
pouze
první
vaše
která
nás
nový
tipy
pokud
může
strana
jeho
své
jiné
zprávy
nové
není
vás
jen
podle
zde
být
více
bude
již
než
který
by
které
co
nebo
ten
tak
při
od
po
jsou
jak
další
ale
si
se
ve
to
jako
za
zpět
ze
do
pro
je
na
atd
atp
jakmile
přičemž
on
ona
ono
oni
ony
my
vy
ji
mne
jemu
tomu
těm
těmu
němu
němuž
jehož
jíž
jelikož
jež
jakož
načež
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(CzechStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,59 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package da
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/registry"
)
const AnalyzerName = "da"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopDaFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerDaFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDaFilter,
stemmerDaFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,71 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package da
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestDanishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("undersøg"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("undersøg"),
Position: 1,
Start: 0,
End: 9,
},
},
},
{
input: []byte("undersøgelse"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("undersøg"),
Position: 1,
Start: 0,
End: 13,
},
},
},
// stop word
{
input: []byte("på"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View file

@ -0,0 +1,52 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package da
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/snowballstem"
"github.com/blevesearch/snowballstem/danish"
)
const SnowballStemmerName = "stemmer_da_snowball"
type DanishStemmerFilter struct {
}
func NewDanishStemmerFilter() *DanishStemmerFilter {
return &DanishStemmerFilter{}
}
func (s *DanishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
env := snowballstem.NewEnv(string(token.Term))
danish.Stem(env)
token.Term = []byte(env.Current())
}
return input
}
func DanishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewDanishStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(SnowballStemmerName, DanishStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package da
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,137 @@
package da
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_da"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var DanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Danish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
og | and
i | in
jeg | I
det | that (dem. pronoun)/it (pers. pronoun)
at | that (in front of a sentence)/to (with infinitive)
en | a/an
den | it (pers. pronoun)/that (dem. pronoun)
til | to/at/for/until/against/by/of/into, more
er | present tense of "to be"
som | who, as
| on/upon/in/on/at/to/after/of/with/for, on
de | they
med | with/by/in, along
han | he
af | of/by/from/off/for/in/with/on, off
for | at/for/to/from/by/of/ago, in front/before, because
ikke | not
der | who/which, there/those
var | past tense of "to be"
mig | me/myself
sig | oneself/himself/herself/itself/themselves
men | but
et | a/an/one, one (number), someone/somebody/one
har | present tense of "to have"
om | round/about/for/in/a, about/around/down, if
vi | we
min | my
havde | past tense of "to have"
ham | him
hun | she
nu | now
over | over/above/across/by/beyond/past/on/about, over/past
da | then, when/as/since
fra | from/off/since, off, since
du | you
ud | out
sin | his/her/its/one's
dem | them
os | us/ourselves
op | up
man | you/one
hans | his
hvor | where
eller | or
hvad | what
skal | must/shall etc.
selv | myself/youself/herself/ourselves etc., even
her | here
alle | all/everyone/everybody etc.
vil | will (verb)
blev | past tense of "to stay/to remain/to get/to become"
kunne | could
ind | in
når | when
være | present tense of "to be"
dog | however/yet/after all
noget | something
ville | would
jo | you know/you see (adv), yes
deres | their/theirs
efter | after/behind/according to/for/by/from, later/afterwards
ned | down
skulle | should
denne | this
end | than
dette | this
mit | my/mine
også | also
under | under/beneath/below/during, below/underneath
have | have
dig | you
anden | other
hende | her
mine | my
alt | everything
meget | much/very, plenty of
sit | his, her, its, one's
sine | his, her, its, one's
vor | our
mod | against
disse | these
hvis | if
din | your/yours
nogle | some
hos | by/at
blive | be/become
mange | many
ad | by/through
bliver | present tense of "to be/to become"
hendes | her/hers
været | be
thi | for (conj)
jer | you
sådan | such, like this/like that
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(DanishStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,64 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package de
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/registry"
)
const AnalyzerName = "de"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopDeFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
lightStemmerDeFilter, err := cache.TokenFilterNamed(LightStemmerName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopDeFilter,
normalizeDeFilter,
lightStemmerDeFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,155 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package de
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestGermanAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte("Tisch"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 5,
},
},
},
{
input: []byte("Tische"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("Tischen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("tisch"),
Position: 1,
Start: 0,
End: 7,
},
},
},
// german specials
{
input: []byte("Schaltflächen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("schaltflach"),
Position: 1,
Start: 0,
End: 14,
},
},
},
{
input: []byte("Schaltflaechen"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("schaltflach"),
Position: 1,
Start: 0,
End: 14,
},
},
},
// tests added by marty to increase coverage
{
input: []byte("Blechern"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("blech"),
Position: 1,
Start: 0,
End: 8,
},
},
},
{
input: []byte("Klecks"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("kleck"),
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("Mindestens"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("mindest"),
Position: 1,
Start: 0,
End: 10,
},
},
},
{
input: []byte("Kugelfest"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("kugelf"),
Position: 1,
Start: 0,
End: 9,
},
},
},
{
input: []byte("Baldigst"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("baldig"),
Position: 1,
Start: 0,
End: 8,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View file

@ -0,0 +1,98 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package de
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const NormalizeName = "normalize_de"
const (
N = 0 /* ordinary state */
V = 1 /* stops 'u' from entering umlaut state */
U = 2 /* umlaut state, allows e-deletion */
)
type GermanNormalizeFilter struct {
}
func NewGermanNormalizeFilter() *GermanNormalizeFilter {
return &GermanNormalizeFilter{}
}
func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
state := N
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case 'a', 'o':
state = U
case 'u':
if state == N {
state = U
} else {
state = V
}
case 'e':
if state == U {
runes = analysis.DeleteRune(runes, i)
i--
}
state = V
case 'i', 'q', 'y':
state = V
case 'ä':
runes[i] = 'a'
state = V
case 'ö':
runes[i] = 'o'
state = V
case 'ü':
runes[i] = 'u'
state = V
case 'ß':
runes[i] = 's'
i++
runes = analysis.InsertRune(runes, i, 's')
state = N
default:
state = N
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewGermanNormalizeFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,103 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package de
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestGermanNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// Tests that a/o/u + e is equivalent to the umlaut form
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflächen"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflachen"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflaechen"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Schaltflachen"),
},
},
},
// Tests the specific heuristic that ue is not folded after a vowel or q.
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("dauer"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("dauer"),
},
},
},
// Tests german specific folding of sharp-s
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("weißbier"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("weissbier"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
germanNormalizeFilter := NewGermanNormalizeFilter()
for _, test := range tests {
actual := germanNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,119 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package de
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const LightStemmerName = "stemmer_de_light"
type GermanLightStemmerFilter struct {
}
func NewGermanLightStemmerFilter() *GermanLightStemmerFilter {
return &GermanLightStemmerFilter{}
}
func (s *GermanLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runes := bytes.Runes(token.Term)
runes = stem(runes)
token.Term = analysis.BuildTermFromRunes(runes)
}
return input
}
func stem(input []rune) []rune {
for i, r := range input {
switch r {
case 'ä', 'à', 'á', 'â':
input[i] = 'a'
case 'ö', 'ò', 'ó', 'ô':
input[i] = 'o'
case 'ï', 'ì', 'í', 'î':
input[i] = 'i'
case 'ü', 'ù', 'ú', 'û':
input[i] = 'u'
}
}
input = step1(input)
return step2(input)
}
func stEnding(ch rune) bool {
switch ch {
case 'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't':
return true
}
return false
}
func step1(s []rune) []rune {
l := len(s)
if l > 5 && s[l-3] == 'e' && s[l-2] == 'r' && s[l-1] == 'n' {
return s[:l-3]
}
if l > 4 && s[l-2] == 'e' {
switch s[l-1] {
case 'm', 'n', 'r', 's':
return s[:l-2]
}
}
if l > 3 && s[l-1] == 'e' {
return s[:l-1]
}
if l > 3 && s[l-1] == 's' && stEnding(s[l-2]) {
return s[:l-1]
}
return s
}
func step2(s []rune) []rune {
l := len(s)
if l > 5 && s[l-3] == 'e' && s[l-2] == 's' && s[l-1] == 't' {
return s[:l-3]
}
if l > 4 && s[l-2] == 'e' && (s[l-1] == 'r' || s[l-1] == 'n') {
return s[:l-2]
}
if l > 4 && s[l-2] == 's' && s[l-1] == 't' && stEnding(s[l-3]) {
return s[:l-2]
}
return s
}
func GermanLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewGermanLightStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(LightStemmerName, GermanLightStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,52 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package de
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/snowballstem"
"github.com/blevesearch/snowballstem/german"
)
const SnowballStemmerName = "stemmer_de_snowball"
type GermanStemmerFilter struct {
}
func NewGermanStemmerFilter() *GermanStemmerFilter {
return &GermanStemmerFilter{}
}
func (s *GermanStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
env := snowballstem.NewEnv(string(token.Term))
german.Stem(env)
token.Term = []byte(env.Current())
}
return input
}
func GermanStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewGermanStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(SnowballStemmerName, GermanStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,91 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package de
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestSnowballGermanStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abzuschrecken"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("abzuschreck"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("abzuwarten"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("abzuwart"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("zwirnfabrik"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("zwirnfabr"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("zyniker"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("zynik"),
},
},
},
}
cache := registry.NewCache()
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package de
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,321 @@
package de
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_de"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var GermanStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A German stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The number of forms in this list is reduced significantly by passing it
| through the German stemmer.
aber | but
alle | all
allem
allen
aller
alles
als | than, as
also | so
am | an + dem
an | at
ander | other
andere
anderem
anderen
anderer
anderes
anderm
andern
anderr
anders
auch | also
auf | on
aus | out of
bei | by
bin | am
bis | until
bist | art
da | there
damit | with it
dann | then
der | the
den
des
dem
die
das
daß | that
derselbe | the same
derselben
denselben
desselben
demselben
dieselbe
dieselben
dasselbe
dazu | to that
dein | thy
deine
deinem
deinen
deiner
deines
denn | because
derer | of those
dessen | of him
dich | thee
dir | to thee
du | thou
dies | this
diese
diesem
diesen
dieser
dieses
doch | (several meanings)
dort | (over) there
durch | through
ein | a
eine
einem
einen
einer
eines
einig | some
einige
einigem
einigen
einiger
einiges
einmal | once
er | he
ihn | him
ihm | to him
es | it
etwas | something
euer | your
eure
eurem
euren
eurer
eures
für | for
gegen | towards
gewesen | p.p. of sein
hab | have
habe | have
haben | have
hat | has
hatte | had
hatten | had
hier | here
hin | there
hinter | behind
ich | I
mich | me
mir | to me
ihr | you, to her
ihre
ihrem
ihren
ihrer
ihres
euch | to you
im | in + dem
in | in
indem | while
ins | in + das
ist | is
jede | each, every
jedem
jeden
jeder
jedes
jene | that
jenem
jenen
jener
jenes
jetzt | now
kann | can
kein | no
keine
keinem
keinen
keiner
keines
können | can
könnte | could
machen | do
man | one
manche | some, many a
manchem
manchen
mancher
manches
mein | my
meine
meinem
meinen
meiner
meines
mit | with
muss | must
musste | had to
nach | to(wards)
nicht | not
nichts | nothing
noch | still, yet
nun | now
nur | only
ob | whether
oder | or
ohne | without
sehr | very
sein | his
seine
seinem
seinen
seiner
seines
selbst | self
sich | herself
sie | they, she
ihnen | to them
sind | are
so | so
solche | such
solchem
solchen
solcher
solches
soll | shall
sollte | should
sondern | but
sonst | else
über | over
um | about, around
und | and
uns | us
unse
unsem
unsen
unser
unses
unter | under
viel | much
vom | von + dem
von | from
vor | before
während | while
war | was
waren | were
warst | wast
was | what
weg | away, off
weil | because
weiter | further
welche | which
welchem
welchen
welcher
welches
wenn | when
werde | will
werden | will
wie | how
wieder | again
will | want
wir | we
wird | will
wirst | willst
wo | where
wollen | want
wollte | wanted
würde | would
würden | would
zu | to
zum | zu + dem
zur | zu + der
zwar | indeed
zwischen | between
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GermanStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package el
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,105 @@
package el
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_el"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var GreekStopWords = []byte(`# Lucene Greek Stopwords list
# Note: by default this file is used after GreekLowerCaseFilter,
# so when modifying this file use 'σ' instead of 'ς'
ο
η
το
οι
τα
του
τησ
των
τον
την
και
κι
κ
ειμαι
εισαι
ειναι
ειμαστε
ειστε
στο
στον
στη
στην
μα
αλλα
απο
για
προσ
με
σε
ωσ
παρα
αντι
κατα
μετα
θα
να
δε
δεν
μη
μην
επι
ενω
εαν
αν
τοτε
που
πωσ
ποιοσ
ποια
ποιο
ποιοι
ποιεσ
ποιων
ποιουσ
αυτοσ
αυτη
αυτο
αυτοι
αυτων
αυτουσ
αυτεσ
αυτα
εκεινοσ
εκεινη
εκεινο
εκεινοι
εκεινεσ
εκεινα
εκεινων
εκεινουσ
οπωσ
ομωσ
ισωσ
οσο
οτι
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GreekStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,73 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package en implements an analyzer with reasonable defaults for processing
// English text.
//
// It strips possessive suffixes ('s), transforms tokens to lower case,
// removes stopwords from a built-in list, and applies porter stemming.
//
// The built-in stopwords list is defined in EnglishStopWords.
package en
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/porter"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
)
const AnalyzerName = "en"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
possEnFilter, err := cache.TokenFilterNamed(PossessiveName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopEnFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerEnFilter, err := cache.TokenFilterNamed(porter.Name)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
possEnFilter,
toLowerFilter,
stopEnFilter,
stemmerEnFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,105 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package en
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestEnglishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("books"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("book"),
Position: 1,
Start: 0,
End: 5,
},
},
},
{
input: []byte("book"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("book"),
Position: 1,
Start: 0,
End: 4,
},
},
},
// stop word removal
{
input: []byte("the"),
output: analysis.TokenStream{},
},
// possessive removal
{
input: []byte("steven's"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("steven"),
Position: 1,
Start: 0,
End: 8,
},
},
},
{
input: []byte("steven\u2019s"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("steven"),
Position: 1,
Start: 0,
End: 10,
},
},
},
{
input: []byte("steven\uFF07s"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("steven"),
Position: 1,
Start: 0,
End: 10,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View file

@ -0,0 +1,177 @@
/*
This code was ported from the Open Search Project
https://github.com/opensearch-project/OpenSearch/blob/main/modules/analysis-common/src/main/java/org/opensearch/analysis/common/EnglishPluralStemFilter.java
The algorithm itself was created by Mark Harwood
https://github.com/markharwood
*/
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package en
import (
"strings"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const PluralStemmerName = "stemmer_en_plural"
type EnglishPluralStemmerFilter struct {
}
func NewEnglishPluralStemmerFilter() *EnglishPluralStemmerFilter {
return &EnglishPluralStemmerFilter{}
}
func (s *EnglishPluralStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
token.Term = []byte(stem(string(token.Term)))
}
return input
}
func EnglishPluralStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewEnglishPluralStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(PluralStemmerName, EnglishPluralStemmerFilterConstructor)
if err != nil {
panic(err)
}
}
// ----------------------------------------------------------------------------
// Words ending in oes that retain the e when stemmed
var oesExceptions = []string{"shoes", "canoes", "oboes"}
// Words ending in ches that retain the e when stemmed
var chesExceptions = []string{
"cliches",
"avalanches",
"mustaches",
"moustaches",
"quiches",
"headaches",
"heartaches",
"porsches",
"tranches",
"caches",
}
func stem(word string) string {
runes := []rune(strings.ToLower(word))
if len(runes) < 3 || runes[len(runes)-1] != 's' {
return string(runes)
}
switch runes[len(runes)-2] {
case 'u':
fallthrough
case 's':
return string(runes)
case 'e':
// Modified ies->y logic from original s-stemmer - only work on strings > 4
// so spies -> spy still but pies->pie.
// The original code also special-cased aies and eies for no good reason as far as I can tell.
// ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies )
if len(runes) > 4 && runes[len(runes)-3] == 'i' {
runes[len(runes)-3] = 'y'
return string(runes[0 : len(runes)-2])
}
// Suffix rules to remove any dangling "e"
if len(runes) > 3 {
// xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe")
if len(runes) > 4 && runes[len(runes)-3] == 'x' {
return string(runes[0 : len(runes)-2])
}
// oes
if len(runes) > 3 && runes[len(runes)-3] == 'o' {
if isException(runes, oesExceptions) {
// Only remove the S
return string(runes[0 : len(runes)-1])
}
// Remove the es
return string(runes[0 : len(runes)-2])
}
if len(runes) > 4 {
// shes/sses
if runes[len(runes)-4] == 's' && (runes[len(runes)-3] == 'h' || runes[len(runes)-3] == 's') {
return string(runes[0 : len(runes)-2])
}
// ches
if len(runes) > 4 {
if runes[len(runes)-4] == 'c' && runes[len(runes)-3] == 'h' {
if isException(runes, chesExceptions) {
// Only remove the S
return string(runes[0 : len(runes)-1])
}
// Remove the es
return string(runes[0 : len(runes)-2])
}
}
}
}
fallthrough
default:
return string(runes[0 : len(runes)-1])
}
}
func isException(word []rune, exceptions []string) bool {
for _, exception := range exceptions {
exceptionRunes := []rune(exception)
exceptionPos := len(exceptionRunes) - 1
wordPos := len(word) - 1
matched := true
for exceptionPos >= 0 && wordPos >= 0 {
if exceptionRunes[exceptionPos] != word[wordPos] {
matched = false
break
}
exceptionPos--
wordPos--
}
if matched {
return true
}
}
return false
}

View file

@ -0,0 +1,46 @@
package en
import "testing"
func TestEnglishPluralStemmer(t *testing.T) {
data := []struct {
In, Out string
}{
{"dresses", "dress"},
{"dress", "dress"},
{"axes", "axe"},
{"ad", "ad"},
{"ads", "ad"},
{"gas", "ga"},
{"sass", "sass"},
{"berries", "berry"},
{"dresses", "dress"},
{"spies", "spy"},
{"shoes", "shoe"},
{"headaches", "headache"},
{"computer", "computer"},
{"dressing", "dressing"},
{"clothes", "clothe"},
{"DRESSES", "dress"},
{"frog", "frog"},
{"dress", "dress"},
{"runs", "run"},
{"pies", "pie"},
{"foxes", "fox"},
{"axes", "axe"},
{"foes", "fo"},
{"dishes", "dish"},
{"snitches", "snitch"},
{"cliches", "cliche"},
{"forests", "forest"},
{"yes", "ye"},
}
for _, datum := range data {
stemmed := stem(datum.In)
if stemmed != datum.Out {
t.Errorf("expected %v but got %v", datum.Out, stemmed)
}
}
}

View file

@ -0,0 +1,70 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package en
import (
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
// PossessiveName is the name PossessiveFilter is registered as
// in the bleve registry.
const PossessiveName = "possessive_en"
const rightSingleQuotationMark = ''
const apostrophe = '\''
const fullWidthApostrophe = ''
const apostropheChars = rightSingleQuotationMark + apostrophe + fullWidthApostrophe
// PossessiveFilter implements a TokenFilter which
// strips the English possessive suffix ('s) from tokens.
// It handle a variety of apostrophe types, is case-insensitive
// and doesn't distinguish between possessive and contraction.
// (ie "She's So Rad" becomes "She So Rad")
type PossessiveFilter struct {
}
func NewPossessiveFilter() *PossessiveFilter {
return &PossessiveFilter{}
}
func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
lastRune, lastRuneSize := utf8.DecodeLastRune(token.Term)
if lastRune == 's' || lastRune == 'S' {
nextLastRune, nextLastRuneSize := utf8.DecodeLastRune(token.Term[:len(token.Term)-lastRuneSize])
if nextLastRune == rightSingleQuotationMark ||
nextLastRune == apostrophe ||
nextLastRune == fullWidthApostrophe {
token.Term = token.Term[:len(token.Term)-lastRuneSize-nextLastRuneSize]
}
}
}
return input
}
func PossessiveFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewPossessiveFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(PossessiveName, PossessiveFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,142 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package en
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestEnglishPossessiveFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("marty's"),
},
&analysis.Token{
Term: []byte("MARTY'S"),
},
&analysis.Token{
Term: []byte("martys"),
},
&analysis.Token{
Term: []byte("MARTYS"),
},
&analysis.Token{
Term: []byte("martys"),
},
&analysis.Token{
Term: []byte("MARTYS"),
},
&analysis.Token{
Term: []byte("m"),
},
&analysis.Token{
Term: []byte("s"),
},
&analysis.Token{
Term: []byte("'s"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("marty"),
},
&analysis.Token{
Term: []byte("MARTY"),
},
&analysis.Token{
Term: []byte("marty"),
},
&analysis.Token{
Term: []byte("MARTY"),
},
&analysis.Token{
Term: []byte("marty"),
},
&analysis.Token{
Term: []byte("MARTY"),
},
&analysis.Token{
Term: []byte("m"),
},
&analysis.Token{
Term: []byte("s"),
},
&analysis.Token{
Term: []byte(""),
},
},
},
}
cache := registry.NewCache()
stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := stemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output, actual)
}
}
}
func BenchmarkEnglishPossessiveFilter(b *testing.B) {
input := analysis.TokenStream{
&analysis.Token{
Term: []byte("marty's"),
},
&analysis.Token{
Term: []byte("MARTY'S"),
},
&analysis.Token{
Term: []byte("martys"),
},
&analysis.Token{
Term: []byte("MARTYS"),
},
&analysis.Token{
Term: []byte("martys"),
},
&analysis.Token{
Term: []byte("MARTYS"),
},
&analysis.Token{
Term: []byte("m"),
},
}
cache := registry.NewCache()
stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
if err != nil {
b.Fatal(err)
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
stemmerFilter.Filter(input)
}
}

View file

@ -0,0 +1,52 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package en
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/snowballstem"
"github.com/blevesearch/snowballstem/english"
)
const SnowballStemmerName = "stemmer_en_snowball"
type EnglishStemmerFilter struct {
}
func NewEnglishStemmerFilter() *EnglishStemmerFilter {
return &EnglishStemmerFilter{}
}
func (s *EnglishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
env := snowballstem.NewEnv(string(token.Term))
english.Stem(env)
token.Term = []byte(env.Current())
}
return input
}
func EnglishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewEnglishStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(SnowballStemmerName, EnglishStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,79 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package en
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestSnowballEnglishStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("enjoy"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("enjoy"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("enjoyed"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("enjoy"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("enjoyable"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("enjoy"),
},
},
},
}
cache := registry.NewCache()
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package en
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,347 @@
package en
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_en"
// EnglishStopWords is the built-in list of stopwords used by the "stop_en" TokenFilter.
//
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| An English stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| Many of the forms below are quite rare (e.g. "yourselves") but included for
| completeness.
| PRONOUNS FORMS
| 1st person sing
i | subject, always in upper case of course
me | object
my | possessive adjective
| the possessive pronoun 'mine' is best suppressed, because of the
| sense of coal-mine etc.
myself | reflexive
| 1st person plural
we | subject
| us | object
| care is required here because US = United States. It is usually
| safe to remove it if it is in lower case.
our | possessive adjective
ours | possessive pronoun
ourselves | reflexive
| second person (archaic 'thou' forms not included)
you | subject and object
your | possessive adjective
yours | possessive pronoun
yourself | reflexive (singular)
yourselves | reflexive (plural)
| third person singular
he | subject
him | object
his | possessive adjective and pronoun
himself | reflexive
she | subject
her | object and possessive adjective
hers | possessive pronoun
herself | reflexive
it | subject and object
its | possessive adjective
itself | reflexive
| third person plural
they | subject
them | object
their | possessive adjective
theirs | possessive pronoun
themselves | reflexive
| other forms (demonstratives, interrogatives)
what
which
who
whom
this
that
these
those
| VERB FORMS (using F.R. Palmer's nomenclature)
| BE
am | 1st person, present
is | -s form (3rd person, present)
are | present
was | 1st person, past
were | past
be | infinitive
been | past participle
being | -ing form
| HAVE
have | simple
has | -s form
had | past
having | -ing form
| DO
do | simple
does | -s form
did | past
doing | -ing form
| The forms below are, I believe, best omitted, because of the significant
| homonym forms:
| He made a WILL
| old tin CAN
| merry month of MAY
| a smell of MUST
| fight the good fight with all thy MIGHT
| would, could, should, ought might however be included
| | AUXILIARIES
| | WILL
|will
would
| | SHALL
|shall
should
| | CAN
|can
could
| | MAY
|may
|might
| | MUST
|must
| | OUGHT
ought
| COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
| pronoun + verb
i'm
you're
he's
she's
it's
we're
they're
i've
you've
we've
they've
i'd
you'd
he'd
she'd
we'd
they'd
i'll
you'll
he'll
she'll
we'll
they'll
| verb + negation
isn't
aren't
wasn't
weren't
hasn't
haven't
hadn't
doesn't
don't
didn't
| auxiliary + negation
won't
wouldn't
shan't
shouldn't
can't
cannot
couldn't
mustn't
| miscellaneous forms
let's
that's
who's
what's
here's
there's
when's
where's
why's
how's
| rarer forms
| daren't needn't
| doubtful forms
| oughtn't mightn't
| ARTICLES
a
an
the
| THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
| high, that classification is pointless.)
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
| Just for the record, the following words are among the commonest in English
| one
| every
| least
| less
| many
| now
| ever
| never
| say
| says
| said
| also
| get
| go
| goes
| just
| made
| make
| put
| see
| seen
| whether
| like
| well
| back
| even
| still
| way
| take
| since
| another
| however
| two
| three
| four
| five
| first
| second
| new
| old
| high
| long
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(EnglishStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,66 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package es
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
)
const AnalyzerName = "es"
func AnalyzerConstructor(config map[string]interface{},
cache *registry.Cache) (analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
normalizeEsFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stopEsFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
lightStemmerEsFilter, err := cache.TokenFilterNamed(LightStemmerName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEsFilter,
normalizeEsFilter,
lightStemmerEsFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,122 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package es
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestSpanishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("chicana"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chican"),
Position: 1,
Start: 0,
End: 7,
},
},
},
{
input: []byte("chicano"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chican"),
Position: 1,
Start: 0,
End: 7,
},
},
},
// added by marty for better coverage
{
input: []byte("yeses"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("yes"),
Position: 1,
Start: 0,
End: 5,
},
},
},
{
input: []byte("jaeces"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("jaez"),
Position: 1,
Start: 0,
End: 6,
},
},
},
{
input: []byte("arcos"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("arc"),
Position: 1,
Start: 0,
End: 5,
},
},
},
{
input: []byte("caos"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("caos"),
Position: 1,
Start: 0,
End: 4,
},
},
},
{
input: []byte("parecer"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("parecer"),
Position: 1,
Start: 0,
End: 7,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View file

@ -0,0 +1,78 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package es
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const LightStemmerName = "stemmer_es_light"
type SpanishLightStemmerFilter struct {
}
func NewSpanishLightStemmerFilter() *SpanishLightStemmerFilter {
return &SpanishLightStemmerFilter{}
}
func (s *SpanishLightStemmerFilter) Filter(
input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runes := bytes.Runes(token.Term)
runes = stem(runes)
token.Term = analysis.BuildTermFromRunes(runes)
}
return input
}
func stem(input []rune) []rune {
l := len(input)
if l < 5 {
return input
}
switch input[l-1] {
case 'o', 'a', 'e':
return input[:l-1]
case 's':
if input[l-2] == 'e' && input[l-3] == 's' && input[l-4] == 'e' {
return input[:l-2]
}
if input[l-2] == 'e' && input[l-3] == 'c' {
input[l-3] = 'z'
return input[:l-2]
}
if input[l-2] == 'o' || input[l-2] == 'a' || input[l-2] == 'e' {
return input[:l-2]
}
}
return input
}
func SpanishLightStemmerFilterConstructor(config map[string]interface{},
cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSpanishLightStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(LightStemmerName, SpanishLightStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,70 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package es
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const NormalizeName = "normalize_es"
type SpanishNormalizeFilter struct {
}
func NewSpanishNormalizeFilter() *SpanishNormalizeFilter {
return &SpanishNormalizeFilter{}
}
func (s *SpanishNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case 'à', 'á', 'â', 'ä':
runes[i] = 'a'
case 'ò', 'ó', 'ô', 'ö':
runes[i] = 'o'
case 'è', 'é', 'ê', 'ë':
runes[i] = 'e'
case 'ù', 'ú', 'û', 'ü':
runes[i] = 'u'
case 'ì', 'í', 'î', 'ï':
runes[i] = 'i'
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSpanishNormalizeFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,112 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package es
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestSpanishNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Guía"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Guia"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Belcebú"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Belcebu"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("Limón"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("Limon"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("agüero"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("aguero"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("laúd"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("laud"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
spanishNormalizeFilter := NewSpanishNormalizeFilter()
for _, test := range tests {
actual := spanishNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,52 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package es
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/snowballstem"
"github.com/blevesearch/snowballstem/spanish"
)
const SnowballStemmerName = "stemmer_es_snowball"
type SpanishStemmerFilter struct {
}
func NewSpanishStemmerFilter() *SpanishStemmerFilter {
return &SpanishStemmerFilter{}
}
func (s *SpanishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
env := snowballstem.NewEnv(string(token.Term))
spanish.Stem(env)
token.Term = []byte(env.Current())
}
return input
}
func SpanishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSpanishStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(SnowballStemmerName, SpanishStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,79 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package es
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestSnowballSpanishStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("agresivos"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("agres"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("agresivamente"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("agres"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("agresividad"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("agres"),
},
},
},
}
cache := registry.NewCache()
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2017 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package es
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{},
cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,383 @@
package es
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_es"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var SpanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Spanish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| The following is a ranked list (commonest to rarest) of stopwords
| deriving from a large sample of text.
| Extra words have been added at the end.
de | from, of
la | the, her
que | who, that
el | the
en | in
y | and
a | to
los | the, them
del | de + el
se | himself, from him etc
las | the, them
por | for, by, etc
un | a
para | for
con | with
no | no
una | a
su | his, her
al | a + el
| es from SER
lo | him
como | how
más | more
pero | pero
sus | su plural
le | to him, her
ya | already
o | or
| fue from SER
este | this
| ha from HABER
| himself etc
porque | because
esta | this
| son from SER
entre | between
| está from ESTAR
cuando | when
muy | very
sin | without
sobre | on
| ser from SER
| tiene from TENER
también | also
me | me
hasta | until
hay | there is/are
donde | where
| han from HABER
quien | whom, that
| están from ESTAR
| estado from ESTAR
desde | from
todo | all
nos | us
durante | during
| estados from ESTAR
todos | all
uno | a
les | to them
ni | nor
contra | against
otros | other
| fueron from SER
ese | that
eso | that
| había from HABER
ante | before
ellos | they
e | and (variant of y)
esto | this
| me
antes | before
algunos | some
qué | what?
unos | a
yo | I
otro | other
otras | other
otra | other
él | he
tanto | so much, many
esa | that
estos | these
mucho | much, many
quienes | who
nada | nothing
muchos | many
cual | who
| sea from SER
poco | few
ella | she
estar | to be
| haber from HABER
estas | these
| estaba from ESTAR
| estamos from ESTAR
algunas | some
algo | something
nosotros | we
| other forms
mi | me
mis | mi plural
| thou
te | thee
ti | thee
tu | thy
tus | tu plural
ellas | they
nosotras | we
vosotros | you
vosotras | you
os | you
mío | mine
mía |
míos |
mías |
tuyo | thine
tuya |
tuyos |
tuyas |
suyo | his, hers, theirs
suya |
suyos |
suyas |
nuestro | ours
nuestra |
nuestros |
nuestras |
vuestro | yours
vuestra |
vuestros |
vuestras |
esos | those
esas | those
| forms of estar, to be (not including the infinitive):
estoy
estás
está
estamos
estáis
están
esté
estés
estemos
estéis
estén
estaré
estarás
estará
estaremos
estaréis
estarán
estaría
estarías
estaríamos
estaríais
estarían
estaba
estabas
estábamos
estabais
estaban
estuve
estuviste
estuvo
estuvimos
estuvisteis
estuvieron
estuviera
estuvieras
estuviéramos
estuvierais
estuvieran
estuviese
estuvieses
estuviésemos
estuvieseis
estuviesen
estando
estado
estada
estados
estadas
estad
| forms of haber, to have (not including the infinitive):
he
has
ha
hemos
habéis
han
haya
hayas
hayamos
hayáis
hayan
habré
habrás
habrá
habremos
habréis
habrán
habría
habrías
habríamos
habríais
habrían
había
habías
habíamos
habíais
habían
hube
hubiste
hubo
hubimos
hubisteis
hubieron
hubiera
hubieras
hubiéramos
hubierais
hubieran
hubiese
hubieses
hubiésemos
hubieseis
hubiesen
habiendo
habido
habida
habidos
habidas
| forms of ser, to be (not including the infinitive):
soy
eres
es
somos
sois
son
sea
seas
seamos
seáis
sean
seré
serás
será
seremos
seréis
serán
sería
serías
seríamos
seríais
serían
era
eras
éramos
erais
eran
fui
fuiste
fue
fuimos
fuisteis
fueron
fuera
fueras
fuéramos
fuerais
fueran
fuese
fueses
fuésemos
fueseis
fuesen
siendo
sido
| sed also means 'thirst'
| forms of tener, to have (not including the infinitive):
tengo
tienes
tiene
tenemos
tenéis
tienen
tenga
tengas
tengamos
tengáis
tengan
tendré
tendrás
tendrá
tendremos
tendréis
tendrán
tendría
tendrías
tendríamos
tendríais
tendrían
tenía
tenías
teníamos
teníais
tenían
tuve
tuviste
tuvo
tuvimos
tuvisteis
tuvieron
tuviera
tuvieras
tuviéramos
tuvierais
tuvieran
tuviese
tuvieses
tuviésemos
tuvieseis
tuviesen
teniendo
tenido
tenida
tenidos
tenidas
tened
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SpanishStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package eu
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,126 @@
package eu
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_eu"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var BasqueStopWords = []byte(`# example set of basque stopwords
al
anitz
arabera
asko
baina
bat
batean
batek
bati
batzuei
batzuek
batzuetan
batzuk
bera
beraiek
berau
berauek
bere
berori
beroriek
beste
bezala
da
dago
dira
ditu
du
dute
edo
egin
ere
eta
eurak
ez
gainera
gu
gutxi
guzti
haiei
haiek
haietan
hainbeste
hala
han
handik
hango
hara
hari
hark
hartan
hau
hauei
hauek
hauetan
hemen
hemendik
hemengo
hi
hona
honek
honela
honetan
honi
hor
hori
horiei
horiek
horietan
horko
horra
horrek
horrela
horretan
horri
hortik
hura
izan
ni
noiz
nola
non
nondik
nongo
nor
nora
ze
zein
zen
zenbait
zenbat
zer
zergatik
ziren
zituen
zu
zuek
zuen
zuten
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(BasqueStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,74 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fa
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/analysis/char/zerowidthnonjoiner"
"github.com/blevesearch/bleve/v2/analysis/lang/ar"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
)
const AnalyzerName = "fa"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
zFilter, err := cache.CharFilterNamed(zerowidthnonjoiner.Name)
if err != nil {
return nil, err
}
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
normArFilter, err := cache.TokenFilterNamed(ar.NormalizeName)
if err != nil {
return nil, err
}
normFaFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopFaFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
CharFilters: []analysis.CharFilter{
zFilter,
},
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
normArFilter,
normFaFilter,
stopFaFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,684 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fa
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestPersianAnalyzerVerbs(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// active present indicative
{
input: []byte("می‌خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active preterite indicative
{
input: []byte("خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active imperfective preterite indicative
{
input: []byte("می‌خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active future indicative
{
input: []byte("خواهد خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active present progressive indicative
{
input: []byte("دارد می‌خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active preterite progressive indicative
{
input: []byte("داشت می‌خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active perfect indicative
{
input: []byte("خورده‌است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective perfect indicative
{
input: []byte("می‌خورده‌است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active pluperfect indicative
{
input: []byte("خورده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective pluperfect indicative
{
input: []byte("می‌خورده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active preterite subjunctive
{
input: []byte("خورده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective preterite subjunctive
{
input: []byte("می‌خورده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active pluperfect subjunctive
{
input: []byte("خورده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective pluperfect subjunctive
{
input: []byte("می‌خورده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present indicative
{
input: []byte("خورده می‌شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite indicative
{
input: []byte("خورده شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective preterite indicative
{
input: []byte("خورده می‌شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive perfect indicative
{
input: []byte("خورده شده‌است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective perfect indicative
{
input: []byte("خورده می‌شده‌است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive pluperfect indicative
{
input: []byte("خورده شده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective pluperfect indicative
{
input: []byte("خورده می‌شده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive future indicative
{
input: []byte("خورده خواهد شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present progressive indicative
{
input: []byte("دارد خورده می‌شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite progressive indicative
{
input: []byte("داشت خورده می‌شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present subjunctive
{
input: []byte("خورده شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite subjunctive
{
input: []byte("خورده شده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective preterite subjunctive
{
input: []byte("خورده می‌شده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive pluperfect subjunctive
{
input: []byte("خورده شده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective pluperfect subjunctive
{
input: []byte("خورده می‌شده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active present subjunctive
{
input: []byte("بخورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("بخورد"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}
func TestPersianAnalyzerVerbsDefective(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// active present indicative
{
input: []byte("مي خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active preterite indicative
{
input: []byte("خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active imperfective preterite indicative
{
input: []byte("مي خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active future indicative
{
input: []byte("خواهد خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active present progressive indicative
{
input: []byte("دارد مي خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active preterite progressive indicative
{
input: []byte("داشت مي خورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورد"),
},
},
},
// active perfect indicative
{
input: []byte("خورده است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective perfect indicative
{
input: []byte("مي خورده است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active pluperfect indicative
{
input: []byte("خورده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective pluperfect indicative
{
input: []byte("مي خورده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active preterite subjunctive
{
input: []byte("خورده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective preterite subjunctive
{
input: []byte("مي خورده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active pluperfect subjunctive
{
input: []byte("خورده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active imperfective pluperfect subjunctive
{
input: []byte("مي خورده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present indicative
{
input: []byte("خورده مي شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite indicative
{
input: []byte("خورده شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective preterite indicative
{
input: []byte("خورده مي شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive perfect indicative
{
input: []byte("خورده شده است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective perfect indicative
{
input: []byte("خورده مي شده است"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive pluperfect indicative
{
input: []byte("خورده شده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective pluperfect indicative
{
input: []byte("خورده مي شده بود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive future indicative
{
input: []byte("خورده خواهد شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present progressive indicative
{
input: []byte("دارد خورده مي شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite progressive indicative
{
input: []byte("داشت خورده مي شد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive present subjunctive
{
input: []byte("خورده شود"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive preterite subjunctive
{
input: []byte("خورده شده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective preterite subjunctive
{
input: []byte("خورده مي شده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive pluperfect subjunctive
{
input: []byte("خورده شده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// passive imperfective pluperfect subjunctive
{
input: []byte("خورده مي شده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
// active present subjunctive
{
input: []byte("بخورد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("بخورد"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}
func TestPersianAnalyzerOthers(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// nouns
{
input: []byte("برگ ها"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("برگ"),
},
},
},
{
input: []byte("برگ‌ها"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("برگ"),
},
},
},
// non persian
{
input: []byte("English test."),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("english"),
},
&analysis.Token{
Term: []byte("test"),
},
},
},
// others
{
input: []byte("خورده مي شده بوده باشد"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("خورده"),
},
},
},
{
input: []byte("برگ‌ها"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("برگ"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View file

@ -0,0 +1,80 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fa
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const NormalizeName = "normalize_fa"
const (
Yeh = '\u064A'
FarsiYeh = '\u06CC'
YehBarree = '\u06D2'
Keheh = '\u06A9'
Kaf = '\u0643'
HamzaAbove = '\u0654'
HehYeh = '\u06C0'
HehGoal = '\u06C1'
Heh = '\u0647'
)
type PersianNormalizeFilter struct {
}
func NewPersianNormalizeFilter() *PersianNormalizeFilter {
return &PersianNormalizeFilter{}
}
func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
case FarsiYeh, YehBarree:
runes[i] = Yeh
case Keheh:
runes[i] = Kaf
case HehYeh, HehGoal:
runes[i] = Heh
case HamzaAbove: // necessary for HEH + HAMZA
runes = analysis.DeleteRune(runes, i)
i--
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewPersianNormalizeFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,130 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fa
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestPersianNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// FarsiYeh
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("های"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هاي"),
},
},
},
// YehBarree
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("هاے"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("هاي"),
},
},
},
// Keheh
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("کشاندن"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كشاندن"),
},
},
},
// HehYeh
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتابۀ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتابه"),
},
},
},
// HehHamzaAbove
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتابهٔ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("كتابه"),
},
},
},
// HehGoal
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("زادہ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("زاده"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
persianNormalizeFilter := NewPersianNormalizeFilter()
for _, test := range tests {
actual := persianNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fa
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,340 @@
package fa
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_fa"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var PersianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# Also see http://www.opensource.org/licenses/bsd-license.html
# Note: by default this file is used after normalization, so when adding entries
# to this file, use the arabic 'ي' instead of 'ی'
انان
نداشته
سراسر
خياه
ايشان
وي
تاكنون
بيشتري
دوم
پس
ناشي
وگو
يا
داشتند
سپس
هنگام
هرگز
پنج
نشان
امسال
ديگر
گروهي
شدند
چطور
ده
و
دو
نخستين
ولي
چرا
چه
وسط
ه
كدام
قابل
يك
رفت
هفت
همچنين
در
هزار
بله
بلي
شايد
اما
شناسي
گرفته
دهد
داشته
دانست
داشتن
خواهيم
ميليارد
وقتيكه
امد
خواهد
جز
اورده
شده
بلكه
خدمات
شدن
برخي
نبود
بسياري
جلوگيري
حق
كردند
نوعي
بعري
نكرده
نظير
نبايد
بوده
بودن
داد
اورد
هست
جايي
شود
دنبال
داده
بايد
سابق
هيچ
همان
انجا
كمتر
كجاست
گردد
كسي
تر
مردم
تان
دادن
بودند
سري
جدا
ندارند
مگر
يكديگر
دارد
دهند
بنابراين
هنگامي
سمت
جا
انچه
خود
دادند
زياد
دارند
اثر
بدون
بهترين
بيشتر
البته
به
براساس
بيرون
كرد
بعضي
گرفت
توي
اي
ميليون
او
جريان
تول
بر
مانند
برابر
باشيم
مدتي
گويند
اكنون
تا
تنها
جديد
چند
بي
نشده
كردن
كردم
گويد
كرده
كنيم
نمي
نزد
روي
قصد
فقط
بالاي
ديگران
اين
ديروز
توسط
سوم
ايم
دانند
سوي
استفاده
شما
كنار
داريم
ساخته
طور
امده
رفته
نخست
بيست
نزديك
طي
كنيد
از
انها
تمامي
داشت
يكي
طريق
اش
چيست
روب
نمايد
گفت
چندين
چيزي
تواند
ام
ايا
با
ان
ايد
ترين
اينكه
ديگري
راه
هايي
بروز
همچنان
پاعين
كس
حدود
مختلف
مقابل
چيز
گيرد
ندارد
ضد
همچون
سازي
شان
مورد
باره
مرسي
خويش
برخوردار
چون
خارج
شش
هنوز
تحت
ضمن
هستيم
گفته
فكر
بسيار
پيش
براي
روزهاي
انكه
نخواهد
بالا
كل
وقتي
كي
چنين
كه
گيري
نيست
است
كجا
كند
نيز
يابد
بندي
حتي
توانند
عقب
خواست
كنند
بين
تمام
همه
ما
باشند
مثل
شد
اري
باشد
اره
طبق
بعد
اگر
صورت
غير
جاي
بيش
ريزي
اند
زيرا
چگونه
بار
لطفا
مي
درباره
من
ديده
همين
گذاري
برداري
علت
گذاشته
هم
فوق
نه
ها
شوند
اباد
همواره
هر
اول
خواهند
چهار
نام
امروز
مان
هاي
قبل
كنم
سعي
تازه
را
هستند
زير
جلوي
عنوان
بود
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(PersianStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,60 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fi
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
)
const AnalyzerName = "fi"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopFiFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerFiFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopFiFilter,
stemmerFiFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,70 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fi
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestFinishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("edeltäjiinsä"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("edeltäj"),
},
},
},
{
input: []byte("edeltäjistään"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("edeltäj"),
},
},
},
// stop word
{
input: []byte("olla"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View file

@ -0,0 +1,52 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fi
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/snowballstem"
"github.com/blevesearch/snowballstem/finnish"
)
const SnowballStemmerName = "stemmer_fi_snowball"
type FinnishStemmerFilter struct {
}
func NewFinnishStemmerFilter() *FinnishStemmerFilter {
return &FinnishStemmerFilter{}
}
func (s *FinnishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
env := snowballstem.NewEnv(string(token.Term))
finnish.Stem(env)
token.Term = []byte(env.Current())
}
return input
}
func FinnishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewFinnishStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(SnowballStemmerName, FinnishStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fi
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,124 @@
package fi
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_fi"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var FinnishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| forms of BE
olla
olen
olet
on
olemme
olette
ovat
ole | negative form
oli
olisi
olisit
olisin
olisimme
olisitte
olisivat
olit
olin
olimme
olitte
olivat
ollut
olleet
en | negation
et
ei
emme
ette
eivät
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
minä minun minut minua minussa minusta minuun minulla minulta minulle | I
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
mitkä | (pl)
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
| conjunctions
että | that
ja | and
jos | if
koska | because
kuin | than
mutta | but
niin | so
sekä | and
sillä | for
tai | or
vaan | but
vai | or
vaikka | although
| prepositions
kanssa | with
mukaan | according to
noin | about
poikki | across
yli | over, across
| other
kun | when
niin | so
nyt | now
itse | self
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FinnishStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,65 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
)
const AnalyzerName = "fr"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopFrFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerFrFilter, err := cache.TokenFilterNamed(LightStemmerName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
elisionFilter,
stopFrFilter,
stemmerFrFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,209 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestFrenchAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte(""),
output: analysis.TokenStream{},
},
{
input: []byte("chien chat cheval"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: []byte("chien CHAT CHEVAL"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: []byte(" chien ,? + = - CHAT /: > CHEVAL"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: []byte("chien++"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
},
},
{
input: []byte("mot \"entreguillemet\""),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("mot"),
},
&analysis.Token{
Term: []byte("entreguilemet"),
},
},
},
{
input: []byte("Jean-François"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("jean"),
},
&analysis.Token{
Term: []byte("francoi"),
},
},
},
// stop words
{
input: []byte("le la chien les aux chat du des à cheval"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
// nouns and adjectives
{
input: []byte("lances chismes habitable chiste éléments captifs"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("lanc"),
},
&analysis.Token{
Term: []byte("chism"),
},
&analysis.Token{
Term: []byte("habitabl"),
},
&analysis.Token{
Term: []byte("chist"),
},
&analysis.Token{
Term: []byte("element"),
},
&analysis.Token{
Term: []byte("captif"),
},
},
},
// verbs
{
input: []byte("finissions souffrirent rugissante"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("finision"),
},
&analysis.Token{
Term: []byte("soufrirent"),
},
&analysis.Token{
Term: []byte("rugisant"),
},
},
},
{
input: []byte("C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("c3po"),
},
&analysis.Token{
Term: []byte("aujourd'hui"),
},
&analysis.Token{
Term: []byte("oeuf"),
},
&analysis.Token{
Term: []byte("ïaöuaä"),
},
&analysis.Token{
Term: []byte("anticonstitutionel"),
},
&analysis.Token{
Term: []byte("java"),
},
},
},
{
input: []byte("propriétaire"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("proprietair"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View file

@ -0,0 +1,40 @@
package fr
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const ArticlesName = "articles_fr"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var FrenchArticles = []byte(`
l
m
t
qu
n
s
j
d
c
jusqu
quoiqu
lorsqu
puisqu
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FrenchArticles)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,40 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"fmt"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/elision"
"github.com/blevesearch/bleve/v2/registry"
)
const ElisionName = "elision_fr"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision.NewElisionFilter(articlesTokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,55 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("l'avion"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("avion"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,309 @@
// Copyright (c) 2015 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"bytes"
"unicode"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const LightStemmerName = "stemmer_fr_light"
type FrenchLightStemmerFilter struct {
}
func NewFrenchLightStemmerFilter() *FrenchLightStemmerFilter {
return &FrenchLightStemmerFilter{}
}
func (s *FrenchLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runes := bytes.Runes(token.Term)
runes = stem(runes)
token.Term = analysis.BuildTermFromRunes(runes)
}
return input
}
func stem(input []rune) []rune {
inputLen := len(input)
if inputLen > 5 && input[inputLen-1] == 'x' {
if input[inputLen-3] == 'a' && input[inputLen-2] == 'u' && input[inputLen-4] != 'e' {
input[inputLen-2] = 'l'
}
input = input[0 : inputLen-1]
inputLen = len(input)
}
if inputLen > 3 && input[inputLen-1] == 'x' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if inputLen > 3 && input[inputLen-1] == 's' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "issement") {
input = input[0 : inputLen-6]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "issant") {
input = input[0 : inputLen-4]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 6 && analysis.RunesEndsWith(input, "ement") {
input = input[0 : inputLen-4]
inputLen = len(input)
if inputLen > 3 && analysis.RunesEndsWith(input, "ive") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-1] = 'f'
}
return norm(input)
}
if inputLen > 11 && analysis.RunesEndsWith(input, "ficatrice") {
input = input[0 : inputLen-5]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 10 && analysis.RunesEndsWith(input, "ficateur") {
input = input[0 : inputLen-4]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "catrice") {
input = input[0 : inputLen-3]
inputLen = len(input)
input[inputLen-4] = 'q'
input[inputLen-3] = 'u'
input[inputLen-2] = 'e'
//s[len-1] = 'r' <-- unnecessary, already 'r'.
return norm(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "cateur") {
input = input[0 : inputLen-2]
inputLen = len(input)
input[inputLen-4] = 'q'
input[inputLen-3] = 'u'
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "atrice") {
input = input[0 : inputLen-4]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 7 && analysis.RunesEndsWith(input, "ateur") {
input = input[0 : inputLen-3]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 6 && analysis.RunesEndsWith(input, "trice") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-3] = 'e'
input[inputLen-2] = 'u'
input[inputLen-1] = 'r'
}
if inputLen > 5 && analysis.RunesEndsWith(input, "ième") {
return norm(input[0 : inputLen-4])
}
if inputLen > 7 && analysis.RunesEndsWith(input, "teuse") {
input = input[0 : inputLen-2]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 6 && analysis.RunesEndsWith(input, "teur") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 5 && analysis.RunesEndsWith(input, "euse") {
return norm(input[0 : inputLen-2])
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ère") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-2] = 'e'
return norm(input)
}
if inputLen > 7 && analysis.RunesEndsWith(input, "ive") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-1] = 'f'
return norm(input)
}
if inputLen > 4 &&
(analysis.RunesEndsWith(input, "folle") ||
analysis.RunesEndsWith(input, "molle")) {
input = input[0 : inputLen-2]
inputLen = len(input)
input[inputLen-1] = 'u'
return norm(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "nnelle") {
return norm(input[0 : inputLen-5])
}
if inputLen > 9 && analysis.RunesEndsWith(input, "nnel") {
return norm(input[0 : inputLen-3])
}
if inputLen > 4 && analysis.RunesEndsWith(input, "ète") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-2] = 'e'
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ique") {
input = input[0 : inputLen-4]
inputLen = len(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "esse") {
return norm(input[0 : inputLen-3])
}
if inputLen > 7 && analysis.RunesEndsWith(input, "inage") {
return norm(input[0 : inputLen-3])
}
if inputLen > 9 && analysis.RunesEndsWith(input, "isation") {
input = input[0 : inputLen-7]
inputLen = len(input)
if inputLen > 5 && analysis.RunesEndsWith(input, "ual") {
input[inputLen-2] = 'e'
}
return norm(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "isateur") {
return norm(input[0 : inputLen-7])
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ation") {
return norm(input[0 : inputLen-5])
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ition") {
return norm(input[0 : inputLen-5])
}
return norm(input)
}
func norm(input []rune) []rune {
if len(input) > 4 {
for i := 0; i < len(input); i++ {
switch input[i] {
case 'à', 'á', 'â':
input[i] = 'a'
case 'ô':
input[i] = 'o'
case 'è', 'é', 'ê':
input[i] = 'e'
case 'ù', 'û':
input[i] = 'u'
case 'î':
input[i] = 'i'
case 'ç':
input[i] = 'c'
}
ch := input[0]
for i := 1; i < len(input); i++ {
if input[i] == ch && unicode.IsLetter(ch) {
input = analysis.DeleteRune(input, i)
i -= 1
} else {
ch = input[i]
}
}
}
}
if len(input) > 4 && analysis.RunesEndsWith(input, "ie") {
input = input[0 : len(input)-2]
}
if len(input) > 4 {
if input[len(input)-1] == 'r' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == 'e' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == 'e' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == input[len(input)-2] && unicode.IsLetter(input[len(input)-1]) {
input = input[0 : len(input)-1]
}
}
return input
}
func FrenchLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewFrenchLightStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(LightStemmerName, FrenchLightStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,82 @@
// Copyright (c) 2015 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const MinimalStemmerName = "stemmer_fr_min"
type FrenchMinimalStemmerFilter struct {
}
func NewFrenchMinimalStemmerFilter() *FrenchMinimalStemmerFilter {
return &FrenchMinimalStemmerFilter{}
}
func (s *FrenchMinimalStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runes := bytes.Runes(token.Term)
runes = minstem(runes)
token.Term = analysis.BuildTermFromRunes(runes)
}
return input
}
func minstem(input []rune) []rune {
if len(input) < 6 {
return input
}
if input[len(input)-1] == 'x' {
if input[len(input)-3] == 'a' && input[len(input)-2] == 'u' {
input[len(input)-2] = 'l'
}
return input[0 : len(input)-1]
}
if input[len(input)-1] == 's' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == 'r' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == 'e' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == 'é' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == input[len(input)-2] {
input = input[0 : len(input)-1]
}
return input
}
func FrenchMinimalStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewFrenchMinimalStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(MinimalStemmerName, FrenchMinimalStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,139 @@
// Copyright (c) 2015 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestFrenchMinimalStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chevaux"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("hiboux"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("hibou"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chantés"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chanter"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chante"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("baronnes"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("barons"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
},
}
cache := registry.NewCache()
filter, err := cache.TokenFilterNamed(MinimalStemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,52 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/snowballstem"
"github.com/blevesearch/snowballstem/french"
)
const SnowballStemmerName = "stemmer_fr_snowball"
type FrenchStemmerFilter struct {
}
func NewFrenchStemmerFilter() *FrenchStemmerFilter {
return &FrenchStemmerFilter{}
}
func (s *FrenchStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
env := snowballstem.NewEnv(string(token.Term))
french.Stem(env)
token.Term = []byte(env.Current())
}
return input
}
func FrenchStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewFrenchStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(SnowballStemmerName, FrenchStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,79 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestSnowballFrenchStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("antagoniste"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("antagon"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("barbouillait"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("barbouill"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("calculateur"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("calcul"),
},
},
},
}
cache := registry.NewCache()
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,213 @@
package fr
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_fr"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var FrenchStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A French stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
au | a + le
aux | a + les
avec | with
ce | this
ces | these
dans | with
de | of
des | de + les
du | de + le
elle | she
en | 'of them' etc
et | and
eux | them
il | he
je | I
la | the
le | the
leur | their
lui | him
ma | my (fem)
mais | but
me | me
même | same; as in moi-même (myself) etc
mes | me (pl)
moi | me
mon | my (masc)
ne | not
nos | our (pl)
notre | our
nous | we
on | one
ou | where
par | by
pas | not
pour | for
qu | que before vowel
que | that
qui | who
sa | his, her (fem)
se | oneself
ses | his (pl)
son | his, her (masc)
sur | on
ta | thy (fem)
te | thee
tes | thy (pl)
toi | thee
ton | thy (masc)
tu | thou
un | a
une | a
vos | your (pl)
votre | your
vous | you
| single letter forms
c | c'
d | d'
j | j'
l | l'
à | to, at
m | m'
n | n'
s | s'
t | t'
y | there
| forms of être (not including the infinitive):
été
étée
étées
étés
étant
suis
es
est
sommes
êtes
sont
serai
seras
sera
serons
serez
seront
serais
serait
serions
seriez
seraient
étais
était
étions
étiez
étaient
fus
fut
fûmes
fûtes
furent
sois
soit
soyons
soyez
soient
fusse
fusses
fût
fussions
fussiez
fussent
| forms of avoir (not including the infinitive):
ayant
eu
eue
eues
eus
ai
as
avons
avez
ont
aurai
auras
aura
aurons
aurez
auront
aurais
aurait
aurions
auriez
auraient
avais
avait
avions
aviez
avaient
eut
eûmes
eûtes
eurent
aie
aies
ait
ayons
ayez
aient
eusse
eusses
eût
eussions
eussiez
eussent
| Later additions (from Jean-Christophe Deschamps)
ceci | this
cela | that
celà | that
cet | this
cette | this
ici | here
ils | they
les | the (pl)
leurs | their (pl)
quel | which
quels | which
quelle | which
quelles | which
sans | without
soi | oneself
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FrenchStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,30 @@
package ga
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const ArticlesName = "articles_ga"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var IrishArticles = []byte(`
d
m
b
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(IrishArticles)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,40 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ga
import (
"fmt"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/elision"
"github.com/blevesearch/bleve/v2/registry"
)
const ElisionName = "elision_ga"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision.NewElisionFilter(articlesTokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,55 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ga
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("b'fhearr"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("fhearr"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ga
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,137 @@
package ga
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_ga"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var IrishStopWords = []byte(`
a
ach
ag
agus
an
aon
ar
arna
as
b'
ba
beirt
bhúr
caoga
ceathair
ceathrar
chomh
chtó
chuig
chun
cois
céad
cúig
cúigear
d'
daichead
dar
de
deich
deichniúr
den
dhá
do
don
dtí
dár
faoi
faoin
faoina
faoinár
fara
fiche
gach
gan
go
gur
haon
hocht
i
iad
idir
in
ina
ins
inár
is
le
leis
lena
lenár
m'
mar
mo
na
nach
naoi
naonúr
níor
nócha
ocht
ochtar
os
roimh
sa
seacht
seachtar
seachtó
seasca
seisear
siad
sibh
sinn
sna
tar
thar
thú
triúr
trí
trína
trínár
tríocha
um
ár
é
éis
í
ó
ón
óna
ónár
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(IrishStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package gl
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,188 @@
package gl
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_gl"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var GalicianStopWords = []byte(`# galican stopwords
a
aínda
alí
aquel
aquela
aquelas
aqueles
aquilo
aquí
ao
aos
as
así
á
ben
cando
che
co
coa
comigo
con
connosco
contigo
convosco
coas
cos
cun
cuns
cunha
cunhas
da
dalgunha
dalgunhas
dalgún
dalgúns
das
de
del
dela
delas
deles
desde
deste
do
dos
dun
duns
dunha
dunhas
e
el
ela
elas
eles
en
era
eran
esa
esas
ese
eses
esta
estar
estaba
está
están
este
estes
estiven
estou
eu
é
facer
foi
foron
fun
había
hai
iso
isto
la
las
lle
lles
lo
los
mais
me
meu
meus
min
miña
miñas
moi
na
nas
neste
nin
no
non
nos
nosa
nosas
noso
nosos
nós
nun
nunha
nuns
nunhas
o
os
ou
ó
ós
para
pero
pode
pois
pola
polas
polo
polos
por
que
se
senón
ser
seu
seus
sexa
sido
sobre
súa
súas
tamén
tan
te
ten
teñen
teño
ter
teu
teus
ti
tido
tiña
tiven
túa
túas
un
unha
unhas
uns
vos
vosa
vosas
voso
vosos
vós
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(GalicianStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,71 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hi
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/analysis/lang/in"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
)
const AnalyzerName = "hi"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
indicNormalizeFilter, err := cache.TokenFilterNamed(in.NormalizeName)
if err != nil {
return nil, err
}
hindiNormalizeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stopHiFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerHiFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
indicNormalizeFilter,
hindiNormalizeFilter,
stopHiFilter,
stemmerHiFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

Some files were not shown because too many files have changed in this diff Show more