1
0
Fork 0

Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-19 00:20:02 +02:00
parent c71cb8b61d
commit 982828099e
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
783 changed files with 150650 additions and 0 deletions

View file

@ -0,0 +1,65 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
)
const AnalyzerName = "fr"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopFrFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerFrFilter, err := cache.TokenFilterNamed(LightStemmerName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
elisionFilter,
stopFrFilter,
stemmerFrFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,209 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestFrenchAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
input: []byte(""),
output: analysis.TokenStream{},
},
{
input: []byte("chien chat cheval"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: []byte("chien CHAT CHEVAL"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: []byte(" chien ,? + = - CHAT /: > CHEVAL"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: []byte("chien++"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
},
},
{
input: []byte("mot \"entreguillemet\""),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("mot"),
},
&analysis.Token{
Term: []byte("entreguilemet"),
},
},
},
{
input: []byte("Jean-François"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("jean"),
},
&analysis.Token{
Term: []byte("francoi"),
},
},
},
// stop words
{
input: []byte("le la chien les aux chat du des à cheval"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chien"),
},
&analysis.Token{
Term: []byte("chat"),
},
&analysis.Token{
Term: []byte("cheval"),
},
},
},
// nouns and adjectives
{
input: []byte("lances chismes habitable chiste éléments captifs"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("lanc"),
},
&analysis.Token{
Term: []byte("chism"),
},
&analysis.Token{
Term: []byte("habitabl"),
},
&analysis.Token{
Term: []byte("chist"),
},
&analysis.Token{
Term: []byte("element"),
},
&analysis.Token{
Term: []byte("captif"),
},
},
},
// verbs
{
input: []byte("finissions souffrirent rugissante"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("finision"),
},
&analysis.Token{
Term: []byte("soufrirent"),
},
&analysis.Token{
Term: []byte("rugisant"),
},
},
},
{
input: []byte("C3PO aujourd'hui oeuf ïâöûàä anticonstitutionnellement Java++ "),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("c3po"),
},
&analysis.Token{
Term: []byte("aujourd'hui"),
},
&analysis.Token{
Term: []byte("oeuf"),
},
&analysis.Token{
Term: []byte("ïaöuaä"),
},
&analysis.Token{
Term: []byte("anticonstitutionel"),
},
&analysis.Token{
Term: []byte("java"),
},
},
},
{
input: []byte("propriétaire"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("proprietair"),
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View file

@ -0,0 +1,40 @@
package fr
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const ArticlesName = "articles_fr"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
var FrenchArticles = []byte(`
l
m
t
qu
n
s
j
d
c
jusqu
quoiqu
lorsqu
puisqu
`)
func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FrenchArticles)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,40 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"fmt"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/elision"
"github.com/blevesearch/bleve/v2/registry"
)
const ElisionName = "elision_fr"
func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
if err != nil {
return nil, fmt.Errorf("error building elision filter: %v", err)
}
return elision.NewElisionFilter(articlesTokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,55 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestFrenchElision(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("l'avion"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("avion"),
},
},
},
}
cache := registry.NewCache()
elisionFilter, err := cache.TokenFilterNamed(ElisionName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := elisionFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,309 @@
// Copyright (c) 2015 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"bytes"
"unicode"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const LightStemmerName = "stemmer_fr_light"
type FrenchLightStemmerFilter struct {
}
func NewFrenchLightStemmerFilter() *FrenchLightStemmerFilter {
return &FrenchLightStemmerFilter{}
}
func (s *FrenchLightStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runes := bytes.Runes(token.Term)
runes = stem(runes)
token.Term = analysis.BuildTermFromRunes(runes)
}
return input
}
func stem(input []rune) []rune {
inputLen := len(input)
if inputLen > 5 && input[inputLen-1] == 'x' {
if input[inputLen-3] == 'a' && input[inputLen-2] == 'u' && input[inputLen-4] != 'e' {
input[inputLen-2] = 'l'
}
input = input[0 : inputLen-1]
inputLen = len(input)
}
if inputLen > 3 && input[inputLen-1] == 'x' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if inputLen > 3 && input[inputLen-1] == 's' {
input = input[0 : inputLen-1]
inputLen = len(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "issement") {
input = input[0 : inputLen-6]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "issant") {
input = input[0 : inputLen-4]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 6 && analysis.RunesEndsWith(input, "ement") {
input = input[0 : inputLen-4]
inputLen = len(input)
if inputLen > 3 && analysis.RunesEndsWith(input, "ive") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-1] = 'f'
}
return norm(input)
}
if inputLen > 11 && analysis.RunesEndsWith(input, "ficatrice") {
input = input[0 : inputLen-5]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 10 && analysis.RunesEndsWith(input, "ficateur") {
input = input[0 : inputLen-4]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "catrice") {
input = input[0 : inputLen-3]
inputLen = len(input)
input[inputLen-4] = 'q'
input[inputLen-3] = 'u'
input[inputLen-2] = 'e'
//s[len-1] = 'r' <-- unnecessary, already 'r'.
return norm(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "cateur") {
input = input[0 : inputLen-2]
inputLen = len(input)
input[inputLen-4] = 'q'
input[inputLen-3] = 'u'
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "atrice") {
input = input[0 : inputLen-4]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 7 && analysis.RunesEndsWith(input, "ateur") {
input = input[0 : inputLen-3]
inputLen = len(input)
input[inputLen-2] = 'e'
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 6 && analysis.RunesEndsWith(input, "trice") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-3] = 'e'
input[inputLen-2] = 'u'
input[inputLen-1] = 'r'
}
if inputLen > 5 && analysis.RunesEndsWith(input, "ième") {
return norm(input[0 : inputLen-4])
}
if inputLen > 7 && analysis.RunesEndsWith(input, "teuse") {
input = input[0 : inputLen-2]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 6 && analysis.RunesEndsWith(input, "teur") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-1] = 'r'
return norm(input)
}
if inputLen > 5 && analysis.RunesEndsWith(input, "euse") {
return norm(input[0 : inputLen-2])
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ère") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-2] = 'e'
return norm(input)
}
if inputLen > 7 && analysis.RunesEndsWith(input, "ive") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-1] = 'f'
return norm(input)
}
if inputLen > 4 &&
(analysis.RunesEndsWith(input, "folle") ||
analysis.RunesEndsWith(input, "molle")) {
input = input[0 : inputLen-2]
inputLen = len(input)
input[inputLen-1] = 'u'
return norm(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "nnelle") {
return norm(input[0 : inputLen-5])
}
if inputLen > 9 && analysis.RunesEndsWith(input, "nnel") {
return norm(input[0 : inputLen-3])
}
if inputLen > 4 && analysis.RunesEndsWith(input, "ète") {
input = input[0 : inputLen-1]
inputLen = len(input)
input[inputLen-2] = 'e'
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ique") {
input = input[0 : inputLen-4]
inputLen = len(input)
}
if inputLen > 8 && analysis.RunesEndsWith(input, "esse") {
return norm(input[0 : inputLen-3])
}
if inputLen > 7 && analysis.RunesEndsWith(input, "inage") {
return norm(input[0 : inputLen-3])
}
if inputLen > 9 && analysis.RunesEndsWith(input, "isation") {
input = input[0 : inputLen-7]
inputLen = len(input)
if inputLen > 5 && analysis.RunesEndsWith(input, "ual") {
input[inputLen-2] = 'e'
}
return norm(input)
}
if inputLen > 9 && analysis.RunesEndsWith(input, "isateur") {
return norm(input[0 : inputLen-7])
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ation") {
return norm(input[0 : inputLen-5])
}
if inputLen > 8 && analysis.RunesEndsWith(input, "ition") {
return norm(input[0 : inputLen-5])
}
return norm(input)
}
func norm(input []rune) []rune {
if len(input) > 4 {
for i := 0; i < len(input); i++ {
switch input[i] {
case 'à', 'á', 'â':
input[i] = 'a'
case 'ô':
input[i] = 'o'
case 'è', 'é', 'ê':
input[i] = 'e'
case 'ù', 'û':
input[i] = 'u'
case 'î':
input[i] = 'i'
case 'ç':
input[i] = 'c'
}
ch := input[0]
for i := 1; i < len(input); i++ {
if input[i] == ch && unicode.IsLetter(ch) {
input = analysis.DeleteRune(input, i)
i -= 1
} else {
ch = input[i]
}
}
}
}
if len(input) > 4 && analysis.RunesEndsWith(input, "ie") {
input = input[0 : len(input)-2]
}
if len(input) > 4 {
if input[len(input)-1] == 'r' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == 'e' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == 'e' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == input[len(input)-2] && unicode.IsLetter(input[len(input)-1]) {
input = input[0 : len(input)-1]
}
}
return input
}
func FrenchLightStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewFrenchLightStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(LightStemmerName, FrenchLightStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,82 @@
// Copyright (c) 2015 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const MinimalStemmerName = "stemmer_fr_min"
type FrenchMinimalStemmerFilter struct {
}
func NewFrenchMinimalStemmerFilter() *FrenchMinimalStemmerFilter {
return &FrenchMinimalStemmerFilter{}
}
func (s *FrenchMinimalStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
runes := bytes.Runes(token.Term)
runes = minstem(runes)
token.Term = analysis.BuildTermFromRunes(runes)
}
return input
}
func minstem(input []rune) []rune {
if len(input) < 6 {
return input
}
if input[len(input)-1] == 'x' {
if input[len(input)-3] == 'a' && input[len(input)-2] == 'u' {
input[len(input)-2] = 'l'
}
return input[0 : len(input)-1]
}
if input[len(input)-1] == 's' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == 'r' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == 'e' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == 'é' {
input = input[0 : len(input)-1]
}
if input[len(input)-1] == input[len(input)-2] {
input = input[0 : len(input)-1]
}
return input
}
func FrenchMinimalStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewFrenchMinimalStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(MinimalStemmerName, FrenchMinimalStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,139 @@
// Copyright (c) 2015 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestFrenchMinimalStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chevaux"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("cheval"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("hiboux"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("hibou"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chantés"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chanter"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("chante"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("chant"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("baronnes"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("barons"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("baron"),
},
},
},
}
cache := registry.NewCache()
filter, err := cache.TokenFilterNamed(MinimalStemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,52 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/snowballstem"
"github.com/blevesearch/snowballstem/french"
)
const SnowballStemmerName = "stemmer_fr_snowball"
type FrenchStemmerFilter struct {
}
func NewFrenchStemmerFilter() *FrenchStemmerFilter {
return &FrenchStemmerFilter{}
}
func (s *FrenchStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
env := snowballstem.NewEnv(string(token.Term))
french.Stem(env)
token.Term = []byte(env.Current())
}
return input
}
func FrenchStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewFrenchStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(SnowballStemmerName, FrenchStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,79 @@
// Copyright (c) 2020 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestSnowballFrenchStemmer(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("antagoniste"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("antagon"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("barbouillait"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("barbouill"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("calculateur"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("calcul"),
},
},
},
}
cache := registry.NewCache()
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := filter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fr
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,213 @@
package fr
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_fr"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var FrenchStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A French stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
au | a + le
aux | a + les
avec | with
ce | this
ces | these
dans | with
de | of
des | de + les
du | de + le
elle | she
en | 'of them' etc
et | and
eux | them
il | he
je | I
la | the
le | the
leur | their
lui | him
ma | my (fem)
mais | but
me | me
même | same; as in moi-même (myself) etc
mes | me (pl)
moi | me
mon | my (masc)
ne | not
nos | our (pl)
notre | our
nous | we
on | one
ou | where
par | by
pas | not
pour | for
qu | que before vowel
que | that
qui | who
sa | his, her (fem)
se | oneself
ses | his (pl)
son | his, her (masc)
sur | on
ta | thy (fem)
te | thee
tes | thy (pl)
toi | thee
ton | thy (masc)
tu | thou
un | a
une | a
vos | your (pl)
votre | your
vous | you
| single letter forms
c | c'
d | d'
j | j'
l | l'
à | to, at
m | m'
n | n'
s | s'
t | t'
y | there
| forms of être (not including the infinitive):
été
étée
étées
étés
étant
suis
es
est
sommes
êtes
sont
serai
seras
sera
serons
serez
seront
serais
serait
serions
seriez
seraient
étais
était
étions
étiez
étaient
fus
fut
fûmes
fûtes
furent
sois
soit
soyons
soyez
soient
fusse
fusses
fût
fussions
fussiez
fussent
| forms of avoir (not including the infinitive):
ayant
eu
eue
eues
eus
ai
as
avons
avez
ont
aurai
auras
aura
aurons
aurez
auront
aurais
aurait
aurions
auriez
auraient
avais
avait
avions
aviez
avaient
eut
eûmes
eûtes
eurent
aie
aies
ait
ayons
ayez
aient
eusse
eusses
eût
eussions
eussiez
eussent
| Later additions (from Jean-Christophe Deschamps)
ceci | this
cela | that
celà | that
cet | this
cette | this
ici | here
ils | they
les | the (pl)
leurs | their (pl)
quel | which
quels | which
quelle | which
quelles | which
sans | without
soi | oneself
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(FrenchStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}