1
0
Fork 0

Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-19 00:20:02 +02:00
parent c71cb8b61d
commit 982828099e
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
783 changed files with 150650 additions and 0 deletions

View file

@ -0,0 +1,60 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package sv
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
)
const AnalyzerName = "sv"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
stopSvFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerSvFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: unicodeTokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopSvFilter,
stemmerSvFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,70 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package sv
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestSwedishAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// stemming
{
input: []byte("jaktkarlarne"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("jaktkarl"),
},
},
},
{
input: []byte("jaktkarlens"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("jaktkarl"),
},
},
},
// stop word
{
input: []byte("och"),
output: analysis.TokenStream{},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if len(actual) != len(test.output) {
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
}
for i, tok := range actual {
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
}
}
}
}

View file

@ -0,0 +1,52 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package sv
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/snowballstem"
"github.com/blevesearch/snowballstem/swedish"
)
const SnowballStemmerName = "stemmer_sv_snowball"
type SwedishStemmerFilter struct {
}
func NewSwedishStemmerFilter() *SwedishStemmerFilter {
return &SwedishStemmerFilter{}
}
func (s *SwedishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
env := snowballstem.NewEnv(string(token.Term))
swedish.Stem(env)
token.Term = []byte(env.Current())
}
return input
}
func SwedishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewSwedishStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(SnowballStemmerName, SwedishStemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2018 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package sv
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,160 @@
package sv
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_sv"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
// ` was changed to ' to allow for literal string
var SwedishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
| This file is distributed under the BSD License.
| See http://snowball.tartarus.org/license.php
| Also see http://www.opensource.org/licenses/bsd-license.html
| - Encoding was converted to UTF-8.
| - This notice was added.
|
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
| A Swedish stop word list. Comments begin with vertical bar. Each stop
| word is at the start of a line.
| This is a ranked list (commonest to rarest) of stopwords derived from
| a large text sample.
| Swedish stop words occasionally exhibit homonym clashes. For example
| = so, but also seed. These are indicated clearly below.
och | and
det | it, this/that
att | to (with infinitive)
i | in, at
en | a
jag | I
hon | she
som | who, that
han | he
| on
den | it, this/that
med | with
var | where, each
sig | him(self) etc
för | for
| so (also: seed)
till | to
är | is
men | but
ett | a
om | if; around, about
hade | had
de | they, these/those
av | of
icke | not, no
mig | me
du | you
henne | her
| then, when
sin | his
nu | now
har | have
inte | inte någon = no one
hans | his
honom | him
skulle | 'sake'
hennes | her
där | there
min | my
man | one (pronoun)
ej | nor
vid | at, by, on (also: vast)
kunde | could
något | some etc
från | from, off
ut | out
när | when
efter | after, behind
upp | up
vi | we
dem | them
vara | be
vad | what
över | over
än | than
dig | you
kan | can
sina | his
här | here
ha | have
mot | towards
alla | all
under | under (also: wonder)
någon | some etc
eller | or (else)
allt | all
mycket | much
sedan | since
ju | why
denna | this/that
själv | myself, yourself etc
detta | this/that
åt | to
utan | without
varit | was
hur | how
ingen | no
mitt | my
ni | you
bli | to be, become
blev | from bli
oss | us
din | thy
dessa | these/those
några | some etc
deras | their
blir | from bli
mina | my
samma | (the) same
vilken | who, that
er | you, your
sådan | such a
vår | our
blivit | from bli
dess | its
inom | within
mellan | between
sådant | such a
varför | why
varje | each
vilka | who, that
ditt | thy
vem | who
vilket | who, that
sitta | his
sådana | such a
vart | each
dina | thy
vars | whose
vårt | our
våra | our
ert | your
era | your
vilkas | whose
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(SwedishStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}