Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c71cb8b61d
commit
982828099e
783 changed files with 150650 additions and 0 deletions
60
analysis/lang/sv/analyzer_sv.go
Normal file
60
analysis/lang/sv/analyzer_sv.go
Normal file
|
@ -0,0 +1,60 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package sv
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "sv"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopSvFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerSvFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: unicodeTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopSvFilter,
|
||||
stemmerSvFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
70
analysis/lang/sv/analyzer_sv_test.go
Normal file
70
analysis/lang/sv/analyzer_sv_test.go
Normal file
|
@ -0,0 +1,70 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package sv
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestSwedishAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("jaktkarlarne"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("jaktkarl"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("jaktkarlens"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("jaktkarl"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word
|
||||
{
|
||||
input: []byte("och"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
52
analysis/lang/sv/stemmer_sv.go
Normal file
52
analysis/lang/sv/stemmer_sv.go
Normal file
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package sv
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/snowballstem"
|
||||
"github.com/blevesearch/snowballstem/swedish"
|
||||
)
|
||||
|
||||
const SnowballStemmerName = "stemmer_sv_snowball"
|
||||
|
||||
type SwedishStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewSwedishStemmerFilter() *SwedishStemmerFilter {
|
||||
return &SwedishStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *SwedishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
env := snowballstem.NewEnv(string(token.Term))
|
||||
swedish.Stem(env)
|
||||
token.Term = []byte(env.Current())
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func SwedishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewSwedishStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(SnowballStemmerName, SwedishStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
36
analysis/lang/sv/stop_filter_sv.go
Normal file
36
analysis/lang/sv/stop_filter_sv.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package sv
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
160
analysis/lang/sv/stop_words_sv.go
Normal file
160
analysis/lang/sv/stop_words_sv.go
Normal file
|
@ -0,0 +1,160 @@
|
|||
package sv
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_sv"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var SwedishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Swedish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large text sample.
|
||||
|
||||
| Swedish stop words occasionally exhibit homonym clashes. For example
|
||||
| så = so, but also seed. These are indicated clearly below.
|
||||
|
||||
och | and
|
||||
det | it, this/that
|
||||
att | to (with infinitive)
|
||||
i | in, at
|
||||
en | a
|
||||
jag | I
|
||||
hon | she
|
||||
som | who, that
|
||||
han | he
|
||||
på | on
|
||||
den | it, this/that
|
||||
med | with
|
||||
var | where, each
|
||||
sig | him(self) etc
|
||||
för | for
|
||||
så | so (also: seed)
|
||||
till | to
|
||||
är | is
|
||||
men | but
|
||||
ett | a
|
||||
om | if; around, about
|
||||
hade | had
|
||||
de | they, these/those
|
||||
av | of
|
||||
icke | not, no
|
||||
mig | me
|
||||
du | you
|
||||
henne | her
|
||||
då | then, when
|
||||
sin | his
|
||||
nu | now
|
||||
har | have
|
||||
inte | inte någon = no one
|
||||
hans | his
|
||||
honom | him
|
||||
skulle | 'sake'
|
||||
hennes | her
|
||||
där | there
|
||||
min | my
|
||||
man | one (pronoun)
|
||||
ej | nor
|
||||
vid | at, by, on (also: vast)
|
||||
kunde | could
|
||||
något | some etc
|
||||
från | from, off
|
||||
ut | out
|
||||
när | when
|
||||
efter | after, behind
|
||||
upp | up
|
||||
vi | we
|
||||
dem | them
|
||||
vara | be
|
||||
vad | what
|
||||
över | over
|
||||
än | than
|
||||
dig | you
|
||||
kan | can
|
||||
sina | his
|
||||
här | here
|
||||
ha | have
|
||||
mot | towards
|
||||
alla | all
|
||||
under | under (also: wonder)
|
||||
någon | some etc
|
||||
eller | or (else)
|
||||
allt | all
|
||||
mycket | much
|
||||
sedan | since
|
||||
ju | why
|
||||
denna | this/that
|
||||
själv | myself, yourself etc
|
||||
detta | this/that
|
||||
åt | to
|
||||
utan | without
|
||||
varit | was
|
||||
hur | how
|
||||
ingen | no
|
||||
mitt | my
|
||||
ni | you
|
||||
bli | to be, become
|
||||
blev | from bli
|
||||
oss | us
|
||||
din | thy
|
||||
dessa | these/those
|
||||
några | some etc
|
||||
deras | their
|
||||
blir | from bli
|
||||
mina | my
|
||||
samma | (the) same
|
||||
vilken | who, that
|
||||
er | you, your
|
||||
sådan | such a
|
||||
vår | our
|
||||
blivit | from bli
|
||||
dess | its
|
||||
inom | within
|
||||
mellan | between
|
||||
sådant | such a
|
||||
varför | why
|
||||
varje | each
|
||||
vilka | who, that
|
||||
ditt | thy
|
||||
vem | who
|
||||
vilket | who, that
|
||||
sitta | his
|
||||
sådana | such a
|
||||
vart | each
|
||||
dina | thy
|
||||
vars | whose
|
||||
vårt | our
|
||||
våra | our
|
||||
ert | your
|
||||
era | your
|
||||
vilkas | whose
|
||||
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(SwedishStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue