Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c71cb8b61d
commit
982828099e
783 changed files with 150650 additions and 0 deletions
60
analysis/lang/no/analyzer_no.go
Normal file
60
analysis/lang/no/analyzer_no.go
Normal file
|
@ -0,0 +1,60 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package no
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "no"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopNoFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerNoFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: unicodeTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopNoFilter,
|
||||
stemmerNoFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
70
analysis/lang/no/analyzer_no_test.go
Normal file
70
analysis/lang/no/analyzer_no_test.go
Normal file
|
@ -0,0 +1,70 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package no
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestNorwegianAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("havnedistriktene"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("havnedistrikt"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("havnedistrikter"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("havnedistrikt"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word
|
||||
{
|
||||
input: []byte("det"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
52
analysis/lang/no/stemmer_no.go
Normal file
52
analysis/lang/no/stemmer_no.go
Normal file
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package no
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/snowballstem"
|
||||
"github.com/blevesearch/snowballstem/norwegian"
|
||||
)
|
||||
|
||||
const SnowballStemmerName = "stemmer_no_snowball"
|
||||
|
||||
type NorwegianStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewNorwegianStemmerFilter() *NorwegianStemmerFilter {
|
||||
return &NorwegianStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *NorwegianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
env := snowballstem.NewEnv(string(token.Term))
|
||||
norwegian.Stem(env)
|
||||
token.Term = []byte(env.Current())
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func NorwegianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewNorwegianStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(SnowballStemmerName, NorwegianStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
36
analysis/lang/no/stop_filter_no.go
Normal file
36
analysis/lang/no/stop_filter_no.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package no
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
221
analysis/lang/no/stop_words_no.go
Normal file
221
analysis/lang/no/stop_words_no.go
Normal file
|
@ -0,0 +1,221 @@
|
|||
package no
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_no"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var NorwegianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
|
||||
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This stop word list is for the dominant bokmål dialect. Words unique
|
||||
| to nynorsk are marked *.
|
||||
|
||||
| Revised by Jan Bruusgaard <Jan.Bruusgaard@ssb.no>, Jan 2005
|
||||
|
||||
og | and
|
||||
i | in
|
||||
jeg | I
|
||||
det | it/this/that
|
||||
at | to (w. inf.)
|
||||
en | a/an
|
||||
et | a/an
|
||||
den | it/this/that
|
||||
til | to
|
||||
er | is/am/are
|
||||
som | who/that
|
||||
på | on
|
||||
de | they / you(formal)
|
||||
med | with
|
||||
han | he
|
||||
av | of
|
||||
ikke | not
|
||||
ikkje | not *
|
||||
der | there
|
||||
så | so
|
||||
var | was/were
|
||||
meg | me
|
||||
seg | you
|
||||
men | but
|
||||
ett | one
|
||||
har | have
|
||||
om | about
|
||||
vi | we
|
||||
min | my
|
||||
mitt | my
|
||||
ha | have
|
||||
hadde | had
|
||||
hun | she
|
||||
nå | now
|
||||
over | over
|
||||
da | when/as
|
||||
ved | by/know
|
||||
fra | from
|
||||
du | you
|
||||
ut | out
|
||||
sin | your
|
||||
dem | them
|
||||
oss | us
|
||||
opp | up
|
||||
man | you/one
|
||||
kan | can
|
||||
hans | his
|
||||
hvor | where
|
||||
eller | or
|
||||
hva | what
|
||||
skal | shall/must
|
||||
selv | self (reflective)
|
||||
sjøl | self (reflective)
|
||||
her | here
|
||||
alle | all
|
||||
vil | will
|
||||
bli | become
|
||||
ble | became
|
||||
blei | became *
|
||||
blitt | have become
|
||||
kunne | could
|
||||
inn | in
|
||||
når | when
|
||||
være | be
|
||||
kom | come
|
||||
noen | some
|
||||
noe | some
|
||||
ville | would
|
||||
dere | you
|
||||
som | who/which/that
|
||||
deres | their/theirs
|
||||
kun | only/just
|
||||
ja | yes
|
||||
etter | after
|
||||
ned | down
|
||||
skulle | should
|
||||
denne | this
|
||||
for | for/because
|
||||
deg | you
|
||||
si | hers/his
|
||||
sine | hers/his
|
||||
sitt | hers/his
|
||||
mot | against
|
||||
å | to
|
||||
meget | much
|
||||
hvorfor | why
|
||||
dette | this
|
||||
disse | these/those
|
||||
uten | without
|
||||
hvordan | how
|
||||
ingen | none
|
||||
din | your
|
||||
ditt | your
|
||||
blir | become
|
||||
samme | same
|
||||
hvilken | which
|
||||
hvilke | which (plural)
|
||||
sånn | such a
|
||||
inni | inside/within
|
||||
mellom | between
|
||||
vår | our
|
||||
hver | each
|
||||
hvem | who
|
||||
vors | us/ours
|
||||
hvis | whose
|
||||
både | both
|
||||
bare | only/just
|
||||
enn | than
|
||||
fordi | as/because
|
||||
før | before
|
||||
mange | many
|
||||
også | also
|
||||
slik | just
|
||||
vært | been
|
||||
være | to be
|
||||
båe | both *
|
||||
begge | both
|
||||
siden | since
|
||||
dykk | your *
|
||||
dykkar | yours *
|
||||
dei | they *
|
||||
deira | them *
|
||||
deires | theirs *
|
||||
deim | them *
|
||||
di | your (fem.) *
|
||||
då | as/when *
|
||||
eg | I *
|
||||
ein | a/an *
|
||||
eit | a/an *
|
||||
eitt | a/an *
|
||||
elles | or *
|
||||
honom | he *
|
||||
hjå | at *
|
||||
ho | she *
|
||||
hoe | she *
|
||||
henne | her
|
||||
hennar | her/hers
|
||||
hennes | hers
|
||||
hoss | how *
|
||||
hossen | how *
|
||||
ikkje | not *
|
||||
ingi | noone *
|
||||
inkje | noone *
|
||||
korleis | how *
|
||||
korso | how *
|
||||
kva | what/which *
|
||||
kvar | where *
|
||||
kvarhelst | where *
|
||||
kven | who/whom *
|
||||
kvi | why *
|
||||
kvifor | why *
|
||||
me | we *
|
||||
medan | while *
|
||||
mi | my *
|
||||
mine | my *
|
||||
mykje | much *
|
||||
no | now *
|
||||
nokon | some (masc./neut.) *
|
||||
noka | some (fem.) *
|
||||
nokor | some *
|
||||
noko | some *
|
||||
nokre | some *
|
||||
si | his/hers *
|
||||
sia | since *
|
||||
sidan | since *
|
||||
so | so *
|
||||
somt | some *
|
||||
somme | some *
|
||||
um | about*
|
||||
upp | up *
|
||||
vere | be *
|
||||
vore | was *
|
||||
verte | become *
|
||||
vort | become *
|
||||
varte | became *
|
||||
vart | became *
|
||||
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(NorwegianStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue