Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c71cb8b61d
commit
982828099e
783 changed files with 150650 additions and 0 deletions
67
analysis/lang/hr/analyzer_hr.go
Normal file
67
analysis/lang/hr/analyzer_hr.go
Normal file
|
@ -0,0 +1,67 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hr
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
// Originated from: http://nlp.ffzg.hr/resources/tools/stemmer-for-croatian/
|
||||
|
||||
const AnalyzerName = "hr"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
suffixFilter, err := cache.TokenFilterNamed(SuffixTransformationFilterName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: unicodeTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopFilter,
|
||||
suffixFilter,
|
||||
stemmerFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
97
analysis/lang/hr/analyzer_hr_test.go
Normal file
97
analysis/lang/hr/analyzer_hr_test.go
Normal file
|
@ -0,0 +1,97 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hr
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestCroatianAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("Hrvatska"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("hrvatsk"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Hrvatski"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("hrvatsk"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// uppercase letters
|
||||
{
|
||||
input: []byte("KOMARAC"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("komarc"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// vowelR
|
||||
{
|
||||
input: []byte("crvi"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("crv"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word
|
||||
{
|
||||
input: []byte("biti"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
// suffix transformation
|
||||
{
|
||||
input: []byte("zaključcima"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("zaključk"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
156
analysis/lang/hr/stemmer_hr.go
Normal file
156
analysis/lang/hr/stemmer_hr.go
Normal file
|
@ -0,0 +1,156 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hr
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_hr"
|
||||
|
||||
// These regular expressions rules originated from:
|
||||
// http://nlp.ffzg.hr/resources/tools/stemmer-for-croatian/
|
||||
|
||||
var stemmingRules = []*regexp.Regexp{
|
||||
regexp.MustCompile(`^(.+(s|š)k)(ijima|ijega|ijemu|ijem|ijim|ijih|ijoj|ijeg|iji|ije|ija|oga|ome|omu|ima|og|om|im|ih|oj|i|e|o|a|u)$`),
|
||||
regexp.MustCompile(`^(.+(s|š)tv)(ima|om|o|a|u)$`),
|
||||
regexp.MustCompile(`^(.+(t|m|p|r|g)anij)(ama|ima|om|a|u|e|i|)$`),
|
||||
regexp.MustCompile(`^(.+an)(inom|ina|inu|ine|ima|in|om|u|i|a|e|)$`),
|
||||
regexp.MustCompile(`^(.+in)(ima|ama|om|a|e|i|u|o|)$`),
|
||||
regexp.MustCompile(`^(.+on)(ovima|ova|ove|ovi|ima|om|a|e|i|u|)$`),
|
||||
regexp.MustCompile(`^(.+n)(ijima|ijega|ijemu|ijeg|ijem|ijim|ijih|ijoj|iji|ije|ija|iju|ima|ome|omu|oga|oj|om|ih|im|og|o|e|a|u|i|)$`),
|
||||
regexp.MustCompile(`^(.+(a|e|u)ć)(oga|ome|omu|ega|emu|ima|oj|ih|om|eg|em|og|uh|im|e|a)$`),
|
||||
regexp.MustCompile(`^(.+ugov)(ima|i|e|a)$`),
|
||||
regexp.MustCompile(`^(.+ug)(ama|om|a|e|i|u|o)$`),
|
||||
regexp.MustCompile(`^(.+log)(ama|om|a|u|e|)$`),
|
||||
regexp.MustCompile(`^(.+[^eo]g)(ovima|ama|ovi|ove|ova|om|a|e|i|u|o|)$`),
|
||||
regexp.MustCompile(`^(.+(rrar|ott|ss|ll)i)(jem|ja|ju|o|)$`),
|
||||
regexp.MustCompile(`^(.+uj)(ući|emo|ete|mo|em|eš|e|u|)$`),
|
||||
regexp.MustCompile(`^(.+(c|č|ć|đ|l|r)aj)(evima|evi|eva|eve|ama|ima|em|a|e|i|u|)$`),
|
||||
regexp.MustCompile(`^(.+(b|c|d|l|n|m|ž|g|f|p|r|s|t|z)ij)(ima|ama|om|a|e|i|u|o|)$`),
|
||||
regexp.MustCompile(`^(.+[^z]nal)(ima|ama|om|a|e|i|u|o|)$`),
|
||||
regexp.MustCompile(`^(.+ijal)(ima|ama|om|a|e|i|u|o|)$`),
|
||||
regexp.MustCompile(`^(.+ozil)(ima|om|a|e|u|i|)$`),
|
||||
regexp.MustCompile(`^(.+olov)(ima|i|a|e)$`),
|
||||
regexp.MustCompile(`^(.+ol)(ima|om|a|u|e|i|)$`),
|
||||
regexp.MustCompile(`^(.+lem)(ama|ima|om|a|e|i|u|o|)$`),
|
||||
regexp.MustCompile(`^(.+ram)(ama|om|a|e|i|u|o)$`),
|
||||
regexp.MustCompile(`^(.+(a|d|e|o)r)(ama|ima|om|u|a|e|i|)$`),
|
||||
regexp.MustCompile(`^(.+(e|i)s)(ima|om|e|a|u)$`),
|
||||
regexp.MustCompile(`^(.+(t|n|j|k|j|t|b|g|v)aš)(ama|ima|om|em|a|u|i|e|)$`),
|
||||
regexp.MustCompile(`^(.+(e|i)š)(ima|ama|om|em|i|e|a|u|)$`),
|
||||
regexp.MustCompile(`^(.+ikat)(ima|om|a|e|i|u|o|)$`),
|
||||
regexp.MustCompile(`^(.+lat)(ima|om|a|e|i|u|o|)$`),
|
||||
regexp.MustCompile(`^(.+et)(ama|ima|om|a|e|i|u|o|)$`),
|
||||
regexp.MustCompile(`^(.+(e|i|k|o)st)(ima|ama|om|a|e|i|u|o|)$`),
|
||||
regexp.MustCompile(`^(.+išt)(ima|em|a|e|u)$`),
|
||||
regexp.MustCompile(`^(.+ova)(smo|ste|hu|ti|še|li|la|le|lo|t|h|o)$`),
|
||||
regexp.MustCompile(`^(.+(a|e|i)v)(ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|ama|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o|)$`),
|
||||
regexp.MustCompile(`^(.+[^dkml]ov)(ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o|)$`),
|
||||
regexp.MustCompile(`^(.+(m|l)ov)(ima|om|a|u|e|i|)$`),
|
||||
regexp.MustCompile(`^(.+el)(ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o|)$`),
|
||||
regexp.MustCompile(`^(.+(a|e|š)nj)(ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|ega|emu|eg|em|im|ih|oj|om|og|a|e|i|o|u)$`),
|
||||
regexp.MustCompile(`^(.+čin)(ama|ome|omu|oga|ima|og|om|im|ih|oj|a|u|i|o|e|)$`),
|
||||
regexp.MustCompile(`^(.+roši)(vši|smo|ste|še|mo|te|ti|li|la|lo|le|m|š|t|h|o)$`),
|
||||
regexp.MustCompile(`^(.+oš)(ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|)$`),
|
||||
regexp.MustCompile(`^(.+(e|o)vit)(ijima|ijega|ijemu|ijem|ijim|ijih|ijoj|ijeg|iji|ije|ija|oga|ome|omu|ima|og|om|im|ih|oj|i|e|o|a|u|)$`),
|
||||
regexp.MustCompile(`^(.+ast)(ijima|ijega|ijemu|ijem|ijim|ijih|ijoj|ijeg|iji|ije|ija|oga|ome|omu|ima|og|om|im|ih|oj|i|e|o|a|u|)$`),
|
||||
regexp.MustCompile(`^(.+k)(ijemu|ijima|ijega|ijeg|ijem|ijim|ijih|ijoj|oga|ome|omu|ima|iji|ije|ija|iju|im|ih|oj|om|og|i|a|u|e|o|)$`),
|
||||
regexp.MustCompile(`^(.+(e|a|i|u)va)(jući|smo|ste|jmo|jte|ju|la|le|li|lo|mo|na|ne|ni|no|te|ti|še|hu|h|j|m|n|o|t|v|š|)$`),
|
||||
regexp.MustCompile(`^(.+ir)(ujemo|ujete|ujući|ajući|ivat|ujem|uješ|ujmo|ujte|avši|asmo|aste|ati|amo|ate|aju|aše|ahu|ala|alo|ali|ale|uje|uju|uj|al|an|am|aš|at|ah|ao)$`),
|
||||
regexp.MustCompile(`^(.+ač)(ismo|iste|iti|imo|ite|iše|eći|ila|ilo|ili|ile|ena|eno|eni|ene|io|im|iš|it|ih|en|i|e)$`),
|
||||
regexp.MustCompile(`^(.+ača)(vši|smo|ste|smo|ste|hu|ti|mo|te|še|la|lo|li|le|ju|na|no|ni|ne|o|m|š|t|h|n)$`),
|
||||
regexp.MustCompile(`^(.+n)(uvši|usmo|uste|ući|imo|ite|emo|ete|ula|ulo|ule|uli|uto|uti|uta|em|eš|uo|ut|e|u|i)$`),
|
||||
regexp.MustCompile(`^(.+ni)(vši|smo|ste|ti|mo|te|mo|te|la|lo|le|li|m|š|o)$`),
|
||||
regexp.MustCompile(`^(.+((a|r|i|p|e|u)st|[^o]g|ik|uc|oj|aj|lj|ak|ck|čk|šk|uk|nj|im|ar|at|et|št|it|ot|ut|zn|zv)a)(jući|vši|smo|ste|jmo|jte|jem|mo|te|je|ju|ti|še|hu|la|li|le|lo|na|no|ni|ne|t|h|o|j|n|m|š)$`),
|
||||
regexp.MustCompile(`^(.+ur)(ajući|asmo|aste|ajmo|ajte|amo|ate|aju|ati|aše|ahu|ala|ali|ale|alo|ana|ano|ani|ane|al|at|ah|ao|aj|an|am|aš)$`),
|
||||
regexp.MustCompile(`^(.+(a|i|o)staj)(asmo|aste|ahu|ati|emo|ete|aše|ali|ući|ala|alo|ale|mo|ao|em|eš|at|ah|te|e|u|)$`),
|
||||
regexp.MustCompile(`^(.+(b|c|č|ć|d|e|f|g|j|k|n|r|t|u|v)a)(lama|lima|lom|lu|li|la|le|lo|l)$`),
|
||||
regexp.MustCompile(`^(.+(t|č|j|ž|š)aj)(evima|evi|eva|eve|ama|ima|em|a|e|i|u|)$`),
|
||||
regexp.MustCompile(`^(.+([^o]m|ič|nč|uč|b|c|ć|d|đ|h|j|k|l|n|p|r|s|š|v|z|ž)a)(jući|vši|smo|ste|jmo|jte|mo|te|ju|ti|še|hu|la|li|le|lo|na|no|ni|ne|t|h|o|j|n|m|š)$`),
|
||||
regexp.MustCompile(`^(.+(a|i|o)sta)(dosmo|doste|doše|nemo|demo|nete|dete|nimo|nite|nila|vši|nem|dem|neš|deš|doh|de|ti|ne|nu|du|la|li|lo|le|t|o)$`),
|
||||
regexp.MustCompile(`^(.+ta)(smo|ste|jmo|jte|vši|ti|mo|te|ju|še|la|lo|le|li|na|no|ni|ne|n|j|o|m|š|t|h)$`),
|
||||
regexp.MustCompile(`^(.+inj)(asmo|aste|ati|emo|ete|ali|ala|alo|ale|aše|ahu|em|eš|at|ah|ao)$`),
|
||||
regexp.MustCompile(`^(.+as)(temo|tete|timo|tite|tući|tem|teš|tao|te|li|ti|la|lo|le)$`),
|
||||
regexp.MustCompile(`^(.+(elj|ulj|tit|ac|ič|od|oj|et|av|ov)i)(vši|eći|smo|ste|še|mo|te|ti|li|la|lo|le|m|š|t|h|o)$`),
|
||||
regexp.MustCompile(`^(.+(tit|jeb|ar|ed|uš|ič)i)(jemo|jete|jem|ješ|smo|ste|jmo|jte|vši|mo|še|te|ti|ju|je|la|lo|li|le|t|m|š|h|j|o)$`),
|
||||
regexp.MustCompile(`^(.+(b|č|d|l|m|p|r|s|š|ž)i)(jemo|jete|jem|ješ|smo|ste|jmo|jte|vši|mo|lu|še|te|ti|ju|je|la|lo|li|le|t|m|š|h|j|o)$`),
|
||||
regexp.MustCompile(`^(.+luč)(ujete|ujući|ujemo|ujem|uješ|ismo|iste|ujmo|ujte|uje|uju|iše|iti|imo|ite|ila|ilo|ili|ile|ena|eno|eni|ene|uj|io|en|im|iš|it|ih|e|i)$`),
|
||||
regexp.MustCompile(`^(.+jeti)(smo|ste|še|mo|te|ti|li|la|lo|le|m|š|t|h|o)$`),
|
||||
regexp.MustCompile(`^(.+e)(lama|lima|lom|lu|li|la|le|lo|l)$`),
|
||||
regexp.MustCompile(`^(.+i)(lama|lima|lom|lu|li|la|le|lo|l)$`),
|
||||
regexp.MustCompile(`^(.+at)(ijega|ijemu|ijima|ijeg|ijem|ijih|ijim|ima|oga|ome|omu|iji|ije|ija|iju|oj|og|om|im|ih|a|u|i|e|o|)$`),
|
||||
regexp.MustCompile(`^(.+et)(avši|ući|emo|imo|em|eš|e|u|i)$`),
|
||||
regexp.MustCompile(`^(.+)(ajući|alima|alom|avši|asmo|aste|ajmo|ajte|ivši|amo|ate|aju|ati|aše|ahu|ali|ala|ale|alo|ana|ano|ani|ane|am|aš|at|ah|ao|aj|an)$`),
|
||||
regexp.MustCompile(`^(.+)(anje|enje|anja|enja|enom|enoj|enog|enim|enih|anom|anoj|anog|anim|anih|eno|ovi|ova|oga|ima|ove|enu|anu|ena|ama)$`),
|
||||
regexp.MustCompile(`^(.+)(nijega|nijemu|nijima|nijeg|nijem|nijim|nijih|nima|niji|nije|nija|niju|noj|nom|nog|nim|nih|an|na|nu|ni|ne|no)$`),
|
||||
regexp.MustCompile(`^(.+)(om|og|im|ih|em|oj|an|u|o|i|e|a)$`),
|
||||
}
|
||||
|
||||
var highlightVowelRRegex = regexp.MustCompile(`(^|[^aeiou])r($|[^aeiou])`)
|
||||
|
||||
func highlightVowelR(term string) string {
|
||||
return highlightVowelRRegex.ReplaceAllString(term, `${1}R${2}`)
|
||||
}
|
||||
|
||||
func hasVowel(term string) bool {
|
||||
term = highlightVowelR(term)
|
||||
return strings.ContainsAny(term, "aeiouR")
|
||||
}
|
||||
|
||||
func stem(term string) string {
|
||||
for _, rule := range stemmingRules {
|
||||
results := rule.FindStringSubmatch(term)
|
||||
if len(results) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
root := results[1]
|
||||
if hasVowel(root) && root != "" {
|
||||
return root
|
||||
}
|
||||
}
|
||||
|
||||
return term
|
||||
}
|
||||
|
||||
type CroatianStemmerFilter struct{}
|
||||
|
||||
func NewCroatianStemmerFilter() *CroatianStemmerFilter {
|
||||
return &CroatianStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *CroatianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
token.Term = []byte(stem(string(token.Term)))
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
func CroatianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewCroatianStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StemmerName, CroatianStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
36
analysis/lang/hr/stop_filter_hr.go
Normal file
36
analysis/lang/hr/stop_filter_hr.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hr
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
111
analysis/lang/hr/stop_words_hr.go
Normal file
111
analysis/lang/hr/stop_words_hr.go
Normal file
|
@ -0,0 +1,111 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hr
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_hr"
|
||||
|
||||
var CroatianStopWords = []byte(`biti
|
||||
jesam
|
||||
budem
|
||||
sam
|
||||
jesi
|
||||
budeš
|
||||
si
|
||||
jesmo
|
||||
budemo
|
||||
smo
|
||||
jeste
|
||||
budete
|
||||
ste
|
||||
jesu
|
||||
budu
|
||||
su
|
||||
bih
|
||||
bijah
|
||||
bjeh
|
||||
bijaše
|
||||
bi
|
||||
bje
|
||||
bješe
|
||||
bijasmo
|
||||
bismo
|
||||
bjesmo
|
||||
bijaste
|
||||
biste
|
||||
bjeste
|
||||
bijahu
|
||||
biste
|
||||
bjeste
|
||||
bijahu
|
||||
bi
|
||||
biše
|
||||
bjehu
|
||||
bješe
|
||||
bio
|
||||
bili
|
||||
budimo
|
||||
budite
|
||||
bila
|
||||
bilo
|
||||
bile
|
||||
ću
|
||||
ćeš
|
||||
će
|
||||
ćemo
|
||||
ćete
|
||||
želim
|
||||
želiš
|
||||
želi
|
||||
želimo
|
||||
želite
|
||||
žele
|
||||
moram
|
||||
moraš
|
||||
mora
|
||||
moramo
|
||||
morate
|
||||
moraju
|
||||
trebam
|
||||
trebaš
|
||||
treba
|
||||
trebamo
|
||||
trebate
|
||||
trebaju
|
||||
mogu
|
||||
možeš
|
||||
može
|
||||
možemo
|
||||
možete
|
||||
za
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(CroatianStopWords)
|
||||
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
189
analysis/lang/hr/suffix_transformation_hr.go
Normal file
189
analysis/lang/hr/suffix_transformation_hr.go
Normal file
|
@ -0,0 +1,189 @@
|
|||
// Copyright (c) 2020 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hr
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const SuffixTransformationFilterName = "hr_suffix_transformation_filter"
|
||||
|
||||
var SuffixTransformations = map[string]string{
|
||||
"lozi": "loga",
|
||||
"lozima": "loga",
|
||||
"pjesi": "pjeh",
|
||||
"pjesima": "pjeh",
|
||||
"vojci": "vojka",
|
||||
"bojci": "bojka",
|
||||
"jaci": "jak",
|
||||
"jacima": "jak",
|
||||
"čajan": "čajni",
|
||||
"ijeran": "ijerni",
|
||||
"laran": "larni",
|
||||
"ijesan": "ijesni",
|
||||
"anjac": "anjca",
|
||||
"ajac": "ajca",
|
||||
"ajaca": "ajca",
|
||||
"ljaca": "ljca",
|
||||
"ljac": "ljca",
|
||||
"ejac": "ejca",
|
||||
"ejaca": "ejca",
|
||||
"ojac": "ojca",
|
||||
"ojaca": "ojca",
|
||||
"ajaka": "ajka",
|
||||
"ojaka": "ojka",
|
||||
"šaca": "šca",
|
||||
"šac": "šca",
|
||||
"inzima": "ing",
|
||||
"inzi": "ing",
|
||||
"tvenici": "tvenik",
|
||||
"tetici": "tetika",
|
||||
"teticima": "tetika",
|
||||
"nstava": "nstva",
|
||||
"nicima": "nik",
|
||||
"ticima": "tik",
|
||||
"zicima": "zik",
|
||||
"snici": "snik",
|
||||
"kuse": "kusi",
|
||||
"kusan": "kusni",
|
||||
"kustava": "kustva",
|
||||
"dušan": "dušni",
|
||||
"antan": "antni",
|
||||
"bilan": "bilni",
|
||||
"tilan": "tilni",
|
||||
"avilan": "avilni",
|
||||
"silan": "silni",
|
||||
"gilan": "gilni",
|
||||
"rilan": "rilni",
|
||||
"nilan": "nilni",
|
||||
"alan": "alni",
|
||||
"ozan": "ozni",
|
||||
"rave": "ravi",
|
||||
"stavan": "stavni",
|
||||
"pravan": "pravni",
|
||||
"tivan": "tivni",
|
||||
"sivan": "sivni",
|
||||
"atan": "atni",
|
||||
"cenata": "centa",
|
||||
"denata": "denta",
|
||||
"genata": "genta",
|
||||
"lenata": "lenta",
|
||||
"menata": "menta",
|
||||
"jenata": "jenta",
|
||||
"venata": "venta",
|
||||
"tetan": "tetni",
|
||||
"pletan": "pletni",
|
||||
"šave": "šavi",
|
||||
"manata": "manta",
|
||||
"tanata": "tanta",
|
||||
"lanata": "lanta",
|
||||
"sanata": "santa",
|
||||
"ačak": "ačka",
|
||||
"ačaka": "ačka",
|
||||
"ušak": "uška",
|
||||
"atak": "atka",
|
||||
"ataka": "atka",
|
||||
"atci": "atka",
|
||||
"atcima": "atka",
|
||||
"etak": "etka",
|
||||
"etaka": "etka",
|
||||
"itak": "itka",
|
||||
"itaka": "itka",
|
||||
"itci": "itka",
|
||||
"otak": "otka",
|
||||
"otaka": "otka",
|
||||
"utak": "utka",
|
||||
"utaka": "utka",
|
||||
"utci": "utka",
|
||||
"utcima": "utka",
|
||||
"eskan": "eskna",
|
||||
"tičan": "tični",
|
||||
"ojsci": "ojska",
|
||||
"esama": "esma",
|
||||
"metara": "metra",
|
||||
"centar": "centra",
|
||||
"centara": "centra",
|
||||
"istara": "istra",
|
||||
"istar": "istra",
|
||||
"ošću": "osti",
|
||||
"daba": "dba",
|
||||
"čcima": "čka",
|
||||
"čci": "čka",
|
||||
"mac": "mca",
|
||||
"maca": "mca",
|
||||
"voljan": "voljni",
|
||||
"anaka": "anki",
|
||||
"vac": "vca",
|
||||
"vaca": "vca",
|
||||
"saca": "sca",
|
||||
"sac": "sca",
|
||||
"naca": "nca",
|
||||
"nac": "nca",
|
||||
"raca": "rca",
|
||||
"rac": "rca",
|
||||
"aoca": "alca",
|
||||
"alaca": "alca",
|
||||
"alac": "alca",
|
||||
"elaca": "elca",
|
||||
"elac": "elca",
|
||||
"olaca": "olca",
|
||||
"olac": "olca",
|
||||
"olce": "olca",
|
||||
"njac": "njca",
|
||||
"njaca": "njca",
|
||||
"ekata": "ekta",
|
||||
"ekat": "ekta",
|
||||
"izam": "izma",
|
||||
"izama": "izma",
|
||||
"jebe": "jebi",
|
||||
"ašan": "ašni",
|
||||
}
|
||||
|
||||
type SuffixTransformationFilter struct{}
|
||||
|
||||
func NewSuffixTransformationFilter() *SuffixTransformationFilter {
|
||||
return &SuffixTransformationFilter{}
|
||||
}
|
||||
|
||||
func (s *SuffixTransformationFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := string(token.Term)
|
||||
|
||||
for suffix, newSuffix := range SuffixTransformations {
|
||||
if strings.HasSuffix(term, suffix) {
|
||||
term = term[:len(term)-len(suffix)] + newSuffix
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
token.Term = []byte(term)
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
func SuffixTransformationFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewSuffixTransformationFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(SuffixTransformationFilterName, SuffixTransformationFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue