1
0
Fork 0

Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-05-19 00:20:02 +02:00
parent c71cb8b61d
commit 982828099e
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
783 changed files with 150650 additions and 0 deletions

View file

@ -0,0 +1,71 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hi
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/analysis/lang/in"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
)
const AnalyzerName = "hi"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
if err != nil {
return nil, err
}
indicNormalizeFilter, err := cache.TokenFilterNamed(in.NormalizeName)
if err != nil {
return nil, err
}
hindiNormalizeFilter, err := cache.TokenFilterNamed(NormalizeName)
if err != nil {
return nil, err
}
stopHiFilter, err := cache.TokenFilterNamed(StopName)
if err != nil {
return nil, err
}
stemmerHiFilter, err := cache.TokenFilterNamed(StemmerName)
if err != nil {
return nil, err
}
rv := analysis.DefaultAnalyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
indicNormalizeFilter,
hindiNormalizeFilter,
stopHiFilter,
stemmerHiFilter,
},
}
return &rv, nil
}
func init() {
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,66 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hi
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
func TestHindiAnalyzer(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
// two ways to write 'hindi' itself
{
input: []byte("हिन्दी"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("हिंद"),
Position: 1,
Start: 0,
End: 18,
},
},
},
{
input: []byte("हिंदी"),
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("हिंद"),
Position: 1,
Start: 0,
End: 15,
},
},
},
}
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := analyzer.Analyze(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %v, got %v", test.output, actual)
}
}
}

View file

@ -0,0 +1,141 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hi
import (
"bytes"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const NormalizeName = "normalize_hi"
type HindiNormalizeFilter struct {
}
func NewHindiNormalizeFilter() *HindiNormalizeFilter {
return &HindiNormalizeFilter{}
}
func (s *HindiNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
term := normalize(token.Term)
token.Term = term
}
return input
}
func normalize(input []byte) []byte {
runes := bytes.Runes(input)
for i := 0; i < len(runes); i++ {
switch runes[i] {
// dead n -> bindu
case '\u0928':
if i+1 < len(runes) && runes[i+1] == '\u094D' {
runes[i] = '\u0902'
runes = analysis.DeleteRune(runes, i+1)
}
// candrabindu -> bindu
case '\u0901':
runes[i] = '\u0902'
// nukta deletions
case '\u093C':
runes = analysis.DeleteRune(runes, i)
i--
case '\u0929':
runes[i] = '\u0928'
case '\u0931':
runes[i] = '\u0930'
case '\u0934':
runes[i] = '\u0933'
case '\u0958':
runes[i] = '\u0915'
case '\u0959':
runes[i] = '\u0916'
case '\u095A':
runes[i] = '\u0917'
case '\u095B':
runes[i] = '\u091C'
case '\u095C':
runes[i] = '\u0921'
case '\u095D':
runes[i] = '\u0922'
case '\u095E':
runes[i] = '\u092B'
case '\u095F':
runes[i] = '\u092F'
// zwj/zwnj -> delete
case '\u200D', '\u200C':
runes = analysis.DeleteRune(runes, i)
i--
// virama -> delete
case '\u094D':
runes = analysis.DeleteRune(runes, i)
i--
// chandra/short -> replace
case '\u0945', '\u0946':
runes[i] = '\u0947'
case '\u0949', '\u094A':
runes[i] = '\u094B'
case '\u090D', '\u090E':
runes[i] = '\u090F'
case '\u0911', '\u0912':
runes[i] = '\u0913'
case '\u0972':
runes[i] = '\u0905'
// long -> short ind. vowels
case '\u0906':
runes[i] = '\u0905'
case '\u0908':
runes[i] = '\u0907'
case '\u090A':
runes[i] = '\u0909'
case '\u0960':
runes[i] = '\u090B'
case '\u0961':
runes[i] = '\u090C'
case '\u0910':
runes[i] = '\u090F'
case '\u0914':
runes[i] = '\u0913'
// long -> short dep. vowels
case '\u0940':
runes[i] = '\u093F'
case '\u0942':
runes[i] = '\u0941'
case '\u0944':
runes[i] = '\u0943'
case '\u0963':
runes[i] = '\u0962'
case '\u0948':
runes[i] = '\u0947'
case '\u094C':
runes[i] = '\u094B'
}
}
return analysis.BuildTermFromRunes(runes)
}
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewHindiNormalizeFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,251 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hi
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestHindiNormalizeFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// basics
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अँगरेज़ी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अँगरेजी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अँग्रेज़ी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अँग्रेजी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेज़ी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंग्रेज़ी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंग्रेजी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अंगरेजि"),
},
},
},
// test decompositions
// removing nukta dot
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("क़िताब"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("फ़र्ज़"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("फरज"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("क़र्ज़"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("करज"),
},
},
},
// some other composed nukta forms
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ऱऴख़ग़ड़ढ़य़"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("रळखगडढय"),
},
},
},
// removal of format (ZWJ/ZWNJ)
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("शार्‍मा"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("शारमा"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("शार्‌मा"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("शारमा"),
},
},
},
// removal of chandra
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("ॅॆॉॊऍऎऑऒ\u0972"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("ेेोोएएओओअ"),
},
},
},
// vowel shortening
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("आईऊॠॡऐऔीूॄॣैौ"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("अइउऋऌएओिुृॢेो"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
hindiNormalizeFilter := NewHindiNormalizeFilter()
for _, test := range tests {
actual := hindiNormalizeFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,152 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hi
import (
"bytes"
"unicode/utf8"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StemmerName = "stemmer_hi"
type HindiStemmerFilter struct {
}
func NewHindiStemmerFilter() *HindiStemmerFilter {
return &HindiStemmerFilter{}
}
func (s *HindiStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
// if not protected keyword, stem it
if !token.KeyWord {
stemmed := stem(token.Term)
token.Term = stemmed
}
}
return input
}
func stem(input []byte) []byte {
inputLen := utf8.RuneCount(input)
// 5
if inputLen > 6 &&
(bytes.HasSuffix(input, []byte("ाएंगी")) ||
bytes.HasSuffix(input, []byte("ाएंगे")) ||
bytes.HasSuffix(input, []byte("ाऊंगी")) ||
bytes.HasSuffix(input, []byte("ाऊंगा")) ||
bytes.HasSuffix(input, []byte("ाइयाँ")) ||
bytes.HasSuffix(input, []byte("ाइयों")) ||
bytes.HasSuffix(input, []byte("ाइयां"))) {
return analysis.TruncateRunes(input, 5)
}
// 4
if inputLen > 5 &&
(bytes.HasSuffix(input, []byte("ाएगी")) ||
bytes.HasSuffix(input, []byte("ाएगा")) ||
bytes.HasSuffix(input, []byte("ाओगी")) ||
bytes.HasSuffix(input, []byte("ाओगे")) ||
bytes.HasSuffix(input, []byte("एंगी")) ||
bytes.HasSuffix(input, []byte("ेंगी")) ||
bytes.HasSuffix(input, []byte("एंगे")) ||
bytes.HasSuffix(input, []byte("ेंगे")) ||
bytes.HasSuffix(input, []byte("ूंगी")) ||
bytes.HasSuffix(input, []byte("ूंगा")) ||
bytes.HasSuffix(input, []byte("ातीं")) ||
bytes.HasSuffix(input, []byte("नाओं")) ||
bytes.HasSuffix(input, []byte("नाएं")) ||
bytes.HasSuffix(input, []byte("ताओं")) ||
bytes.HasSuffix(input, []byte("ताएं")) ||
bytes.HasSuffix(input, []byte("ियाँ")) ||
bytes.HasSuffix(input, []byte("ियों")) ||
bytes.HasSuffix(input, []byte("ियां"))) {
return analysis.TruncateRunes(input, 4)
}
// 3
if inputLen > 4 &&
(bytes.HasSuffix(input, []byte("ाकर")) ||
bytes.HasSuffix(input, []byte("ाइए")) ||
bytes.HasSuffix(input, []byte("ाईं")) ||
bytes.HasSuffix(input, []byte("ाया")) ||
bytes.HasSuffix(input, []byte("ेगी")) ||
bytes.HasSuffix(input, []byte("ेगा")) ||
bytes.HasSuffix(input, []byte("ोगी")) ||
bytes.HasSuffix(input, []byte("ोगे")) ||
bytes.HasSuffix(input, []byte("ाने")) ||
bytes.HasSuffix(input, []byte("ाना")) ||
bytes.HasSuffix(input, []byte("ाते")) ||
bytes.HasSuffix(input, []byte("ाती")) ||
bytes.HasSuffix(input, []byte("ाता")) ||
bytes.HasSuffix(input, []byte("तीं")) ||
bytes.HasSuffix(input, []byte("ाओं")) ||
bytes.HasSuffix(input, []byte("ाएं")) ||
bytes.HasSuffix(input, []byte("ुओं")) ||
bytes.HasSuffix(input, []byte("ुएं")) ||
bytes.HasSuffix(input, []byte("ुआं"))) {
return analysis.TruncateRunes(input, 3)
}
// 2
if inputLen > 3 &&
(bytes.HasSuffix(input, []byte("कर")) ||
bytes.HasSuffix(input, []byte("ाओ")) ||
bytes.HasSuffix(input, []byte("िए")) ||
bytes.HasSuffix(input, []byte("ाई")) ||
bytes.HasSuffix(input, []byte("ाए")) ||
bytes.HasSuffix(input, []byte("ने")) ||
bytes.HasSuffix(input, []byte("नी")) ||
bytes.HasSuffix(input, []byte("ना")) ||
bytes.HasSuffix(input, []byte("ते")) ||
bytes.HasSuffix(input, []byte("ीं")) ||
bytes.HasSuffix(input, []byte("ती")) ||
bytes.HasSuffix(input, []byte("ता")) ||
bytes.HasSuffix(input, []byte("ाँ")) ||
bytes.HasSuffix(input, []byte("ां")) ||
bytes.HasSuffix(input, []byte("ों")) ||
bytes.HasSuffix(input, []byte("ें"))) {
return analysis.TruncateRunes(input, 2)
}
// 1
if inputLen > 2 &&
(bytes.HasSuffix(input, []byte("ो")) ||
bytes.HasSuffix(input, []byte("े")) ||
bytes.HasSuffix(input, []byte("ू")) ||
bytes.HasSuffix(input, []byte("ु")) ||
bytes.HasSuffix(input, []byte("ी")) ||
bytes.HasSuffix(input, []byte("ि")) ||
bytes.HasSuffix(input, []byte("ा"))) {
return analysis.TruncateRunes(input, 1)
}
return input
}
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewHindiStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,308 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hi
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/v2/analysis"
)
func TestHindiStemmerFilter(t *testing.T) {
tests := []struct {
input analysis.TokenStream
output analysis.TokenStream
}{
// masc noun inflections
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडका"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडके"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडकों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुरु"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुर"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुरुओं"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("गुर"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्त"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्त"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्तों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("दोस्त"),
},
},
},
// feminine noun inflections
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडकी"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडकियों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("लडक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताबें"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताबों"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("किताब"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीका"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीकाएं"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीक"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीकाओं"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("आध्यापीक"),
},
},
},
// some verb forms
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खाना"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खाता"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खाती"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("खा"),
},
},
},
// exceptions
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिनाइयां"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिन"),
},
},
},
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिन"),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte("कठिन"),
},
},
},
// empty
{
input: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
output: analysis.TokenStream{
&analysis.Token{
Term: []byte(""),
},
},
},
}
hindiStemmerFilter := NewHindiStemmerFilter()
for _, test := range tests {
actual := hindiStemmerFilter.Filter(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("expected %#v, got %#v", test.output, actual)
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
}
}
}

View file

@ -0,0 +1,36 @@
// Copyright (c) 2014 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package hi
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/stop"
"github.com/blevesearch/bleve/v2/registry"
)
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
tokenMap, err := cache.TokenMapNamed(StopName)
if err != nil {
return nil, err
}
return stop.NewStopTokensFilter(tokenMap), nil
}
func init() {
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
if err != nil {
panic(err)
}
}

View file

@ -0,0 +1,262 @@
package hi
import (
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const StopName = "stop_hi"
// this content was obtained from:
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
// ` was changed to ' to allow for literal string
var HindiStopWords = []byte(`# Also see http://www.opensource.org/licenses/bsd-license.html
# See http://members.unine.ch/jacques.savoy/clef/index.html.
# This file was created by Jacques Savoy and is distributed under the BSD license.
# Note: by default this file also contains forms normalized by HindiNormalizer
# for spelling variation (see section below), such that it can be used whether or
# not you enable that feature. When adding additional entries to this list,
# please add the normalized form as well.
अंदर
अत
अपन
अपन
अपने
अभ
आदि
आप
इत्यि
इन
इनक
इन्ह
इन्हें
इन्ह
इस
इसक
इसक
इसके
इसमें
इस
इसे
उन
उनक
उनक
उनके
उनक
उन्ह
उन्हें
उन्ह
उस
उसके
उस
उसे
एक
एवं
एस
ऐसे
और
कई
कर
करत
करते
करन
करने
करें
कहते
कह
ि
ितन
िन्हें
िन्ह
ि
ि
ि
ि
िसे
कुछ
कुल
के
नस
गय
घर
जब
जह
ितन
ि
िन्हें
िन्ह
ि
िसे
धर
जैस
जैसे
तक
तब
तरह
ि
िन्हें
िन्ह
ि
िसे
थे
दब
ि
दुसर
दूसरे
द्व
नह
ियत
चे
ने
पर
पर
पहले
पूर
पे
ि
बन
बह
बहुत
िलकुल
तर
मगर
मे
में
यदि
यह
यह
यह
ि
ये
रखें
रह
रहे
ऱ्व
ि
िये
लेकि
वर्ग
वह
वह
वह
वह
ले
वुह
वे
वग़ैरह
संग
सकत
सकते
सबसे
सभ
बुत
से
हुआ
हुई
हुए
है
हैं
ते
ने
# additional normalized forms of the above
अपनि
जेसे
ि
सभि
िंह
इंह
दव
इसि
िंहें
ि
उंह
ओर
िंहें
वहि
अभि
बनि
ि
उंहि
उंहें
हें
वगेरह
एसे
रव
िचे
ि
उसि
पुर
ितर
हे
बहि
वह
यह
िंह
िंहें
िि
कइ
यहि
इंहि
िधर
इंहें
अदि
इतयि
हुइ
नस
इसकि
दुसरे
जह
अप
िंह
उनकि
ि
वरग
हुअ
जेस
नहि
`)
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
rv := analysis.NewTokenMap()
err := rv.LoadBytes(HindiStopWords)
return rv, err
}
func init() {
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
if err != nil {
panic(err)
}
}