Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c71cb8b61d
commit
982828099e
783 changed files with 150650 additions and 0 deletions
71
analysis/lang/hi/analyzer_hi.go
Normal file
71
analysis/lang/hi/analyzer_hi.go
Normal file
|
@ -0,0 +1,71 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hi
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/lang/in"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "hi"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
indicNormalizeFilter, err := cache.TokenFilterNamed(in.NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
hindiNormalizeFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopHiFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerHiFilter, err := cache.TokenFilterNamed(StemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
indicNormalizeFilter,
|
||||
hindiNormalizeFilter,
|
||||
stopHiFilter,
|
||||
stemmerHiFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
66
analysis/lang/hi/analyzer_hi_test.go
Normal file
66
analysis/lang/hi/analyzer_hi_test.go
Normal file
|
@ -0,0 +1,66 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hi
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestHindiAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// two ways to write 'hindi' itself
|
||||
{
|
||||
input: []byte("हिन्दी"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("हिंद"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 18,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("हिंदी"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("हिंद"),
|
||||
Position: 1,
|
||||
Start: 0,
|
||||
End: 15,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %v, got %v", test.output, actual)
|
||||
}
|
||||
}
|
||||
}
|
141
analysis/lang/hi/hindi_normalize.go
Normal file
141
analysis/lang/hi/hindi_normalize.go
Normal file
|
@ -0,0 +1,141 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hi
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_hi"
|
||||
|
||||
type HindiNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewHindiNormalizeFilter() *HindiNormalizeFilter {
|
||||
return &HindiNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *HindiNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
// dead n -> bindu
|
||||
case '\u0928':
|
||||
if i+1 < len(runes) && runes[i+1] == '\u094D' {
|
||||
runes[i] = '\u0902'
|
||||
runes = analysis.DeleteRune(runes, i+1)
|
||||
}
|
||||
// candrabindu -> bindu
|
||||
case '\u0901':
|
||||
runes[i] = '\u0902'
|
||||
// nukta deletions
|
||||
case '\u093C':
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
case '\u0929':
|
||||
runes[i] = '\u0928'
|
||||
case '\u0931':
|
||||
runes[i] = '\u0930'
|
||||
case '\u0934':
|
||||
runes[i] = '\u0933'
|
||||
case '\u0958':
|
||||
runes[i] = '\u0915'
|
||||
case '\u0959':
|
||||
runes[i] = '\u0916'
|
||||
case '\u095A':
|
||||
runes[i] = '\u0917'
|
||||
case '\u095B':
|
||||
runes[i] = '\u091C'
|
||||
case '\u095C':
|
||||
runes[i] = '\u0921'
|
||||
case '\u095D':
|
||||
runes[i] = '\u0922'
|
||||
case '\u095E':
|
||||
runes[i] = '\u092B'
|
||||
case '\u095F':
|
||||
runes[i] = '\u092F'
|
||||
// zwj/zwnj -> delete
|
||||
case '\u200D', '\u200C':
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
// virama -> delete
|
||||
case '\u094D':
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
// chandra/short -> replace
|
||||
case '\u0945', '\u0946':
|
||||
runes[i] = '\u0947'
|
||||
case '\u0949', '\u094A':
|
||||
runes[i] = '\u094B'
|
||||
case '\u090D', '\u090E':
|
||||
runes[i] = '\u090F'
|
||||
case '\u0911', '\u0912':
|
||||
runes[i] = '\u0913'
|
||||
case '\u0972':
|
||||
runes[i] = '\u0905'
|
||||
// long -> short ind. vowels
|
||||
case '\u0906':
|
||||
runes[i] = '\u0905'
|
||||
case '\u0908':
|
||||
runes[i] = '\u0907'
|
||||
case '\u090A':
|
||||
runes[i] = '\u0909'
|
||||
case '\u0960':
|
||||
runes[i] = '\u090B'
|
||||
case '\u0961':
|
||||
runes[i] = '\u090C'
|
||||
case '\u0910':
|
||||
runes[i] = '\u090F'
|
||||
case '\u0914':
|
||||
runes[i] = '\u0913'
|
||||
// long -> short dep. vowels
|
||||
case '\u0940':
|
||||
runes[i] = '\u093F'
|
||||
case '\u0942':
|
||||
runes[i] = '\u0941'
|
||||
case '\u0944':
|
||||
runes[i] = '\u0943'
|
||||
case '\u0963':
|
||||
runes[i] = '\u0962'
|
||||
case '\u0948':
|
||||
runes[i] = '\u0947'
|
||||
case '\u094C':
|
||||
runes[i] = '\u094B'
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewHindiNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
251
analysis/lang/hi/hindi_normalize_test.go
Normal file
251
analysis/lang/hi/hindi_normalize_test.go
Normal file
|
@ -0,0 +1,251 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hi
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestHindiNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// basics
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अँगरेज़ी"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंगरेजि"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अँगरेजी"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंगरेजि"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अँग्रेज़ी"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंगरेजि"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अँग्रेजी"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंगरेजि"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंगरेज़ी"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंगरेजि"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंगरेजी"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंगरेजि"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंग्रेज़ी"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंगरेजि"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंग्रेजी"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अंगरेजि"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// test decompositions
|
||||
// removing nukta dot
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("क़िताब"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("किताब"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("फ़र्ज़"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("फरज"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("क़र्ज़"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("करज"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// some other composed nukta forms
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ऱऴख़ग़ड़ढ़य़"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("रळखगडढय"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// removal of format (ZWJ/ZWNJ)
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("शार्मा"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("शारमा"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("शार्मा"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("शारमा"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// removal of chandra
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ॅॆॉॊऍऎऑऒ\u0972"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ेेोोएएओओअ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// vowel shortening
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("आईऊॠॡऐऔीूॄॣैौ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("अइउऋऌएओिुृॢेो"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
hindiNormalizeFilter := NewHindiNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := hindiNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
152
analysis/lang/hi/hindi_stemmer_filter.go
Normal file
152
analysis/lang/hi/hindi_stemmer_filter.go
Normal file
|
@ -0,0 +1,152 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hi
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StemmerName = "stemmer_hi"
|
||||
|
||||
type HindiStemmerFilter struct {
|
||||
}
|
||||
|
||||
func NewHindiStemmerFilter() *HindiStemmerFilter {
|
||||
return &HindiStemmerFilter{}
|
||||
}
|
||||
|
||||
func (s *HindiStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
// if not protected keyword, stem it
|
||||
if !token.KeyWord {
|
||||
stemmed := stem(token.Term)
|
||||
token.Term = stemmed
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func stem(input []byte) []byte {
|
||||
inputLen := utf8.RuneCount(input)
|
||||
|
||||
// 5
|
||||
if inputLen > 6 &&
|
||||
(bytes.HasSuffix(input, []byte("ाएंगी")) ||
|
||||
bytes.HasSuffix(input, []byte("ाएंगे")) ||
|
||||
bytes.HasSuffix(input, []byte("ाऊंगी")) ||
|
||||
bytes.HasSuffix(input, []byte("ाऊंगा")) ||
|
||||
bytes.HasSuffix(input, []byte("ाइयाँ")) ||
|
||||
bytes.HasSuffix(input, []byte("ाइयों")) ||
|
||||
bytes.HasSuffix(input, []byte("ाइयां"))) {
|
||||
return analysis.TruncateRunes(input, 5)
|
||||
}
|
||||
|
||||
// 4
|
||||
if inputLen > 5 &&
|
||||
(bytes.HasSuffix(input, []byte("ाएगी")) ||
|
||||
bytes.HasSuffix(input, []byte("ाएगा")) ||
|
||||
bytes.HasSuffix(input, []byte("ाओगी")) ||
|
||||
bytes.HasSuffix(input, []byte("ाओगे")) ||
|
||||
bytes.HasSuffix(input, []byte("एंगी")) ||
|
||||
bytes.HasSuffix(input, []byte("ेंगी")) ||
|
||||
bytes.HasSuffix(input, []byte("एंगे")) ||
|
||||
bytes.HasSuffix(input, []byte("ेंगे")) ||
|
||||
bytes.HasSuffix(input, []byte("ूंगी")) ||
|
||||
bytes.HasSuffix(input, []byte("ूंगा")) ||
|
||||
bytes.HasSuffix(input, []byte("ातीं")) ||
|
||||
bytes.HasSuffix(input, []byte("नाओं")) ||
|
||||
bytes.HasSuffix(input, []byte("नाएं")) ||
|
||||
bytes.HasSuffix(input, []byte("ताओं")) ||
|
||||
bytes.HasSuffix(input, []byte("ताएं")) ||
|
||||
bytes.HasSuffix(input, []byte("ियाँ")) ||
|
||||
bytes.HasSuffix(input, []byte("ियों")) ||
|
||||
bytes.HasSuffix(input, []byte("ियां"))) {
|
||||
return analysis.TruncateRunes(input, 4)
|
||||
}
|
||||
|
||||
// 3
|
||||
if inputLen > 4 &&
|
||||
(bytes.HasSuffix(input, []byte("ाकर")) ||
|
||||
bytes.HasSuffix(input, []byte("ाइए")) ||
|
||||
bytes.HasSuffix(input, []byte("ाईं")) ||
|
||||
bytes.HasSuffix(input, []byte("ाया")) ||
|
||||
bytes.HasSuffix(input, []byte("ेगी")) ||
|
||||
bytes.HasSuffix(input, []byte("ेगा")) ||
|
||||
bytes.HasSuffix(input, []byte("ोगी")) ||
|
||||
bytes.HasSuffix(input, []byte("ोगे")) ||
|
||||
bytes.HasSuffix(input, []byte("ाने")) ||
|
||||
bytes.HasSuffix(input, []byte("ाना")) ||
|
||||
bytes.HasSuffix(input, []byte("ाते")) ||
|
||||
bytes.HasSuffix(input, []byte("ाती")) ||
|
||||
bytes.HasSuffix(input, []byte("ाता")) ||
|
||||
bytes.HasSuffix(input, []byte("तीं")) ||
|
||||
bytes.HasSuffix(input, []byte("ाओं")) ||
|
||||
bytes.HasSuffix(input, []byte("ाएं")) ||
|
||||
bytes.HasSuffix(input, []byte("ुओं")) ||
|
||||
bytes.HasSuffix(input, []byte("ुएं")) ||
|
||||
bytes.HasSuffix(input, []byte("ुआं"))) {
|
||||
return analysis.TruncateRunes(input, 3)
|
||||
}
|
||||
|
||||
// 2
|
||||
if inputLen > 3 &&
|
||||
(bytes.HasSuffix(input, []byte("कर")) ||
|
||||
bytes.HasSuffix(input, []byte("ाओ")) ||
|
||||
bytes.HasSuffix(input, []byte("िए")) ||
|
||||
bytes.HasSuffix(input, []byte("ाई")) ||
|
||||
bytes.HasSuffix(input, []byte("ाए")) ||
|
||||
bytes.HasSuffix(input, []byte("ने")) ||
|
||||
bytes.HasSuffix(input, []byte("नी")) ||
|
||||
bytes.HasSuffix(input, []byte("ना")) ||
|
||||
bytes.HasSuffix(input, []byte("ते")) ||
|
||||
bytes.HasSuffix(input, []byte("ीं")) ||
|
||||
bytes.HasSuffix(input, []byte("ती")) ||
|
||||
bytes.HasSuffix(input, []byte("ता")) ||
|
||||
bytes.HasSuffix(input, []byte("ाँ")) ||
|
||||
bytes.HasSuffix(input, []byte("ां")) ||
|
||||
bytes.HasSuffix(input, []byte("ों")) ||
|
||||
bytes.HasSuffix(input, []byte("ें"))) {
|
||||
return analysis.TruncateRunes(input, 2)
|
||||
}
|
||||
|
||||
// 1
|
||||
if inputLen > 2 &&
|
||||
(bytes.HasSuffix(input, []byte("ो")) ||
|
||||
bytes.HasSuffix(input, []byte("े")) ||
|
||||
bytes.HasSuffix(input, []byte("ू")) ||
|
||||
bytes.HasSuffix(input, []byte("ु")) ||
|
||||
bytes.HasSuffix(input, []byte("ी")) ||
|
||||
bytes.HasSuffix(input, []byte("ि")) ||
|
||||
bytes.HasSuffix(input, []byte("ा"))) {
|
||||
return analysis.TruncateRunes(input, 1)
|
||||
}
|
||||
|
||||
return input
|
||||
}
|
||||
|
||||
func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewHindiStemmerFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
308
analysis/lang/hi/hindi_stemmer_filter_test.go
Normal file
308
analysis/lang/hi/hindi_stemmer_filter_test.go
Normal file
|
@ -0,0 +1,308 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hi
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestHindiStemmerFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// masc noun inflections
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("लडका"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("लडक"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("लडके"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("लडक"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("लडकों"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("लडक"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("गुरु"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("गुर"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("गुरुओं"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("गुर"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("दोस्त"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("दोस्त"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("दोस्तों"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("दोस्त"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// feminine noun inflections
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("लडकी"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("लडक"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("लडकियों"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("लडक"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("किताब"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("किताब"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("किताबें"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("किताब"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("किताबों"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("किताब"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("आध्यापीका"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("आध्यापीक"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("आध्यापीकाएं"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("आध्यापीक"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("आध्यापीकाओं"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("आध्यापीक"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// some verb forms
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("खाना"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("खा"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("खाता"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("खा"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("खाती"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("खा"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("खा"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("खा"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// exceptions
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("कठिनाइयां"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("कठिन"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("कठिन"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("कठिन"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
hindiStemmerFilter := NewHindiStemmerFilter()
|
||||
for _, test := range tests {
|
||||
actual := hindiStemmerFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/hi/stop_filter_hi.go
Normal file
36
analysis/lang/hi/stop_filter_hi.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package hi
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
262
analysis/lang/hi/stop_words_hi.go
Normal file
262
analysis/lang/hi/stop_words_hi.go
Normal file
|
@ -0,0 +1,262 @@
|
|||
package hi
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_hi"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var HindiStopWords = []byte(`# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# Note: by default this file also contains forms normalized by HindiNormalizer
|
||||
# for spelling variation (see section below), such that it can be used whether or
|
||||
# not you enable that feature. When adding additional entries to this list,
|
||||
# please add the normalized form as well.
|
||||
अंदर
|
||||
अत
|
||||
अपना
|
||||
अपनी
|
||||
अपने
|
||||
अभी
|
||||
आदि
|
||||
आप
|
||||
इत्यादि
|
||||
इन
|
||||
इनका
|
||||
इन्हीं
|
||||
इन्हें
|
||||
इन्हों
|
||||
इस
|
||||
इसका
|
||||
इसकी
|
||||
इसके
|
||||
इसमें
|
||||
इसी
|
||||
इसे
|
||||
उन
|
||||
उनका
|
||||
उनकी
|
||||
उनके
|
||||
उनको
|
||||
उन्हीं
|
||||
उन्हें
|
||||
उन्हों
|
||||
उस
|
||||
उसके
|
||||
उसी
|
||||
उसे
|
||||
एक
|
||||
एवं
|
||||
एस
|
||||
ऐसे
|
||||
और
|
||||
कई
|
||||
कर
|
||||
करता
|
||||
करते
|
||||
करना
|
||||
करने
|
||||
करें
|
||||
कहते
|
||||
कहा
|
||||
का
|
||||
काफ़ी
|
||||
कि
|
||||
कितना
|
||||
किन्हें
|
||||
किन्हों
|
||||
किया
|
||||
किर
|
||||
किस
|
||||
किसी
|
||||
किसे
|
||||
की
|
||||
कुछ
|
||||
कुल
|
||||
के
|
||||
को
|
||||
कोई
|
||||
कौन
|
||||
कौनसा
|
||||
गया
|
||||
घर
|
||||
जब
|
||||
जहाँ
|
||||
जा
|
||||
जितना
|
||||
जिन
|
||||
जिन्हें
|
||||
जिन्हों
|
||||
जिस
|
||||
जिसे
|
||||
जीधर
|
||||
जैसा
|
||||
जैसे
|
||||
जो
|
||||
तक
|
||||
तब
|
||||
तरह
|
||||
तिन
|
||||
तिन्हें
|
||||
तिन्हों
|
||||
तिस
|
||||
तिसे
|
||||
तो
|
||||
था
|
||||
थी
|
||||
थे
|
||||
दबारा
|
||||
दिया
|
||||
दुसरा
|
||||
दूसरे
|
||||
दो
|
||||
द्वारा
|
||||
न
|
||||
नहीं
|
||||
ना
|
||||
निहायत
|
||||
नीचे
|
||||
ने
|
||||
पर
|
||||
पर
|
||||
पहले
|
||||
पूरा
|
||||
पे
|
||||
फिर
|
||||
बनी
|
||||
बही
|
||||
बहुत
|
||||
बाद
|
||||
बाला
|
||||
बिलकुल
|
||||
भी
|
||||
भीतर
|
||||
मगर
|
||||
मानो
|
||||
मे
|
||||
में
|
||||
यदि
|
||||
यह
|
||||
यहाँ
|
||||
यही
|
||||
या
|
||||
यिह
|
||||
ये
|
||||
रखें
|
||||
रहा
|
||||
रहे
|
||||
ऱ्वासा
|
||||
लिए
|
||||
लिये
|
||||
लेकिन
|
||||
व
|
||||
वर्ग
|
||||
वह
|
||||
वह
|
||||
वहाँ
|
||||
वहीं
|
||||
वाले
|
||||
वुह
|
||||
वे
|
||||
वग़ैरह
|
||||
संग
|
||||
सकता
|
||||
सकते
|
||||
सबसे
|
||||
सभी
|
||||
साथ
|
||||
साबुत
|
||||
साभ
|
||||
सारा
|
||||
से
|
||||
सो
|
||||
ही
|
||||
हुआ
|
||||
हुई
|
||||
हुए
|
||||
है
|
||||
हैं
|
||||
हो
|
||||
होता
|
||||
होती
|
||||
होते
|
||||
होना
|
||||
होने
|
||||
# additional normalized forms of the above
|
||||
अपनि
|
||||
जेसे
|
||||
होति
|
||||
सभि
|
||||
तिंहों
|
||||
इंहों
|
||||
दवारा
|
||||
इसि
|
||||
किंहें
|
||||
थि
|
||||
उंहों
|
||||
ओर
|
||||
जिंहें
|
||||
वहिं
|
||||
अभि
|
||||
बनि
|
||||
हि
|
||||
उंहिं
|
||||
उंहें
|
||||
हें
|
||||
वगेरह
|
||||
एसे
|
||||
रवासा
|
||||
कोन
|
||||
निचे
|
||||
काफि
|
||||
उसि
|
||||
पुरा
|
||||
भितर
|
||||
हे
|
||||
बहि
|
||||
वहां
|
||||
कोइ
|
||||
यहां
|
||||
जिंहों
|
||||
तिंहें
|
||||
किसि
|
||||
कइ
|
||||
यहि
|
||||
इंहिं
|
||||
जिधर
|
||||
इंहें
|
||||
अदि
|
||||
इतयादि
|
||||
हुइ
|
||||
कोनसा
|
||||
इसकि
|
||||
दुसरे
|
||||
जहां
|
||||
अप
|
||||
किंहों
|
||||
उनकि
|
||||
भि
|
||||
वरग
|
||||
हुअ
|
||||
जेसा
|
||||
नहिं
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(HindiStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue