Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00 · 2025-05-19 00:20:02 +02:00 · 982828099e
commit 982828099e
parent c71cb8b61d
783 changed files with 150650 additions and 0 deletions
--- a/analysis/lang/hi/analyzer_hi.go
+++ b/analysis/lang/hi/analyzer_hi.go
@ -0,0 +1,71 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hi
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+
+	"github.com/blevesearch/bleve/v2/analysis/lang/in"
+	"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
+	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
+)
+
+const AnalyzerName = "hi"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
+	tokenizer, err := cache.TokenizerNamed(unicode.Name)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
+	if err != nil {
+		return nil, err
+	}
+	indicNormalizeFilter, err := cache.TokenFilterNamed(in.NormalizeName)
+	if err != nil {
+		return nil, err
+	}
+	hindiNormalizeFilter, err := cache.TokenFilterNamed(NormalizeName)
+	if err != nil {
+		return nil, err
+	}
+	stopHiFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	stemmerHiFilter, err := cache.TokenFilterNamed(StemmerName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.DefaultAnalyzer{
+		Tokenizer: tokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			indicNormalizeFilter,
+			hindiNormalizeFilter,
+			stopHiFilter,
+			stemmerHiFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/hi/analyzer_hi_test.go
+++ b/analysis/lang/hi/analyzer_hi_test.go
@ -0,0 +1,66 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hi
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func TestHindiAnalyzer(t *testing.T) {
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		// two ways to write 'hindi' itself
+		{
+			input: []byte("हिन्दी"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("हिंद"),
+					Position: 1,
+					Start:    0,
+					End:      18,
+				},
+			},
+		},
+		{
+			input: []byte("हिंदी"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("हिंद"),
+					Position: 1,
+					Start:    0,
+					End:      15,
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := analyzer.Analyze(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %v, got %v", test.output, actual)
+		}
+	}
+}
--- a/analysis/lang/hi/hindi_normalize.go
+++ b/analysis/lang/hi/hindi_normalize.go
@ -0,0 +1,141 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hi
+
+import (
+	"bytes"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const NormalizeName = "normalize_hi"
+
+type HindiNormalizeFilter struct {
+}
+
+func NewHindiNormalizeFilter() *HindiNormalizeFilter {
+	return &HindiNormalizeFilter{}
+}
+
+func (s *HindiNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		term := normalize(token.Term)
+		token.Term = term
+	}
+	return input
+}
+
+func normalize(input []byte) []byte {
+	runes := bytes.Runes(input)
+	for i := 0; i < len(runes); i++ {
+		switch runes[i] {
+		// dead n -> bindu
+		case '\u0928':
+			if i+1 < len(runes) && runes[i+1] == '\u094D' {
+				runes[i] = '\u0902'
+				runes = analysis.DeleteRune(runes, i+1)
+			}
+		// candrabindu -> bindu
+		case '\u0901':
+			runes[i] = '\u0902'
+		// nukta deletions
+		case '\u093C':
+			runes = analysis.DeleteRune(runes, i)
+			i--
+		case '\u0929':
+			runes[i] = '\u0928'
+		case '\u0931':
+			runes[i] = '\u0930'
+		case '\u0934':
+			runes[i] = '\u0933'
+		case '\u0958':
+			runes[i] = '\u0915'
+		case '\u0959':
+			runes[i] = '\u0916'
+		case '\u095A':
+			runes[i] = '\u0917'
+		case '\u095B':
+			runes[i] = '\u091C'
+		case '\u095C':
+			runes[i] = '\u0921'
+		case '\u095D':
+			runes[i] = '\u0922'
+		case '\u095E':
+			runes[i] = '\u092B'
+		case '\u095F':
+			runes[i] = '\u092F'
+			// zwj/zwnj -> delete
+		case '\u200D', '\u200C':
+			runes = analysis.DeleteRune(runes, i)
+			i--
+			// virama -> delete
+		case '\u094D':
+			runes = analysis.DeleteRune(runes, i)
+			i--
+			// chandra/short -> replace
+		case '\u0945', '\u0946':
+			runes[i] = '\u0947'
+		case '\u0949', '\u094A':
+			runes[i] = '\u094B'
+		case '\u090D', '\u090E':
+			runes[i] = '\u090F'
+		case '\u0911', '\u0912':
+			runes[i] = '\u0913'
+		case '\u0972':
+			runes[i] = '\u0905'
+			// long -> short ind. vowels
+		case '\u0906':
+			runes[i] = '\u0905'
+		case '\u0908':
+			runes[i] = '\u0907'
+		case '\u090A':
+			runes[i] = '\u0909'
+		case '\u0960':
+			runes[i] = '\u090B'
+		case '\u0961':
+			runes[i] = '\u090C'
+		case '\u0910':
+			runes[i] = '\u090F'
+		case '\u0914':
+			runes[i] = '\u0913'
+			// long -> short dep. vowels
+		case '\u0940':
+			runes[i] = '\u093F'
+		case '\u0942':
+			runes[i] = '\u0941'
+		case '\u0944':
+			runes[i] = '\u0943'
+		case '\u0963':
+			runes[i] = '\u0962'
+		case '\u0948':
+			runes[i] = '\u0947'
+		case '\u094C':
+			runes[i] = '\u094B'
+		}
+	}
+	return analysis.BuildTermFromRunes(runes)
+}
+
+func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewHindiNormalizeFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/hi/hindi_normalize_test.go
+++ b/analysis/lang/hi/hindi_normalize_test.go
@ -0,0 +1,251 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hi
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestHindiNormalizeFilter(t *testing.T) {
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		// basics
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अँगरेज़ी"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंगरेजि"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अँगरेजी"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंगरेजि"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अँग्रेज़ी"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंगरेजि"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अँग्रेजी"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंगरेजि"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंगरेज़ी"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंगरेजि"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंगरेजी"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंगरेजि"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंग्रेज़ी"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंगरेजि"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंग्रेजी"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अंगरेजि"),
+				},
+			},
+		},
+		// test decompositions
+		// removing nukta dot
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("क़िताब"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("किताब"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("फ़र्ज़"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("फरज"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("क़र्ज़"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("करज"),
+				},
+			},
+		},
+		// some other composed nukta forms
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ऱऴख़ग़ड़ढ़य़"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("रळखगडढय"),
+				},
+			},
+		},
+		// removal of format (ZWJ/ZWNJ)
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("शार्‍मा"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("शारमा"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("शार्‌मा"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("शारमा"),
+				},
+			},
+		},
+		// removal of chandra
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ॅॆॉॊऍऎऑऒ\u0972"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ेेोोएएओओअ"),
+				},
+			},
+		},
+		// vowel shortening
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("आईऊॠॡऐऔीूॄॣैौ"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("अइउऋऌएओिुृॢेो"),
+				},
+			},
+		},
+		// empty
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+	}
+
+	hindiNormalizeFilter := NewHindiNormalizeFilter()
+	for _, test := range tests {
+		actual := hindiNormalizeFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %#v, got %#v", test.output, actual)
+			t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
+		}
+	}
+}
--- a/analysis/lang/hi/hindi_stemmer_filter.go
+++ b/analysis/lang/hi/hindi_stemmer_filter.go
@ -0,0 +1,152 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hi
+
+import (
+	"bytes"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const StemmerName = "stemmer_hi"
+
+type HindiStemmerFilter struct {
+}
+
+func NewHindiStemmerFilter() *HindiStemmerFilter {
+	return &HindiStemmerFilter{}
+}
+
+func (s *HindiStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		// if not protected keyword, stem it
+		if !token.KeyWord {
+			stemmed := stem(token.Term)
+			token.Term = stemmed
+		}
+	}
+	return input
+}
+
+func stem(input []byte) []byte {
+	inputLen := utf8.RuneCount(input)
+
+	// 5
+	if inputLen > 6 &&
+		(bytes.HasSuffix(input, []byte("ाएंगी")) ||
+			bytes.HasSuffix(input, []byte("ाएंगे")) ||
+			bytes.HasSuffix(input, []byte("ाऊंगी")) ||
+			bytes.HasSuffix(input, []byte("ाऊंगा")) ||
+			bytes.HasSuffix(input, []byte("ाइयाँ")) ||
+			bytes.HasSuffix(input, []byte("ाइयों")) ||
+			bytes.HasSuffix(input, []byte("ाइयां"))) {
+		return analysis.TruncateRunes(input, 5)
+	}
+
+	// 4
+	if inputLen > 5 &&
+		(bytes.HasSuffix(input, []byte("ाएगी")) ||
+			bytes.HasSuffix(input, []byte("ाएगा")) ||
+			bytes.HasSuffix(input, []byte("ाओगी")) ||
+			bytes.HasSuffix(input, []byte("ाओगे")) ||
+			bytes.HasSuffix(input, []byte("एंगी")) ||
+			bytes.HasSuffix(input, []byte("ेंगी")) ||
+			bytes.HasSuffix(input, []byte("एंगे")) ||
+			bytes.HasSuffix(input, []byte("ेंगे")) ||
+			bytes.HasSuffix(input, []byte("ूंगी")) ||
+			bytes.HasSuffix(input, []byte("ूंगा")) ||
+			bytes.HasSuffix(input, []byte("ातीं")) ||
+			bytes.HasSuffix(input, []byte("नाओं")) ||
+			bytes.HasSuffix(input, []byte("नाएं")) ||
+			bytes.HasSuffix(input, []byte("ताओं")) ||
+			bytes.HasSuffix(input, []byte("ताएं")) ||
+			bytes.HasSuffix(input, []byte("ियाँ")) ||
+			bytes.HasSuffix(input, []byte("ियों")) ||
+			bytes.HasSuffix(input, []byte("ियां"))) {
+		return analysis.TruncateRunes(input, 4)
+	}
+
+	// 3
+	if inputLen > 4 &&
+		(bytes.HasSuffix(input, []byte("ाकर")) ||
+			bytes.HasSuffix(input, []byte("ाइए")) ||
+			bytes.HasSuffix(input, []byte("ाईं")) ||
+			bytes.HasSuffix(input, []byte("ाया")) ||
+			bytes.HasSuffix(input, []byte("ेगी")) ||
+			bytes.HasSuffix(input, []byte("ेगा")) ||
+			bytes.HasSuffix(input, []byte("ोगी")) ||
+			bytes.HasSuffix(input, []byte("ोगे")) ||
+			bytes.HasSuffix(input, []byte("ाने")) ||
+			bytes.HasSuffix(input, []byte("ाना")) ||
+			bytes.HasSuffix(input, []byte("ाते")) ||
+			bytes.HasSuffix(input, []byte("ाती")) ||
+			bytes.HasSuffix(input, []byte("ाता")) ||
+			bytes.HasSuffix(input, []byte("तीं")) ||
+			bytes.HasSuffix(input, []byte("ाओं")) ||
+			bytes.HasSuffix(input, []byte("ाएं")) ||
+			bytes.HasSuffix(input, []byte("ुओं")) ||
+			bytes.HasSuffix(input, []byte("ुएं")) ||
+			bytes.HasSuffix(input, []byte("ुआं"))) {
+		return analysis.TruncateRunes(input, 3)
+	}
+
+	// 2
+	if inputLen > 3 &&
+		(bytes.HasSuffix(input, []byte("कर")) ||
+			bytes.HasSuffix(input, []byte("ाओ")) ||
+			bytes.HasSuffix(input, []byte("िए")) ||
+			bytes.HasSuffix(input, []byte("ाई")) ||
+			bytes.HasSuffix(input, []byte("ाए")) ||
+			bytes.HasSuffix(input, []byte("ने")) ||
+			bytes.HasSuffix(input, []byte("नी")) ||
+			bytes.HasSuffix(input, []byte("ना")) ||
+			bytes.HasSuffix(input, []byte("ते")) ||
+			bytes.HasSuffix(input, []byte("ीं")) ||
+			bytes.HasSuffix(input, []byte("ती")) ||
+			bytes.HasSuffix(input, []byte("ता")) ||
+			bytes.HasSuffix(input, []byte("ाँ")) ||
+			bytes.HasSuffix(input, []byte("ां")) ||
+			bytes.HasSuffix(input, []byte("ों")) ||
+			bytes.HasSuffix(input, []byte("ें"))) {
+		return analysis.TruncateRunes(input, 2)
+	}
+
+	// 1
+	if inputLen > 2 &&
+		(bytes.HasSuffix(input, []byte("ो")) ||
+			bytes.HasSuffix(input, []byte("े")) ||
+			bytes.HasSuffix(input, []byte("ू")) ||
+			bytes.HasSuffix(input, []byte("ु")) ||
+			bytes.HasSuffix(input, []byte("ी")) ||
+			bytes.HasSuffix(input, []byte("ि")) ||
+			bytes.HasSuffix(input, []byte("ा"))) {
+		return analysis.TruncateRunes(input, 1)
+	}
+
+	return input
+}
+
+func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewHindiStemmerFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/hi/hindi_stemmer_filter_test.go
+++ b/analysis/lang/hi/hindi_stemmer_filter_test.go
@ -0,0 +1,308 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hi
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestHindiStemmerFilter(t *testing.T) {
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		// masc noun inflections
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("लडका"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("लडक"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("लडके"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("लडक"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("लडकों"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("लडक"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("गुरु"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("गुर"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("गुरुओं"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("गुर"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("दोस्त"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("दोस्त"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("दोस्तों"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("दोस्त"),
+				},
+			},
+		},
+		// feminine noun inflections
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("लडकी"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("लडक"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("लडकियों"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("लडक"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("किताब"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("किताब"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("किताबें"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("किताब"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("किताबों"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("किताब"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("आध्यापीका"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("आध्यापीक"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("आध्यापीकाएं"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("आध्यापीक"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("आध्यापीकाओं"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("आध्यापीक"),
+				},
+			},
+		},
+		// some verb forms
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("खाना"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("खा"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("खाता"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("खा"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("खाती"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("खा"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("खा"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("खा"),
+				},
+			},
+		},
+		// exceptions
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("कठिनाइयां"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("कठिन"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("कठिन"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("कठिन"),
+				},
+			},
+		},
+		// empty
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+	}
+
+	hindiStemmerFilter := NewHindiStemmerFilter()
+	for _, test := range tests {
+		actual := hindiStemmerFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %#v, got %#v", test.output, actual)
+			t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
+		}
+	}
+}
--- a/analysis/lang/hi/stop_filter_hi.go
+++ b/analysis/lang/hi/stop_filter_hi.go
@ -0,0 +1,36 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package hi
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/analysis/token/stop"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/hi/stop_words_hi.go
+++ b/analysis/lang/hi/stop_words_hi.go
@ -0,0 +1,262 @@
+package hi
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const StopName = "stop_hi"
+
+// this content was obtained from:
+// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
+// ` was changed to ' to allow for literal string
+
+var HindiStopWords = []byte(`# Also see http://www.opensource.org/licenses/bsd-license.html
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# This file was created by Jacques Savoy and is distributed under the BSD license.
+# Note: by default this file also contains forms normalized by HindiNormalizer 
+# for spelling variation (see section below), such that it can be used whether or 
+# not you enable that feature. When adding additional entries to this list,
+# please add the normalized form as well. 
+अंदर
+अत
+अपना
+अपनी
+अपने
+अभी
+आदि
+आप
+इत्यादि
+इन 
+इनका
+इन्हीं
+इन्हें
+इन्हों
+इस
+इसका
+इसकी
+इसके
+इसमें
+इसी
+इसे
+उन
+उनका
+उनकी
+उनके
+उनको
+उन्हीं
+उन्हें
+उन्हों
+उस
+उसके
+उसी
+उसे
+एक
+एवं
+एस
+ऐसे
+और
+कई
+कर
+करता
+करते
+करना
+करने
+करें
+कहते
+कहा
+का
+काफ़ी
+कि
+कितना
+किन्हें
+किन्हों
+किया
+किर
+किस
+किसी
+किसे
+की
+कुछ
+कुल
+के
+को
+कोई
+कौन
+कौनसा
+गया
+घर
+जब
+जहाँ
+जा
+जितना
+जिन
+जिन्हें
+जिन्हों
+जिस
+जिसे
+जीधर
+जैसा
+जैसे
+जो
+तक
+तब
+तरह
+तिन
+तिन्हें
+तिन्हों
+तिस
+तिसे
+तो
+था
+थी
+थे
+दबारा
+दिया
+दुसरा
+दूसरे
+दो
+द्वारा
+न
+नहीं
+ना
+निहायत
+नीचे
+ने
+पर
+पर  
+पहले
+पूरा
+पे
+फिर
+बनी
+बही
+बहुत
+बाद
+बाला
+बिलकुल
+भी
+भीतर
+मगर
+मानो
+मे
+में
+यदि
+यह
+यहाँ
+यही
+या
+यिह 
+ये
+रखें
+रहा
+रहे
+ऱ्वासा
+लिए
+लिये
+लेकिन
+व
+वर्ग
+वह
+वह 
+वहाँ
+वहीं
+वाले
+वुह 
+वे
+वग़ैरह
+संग
+सकता
+सकते
+सबसे
+सभी
+साथ
+साबुत
+साभ
+सारा
+से
+सो
+ही
+हुआ
+हुई
+हुए
+है
+हैं
+हो
+होता
+होती
+होते
+होना
+होने
+# additional normalized forms of the above
+अपनि
+जेसे
+होति
+सभि
+तिंहों
+इंहों
+दवारा
+इसि
+किंहें
+थि
+उंहों
+ओर
+जिंहें
+वहिं
+अभि
+बनि
+हि
+उंहिं
+उंहें
+हें
+वगेरह
+एसे
+रवासा
+कोन
+निचे
+काफि
+उसि
+पुरा
+भितर
+हे
+बहि
+वहां
+कोइ
+यहां
+जिंहों
+तिंहें
+किसि
+कइ
+यहि
+इंहिं
+जिधर
+इंहें
+अदि
+इतयादि
+हुइ
+कोनसा
+इसकि
+दुसरे
+जहां
+अप
+किंहों
+उनकि
+भि
+वरग
+हुअ
+जेसा
+नहिं
+`)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(HindiStopWords)
+	return rv, err
+}
+
+func init() {
+	err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
+	if err != nil {
+		panic(err)
+	}
+}