Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00 · 2025-05-19 00:20:02 +02:00 · 982828099e
commit 982828099e
parent c71cb8b61d
783 changed files with 150650 additions and 0 deletions
--- a/analysis/lang/fa/analyzer_fa.go
+++ b/analysis/lang/fa/analyzer_fa.go
@ -0,0 +1,74 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fa
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+
+	"github.com/blevesearch/bleve/v2/analysis/char/zerowidthnonjoiner"
+	"github.com/blevesearch/bleve/v2/analysis/lang/ar"
+	"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
+	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
+)
+
+const AnalyzerName = "fa"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
+	zFilter, err := cache.CharFilterNamed(zerowidthnonjoiner.Name)
+	if err != nil {
+		return nil, err
+	}
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
+	if err != nil {
+		return nil, err
+	}
+	normArFilter, err := cache.TokenFilterNamed(ar.NormalizeName)
+	if err != nil {
+		return nil, err
+	}
+	normFaFilter, err := cache.TokenFilterNamed(NormalizeName)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopFaFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.DefaultAnalyzer{
+		CharFilters: []analysis.CharFilter{
+			zFilter,
+		},
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			normArFilter,
+			normFaFilter,
+			stopFaFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/fa/analyzer_fa_test.go
+++ b/analysis/lang/fa/analyzer_fa_test.go
@ -0,0 +1,684 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fa
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func TestPersianAnalyzerVerbs(t *testing.T) {
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		// active present indicative
+		{
+			input: []byte("می‌خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active preterite indicative
+		{
+			input: []byte("خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active imperfective preterite indicative
+		{
+			input: []byte("می‌خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active future indicative
+		{
+			input: []byte("خواهد خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active present progressive indicative
+		{
+			input: []byte("دارد می‌خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active preterite progressive indicative
+		{
+			input: []byte("داشت می‌خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active perfect indicative
+		{
+			input: []byte("خورده‌است"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active imperfective perfect indicative
+		{
+			input: []byte("می‌خورده‌است"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active pluperfect indicative
+		{
+			input: []byte("خورده بود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active imperfective pluperfect indicative
+		{
+			input: []byte("می‌خورده بود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active preterite subjunctive
+		{
+			input: []byte("خورده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active imperfective preterite subjunctive
+		{
+			input: []byte("می‌خورده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active pluperfect subjunctive
+		{
+			input: []byte("خورده بوده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active imperfective pluperfect subjunctive
+		{
+			input: []byte("می‌خورده بوده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive present indicative
+		{
+			input: []byte("خورده می‌شود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive preterite indicative
+		{
+			input: []byte("خورده شد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive imperfective preterite indicative
+		{
+			input: []byte("خورده می‌شد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive perfect indicative
+		{
+			input: []byte("خورده شده‌است"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive imperfective perfect indicative
+		{
+			input: []byte("خورده می‌شده‌است"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive pluperfect indicative
+		{
+			input: []byte("خورده شده بود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive imperfective pluperfect indicative
+		{
+			input: []byte("خورده می‌شده بود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive future indicative
+		{
+			input: []byte("خورده خواهد شد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive present progressive indicative
+		{
+			input: []byte("دارد خورده می‌شود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive preterite progressive indicative
+		{
+			input: []byte("داشت خورده می‌شد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive present subjunctive
+		{
+			input: []byte("خورده شود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive preterite subjunctive
+		{
+			input: []byte("خورده شده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive imperfective preterite subjunctive
+		{
+			input: []byte("خورده می‌شده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive pluperfect subjunctive
+		{
+			input: []byte("خورده شده بوده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive imperfective pluperfect subjunctive
+		{
+			input: []byte("خورده می‌شده بوده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active present subjunctive
+		{
+			input: []byte("بخورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("بخورد"),
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := analyzer.Analyze(test.input)
+		if len(actual) != len(test.output) {
+			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
+		}
+		for i, tok := range actual {
+			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
+				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
+			}
+		}
+	}
+}
+
+func TestPersianAnalyzerVerbsDefective(t *testing.T) {
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		// active present indicative
+		{
+			input: []byte("مي خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active preterite indicative
+		{
+			input: []byte("خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active imperfective preterite indicative
+		{
+			input: []byte("مي خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active future indicative
+		{
+			input: []byte("خواهد خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active present progressive indicative
+		{
+			input: []byte("دارد مي خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active preterite progressive indicative
+		{
+			input: []byte("داشت مي خورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورد"),
+				},
+			},
+		},
+		// active perfect indicative
+		{
+			input: []byte("خورده است"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active imperfective perfect indicative
+		{
+			input: []byte("مي خورده است"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active pluperfect indicative
+		{
+			input: []byte("خورده بود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active imperfective pluperfect indicative
+		{
+			input: []byte("مي خورده بود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active preterite subjunctive
+		{
+			input: []byte("خورده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active imperfective preterite subjunctive
+		{
+			input: []byte("مي خورده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active pluperfect subjunctive
+		{
+			input: []byte("خورده بوده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active imperfective pluperfect subjunctive
+		{
+			input: []byte("مي خورده بوده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive present indicative
+		{
+			input: []byte("خورده مي شود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive preterite indicative
+		{
+			input: []byte("خورده شد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive imperfective preterite indicative
+		{
+			input: []byte("خورده مي شد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive perfect indicative
+		{
+			input: []byte("خورده شده است"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive imperfective perfect indicative
+		{
+			input: []byte("خورده مي شده است"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive pluperfect indicative
+		{
+			input: []byte("خورده شده بود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive imperfective pluperfect indicative
+		{
+			input: []byte("خورده مي شده بود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive future indicative
+		{
+			input: []byte("خورده خواهد شد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive present progressive indicative
+		{
+			input: []byte("دارد خورده مي شود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive preterite progressive indicative
+		{
+			input: []byte("داشت خورده مي شد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive present subjunctive
+		{
+			input: []byte("خورده شود"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive preterite subjunctive
+		{
+			input: []byte("خورده شده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive imperfective preterite subjunctive
+		{
+			input: []byte("خورده مي شده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive pluperfect subjunctive
+		{
+			input: []byte("خورده شده بوده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// passive imperfective pluperfect subjunctive
+		{
+			input: []byte("خورده مي شده بوده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		// active present subjunctive
+		{
+			input: []byte("بخورد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("بخورد"),
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := analyzer.Analyze(test.input)
+		if len(actual) != len(test.output) {
+			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
+		}
+		for i, tok := range actual {
+			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
+				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
+			}
+		}
+	}
+}
+
+func TestPersianAnalyzerOthers(t *testing.T) {
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		// nouns
+		{
+			input: []byte("برگ ها"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("برگ"),
+				},
+			},
+		},
+		{
+			input: []byte("برگ‌ها"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("برگ"),
+				},
+			},
+		},
+		// non persian
+		{
+			input: []byte("English test."),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("english"),
+				},
+				&analysis.Token{
+					Term: []byte("test"),
+				},
+			},
+		},
+		// others
+		{
+			input: []byte("خورده مي شده بوده باشد"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("خورده"),
+				},
+			},
+		},
+		{
+			input: []byte("برگ‌ها"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("برگ"),
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := analyzer.Analyze(test.input)
+		if len(actual) != len(test.output) {
+			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
+		}
+		for i, tok := range actual {
+			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
+				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
+			}
+		}
+	}
+}
--- a/analysis/lang/fa/persian_normalize.go
+++ b/analysis/lang/fa/persian_normalize.go
@ -0,0 +1,80 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fa
+
+import (
+	"bytes"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const NormalizeName = "normalize_fa"
+
+const (
+	Yeh        = '\u064A'
+	FarsiYeh   = '\u06CC'
+	YehBarree  = '\u06D2'
+	Keheh      = '\u06A9'
+	Kaf        = '\u0643'
+	HamzaAbove = '\u0654'
+	HehYeh     = '\u06C0'
+	HehGoal    = '\u06C1'
+	Heh        = '\u0647'
+)
+
+type PersianNormalizeFilter struct {
+}
+
+func NewPersianNormalizeFilter() *PersianNormalizeFilter {
+	return &PersianNormalizeFilter{}
+}
+
+func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		term := normalize(token.Term)
+		token.Term = term
+	}
+	return input
+}
+
+func normalize(input []byte) []byte {
+	runes := bytes.Runes(input)
+	for i := 0; i < len(runes); i++ {
+		switch runes[i] {
+		case FarsiYeh, YehBarree:
+			runes[i] = Yeh
+		case Keheh:
+			runes[i] = Kaf
+		case HehYeh, HehGoal:
+			runes[i] = Heh
+		case HamzaAbove: // necessary for HEH + HAMZA
+			runes = analysis.DeleteRune(runes, i)
+			i--
+		}
+	}
+	return analysis.BuildTermFromRunes(runes)
+}
+
+func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewPersianNormalizeFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/fa/persian_normalize_test.go
+++ b/analysis/lang/fa/persian_normalize_test.go
@ -0,0 +1,130 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fa
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestPersianNormalizeFilter(t *testing.T) {
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		// FarsiYeh
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("های"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("هاي"),
+				},
+			},
+		},
+		// YehBarree
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("هاے"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("هاي"),
+				},
+			},
+		},
+		// Keheh
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("کشاندن"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("كشاندن"),
+				},
+			},
+		},
+		// HehYeh
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("كتابۀ"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("كتابه"),
+				},
+			},
+		},
+		// HehHamzaAbove
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("كتابهٔ"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("كتابه"),
+				},
+			},
+		},
+		// HehGoal
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("زادہ"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("زاده"),
+				},
+			},
+		},
+		// empty
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+	}
+
+	persianNormalizeFilter := NewPersianNormalizeFilter()
+	for _, test := range tests {
+		actual := persianNormalizeFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %#v, got %#v", test.output, actual)
+			t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
+		}
+	}
+}
--- a/analysis/lang/fa/stop_filter_fa.go
+++ b/analysis/lang/fa/stop_filter_fa.go
@ -0,0 +1,36 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fa
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/analysis/token/stop"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/fa/stop_words_fa.go
+++ b/analysis/lang/fa/stop_words_fa.go
@ -0,0 +1,340 @@
+package fa
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const StopName = "stop_fa"
+
+// this content was obtained from:
+// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
+// ` was changed to ' to allow for literal string
+
+var PersianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
+# See http://members.unine.ch/jacques.savoy/clef/index.html.
+# Also see http://www.opensource.org/licenses/bsd-license.html
+# Note: by default this file is used after normalization, so when adding entries
+# to this file, use the arabic 'ي' instead of 'ی'
+انان
+نداشته
+سراسر
+خياه
+ايشان
+وي
+تاكنون
+بيشتري
+دوم
+پس
+ناشي
+وگو
+يا
+داشتند
+سپس
+هنگام
+هرگز
+پنج
+نشان
+امسال
+ديگر
+گروهي
+شدند
+چطور
+ده
+و
+دو
+نخستين
+ولي
+چرا
+چه
+وسط
+ه
+كدام
+قابل
+يك
+رفت
+هفت
+همچنين
+در
+هزار
+بله
+بلي
+شايد
+اما
+شناسي
+گرفته
+دهد
+داشته
+دانست
+داشتن
+خواهيم
+ميليارد
+وقتيكه
+امد
+خواهد
+جز
+اورده
+شده
+بلكه
+خدمات
+شدن
+برخي
+نبود
+بسياري
+جلوگيري
+حق
+كردند
+نوعي
+بعري
+نكرده
+نظير
+نبايد
+بوده
+بودن
+داد
+اورد
+هست
+جايي
+شود
+دنبال
+داده
+بايد
+سابق
+هيچ
+همان
+انجا
+كمتر
+كجاست
+گردد
+كسي
+تر
+مردم
+تان
+دادن
+بودند
+سري
+جدا
+ندارند
+مگر
+يكديگر
+دارد
+دهند
+بنابراين
+هنگامي
+سمت
+جا
+انچه
+خود
+دادند
+زياد
+دارند
+اثر
+بدون
+بهترين
+بيشتر
+البته
+به
+براساس
+بيرون
+كرد
+بعضي
+گرفت
+توي
+اي
+ميليون
+او
+جريان
+تول
+بر
+مانند
+برابر
+باشيم
+مدتي
+گويند
+اكنون
+تا
+تنها
+جديد
+چند
+بي
+نشده
+كردن
+كردم
+گويد
+كرده
+كنيم
+نمي
+نزد
+روي
+قصد
+فقط
+بالاي
+ديگران
+اين
+ديروز
+توسط
+سوم
+ايم
+دانند
+سوي
+استفاده
+شما
+كنار
+داريم
+ساخته
+طور
+امده
+رفته
+نخست
+بيست
+نزديك
+طي
+كنيد
+از
+انها
+تمامي
+داشت
+يكي
+طريق
+اش
+چيست
+روب
+نمايد
+گفت
+چندين
+چيزي
+تواند
+ام
+ايا
+با
+ان
+ايد
+ترين
+اينكه
+ديگري
+راه
+هايي
+بروز
+همچنان
+پاعين
+كس
+حدود
+مختلف
+مقابل
+چيز
+گيرد
+ندارد
+ضد
+همچون
+سازي
+شان
+مورد
+باره
+مرسي
+خويش
+برخوردار
+چون
+خارج
+شش
+هنوز
+تحت
+ضمن
+هستيم
+گفته
+فكر
+بسيار
+پيش
+براي
+روزهاي
+انكه
+نخواهد
+بالا
+كل
+وقتي
+كي
+چنين
+كه
+گيري
+نيست
+است
+كجا
+كند
+نيز
+يابد
+بندي
+حتي
+توانند
+عقب
+خواست
+كنند
+بين
+تمام
+همه
+ما
+باشند
+مثل
+شد
+اري
+باشد
+اره
+طبق
+بعد
+اگر
+صورت
+غير
+جاي
+بيش
+ريزي
+اند
+زيرا
+چگونه
+بار
+لطفا
+مي
+درباره
+من
+ديده
+همين
+گذاري
+برداري
+علت
+گذاشته
+هم
+فوق
+نه
+ها
+شوند
+اباد
+همواره
+هر
+اول
+خواهند
+چهار
+نام
+امروز
+مان
+هاي
+قبل
+كنم
+سعي
+تازه
+را
+هستند
+زير
+جلوي
+عنوان
+بود
+`)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(PersianStopWords)
+	return rv, err
+}
+
+func init() {
+	err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
+	if err != nil {
+		panic(err)
+	}
+}