Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c71cb8b61d
commit
982828099e
783 changed files with 150650 additions and 0 deletions
74
analysis/lang/fa/analyzer_fa.go
Normal file
74
analysis/lang/fa/analyzer_fa.go
Normal file
|
@ -0,0 +1,74 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/char/zerowidthnonjoiner"
|
||||
"github.com/blevesearch/bleve/v2/analysis/lang/ar"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "fa"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
zFilter, err := cache.CharFilterNamed(zerowidthnonjoiner.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normArFilter, err := cache.TokenFilterNamed(ar.NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
normFaFilter, err := cache.TokenFilterNamed(NormalizeName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopFaFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
CharFilters: []analysis.CharFilter{
|
||||
zFilter,
|
||||
},
|
||||
Tokenizer: unicodeTokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
normArFilter,
|
||||
normFaFilter,
|
||||
stopFaFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
684
analysis/lang/fa/analyzer_fa_test.go
Normal file
684
analysis/lang/fa/analyzer_fa_test.go
Normal file
|
@ -0,0 +1,684 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestPersianAnalyzerVerbs(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// active present indicative
|
||||
{
|
||||
input: []byte("میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite indicative
|
||||
{
|
||||
input: []byte("خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite indicative
|
||||
{
|
||||
input: []byte("میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active future indicative
|
||||
{
|
||||
input: []byte("خواهد خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present progressive indicative
|
||||
{
|
||||
input: []byte("دارد میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت میخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active perfect indicative
|
||||
{
|
||||
input: []byte("خوردهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective perfect indicative
|
||||
{
|
||||
input: []byte("میخوردهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("میخورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("میخورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("میخورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present indicative
|
||||
{
|
||||
input: []byte("خورده میشود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite indicative
|
||||
{
|
||||
input: []byte("خورده شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite indicative
|
||||
{
|
||||
input: []byte("خورده میشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive perfect indicative
|
||||
{
|
||||
input: []byte("خورده شدهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective perfect indicative
|
||||
{
|
||||
input: []byte("خورده میشدهاست"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده شده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده میشده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive future indicative
|
||||
{
|
||||
input: []byte("خورده خواهد شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present progressive indicative
|
||||
{
|
||||
input: []byte("دارد خورده میشود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت خورده میشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present subjunctive
|
||||
{
|
||||
input: []byte("خورده شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده شده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده میشده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده میشده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present subjunctive
|
||||
{
|
||||
input: []byte("بخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بخورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPersianAnalyzerVerbsDefective(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// active present indicative
|
||||
{
|
||||
input: []byte("مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite indicative
|
||||
{
|
||||
input: []byte("خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite indicative
|
||||
{
|
||||
input: []byte("مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active future indicative
|
||||
{
|
||||
input: []byte("خواهد خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present progressive indicative
|
||||
{
|
||||
input: []byte("دارد مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت مي خورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active perfect indicative
|
||||
{
|
||||
input: []byte("خورده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective perfect indicative
|
||||
{
|
||||
input: []byte("مي خورده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("مي خورده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("مي خورده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("مي خورده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present indicative
|
||||
{
|
||||
input: []byte("خورده مي شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite indicative
|
||||
{
|
||||
input: []byte("خورده شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite indicative
|
||||
{
|
||||
input: []byte("خورده مي شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive perfect indicative
|
||||
{
|
||||
input: []byte("خورده شده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective perfect indicative
|
||||
{
|
||||
input: []byte("خورده مي شده است"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده شده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect indicative
|
||||
{
|
||||
input: []byte("خورده مي شده بود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive future indicative
|
||||
{
|
||||
input: []byte("خورده خواهد شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present progressive indicative
|
||||
{
|
||||
input: []byte("دارد خورده مي شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite progressive indicative
|
||||
{
|
||||
input: []byte("داشت خورده مي شد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive present subjunctive
|
||||
{
|
||||
input: []byte("خورده شود"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده شده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective preterite subjunctive
|
||||
{
|
||||
input: []byte("خورده مي شده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// passive imperfective pluperfect subjunctive
|
||||
{
|
||||
input: []byte("خورده مي شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// active present subjunctive
|
||||
{
|
||||
input: []byte("بخورد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("بخورد"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPersianAnalyzerOthers(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// nouns
|
||||
{
|
||||
input: []byte("برگ ها"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("برگ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("برگها"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("برگ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// non persian
|
||||
{
|
||||
input: []byte("English test."),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("english"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("test"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// others
|
||||
{
|
||||
input: []byte("خورده مي شده بوده باشد"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("خورده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("برگها"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("برگ"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
80
analysis/lang/fa/persian_normalize.go
Normal file
80
analysis/lang/fa/persian_normalize.go
Normal file
|
@ -0,0 +1,80 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const NormalizeName = "normalize_fa"
|
||||
|
||||
const (
|
||||
Yeh = '\u064A'
|
||||
FarsiYeh = '\u06CC'
|
||||
YehBarree = '\u06D2'
|
||||
Keheh = '\u06A9'
|
||||
Kaf = '\u0643'
|
||||
HamzaAbove = '\u0654'
|
||||
HehYeh = '\u06C0'
|
||||
HehGoal = '\u06C1'
|
||||
Heh = '\u0647'
|
||||
)
|
||||
|
||||
type PersianNormalizeFilter struct {
|
||||
}
|
||||
|
||||
func NewPersianNormalizeFilter() *PersianNormalizeFilter {
|
||||
return &PersianNormalizeFilter{}
|
||||
}
|
||||
|
||||
func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
term := normalize(token.Term)
|
||||
token.Term = term
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func normalize(input []byte) []byte {
|
||||
runes := bytes.Runes(input)
|
||||
for i := 0; i < len(runes); i++ {
|
||||
switch runes[i] {
|
||||
case FarsiYeh, YehBarree:
|
||||
runes[i] = Yeh
|
||||
case Keheh:
|
||||
runes[i] = Kaf
|
||||
case HehYeh, HehGoal:
|
||||
runes[i] = Heh
|
||||
case HamzaAbove: // necessary for HEH + HAMZA
|
||||
runes = analysis.DeleteRune(runes, i)
|
||||
i--
|
||||
}
|
||||
}
|
||||
return analysis.BuildTermFromRunes(runes)
|
||||
}
|
||||
|
||||
func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewPersianNormalizeFilter(), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
130
analysis/lang/fa/persian_normalize_test.go
Normal file
130
analysis/lang/fa/persian_normalize_test.go
Normal file
|
@ -0,0 +1,130 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
)
|
||||
|
||||
func TestPersianNormalizeFilter(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// FarsiYeh
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("های"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هاي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// YehBarree
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هاے"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("هاي"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// Keheh
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("کشاندن"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كشاندن"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HehYeh
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابۀ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابه"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HehHamzaAbove
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابهٔ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("كتابه"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// HehGoal
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زادہ"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("زاده"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// empty
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte(""),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
persianNormalizeFilter := NewPersianNormalizeFilter()
|
||||
for _, test := range tests {
|
||||
actual := persianNormalizeFilter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %#v, got %#v", test.output, actual)
|
||||
t.Errorf("expected % x, got % x", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
36
analysis/lang/fa/stop_filter_fa.go
Normal file
36
analysis/lang/fa/stop_filter_fa.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2014 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package fa
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
340
analysis/lang/fa/stop_words_fa.go
Normal file
340
analysis/lang/fa/stop_words_fa.go
Normal file
|
@ -0,0 +1,340 @@
|
|||
package fa
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_fa"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var PersianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# Note: by default this file is used after normalization, so when adding entries
|
||||
# to this file, use the arabic 'ي' instead of 'ی'
|
||||
انان
|
||||
نداشته
|
||||
سراسر
|
||||
خياه
|
||||
ايشان
|
||||
وي
|
||||
تاكنون
|
||||
بيشتري
|
||||
دوم
|
||||
پس
|
||||
ناشي
|
||||
وگو
|
||||
يا
|
||||
داشتند
|
||||
سپس
|
||||
هنگام
|
||||
هرگز
|
||||
پنج
|
||||
نشان
|
||||
امسال
|
||||
ديگر
|
||||
گروهي
|
||||
شدند
|
||||
چطور
|
||||
ده
|
||||
و
|
||||
دو
|
||||
نخستين
|
||||
ولي
|
||||
چرا
|
||||
چه
|
||||
وسط
|
||||
ه
|
||||
كدام
|
||||
قابل
|
||||
يك
|
||||
رفت
|
||||
هفت
|
||||
همچنين
|
||||
در
|
||||
هزار
|
||||
بله
|
||||
بلي
|
||||
شايد
|
||||
اما
|
||||
شناسي
|
||||
گرفته
|
||||
دهد
|
||||
داشته
|
||||
دانست
|
||||
داشتن
|
||||
خواهيم
|
||||
ميليارد
|
||||
وقتيكه
|
||||
امد
|
||||
خواهد
|
||||
جز
|
||||
اورده
|
||||
شده
|
||||
بلكه
|
||||
خدمات
|
||||
شدن
|
||||
برخي
|
||||
نبود
|
||||
بسياري
|
||||
جلوگيري
|
||||
حق
|
||||
كردند
|
||||
نوعي
|
||||
بعري
|
||||
نكرده
|
||||
نظير
|
||||
نبايد
|
||||
بوده
|
||||
بودن
|
||||
داد
|
||||
اورد
|
||||
هست
|
||||
جايي
|
||||
شود
|
||||
دنبال
|
||||
داده
|
||||
بايد
|
||||
سابق
|
||||
هيچ
|
||||
همان
|
||||
انجا
|
||||
كمتر
|
||||
كجاست
|
||||
گردد
|
||||
كسي
|
||||
تر
|
||||
مردم
|
||||
تان
|
||||
دادن
|
||||
بودند
|
||||
سري
|
||||
جدا
|
||||
ندارند
|
||||
مگر
|
||||
يكديگر
|
||||
دارد
|
||||
دهند
|
||||
بنابراين
|
||||
هنگامي
|
||||
سمت
|
||||
جا
|
||||
انچه
|
||||
خود
|
||||
دادند
|
||||
زياد
|
||||
دارند
|
||||
اثر
|
||||
بدون
|
||||
بهترين
|
||||
بيشتر
|
||||
البته
|
||||
به
|
||||
براساس
|
||||
بيرون
|
||||
كرد
|
||||
بعضي
|
||||
گرفت
|
||||
توي
|
||||
اي
|
||||
ميليون
|
||||
او
|
||||
جريان
|
||||
تول
|
||||
بر
|
||||
مانند
|
||||
برابر
|
||||
باشيم
|
||||
مدتي
|
||||
گويند
|
||||
اكنون
|
||||
تا
|
||||
تنها
|
||||
جديد
|
||||
چند
|
||||
بي
|
||||
نشده
|
||||
كردن
|
||||
كردم
|
||||
گويد
|
||||
كرده
|
||||
كنيم
|
||||
نمي
|
||||
نزد
|
||||
روي
|
||||
قصد
|
||||
فقط
|
||||
بالاي
|
||||
ديگران
|
||||
اين
|
||||
ديروز
|
||||
توسط
|
||||
سوم
|
||||
ايم
|
||||
دانند
|
||||
سوي
|
||||
استفاده
|
||||
شما
|
||||
كنار
|
||||
داريم
|
||||
ساخته
|
||||
طور
|
||||
امده
|
||||
رفته
|
||||
نخست
|
||||
بيست
|
||||
نزديك
|
||||
طي
|
||||
كنيد
|
||||
از
|
||||
انها
|
||||
تمامي
|
||||
داشت
|
||||
يكي
|
||||
طريق
|
||||
اش
|
||||
چيست
|
||||
روب
|
||||
نمايد
|
||||
گفت
|
||||
چندين
|
||||
چيزي
|
||||
تواند
|
||||
ام
|
||||
ايا
|
||||
با
|
||||
ان
|
||||
ايد
|
||||
ترين
|
||||
اينكه
|
||||
ديگري
|
||||
راه
|
||||
هايي
|
||||
بروز
|
||||
همچنان
|
||||
پاعين
|
||||
كس
|
||||
حدود
|
||||
مختلف
|
||||
مقابل
|
||||
چيز
|
||||
گيرد
|
||||
ندارد
|
||||
ضد
|
||||
همچون
|
||||
سازي
|
||||
شان
|
||||
مورد
|
||||
باره
|
||||
مرسي
|
||||
خويش
|
||||
برخوردار
|
||||
چون
|
||||
خارج
|
||||
شش
|
||||
هنوز
|
||||
تحت
|
||||
ضمن
|
||||
هستيم
|
||||
گفته
|
||||
فكر
|
||||
بسيار
|
||||
پيش
|
||||
براي
|
||||
روزهاي
|
||||
انكه
|
||||
نخواهد
|
||||
بالا
|
||||
كل
|
||||
وقتي
|
||||
كي
|
||||
چنين
|
||||
كه
|
||||
گيري
|
||||
نيست
|
||||
است
|
||||
كجا
|
||||
كند
|
||||
نيز
|
||||
يابد
|
||||
بندي
|
||||
حتي
|
||||
توانند
|
||||
عقب
|
||||
خواست
|
||||
كنند
|
||||
بين
|
||||
تمام
|
||||
همه
|
||||
ما
|
||||
باشند
|
||||
مثل
|
||||
شد
|
||||
اري
|
||||
باشد
|
||||
اره
|
||||
طبق
|
||||
بعد
|
||||
اگر
|
||||
صورت
|
||||
غير
|
||||
جاي
|
||||
بيش
|
||||
ريزي
|
||||
اند
|
||||
زيرا
|
||||
چگونه
|
||||
بار
|
||||
لطفا
|
||||
مي
|
||||
درباره
|
||||
من
|
||||
ديده
|
||||
همين
|
||||
گذاري
|
||||
برداري
|
||||
علت
|
||||
گذاشته
|
||||
هم
|
||||
فوق
|
||||
نه
|
||||
ها
|
||||
شوند
|
||||
اباد
|
||||
همواره
|
||||
هر
|
||||
اول
|
||||
خواهند
|
||||
چهار
|
||||
نام
|
||||
امروز
|
||||
مان
|
||||
هاي
|
||||
قبل
|
||||
كنم
|
||||
سعي
|
||||
تازه
|
||||
را
|
||||
هستند
|
||||
زير
|
||||
جلوي
|
||||
عنوان
|
||||
بود
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(PersianStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue