Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00 · 2025-05-19 00:20:02 +02:00 · 982828099e
commit 982828099e
parent c71cb8b61d
783 changed files with 150650 additions and 0 deletions
--- a/analysis/lang/en/analyzer_en.go
+++ b/analysis/lang/en/analyzer_en.go
@ -0,0 +1,73 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package en implements an analyzer with reasonable defaults for processing
+// English text.
+//
+// It strips possessive suffixes ('s), transforms tokens to lower case,
+// removes stopwords from a built-in list, and applies porter stemming.
+//
+// The built-in stopwords list is defined in EnglishStopWords.
+package en
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+
+	"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
+	"github.com/blevesearch/bleve/v2/analysis/token/porter"
+	"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
+)
+
+const AnalyzerName = "en"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
+	tokenizer, err := cache.TokenizerNamed(unicode.Name)
+	if err != nil {
+		return nil, err
+	}
+	possEnFilter, err := cache.TokenFilterNamed(PossessiveName)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopEnFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	stemmerEnFilter, err := cache.TokenFilterNamed(porter.Name)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.DefaultAnalyzer{
+		Tokenizer: tokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			possEnFilter,
+			toLowerFilter,
+			stopEnFilter,
+			stemmerEnFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/en/analyzer_en_test.go
+++ b/analysis/lang/en/analyzer_en_test.go
@ -0,0 +1,105 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package en
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func TestEnglishAnalyzer(t *testing.T) {
+	tests := []struct {
+		input  []byte
+		output analysis.TokenStream
+	}{
+		// stemming
+		{
+			input: []byte("books"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("book"),
+					Position: 1,
+					Start:    0,
+					End:      5,
+				},
+			},
+		},
+		{
+			input: []byte("book"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("book"),
+					Position: 1,
+					Start:    0,
+					End:      4,
+				},
+			},
+		},
+		// stop word removal
+		{
+			input:  []byte("the"),
+			output: analysis.TokenStream{},
+		},
+		// possessive removal
+		{
+			input: []byte("steven's"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("steven"),
+					Position: 1,
+					Start:    0,
+					End:      8,
+				},
+			},
+		},
+		{
+			input: []byte("steven\u2019s"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("steven"),
+					Position: 1,
+					Start:    0,
+					End:      10,
+				},
+			},
+		},
+		{
+			input: []byte("steven\uFF07s"),
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("steven"),
+					Position: 1,
+					Start:    0,
+					End:      10,
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := analyzer.Analyze(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %v, got %v", test.output, actual)
+		}
+	}
+}
--- a/analysis/lang/en/plural_stemmer.go
+++ b/analysis/lang/en/plural_stemmer.go
@ -0,0 +1,177 @@
+/*
+	This code was ported from the Open Search Project
+	https://github.com/opensearch-project/OpenSearch/blob/main/modules/analysis-common/src/main/java/org/opensearch/analysis/common/EnglishPluralStemFilter.java
+	The algorithm itself was created by Mark Harwood
+	https://github.com/markharwood
+*/
+
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package en
+
+import (
+	"strings"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const PluralStemmerName = "stemmer_en_plural"
+
+type EnglishPluralStemmerFilter struct {
+}
+
+func NewEnglishPluralStemmerFilter() *EnglishPluralStemmerFilter {
+	return &EnglishPluralStemmerFilter{}
+}
+
+func (s *EnglishPluralStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		token.Term = []byte(stem(string(token.Term)))
+	}
+
+	return input
+}
+
+func EnglishPluralStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewEnglishPluralStemmerFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(PluralStemmerName, EnglishPluralStemmerFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
+
+// ----------------------------------------------------------------------------
+
+// Words ending in oes that retain the e when stemmed
+var oesExceptions = []string{"shoes", "canoes", "oboes"}
+
+// Words ending in ches that retain the e when stemmed
+var chesExceptions = []string{
+	"cliches",
+	"avalanches",
+	"mustaches",
+	"moustaches",
+	"quiches",
+	"headaches",
+	"heartaches",
+	"porsches",
+	"tranches",
+	"caches",
+}
+
+func stem(word string) string {
+	runes := []rune(strings.ToLower(word))
+
+	if len(runes) < 3 || runes[len(runes)-1] != 's' {
+		return string(runes)
+	}
+
+	switch runes[len(runes)-2] {
+	case 'u':
+		fallthrough
+	case 's':
+		return string(runes)
+	case 'e':
+		// Modified ies->y logic from original s-stemmer - only work on strings > 4
+		// so spies -> spy still but pies->pie.
+		// The original code also special-cased aies and eies for no good reason as far as I can tell.
+		// ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies )
+		if len(runes) > 4 && runes[len(runes)-3] == 'i' {
+			runes[len(runes)-3] = 'y'
+			return string(runes[0 : len(runes)-2])
+		}
+
+		// Suffix rules to remove any dangling "e"
+		if len(runes) > 3 {
+			// xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe")
+			if len(runes) > 4 && runes[len(runes)-3] == 'x' {
+				return string(runes[0 : len(runes)-2])
+			}
+
+			// oes
+			if len(runes) > 3 && runes[len(runes)-3] == 'o' {
+				if isException(runes, oesExceptions) {
+					// Only remove the S
+					return string(runes[0 : len(runes)-1])
+				}
+				// Remove the es
+				return string(runes[0 : len(runes)-2])
+			}
+
+			if len(runes) > 4 {
+				// shes/sses
+				if runes[len(runes)-4] == 's' && (runes[len(runes)-3] == 'h' || runes[len(runes)-3] == 's') {
+					return string(runes[0 : len(runes)-2])
+				}
+
+				// ches
+				if len(runes) > 4 {
+					if runes[len(runes)-4] == 'c' && runes[len(runes)-3] == 'h' {
+						if isException(runes, chesExceptions) {
+							// Only remove the S
+							return string(runes[0 : len(runes)-1])
+						}
+						// Remove the es
+						return string(runes[0 : len(runes)-2])
+					}
+				}
+			}
+		}
+		fallthrough
+	default:
+		return string(runes[0 : len(runes)-1])
+	}
+}
+
+func isException(word []rune, exceptions []string) bool {
+	for _, exception := range exceptions {
+
+		exceptionRunes := []rune(exception)
+
+		exceptionPos := len(exceptionRunes) - 1
+		wordPos := len(word) - 1
+
+		matched := true
+		for exceptionPos >= 0 && wordPos >= 0 {
+			if exceptionRunes[exceptionPos] != word[wordPos] {
+				matched = false
+				break
+			}
+			exceptionPos--
+			wordPos--
+		}
+		if matched {
+			return true
+		}
+	}
+	return false
+}
--- a/analysis/lang/en/plural_stemmer_test.go
+++ b/analysis/lang/en/plural_stemmer_test.go
@ -0,0 +1,46 @@
+package en
+
+import "testing"
+
+func TestEnglishPluralStemmer(t *testing.T) {
+	data := []struct {
+		In, Out string
+	}{
+		{"dresses", "dress"},
+		{"dress", "dress"},
+		{"axes", "axe"},
+		{"ad", "ad"},
+		{"ads", "ad"},
+		{"gas", "ga"},
+		{"sass", "sass"},
+		{"berries", "berry"},
+		{"dresses", "dress"},
+		{"spies", "spy"},
+		{"shoes", "shoe"},
+		{"headaches", "headache"},
+		{"computer", "computer"},
+		{"dressing", "dressing"},
+		{"clothes", "clothe"},
+		{"DRESSES", "dress"},
+		{"frog", "frog"},
+		{"dress", "dress"},
+		{"runs", "run"},
+		{"pies", "pie"},
+		{"foxes", "fox"},
+		{"axes", "axe"},
+		{"foes", "fo"},
+		{"dishes", "dish"},
+		{"snitches", "snitch"},
+		{"cliches", "cliche"},
+		{"forests", "forest"},
+		{"yes", "ye"},
+	}
+
+	for _, datum := range data {
+		stemmed := stem(datum.In)
+
+		if stemmed != datum.Out {
+			t.Errorf("expected %v but got %v", datum.Out, stemmed)
+		}
+	}
+}
--- a/analysis/lang/en/possessive_filter_en.go
+++ b/analysis/lang/en/possessive_filter_en.go
@ -0,0 +1,70 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package en
+
+import (
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+// PossessiveName is the name PossessiveFilter is registered as
+// in the bleve registry.
+const PossessiveName = "possessive_en"
+
+const rightSingleQuotationMark = '’'
+const apostrophe = '\''
+const fullWidthApostrophe = '＇'
+
+const apostropheChars = rightSingleQuotationMark + apostrophe + fullWidthApostrophe
+
+// PossessiveFilter implements a TokenFilter which
+// strips the English possessive suffix ('s) from tokens.
+// It handle a variety of apostrophe types, is case-insensitive
+// and doesn't distinguish between possessive and contraction.
+// (ie "She's So Rad" becomes "She So Rad")
+type PossessiveFilter struct {
+}
+
+func NewPossessiveFilter() *PossessiveFilter {
+	return &PossessiveFilter{}
+}
+
+func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		lastRune, lastRuneSize := utf8.DecodeLastRune(token.Term)
+		if lastRune == 's' || lastRune == 'S' {
+			nextLastRune, nextLastRuneSize := utf8.DecodeLastRune(token.Term[:len(token.Term)-lastRuneSize])
+			if nextLastRune == rightSingleQuotationMark ||
+				nextLastRune == apostrophe ||
+				nextLastRune == fullWidthApostrophe {
+				token.Term = token.Term[:len(token.Term)-lastRuneSize-nextLastRuneSize]
+			}
+		}
+	}
+	return input
+}
+
+func PossessiveFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewPossessiveFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(PossessiveName, PossessiveFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/en/possessive_filter_en_test.go
+++ b/analysis/lang/en/possessive_filter_en_test.go
@ -0,0 +1,142 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package en
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func TestEnglishPossessiveFilter(t *testing.T) {
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("marty's"),
+				},
+				&analysis.Token{
+					Term: []byte("MARTY'S"),
+				},
+				&analysis.Token{
+					Term: []byte("marty’s"),
+				},
+				&analysis.Token{
+					Term: []byte("MARTY’S"),
+				},
+				&analysis.Token{
+					Term: []byte("marty＇s"),
+				},
+				&analysis.Token{
+					Term: []byte("MARTY＇S"),
+				},
+				&analysis.Token{
+					Term: []byte("m"),
+				},
+				&analysis.Token{
+					Term: []byte("s"),
+				},
+				&analysis.Token{
+					Term: []byte("'s"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("marty"),
+				},
+				&analysis.Token{
+					Term: []byte("MARTY"),
+				},
+				&analysis.Token{
+					Term: []byte("marty"),
+				},
+				&analysis.Token{
+					Term: []byte("MARTY"),
+				},
+				&analysis.Token{
+					Term: []byte("marty"),
+				},
+				&analysis.Token{
+					Term: []byte("MARTY"),
+				},
+				&analysis.Token{
+					Term: []byte("m"),
+				},
+				&analysis.Token{
+					Term: []byte("s"),
+				},
+				&analysis.Token{
+					Term: []byte(""),
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := stemmerFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output, actual)
+		}
+	}
+}
+
+func BenchmarkEnglishPossessiveFilter(b *testing.B) {
+
+	input := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("marty's"),
+		},
+		&analysis.Token{
+			Term: []byte("MARTY'S"),
+		},
+		&analysis.Token{
+			Term: []byte("marty’s"),
+		},
+		&analysis.Token{
+			Term: []byte("MARTY’S"),
+		},
+		&analysis.Token{
+			Term: []byte("marty＇s"),
+		},
+		&analysis.Token{
+			Term: []byte("MARTY＇S"),
+		},
+		&analysis.Token{
+			Term: []byte("m"),
+		},
+	}
+
+	cache := registry.NewCache()
+	stemmerFilter, err := cache.TokenFilterNamed(PossessiveName)
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		stemmerFilter.Filter(input)
+	}
+
+}
--- a/analysis/lang/en/stemmer_en_snowball.go
+++ b/analysis/lang/en/stemmer_en_snowball.go
@ -0,0 +1,52 @@
+//  Copyright (c) 2020 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package en
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+
+	"github.com/blevesearch/snowballstem"
+	"github.com/blevesearch/snowballstem/english"
+)
+
+const SnowballStemmerName = "stemmer_en_snowball"
+
+type EnglishStemmerFilter struct {
+}
+
+func NewEnglishStemmerFilter() *EnglishStemmerFilter {
+	return &EnglishStemmerFilter{}
+}
+
+func (s *EnglishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		env := snowballstem.NewEnv(string(token.Term))
+		english.Stem(env)
+		token.Term = []byte(env.Current())
+	}
+	return input
+}
+
+func EnglishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewEnglishStemmerFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(SnowballStemmerName, EnglishStemmerFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/en/stemmer_en_test.go
+++ b/analysis/lang/en/stemmer_en_test.go
@ -0,0 +1,79 @@
+//  Copyright (c) 2020 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package en
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func TestSnowballEnglishStemmer(t *testing.T) {
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("enjoy"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("enjoy"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("enjoyed"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("enjoy"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("enjoyable"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("enjoy"),
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	filter, err := cache.TokenFilterNamed(SnowballStemmerName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := filter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
+		}
+	}
+}
--- a/analysis/lang/en/stop_filter_en.go
+++ b/analysis/lang/en/stop_filter_en.go
@ -0,0 +1,36 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package en
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/analysis/token/stop"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/lang/en/stop_words_en.go
+++ b/analysis/lang/en/stop_words_en.go
@ -0,0 +1,347 @@
+package en
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const StopName = "stop_en"
+
+// EnglishStopWords is the built-in list of stopwords used by the "stop_en" TokenFilter.
+//
+// this content was obtained from:
+// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
+// ` was changed to ' to allow for literal string
+var EnglishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/english/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+ |
+ | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
+ 
+ | An English stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | Many of the forms below are quite rare (e.g. "yourselves") but included for
+ |  completeness.
+
+           | PRONOUNS FORMS
+             | 1st person sing
+
+i              | subject, always in upper case of course
+
+me             | object
+my             | possessive adjective
+               | the possessive pronoun 'mine' is best suppressed, because of the
+               | sense of coal-mine etc.
+myself         | reflexive
+             | 1st person plural
+we             | subject
+
+| us           | object
+               | care is required here because US = United States. It is usually
+               | safe to remove it if it is in lower case.
+our            | possessive adjective
+ours           | possessive pronoun
+ourselves      | reflexive
+             | second person (archaic 'thou' forms not included)
+you            | subject and object
+your           | possessive adjective
+yours          | possessive pronoun
+yourself       | reflexive (singular)
+yourselves     | reflexive (plural)
+             | third person singular
+he             | subject
+him            | object
+his            | possessive adjective and pronoun
+himself        | reflexive
+
+she            | subject
+her            | object and possessive adjective
+hers           | possessive pronoun
+herself        | reflexive
+
+it             | subject and object
+its            | possessive adjective
+itself         | reflexive
+             | third person plural
+they           | subject
+them           | object
+their          | possessive adjective
+theirs         | possessive pronoun
+themselves     | reflexive
+             | other forms (demonstratives, interrogatives)
+what
+which
+who
+whom
+this
+that
+these
+those
+
+           | VERB FORMS (using F.R. Palmer's nomenclature)
+             | BE
+am             | 1st person, present
+is             | -s form (3rd person, present)
+are            | present
+was            | 1st person, past
+were           | past
+be             | infinitive
+been           | past participle
+being          | -ing form
+             | HAVE
+have           | simple
+has            | -s form
+had            | past
+having         | -ing form
+             | DO
+do             | simple
+does           | -s form
+did            | past
+doing          | -ing form
+
+ | The forms below are, I believe, best omitted, because of the significant
+ | homonym forms:
+
+ |  He made a WILL
+ |  old tin CAN
+ |  merry month of MAY
+ |  a smell of MUST
+ |  fight the good fight with all thy MIGHT
+
+ | would, could, should, ought might however be included
+
+ |          | AUXILIARIES
+ |            | WILL
+ |will
+
+would
+
+ |            | SHALL
+ |shall
+
+should
+
+ |            | CAN
+ |can
+
+could
+
+ |            | MAY
+ |may
+ |might
+ |            | MUST
+ |must
+ |            | OUGHT
+
+ought
+
+           | COMPOUND FORMS, increasingly encountered nowadays in 'formal' writing
+              | pronoun + verb
+
+i'm
+you're
+he's
+she's
+it's
+we're
+they're
+i've
+you've
+we've
+they've
+i'd
+you'd
+he'd
+she'd
+we'd
+they'd
+i'll
+you'll
+he'll
+she'll
+we'll
+they'll
+
+              | verb + negation
+
+isn't
+aren't
+wasn't
+weren't
+hasn't
+haven't
+hadn't
+doesn't
+don't
+didn't
+
+              | auxiliary + negation
+
+won't
+wouldn't
+shan't
+shouldn't
+can't
+cannot
+couldn't
+mustn't
+
+             | miscellaneous forms
+
+let's
+that's
+who's
+what's
+here's
+there's
+when's
+where's
+why's
+how's
+
+              | rarer forms
+
+ | daren't needn't
+
+              | doubtful forms
+
+ | oughtn't mightn't
+
+           | ARTICLES
+a
+an
+the
+
+           | THE REST (Overlap among prepositions, conjunctions, adverbs etc is so
+           | high, that classification is pointless.)
+and
+but
+if
+or
+because
+as
+until
+while
+
+of
+at
+by
+for
+with
+about
+against
+between
+into
+through
+during
+before
+after
+above
+below
+to
+from
+up
+down
+in
+out
+on
+off
+over
+under
+
+again
+further
+then
+once
+
+here
+there
+when
+where
+why
+how
+
+all
+any
+both
+each
+few
+more
+most
+other
+some
+such
+
+no
+nor
+not
+only
+own
+same
+so
+than
+too
+very
+
+ | Just for the record, the following words are among the commonest in English
+
+    | one
+    | every
+    | least
+    | less
+    | many
+    | now
+    | ever
+    | never
+    | say
+    | says
+    | said
+    | also
+    | get
+    | go
+    | goes
+    | just
+    | made
+    | make
+    | put
+    | see
+    | seen
+    | whether
+    | like
+    | well
+    | back
+    | even
+    | still
+    | way
+    | take
+    | since
+    | another
+    | however
+    | two
+    | three
+    | four
+    | five
+    | first
+    | second
+    | new
+    | old
+    | high
+    | long
+`)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(EnglishStopWords)
+	return rv, err
+}
+
+func init() {
+	err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
+	if err != nil {
+		panic(err)
+	}
+}