Adding upstream version 2.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00 · 2025-05-19 00:20:02 +02:00 · 982828099e
commit 982828099e
parent c71cb8b61d
783 changed files with 150650 additions and 0 deletions
--- a/analysis/token/apostrophe/apostrophe.go
+++ b/analysis/token/apostrophe/apostrophe.go
@ -0,0 +1,57 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package apostrophe
+
+import (
+	"bytes"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "apostrophe"
+
+const RightSingleQuotationMark = "’"
+const Apostrophe = "'"
+const Apostrophes = Apostrophe + RightSingleQuotationMark
+
+type ApostropheFilter struct{}
+
+func NewApostropheFilter() *ApostropheFilter {
+	return &ApostropheFilter{}
+}
+
+func (s *ApostropheFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		firstApostrophe := bytes.IndexAny(token.Term, Apostrophes)
+		if firstApostrophe >= 0 {
+			// found an apostrophe
+			token.Term = token.Term[0:firstApostrophe]
+		}
+	}
+
+	return input
+}
+
+func ApostropheFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewApostropheFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, ApostropheFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/apostrophe/apostrophe_test.go
+++ b/analysis/token/apostrophe/apostrophe_test.go
@ -0,0 +1,99 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package apostrophe
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestApostropheFilter(t *testing.T) {
+
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Türkiye'de"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Türkiye"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("2003'te"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("2003"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Van"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Van"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Gölü'nü"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Gölü"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("gördüm"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("gördüm"),
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		apostropheFilter := NewApostropheFilter()
+		actual := apostropheFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
+		}
+	}
+}
--- a/analysis/token/camelcase/camelcase.go
+++ b/analysis/token/camelcase/camelcase.go
@ -0,0 +1,81 @@
+//  Copyright (c) 2016 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package camelcase
+
+import (
+	"bytes"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "camelCase"
+
+// CamelCaseFilter splits a given token into a set of tokens where each resulting token
+// falls into one the following classes:
+//  1. Upper case followed by lower case letters.
+//     Terminated by a number, an upper case letter, and a non alpha-numeric symbol.
+//  2. Upper case followed by upper case letters.
+//     Terminated by a number, an upper case followed by a lower case letter, and a non alpha-numeric symbol.
+//  3. Lower case followed by lower case letters.
+//     Terminated by a number, an upper case letter, and a non alpha-numeric symbol.
+//  4. Number followed by numbers.
+//     Terminated by a letter, and a non alpha-numeric symbol.
+//  5. Non alpha-numeric symbol followed by non alpha-numeric symbols.
+//     Terminated by a number, and a letter.
+//
+// It does a one-time sequential pass over an input token, from left to right.
+// The scan is greedy and generates the longest substring that fits into one of the classes.
+//
+// See the test file for examples of classes and their parsings.
+type CamelCaseFilter struct{}
+
+func NewCamelCaseFilter() *CamelCaseFilter {
+	return &CamelCaseFilter{}
+}
+
+func (f *CamelCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0, len(input))
+
+	nextPosition := 1
+	for _, token := range input {
+		runeCount := utf8.RuneCount(token.Term)
+		runes := bytes.Runes(token.Term)
+
+		p := NewParser(runeCount, nextPosition, token.Start)
+		for i := 0; i < runeCount; i++ {
+			if i+1 >= runeCount {
+				p.Push(runes[i], nil)
+			} else {
+				p.Push(runes[i], &runes[i+1])
+			}
+		}
+		rv = append(rv, p.FlushTokens()...)
+		nextPosition = p.NextPosition()
+	}
+	return rv
+}
+
+func CamelCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewCamelCaseFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, CamelCaseFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/camelcase/camelcase_test.go
+++ b/analysis/token/camelcase/camelcase_test.go
@ -0,0 +1,95 @@
+//  Copyright (c) 2016 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package camelcase
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestCamelCaseFilter(t *testing.T) {
+
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			input:  tokenStream(""),
+			output: tokenStream(""),
+		},
+		{
+			input:  tokenStream("a"),
+			output: tokenStream("a"),
+		},
+
+		{
+			input:  tokenStream("...aMACMac123macILoveGolang"),
+			output: tokenStream("...", "a", "MAC", "Mac", "123", "mac", "I", "Love", "Golang"),
+		},
+		{
+			input:  tokenStream("Lang"),
+			output: tokenStream("Lang"),
+		},
+		{
+			input:  tokenStream("GLang"),
+			output: tokenStream("G", "Lang"),
+		},
+		{
+			input:  tokenStream("GOLang"),
+			output: tokenStream("GO", "Lang"),
+		},
+		{
+			input:  tokenStream("GOOLang"),
+			output: tokenStream("GOO", "Lang"),
+		},
+		{
+			input:  tokenStream("1234"),
+			output: tokenStream("1234"),
+		},
+		{
+			input:  tokenStream("starbucks"),
+			output: tokenStream("starbucks"),
+		},
+		{
+			input:  tokenStream("Starbucks TVSamsungIsGREAT000"),
+			output: tokenStream("Starbucks", " ", "TV", "Samsung", "Is", "GREAT", "000"),
+		},
+	}
+
+	for _, test := range tests {
+		ccFilter := NewCamelCaseFilter()
+		actual := ccFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s \n\n got %s", test.output, actual)
+		}
+	}
+}
+
+func tokenStream(termStrs ...string) analysis.TokenStream {
+	tokenStream := make([]*analysis.Token, len(termStrs))
+	index := 0
+	for i, termStr := range termStrs {
+		tokenStream[i] = &analysis.Token{
+			Term:     []byte(termStr),
+			Position: i + 1,
+			Start:    index,
+			End:      index + len(termStr),
+		}
+		index += len(termStr)
+	}
+	return analysis.TokenStream(tokenStream)
+}
--- a/analysis/token/camelcase/parser.go
+++ b/analysis/token/camelcase/parser.go
@ -0,0 +1,109 @@
+//  Copyright (c) 2016 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package camelcase
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func (p *Parser) buildTokenFromTerm(buffer []rune) *analysis.Token {
+	term := analysis.BuildTermFromRunes(buffer)
+	token := &analysis.Token{
+		Term:     term,
+		Position: p.position,
+		Start:    p.index,
+		End:      p.index + len(term),
+	}
+	p.position++
+	p.index += len(term)
+	return token
+}
+
+// Parser accepts a symbol and passes it to the current state (representing a class).
+// The state can accept it (and accumulate it). Otherwise, the parser creates a new state that
+// starts with the pushed symbol.
+//
+// Parser accumulates a new resulting token every time it switches state.
+// Use FlushTokens() to get the results after the last symbol was pushed.
+type Parser struct {
+	bufferLen int
+	buffer    []rune
+	current   State
+	tokens    []*analysis.Token
+	position  int
+	index     int
+}
+
+func NewParser(length, position, index int) *Parser {
+	return &Parser{
+		bufferLen: length,
+		buffer:    make([]rune, 0, length),
+		tokens:    make([]*analysis.Token, 0, length),
+		position:  position,
+		index:     index,
+	}
+}
+
+func (p *Parser) Push(sym rune, peek *rune) {
+	if p.current == nil {
+		// the start of parsing
+		p.current = p.NewState(sym)
+		p.buffer = append(p.buffer, sym)
+
+	} else if p.current.Member(sym, peek) {
+		// same state, just accumulate
+		p.buffer = append(p.buffer, sym)
+
+	} else {
+		// the old state is no more, thus convert the buffer
+		p.tokens = append(p.tokens, p.buildTokenFromTerm(p.buffer))
+
+		// let the new state begin
+		p.current = p.NewState(sym)
+		p.buffer = make([]rune, 0, p.bufferLen)
+		p.buffer = append(p.buffer, sym)
+	}
+}
+
+// Note. States have to have different starting symbols.
+func (p *Parser) NewState(sym rune) State {
+	var found State
+
+	found = &LowerCaseState{}
+	if found.StartSym(sym) {
+		return found
+	}
+
+	found = &UpperCaseState{}
+	if found.StartSym(sym) {
+		return found
+	}
+
+	found = &NumberCaseState{}
+	if found.StartSym(sym) {
+		return found
+	}
+
+	return &NonAlphaNumericCaseState{}
+}
+
+func (p *Parser) FlushTokens() []*analysis.Token {
+	p.tokens = append(p.tokens, p.buildTokenFromTerm(p.buffer))
+	return p.tokens
+}
+
+func (p *Parser) NextPosition() int {
+	return p.position
+}
--- a/analysis/token/camelcase/states.go
+++ b/analysis/token/camelcase/states.go
@ -0,0 +1,87 @@
+//  Copyright (c) 2016 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package camelcase
+
+import (
+	"unicode"
+)
+
+// States codify the classes that the parser recognizes.
+type State interface {
+	// is _sym_ the start character
+	StartSym(sym rune) bool
+
+	// is _sym_ a member of a class.
+	// peek, the next sym on the tape, can also be used to determine a class.
+	Member(sym rune, peek *rune) bool
+}
+
+type LowerCaseState struct{}
+
+func (s *LowerCaseState) Member(sym rune, peek *rune) bool {
+	return unicode.IsLower(sym)
+}
+
+func (s *LowerCaseState) StartSym(sym rune) bool {
+	return s.Member(sym, nil)
+}
+
+type UpperCaseState struct {
+	startedCollecting bool // denotes that the start character has been read
+	collectingUpper   bool // denotes if this is a class of all upper case letters
+}
+
+func (s *UpperCaseState) Member(sym rune, peek *rune) bool {
+	if !(unicode.IsLower(sym) || unicode.IsUpper(sym)) {
+		return false
+	}
+
+	if peek != nil && unicode.IsUpper(sym) && unicode.IsLower(*peek) {
+		return false
+	}
+
+	if !s.startedCollecting {
+		// now we have to determine if upper-case letters are collected.
+		s.startedCollecting = true
+		s.collectingUpper = unicode.IsUpper(sym)
+		return true
+	}
+
+	return s.collectingUpper == unicode.IsUpper(sym)
+}
+
+func (s *UpperCaseState) StartSym(sym rune) bool {
+	return unicode.IsUpper(sym)
+}
+
+type NumberCaseState struct{}
+
+func (s *NumberCaseState) Member(sym rune, peek *rune) bool {
+	return unicode.IsNumber(sym)
+}
+
+func (s *NumberCaseState) StartSym(sym rune) bool {
+	return s.Member(sym, nil)
+}
+
+type NonAlphaNumericCaseState struct{}
+
+func (s *NonAlphaNumericCaseState) Member(sym rune, peek *rune) bool {
+	return !unicode.IsLower(sym) && !unicode.IsUpper(sym) && !unicode.IsNumber(sym)
+}
+
+func (s *NonAlphaNumericCaseState) StartSym(sym rune) bool {
+	return s.Member(sym, nil)
+}
--- a/analysis/token/compound/dict.go
+++ b/analysis/token/compound/dict.go
@ -0,0 +1,144 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compound
+
+import (
+	"bytes"
+	"fmt"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "dict_compound"
+
+const defaultMinWordSize = 5
+const defaultMinSubWordSize = 2
+const defaultMaxSubWordSize = 15
+const defaultOnlyLongestMatch = false
+
+type DictionaryCompoundFilter struct {
+	dict             analysis.TokenMap
+	minWordSize      int
+	minSubWordSize   int
+	maxSubWordSize   int
+	onlyLongestMatch bool
+}
+
+func NewDictionaryCompoundFilter(dict analysis.TokenMap, minWordSize, minSubWordSize, maxSubWordSize int, onlyLongestMatch bool) *DictionaryCompoundFilter {
+	return &DictionaryCompoundFilter{
+		dict:             dict,
+		minWordSize:      minWordSize,
+		minSubWordSize:   minSubWordSize,
+		maxSubWordSize:   maxSubWordSize,
+		onlyLongestMatch: onlyLongestMatch,
+	}
+}
+
+func (f *DictionaryCompoundFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0, len(input))
+
+	for _, token := range input {
+		rv = append(rv, token)
+		tokenLen := utf8.RuneCount(token.Term)
+		if tokenLen >= f.minWordSize {
+			newtokens := f.decompose(token)
+			for _, newtoken := range newtokens {
+				rv = append(rv, newtoken)
+			}
+		}
+	}
+
+	return rv
+}
+
+func (f *DictionaryCompoundFilter) decompose(token *analysis.Token) []*analysis.Token {
+	runes := bytes.Runes(token.Term)
+	rv := make([]*analysis.Token, 0)
+	rlen := len(runes)
+	for i := 0; i <= (rlen - f.minSubWordSize); i++ {
+		var longestMatchToken *analysis.Token
+		for j := f.minSubWordSize; j <= f.maxSubWordSize; j++ {
+			if i+j > rlen {
+				break
+			}
+			_, inDict := f.dict[string(runes[i:i+j])]
+			if inDict {
+				newtoken := analysis.Token{
+					Term:     []byte(string(runes[i : i+j])),
+					Position: token.Position,
+					Start:    token.Start + i,
+					End:      token.Start + i + j,
+					Type:     token.Type,
+					KeyWord:  token.KeyWord,
+				}
+				if f.onlyLongestMatch {
+					if longestMatchToken == nil || utf8.RuneCount(longestMatchToken.Term) < j {
+						longestMatchToken = &newtoken
+					}
+				} else {
+					rv = append(rv, &newtoken)
+				}
+			}
+		}
+		if f.onlyLongestMatch && longestMatchToken != nil {
+			rv = append(rv, longestMatchToken)
+		}
+	}
+	return rv
+}
+
+func DictionaryCompoundFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+
+	minWordSize := defaultMinWordSize
+	minSubWordSize := defaultMinSubWordSize
+	maxSubWordSize := defaultMaxSubWordSize
+	onlyLongestMatch := defaultOnlyLongestMatch
+
+	minVal, ok := config["min_word_size"].(float64)
+	if ok {
+		minWordSize = int(minVal)
+	}
+	minSubVal, ok := config["min_subword_size"].(float64)
+	if ok {
+		minSubWordSize = int(minSubVal)
+	}
+	maxSubVal, ok := config["max_subword_size"].(float64)
+	if ok {
+		maxSubWordSize = int(maxSubVal)
+	}
+	onlyVal, ok := config["only_longest_match"].(bool)
+	if ok {
+		onlyLongestMatch = onlyVal
+	}
+
+	dictTokenMapName, ok := config["dict_token_map"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify dict_token_map")
+	}
+	dictTokenMap, err := cache.TokenMapNamed(dictTokenMapName)
+	if err != nil {
+		return nil, fmt.Errorf("error building dict compound words filter: %v", err)
+	}
+	return NewDictionaryCompoundFilter(dictTokenMap, minWordSize, minSubWordSize, maxSubWordSize, onlyLongestMatch), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, DictionaryCompoundFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/compound/dict_test.go
+++ b/analysis/token/compound/dict_test.go
@ -0,0 +1,187 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compound
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/analysis/tokenmap"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func TestStopWordsFilter(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term:     []byte("i"),
+			Start:    0,
+			End:      1,
+			Position: 1,
+		},
+		&analysis.Token{
+			Term:     []byte("like"),
+			Start:    2,
+			End:      6,
+			Position: 2,
+		},
+		&analysis.Token{
+			Term:     []byte("to"),
+			Start:    7,
+			End:      9,
+			Position: 3,
+		},
+		&analysis.Token{
+			Term:     []byte("play"),
+			Start:    10,
+			End:      14,
+			Position: 4,
+		},
+		&analysis.Token{
+			Term:     []byte("softball"),
+			Start:    15,
+			End:      23,
+			Position: 5,
+		},
+	}
+
+	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term:     []byte("i"),
+			Start:    0,
+			End:      1,
+			Position: 1,
+		},
+		&analysis.Token{
+			Term:     []byte("like"),
+			Start:    2,
+			End:      6,
+			Position: 2,
+		},
+		&analysis.Token{
+			Term:     []byte("to"),
+			Start:    7,
+			End:      9,
+			Position: 3,
+		},
+		&analysis.Token{
+			Term:     []byte("play"),
+			Start:    10,
+			End:      14,
+			Position: 4,
+		},
+		&analysis.Token{
+			Term:     []byte("softball"),
+			Start:    15,
+			End:      23,
+			Position: 5,
+		},
+		&analysis.Token{
+			Term:     []byte("soft"),
+			Start:    15,
+			End:      19,
+			Position: 5,
+		},
+		&analysis.Token{
+			Term:     []byte("ball"),
+			Start:    19,
+			End:      23,
+			Position: 5,
+		},
+	}
+
+	cache := registry.NewCache()
+	dictListConfig := map[string]interface{}{
+		"type":   tokenmap.Name,
+		"tokens": []interface{}{"factor", "soft", "ball", "team"},
+	}
+	_, err := cache.DefineTokenMap("dict_test", dictListConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	dictConfig := map[string]interface{}{
+		"type":           "dict_compound",
+		"dict_token_map": "dict_test",
+	}
+	dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	ouputTokenStream := dictFilter.Filter(inputTokenStream)
+	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
+		t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
+	}
+}
+
+func TestStopWordsFilterLongestMatch(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term:     []byte("softestball"),
+			Start:    0,
+			End:      11,
+			Position: 1,
+		},
+	}
+
+	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term:     []byte("softestball"),
+			Start:    0,
+			End:      11,
+			Position: 1,
+		},
+		&analysis.Token{
+			Term:     []byte("softest"),
+			Start:    0,
+			End:      7,
+			Position: 1,
+		},
+		&analysis.Token{
+			Term:     []byte("ball"),
+			Start:    7,
+			End:      11,
+			Position: 1,
+		},
+	}
+
+	cache := registry.NewCache()
+	dictListConfig := map[string]interface{}{
+		"type":   tokenmap.Name,
+		"tokens": []interface{}{"soft", "softest", "ball"},
+	}
+	_, err := cache.DefineTokenMap("dict_test", dictListConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	dictConfig := map[string]interface{}{
+		"type":               "dict_compound",
+		"dict_token_map":     "dict_test",
+		"only_longest_match": true,
+	}
+	dictFilter, err := cache.DefineTokenFilter("dict_test", dictConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	ouputTokenStream := dictFilter.Filter(inputTokenStream)
+	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
+		t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
+	}
+}
--- a/analysis/token/edgengram/edgengram.go
+++ b/analysis/token/edgengram/edgengram.go
@ -0,0 +1,118 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package edgengram
+
+import (
+	"bytes"
+	"fmt"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "edge_ngram"
+
+type Side bool
+
+const BACK Side = true
+const FRONT Side = false
+
+type EdgeNgramFilter struct {
+	back      Side
+	minLength int
+	maxLength int
+}
+
+func NewEdgeNgramFilter(side Side, minLength, maxLength int) *EdgeNgramFilter {
+	return &EdgeNgramFilter{
+		back:      side,
+		minLength: minLength,
+		maxLength: maxLength,
+	}
+}
+
+func (s *EdgeNgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0, len(input))
+
+	for _, token := range input {
+		runeCount := utf8.RuneCount(token.Term)
+		runes := bytes.Runes(token.Term)
+		if s.back {
+			i := runeCount
+			// index of the starting rune for this token
+			for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
+				// build an ngram of this size starting at i
+				if i-ngramSize >= 0 {
+					ngramTerm := analysis.BuildTermFromRunes(runes[i-ngramSize : i])
+					token := analysis.Token{
+						Position: token.Position,
+						Start:    token.Start,
+						End:      token.End,
+						Type:     token.Type,
+						Term:     ngramTerm,
+					}
+					rv = append(rv, &token)
+				}
+			}
+		} else {
+			i := 0
+			// index of the starting rune for this token
+			for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
+				// build an ngram of this size starting at i
+				if i+ngramSize <= runeCount {
+					ngramTerm := analysis.BuildTermFromRunes(runes[i : i+ngramSize])
+					token := analysis.Token{
+						Position: token.Position,
+						Start:    token.Start,
+						End:      token.End,
+						Type:     token.Type,
+						Term:     ngramTerm,
+					}
+					rv = append(rv, &token)
+				}
+			}
+		}
+	}
+
+	return rv
+}
+
+func EdgeNgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	side := FRONT
+	back, ok := config["back"].(bool)
+	if ok && back {
+		side = BACK
+	}
+	minVal, ok := config["min"].(float64)
+	if !ok {
+		return nil, fmt.Errorf("must specify min")
+	}
+	min := int(minVal)
+	maxVal, ok := config["max"].(float64)
+	if !ok {
+		return nil, fmt.Errorf("must specify max")
+	}
+	max := int(maxVal)
+
+	return NewEdgeNgramFilter(side, min, max), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, EdgeNgramFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/edgengram/edgengram_test.go
+++ b/analysis/token/edgengram/edgengram_test.go
@ -0,0 +1,189 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package edgengram
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestEdgeNgramFilter(t *testing.T) {
+
+	tests := []struct {
+		side   Side
+		min    int
+		max    int
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			side: FRONT,
+			min:  1,
+			max:  1,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("abcde"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a"),
+				},
+			},
+		},
+		{
+			side: BACK,
+			min:  1,
+			max:  1,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("abcde"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("e"),
+				},
+			},
+		},
+		{
+			side: FRONT,
+			min:  1,
+			max:  3,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("abcde"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a"),
+				},
+				&analysis.Token{
+					Term: []byte("ab"),
+				},
+				&analysis.Token{
+					Term: []byte("abc"),
+				},
+			},
+		},
+		{
+			side: BACK,
+			min:  1,
+			max:  3,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("abcde"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("e"),
+				},
+				&analysis.Token{
+					Term: []byte("de"),
+				},
+				&analysis.Token{
+					Term: []byte("cde"),
+				},
+			},
+		},
+		{
+			side: FRONT,
+			min:  1,
+			max:  3,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("abcde"),
+				},
+				&analysis.Token{
+					Term: []byte("vwxyz"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a"),
+				},
+				&analysis.Token{
+					Term: []byte("ab"),
+				},
+				&analysis.Token{
+					Term: []byte("abc"),
+				},
+				&analysis.Token{
+					Term: []byte("v"),
+				},
+				&analysis.Token{
+					Term: []byte("vw"),
+				},
+				&analysis.Token{
+					Term: []byte("vwx"),
+				},
+			},
+		},
+		{
+			side: BACK,
+			min:  3,
+			max:  5,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Beryl"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ryl"),
+				},
+				&analysis.Token{
+					Term: []byte("eryl"),
+				},
+				&analysis.Token{
+					Term: []byte("Beryl"),
+				},
+			},
+		},
+		{
+			side: FRONT,
+			min:  3,
+			max:  5,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Beryl"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Ber"),
+				},
+				&analysis.Token{
+					Term: []byte("Bery"),
+				},
+				&analysis.Token{
+					Term: []byte("Beryl"),
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		edgeNgramFilter := NewEdgeNgramFilter(test.side, test.min, test.max)
+		actual := edgeNgramFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output, actual)
+		}
+	}
+}
--- a/analysis/token/elision/elision.go
+++ b/analysis/token/elision/elision.go
@ -0,0 +1,77 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package elision
+
+import (
+	"fmt"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "elision"
+
+const RightSingleQuotationMark = '’'
+const Apostrophe = '\''
+
+type ElisionFilter struct {
+	articles analysis.TokenMap
+}
+
+func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter {
+	return &ElisionFilter{
+		articles: articles,
+	}
+}
+
+func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		term := token.Term
+		for i := 0; i < len(term); {
+			r, size := utf8.DecodeRune(term[i:])
+			if r == Apostrophe || r == RightSingleQuotationMark {
+				// see if the prefix matches one of the articles
+				prefix := term[0:i]
+				_, articleMatch := s.articles[string(prefix)]
+				if articleMatch {
+					token.Term = term[i+size:]
+					break
+				}
+			}
+			i += size
+		}
+	}
+	return input
+}
+
+func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	articlesTokenMapName, ok := config["articles_token_map"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify articles_token_map")
+	}
+	articlesTokenMap, err := cache.TokenMapNamed(articlesTokenMapName)
+	if err != nil {
+		return nil, fmt.Errorf("error building elision filter: %v", err)
+	}
+	return NewElisionFilter(articlesTokenMap), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, ElisionFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/elision/elision_test.go
+++ b/analysis/token/elision/elision_test.go
@ -0,0 +1,85 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package elision
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/analysis/tokenmap"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func TestElisionFilter(t *testing.T) {
+
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ar" + string(Apostrophe) + "word"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("word"),
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ar" + string(RightSingleQuotationMark) + "word"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("word"),
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+
+	articleListConfig := map[string]interface{}{
+		"type":   tokenmap.Name,
+		"tokens": []interface{}{"ar"},
+	}
+	_, err := cache.DefineTokenMap("articles_test", articleListConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	elisionConfig := map[string]interface{}{
+		"type":               "elision",
+		"articles_token_map": "articles_test",
+	}
+	elisionFilter, err := cache.DefineTokenFilter("elision_test", elisionConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	for _, test := range tests {
+
+		actual := elisionFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
+		}
+	}
+}
--- a/analysis/token/hierarchy/hierarchy.go
+++ b/analysis/token/hierarchy/hierarchy.go
@ -0,0 +1,95 @@
+package hierarchy
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "hierarchy"
+
+type HierarchyFilter struct {
+	maxLevels  int
+	delimiter  []byte
+	splitInput bool
+}
+
+func NewHierarchyFilter(delimiter []byte, maxLevels int, splitInput bool) *HierarchyFilter {
+	return &HierarchyFilter{
+		maxLevels:  maxLevels,
+		delimiter:  delimiter,
+		splitInput: splitInput,
+	}
+}
+
+func (s *HierarchyFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0, s.maxLevels)
+
+	var soFar [][]byte
+	for _, token := range input {
+		if s.splitInput {
+			parts := bytes.Split(token.Term, s.delimiter)
+			for _, part := range parts {
+				soFar, rv = s.buildToken(rv, soFar, part)
+				if len(soFar) >= s.maxLevels {
+					return rv
+				}
+			}
+		} else {
+			soFar, rv = s.buildToken(rv, soFar, token.Term)
+			if len(soFar) >= s.maxLevels {
+				return rv
+			}
+		}
+	}
+
+	return rv
+}
+
+func (s *HierarchyFilter) buildToken(tokenStream analysis.TokenStream, soFar [][]byte, part []byte) (
+	[][]byte, analysis.TokenStream) {
+
+	soFar = append(soFar, part)
+	term := bytes.Join(soFar, s.delimiter)
+
+	tokenStream = append(tokenStream, &analysis.Token{
+		Type:     analysis.Shingle,
+		Term:     term,
+		Start:    0,
+		End:      len(term),
+		Position: 1,
+	})
+
+	return soFar, tokenStream
+}
+
+func HierarchyFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	max := math.MaxInt64
+	maxVal, ok := config["max"].(float64)
+	if ok {
+		max = int(maxVal)
+	}
+
+	splitInput := true
+	splitInputVal, ok := config["split_input"].(bool)
+	if ok {
+		splitInput = splitInputVal
+	}
+
+	delimiter, ok := config["delimiter"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify delimiter")
+	}
+
+	return NewHierarchyFilter([]byte(delimiter), max, splitInput), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, HierarchyFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/hierarchy/hierarchy_test.go
+++ b/analysis/token/hierarchy/hierarchy_test.go
@ -0,0 +1,229 @@
+package hierarchy
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestHierarchyFilter(t *testing.T) {
+
+	tests := []struct {
+		name       string
+		delimiter  string
+		max        int
+		splitInput bool
+
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			name: "single token a/b/c, delimiter /",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a/b/c"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("a"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      1,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("a/b"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      3,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("a/b/c"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      5,
+					Position: 1,
+				},
+			},
+			delimiter:  "/",
+			max:        10,
+			splitInput: true,
+		},
+		{
+			name: "multiple tokens already split a b c, delimiter /",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a"),
+				},
+				&analysis.Token{
+					Term: []byte("b"),
+				},
+				&analysis.Token{
+					Term: []byte("c"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("a"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      1,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("a/b"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      3,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("a/b/c"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      5,
+					Position: 1,
+				},
+			},
+			delimiter:  "/",
+			max:        10,
+			splitInput: true,
+		},
+		{
+			name: "single token a/b/c, delimiter /, limit 2",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a/b/c"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("a"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      1,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("a/b"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      3,
+					Position: 1,
+				},
+			},
+			delimiter:  "/",
+			max:        2,
+			splitInput: true,
+		},
+		{
+			name: "multiple tokens already split a b c, delimiter /, limit 2",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a"),
+				},
+				&analysis.Token{
+					Term: []byte("b"),
+				},
+				&analysis.Token{
+					Term: []byte("c"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("a"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      1,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("a/b"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      3,
+					Position: 1,
+				},
+			},
+			delimiter:  "/",
+			max:        2,
+			splitInput: true,
+		},
+
+		{
+			name: "single token a/b/c, delimiter /, no split",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a/b/c"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("a/b/c"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      5,
+					Position: 1,
+				},
+			},
+			delimiter:  "/",
+			max:        10,
+			splitInput: false,
+		},
+		{
+			name: "multiple tokens already split a b c, delimiter /, no split",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a"),
+				},
+				&analysis.Token{
+					Term: []byte("b"),
+				},
+				&analysis.Token{
+					Term: []byte("c"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("a"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      1,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("a/b"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      3,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("a/b/c"),
+					Type:     analysis.Shingle,
+					Start:    0,
+					End:      5,
+					Position: 1,
+				},
+			},
+			delimiter:  "/",
+			max:        10,
+			splitInput: false,
+		},
+	}
+
+	for _, test := range tests {
+		test := test
+		t.Run(test.name, func(t *testing.T) {
+			filter := NewHierarchyFilter([]byte(test.delimiter), test.max, test.splitInput)
+			actual := filter.Filter(test.input)
+			if !reflect.DeepEqual(actual, test.output) {
+				t.Errorf("expected %s, got %s", test.output, actual)
+			}
+		})
+	}
+
+}
--- a/analysis/token/keyword/keyword.go
+++ b/analysis/token/keyword/keyword.go
@ -0,0 +1,63 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package keyword
+
+import (
+	"fmt"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "keyword_marker"
+
+type KeyWordMarkerFilter struct {
+	keyWords analysis.TokenMap
+}
+
+func NewKeyWordMarkerFilter(keyWords analysis.TokenMap) *KeyWordMarkerFilter {
+	return &KeyWordMarkerFilter{
+		keyWords: keyWords,
+	}
+}
+
+func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		_, isKeyWord := f.keyWords[string(token.Term)]
+		if isKeyWord {
+			token.KeyWord = true
+		}
+	}
+	return input
+}
+
+func KeyWordMarkerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	keywordsTokenMapName, ok := config["keywords_token_map"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify keywords_token_map")
+	}
+	keywordsTokenMap, err := cache.TokenMapNamed(keywordsTokenMapName)
+	if err != nil {
+		return nil, fmt.Errorf("error building keyword marker filter: %v", err)
+	}
+	return NewKeyWordMarkerFilter(keywordsTokenMap), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, KeyWordMarkerFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/keyword/keyword_test.go
+++ b/analysis/token/keyword/keyword_test.go
@ -0,0 +1,73 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package keyword
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestKeyWordMarkerFilter(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("a"),
+		},
+		&analysis.Token{
+			Term: []byte("walk"),
+		},
+		&analysis.Token{
+			Term: []byte("in"),
+		},
+		&analysis.Token{
+			Term: []byte("the"),
+		},
+		&analysis.Token{
+			Term: []byte("park"),
+		},
+	}
+
+	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("a"),
+		},
+		&analysis.Token{
+			Term:    []byte("walk"),
+			KeyWord: true,
+		},
+		&analysis.Token{
+			Term: []byte("in"),
+		},
+		&analysis.Token{
+			Term: []byte("the"),
+		},
+		&analysis.Token{
+			Term:    []byte("park"),
+			KeyWord: true,
+		},
+	}
+
+	keyWordsMap := analysis.NewTokenMap()
+	keyWordsMap.AddToken("walk")
+	keyWordsMap.AddToken("park")
+
+	filter := NewKeyWordMarkerFilter(keyWordsMap)
+	ouputTokenStream := filter.Filter(inputTokenStream)
+	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
+		t.Errorf("expected %#v got %#v", expectedTokenStream[0].KeyWord, ouputTokenStream[0].KeyWord)
+	}
+}
--- a/analysis/token/length/length.go
+++ b/analysis/token/length/length.go
@ -0,0 +1,80 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package length
+
+import (
+	"fmt"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "length"
+
+type LengthFilter struct {
+	min int
+	max int
+}
+
+func NewLengthFilter(min, max int) *LengthFilter {
+	return &LengthFilter{
+		min: min,
+		max: max,
+	}
+}
+
+func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0, len(input))
+
+	for _, token := range input {
+		wordLen := utf8.RuneCount(token.Term)
+		if f.min > 0 && f.min > wordLen {
+			continue
+		}
+		if f.max > 0 && f.max < wordLen {
+			continue
+		}
+		rv = append(rv, token)
+	}
+
+	return rv
+}
+
+func LengthFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	min := 0
+	max := 0
+
+	minVal, ok := config["min"].(float64)
+	if ok {
+		min = int(minVal)
+	}
+	maxVal, ok := config["max"].(float64)
+	if ok {
+		max = int(maxVal)
+	}
+	if min == max && max == 0 {
+		return nil, fmt.Errorf("either min or max must be non-zero")
+	}
+
+	return NewLengthFilter(min, max), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, LengthFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/length/length_test.go
+++ b/analysis/token/length/length_test.go
@ -0,0 +1,99 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package length
+
+import (
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestLengthFilter(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("1"),
+		},
+		&analysis.Token{
+			Term: []byte("two"),
+		},
+		&analysis.Token{
+			Term: []byte("three"),
+		},
+	}
+
+	lengthFilter := NewLengthFilter(3, 4)
+	ouputTokenStream := lengthFilter.Filter(inputTokenStream)
+	if len(ouputTokenStream) != 1 {
+		t.Fatalf("expected 1 output token")
+	}
+	if string(ouputTokenStream[0].Term) != "two" {
+		t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
+	}
+}
+
+func TestLengthFilterNoMax(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("1"),
+		},
+		&analysis.Token{
+			Term: []byte("two"),
+		},
+		&analysis.Token{
+			Term: []byte("three"),
+		},
+	}
+
+	lengthFilter := NewLengthFilter(3, -1)
+	ouputTokenStream := lengthFilter.Filter(inputTokenStream)
+	if len(ouputTokenStream) != 2 {
+		t.Fatalf("expected 2 output token")
+	}
+	if string(ouputTokenStream[0].Term) != "two" {
+		t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
+	}
+	if string(ouputTokenStream[1].Term) != "three" {
+		t.Errorf("expected term `three`, got `%s`", ouputTokenStream[0].Term)
+	}
+}
+
+func TestLengthFilterNoMin(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("1"),
+		},
+		&analysis.Token{
+			Term: []byte("two"),
+		},
+		&analysis.Token{
+			Term: []byte("three"),
+		},
+	}
+
+	lengthFilter := NewLengthFilter(-1, 4)
+	ouputTokenStream := lengthFilter.Filter(inputTokenStream)
+	if len(ouputTokenStream) != 2 {
+		t.Fatalf("expected 2 output token")
+	}
+	if string(ouputTokenStream[0].Term) != "1" {
+		t.Errorf("expected term `1`, got `%s`", ouputTokenStream[0].Term)
+	}
+	if string(ouputTokenStream[1].Term) != "two" {
+		t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term)
+	}
+}
--- a/analysis/token/lowercase/lowercase.go
+++ b/analysis/token/lowercase/lowercase.go
@ -0,0 +1,108 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lowercase implements a TokenFilter which converts
+// tokens to lower case according to unicode rules.
+package lowercase
+
+import (
+	"bytes"
+	"unicode"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+// Name is the name used to register LowerCaseFilter in the bleve registry
+const Name = "to_lower"
+
+type LowerCaseFilter struct {
+}
+
+func NewLowerCaseFilter() *LowerCaseFilter {
+	return &LowerCaseFilter{}
+}
+
+func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		token.Term = toLowerDeferredCopy(token.Term)
+	}
+	return input
+}
+
+func LowerCaseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewLowerCaseFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, LowerCaseFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
+
+// toLowerDeferredCopy will function exactly like
+// bytes.ToLower() only it will reuse (overwrite)
+// the original byte array when possible
+// NOTE: because its possible that the lower-case
+// form of a rune has a different utf-8 encoded
+// length, in these cases a new byte array is allocated
+func toLowerDeferredCopy(s []byte) []byte {
+	j := 0
+	for i := 0; i < len(s); {
+		wid := 1
+		r := rune(s[i])
+		if r >= utf8.RuneSelf {
+			r, wid = utf8.DecodeRune(s[i:])
+		}
+
+		l := unicode.ToLower(r)
+
+		// If the rune is already lowercased, just move to the
+		// next rune.
+		if l == r {
+			i += wid
+			j += wid
+			continue
+		}
+
+		// Handles the Unicode edge-case where the last
+		// rune in a word on the greek Σ needs to be converted
+		// differently.
+		if l == 'σ' && i+2 == len(s) {
+			l = 'ς'
+		}
+
+		lwid := utf8.RuneLen(l)
+		if lwid > wid {
+			// utf-8 encoded replacement is wider
+			// for now, punt and defer
+			// to bytes.ToLower() for the remainder
+			// only known to happen with chars
+			//   Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
+			//   Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
+			rest := bytes.ToLower(s[i:])
+			rv := make([]byte, j+len(rest))
+			copy(rv[:j], s[:j])
+			copy(rv[j:], rest)
+			return rv
+		} else {
+			utf8.EncodeRune(s[j:], l)
+		}
+		i += wid
+		j += lwid
+	}
+	return s[:j]
+}
--- a/analysis/token/lowercase/lowercase_test.go
+++ b/analysis/token/lowercase/lowercase_test.go
@ -0,0 +1,166 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package lowercase
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestLowerCaseFilter(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("ONE"),
+		},
+		&analysis.Token{
+			Term: []byte("two"),
+		},
+		&analysis.Token{
+			Term: []byte("ThReE"),
+		},
+		&analysis.Token{
+			Term: []byte("steven's"),
+		},
+		// these characters are chosen in particular
+		// because the utf-8 encoding of the lower-case
+		// version has a different length
+		// Rune İ(304) width 2 - Lower i(105) width 1
+		// Rune Ⱥ(570) width 2 - Lower ⱥ(11365) width 3
+		// Rune Ⱦ(574) width 2 - Lower ⱦ(11366) width 3
+		&analysis.Token{
+			Term: []byte("İȺȾCAT"),
+		},
+		&analysis.Token{
+			Term: []byte("ȺȾCAT"),
+		},
+		&analysis.Token{
+			Term: []byte("ὈΔΥΣΣ"),
+		},
+	}
+
+	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("one"),
+		},
+		&analysis.Token{
+			Term: []byte("two"),
+		},
+		&analysis.Token{
+			Term: []byte("three"),
+		},
+		&analysis.Token{
+			Term: []byte("steven's"),
+		},
+		&analysis.Token{
+			Term: []byte("iⱥⱦcat"),
+		},
+		&analysis.Token{
+			Term: []byte("ⱥⱦcat"),
+		},
+		&analysis.Token{
+			Term: []byte("ὀδυσς"),
+		},
+	}
+
+	filter := NewLowerCaseFilter()
+	ouputTokenStream := filter.Filter(inputTokenStream)
+	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
+		t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
+		t.Errorf("expected %s got %s", expectedTokenStream[0].Term, ouputTokenStream[0].Term)
+	}
+}
+
+func BenchmarkLowerCaseFilter(b *testing.B) {
+	input := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("A"),
+		},
+		&analysis.Token{
+			Term: []byte("boiling"),
+		},
+		&analysis.Token{
+			Term: []byte("liquid"),
+		},
+		&analysis.Token{
+			Term: []byte("expanding"),
+		},
+		&analysis.Token{
+			Term: []byte("vapor"),
+		},
+		&analysis.Token{
+			Term: []byte("explosion"),
+		},
+		&analysis.Token{
+			Term: []byte("caused"),
+		},
+		&analysis.Token{
+			Term: []byte("by"),
+		},
+		&analysis.Token{
+			Term: []byte("the"),
+		},
+		&analysis.Token{
+			Term: []byte("rupture"),
+		},
+		&analysis.Token{
+			Term: []byte("of"),
+		},
+		&analysis.Token{
+			Term: []byte("a"),
+		},
+		&analysis.Token{
+			Term: []byte("vessel"),
+		},
+		&analysis.Token{
+			Term: []byte("containing"),
+		},
+		&analysis.Token{
+			Term: []byte("a"),
+		},
+		&analysis.Token{
+			Term: []byte("pressurized"),
+		},
+		&analysis.Token{
+			Term: []byte("liquid"),
+		},
+		&analysis.Token{
+			Term: []byte("above"),
+		},
+		&analysis.Token{
+			Term: []byte("its"),
+		},
+		&analysis.Token{
+			Term: []byte("boiling"),
+		},
+		&analysis.Token{
+			Term: []byte("point"),
+		},
+		&analysis.Token{
+			Term: []byte("İȺȾCAT"),
+		},
+		&analysis.Token{
+			Term: []byte("ȺȾCAT"),
+		},
+	}
+	filter := NewLowerCaseFilter()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		filter.Filter(input)
+	}
+}
--- a/analysis/token/ngram/ngram.go
+++ b/analysis/token/ngram/ngram.go
@ -0,0 +1,116 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ngram
+
+import (
+	"bytes"
+	"fmt"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "ngram"
+
+type NgramFilter struct {
+	minLength int
+	maxLength int
+}
+
+func NewNgramFilter(minLength, maxLength int) *NgramFilter {
+	return &NgramFilter{
+		minLength: minLength,
+		maxLength: maxLength,
+	}
+}
+
+func (s *NgramFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0, len(input))
+
+	for _, token := range input {
+		runeCount := utf8.RuneCount(token.Term)
+		runes := bytes.Runes(token.Term)
+		for i := 0; i < runeCount; i++ {
+			// index of the starting rune for this token
+			for ngramSize := s.minLength; ngramSize <= s.maxLength; ngramSize++ {
+				// build an ngram of this size starting at i
+				if i+ngramSize <= runeCount {
+					ngramTerm := analysis.BuildTermFromRunes(runes[i : i+ngramSize])
+					token := analysis.Token{
+						Position: token.Position,
+						Start:    token.Start,
+						End:      token.End,
+						Type:     token.Type,
+						Term:     ngramTerm,
+					}
+					rv = append(rv, &token)
+				}
+			}
+		}
+	}
+
+	return rv
+}
+
+func NgramFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	minVal, ok := config["min"]
+	if !ok {
+		return nil, fmt.Errorf("must specify min")
+	}
+
+	min, err := convertToInt(minVal)
+	if err != nil {
+		return nil, err
+	}
+
+	maxVal, ok := config["max"]
+	if !ok {
+		return nil, fmt.Errorf("must specify max")
+	}
+
+	max, err := convertToInt(maxVal)
+	if err != nil {
+		return nil, err
+	}
+
+	return NewNgramFilter(min, max), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, NgramFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
+
+// Expects either an int or a flaot64 value
+func convertToInt(val interface{}) (int, error) {
+	var intVal int
+	var floatVal float64
+	var ok bool
+
+	intVal, ok = val.(int)
+	if ok {
+		return intVal, nil
+	}
+
+	floatVal, ok = val.(float64)
+	if ok {
+		return int(floatVal), nil
+	}
+
+	return 0, fmt.Errorf("failed to convert to int value")
+}
--- a/analysis/token/ngram/ngram_test.go
+++ b/analysis/token/ngram/ngram_test.go
@ -0,0 +1,192 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ngram
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestNgramFilter(t *testing.T) {
+
+	tests := []struct {
+		min    int
+		max    int
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			min: 1,
+			max: 1,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("abcde"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a"),
+				},
+				&analysis.Token{
+					Term: []byte("b"),
+				},
+				&analysis.Token{
+					Term: []byte("c"),
+				},
+				&analysis.Token{
+					Term: []byte("d"),
+				},
+				&analysis.Token{
+					Term: []byte("e"),
+				},
+			},
+		},
+		{
+			min: 2,
+			max: 2,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("abcde"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ab"),
+				},
+				&analysis.Token{
+					Term: []byte("bc"),
+				},
+				&analysis.Token{
+					Term: []byte("cd"),
+				},
+				&analysis.Token{
+					Term: []byte("de"),
+				},
+			},
+		},
+		{
+			min: 1,
+			max: 3,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("abcde"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a"),
+				},
+				&analysis.Token{
+					Term: []byte("ab"),
+				},
+				&analysis.Token{
+					Term: []byte("abc"),
+				},
+				&analysis.Token{
+					Term: []byte("b"),
+				},
+				&analysis.Token{
+					Term: []byte("bc"),
+				},
+				&analysis.Token{
+					Term: []byte("bcd"),
+				},
+				&analysis.Token{
+					Term: []byte("c"),
+				},
+				&analysis.Token{
+					Term: []byte("cd"),
+				},
+				&analysis.Token{
+					Term: []byte("cde"),
+				},
+				&analysis.Token{
+					Term: []byte("d"),
+				},
+				&analysis.Token{
+					Term: []byte("de"),
+				},
+				&analysis.Token{
+					Term: []byte("e"),
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		ngramFilter := NewNgramFilter(test.min, test.max)
+		actual := ngramFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output, actual)
+		}
+	}
+}
+
+func TestConversionInt(t *testing.T) {
+	config := map[string]interface{}{
+		"type": Name,
+		"min":  3,
+		"max":  8,
+	}
+
+	f, err := NgramFilterConstructor(config, nil)
+
+	if err != nil {
+		t.Errorf("Failed to construct the ngram filter: %v", err)
+	}
+
+	ngram := f.(*NgramFilter)
+	if ngram.minLength != 3 && ngram.maxLength != 8 {
+		t.Errorf("Failed to construct the bounds. Got %v and %v.", ngram.minLength, ngram.maxLength)
+	}
+}
+
+func TestConversionFloat(t *testing.T) {
+	config := map[string]interface{}{
+		"type": Name,
+		"min":  float64(3),
+		"max":  float64(8),
+	}
+
+	f, err := NgramFilterConstructor(config, nil)
+
+	if err != nil {
+		t.Errorf("Failed to construct the ngram filter: %v", err)
+	}
+
+	ngram := f.(*NgramFilter)
+	if ngram.minLength != 3 && ngram.maxLength != 8 {
+		t.Errorf("Failed to construct the bounds. Got %v and %v.", ngram.minLength, ngram.maxLength)
+	}
+}
+
+func TestBadConversion(t *testing.T) {
+	config := map[string]interface{}{
+		"type": Name,
+		"min":  "3",
+	}
+
+	_, err := NgramFilterConstructor(config, nil)
+
+	if err == nil {
+		t.Errorf("Expected conversion error.")
+	}
+
+	if err.Error() != "failed to convert to int value" {
+		t.Errorf("Wrong error recevied. Got %v.", err)
+	}
+}
--- a/analysis/token/porter/porter.go
+++ b/analysis/token/porter/porter.go
@ -0,0 +1,56 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package porter
+
+import (
+	"bytes"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+
+	"github.com/blevesearch/go-porterstemmer"
+)
+
+const Name = "stemmer_porter"
+
+type PorterStemmer struct {
+}
+
+func NewPorterStemmer() *PorterStemmer {
+	return &PorterStemmer{}
+}
+
+func (s *PorterStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		// if it is not a protected keyword, stem it
+		if !token.KeyWord {
+			termRunes := bytes.Runes(token.Term)
+			stemmedRunes := porterstemmer.StemWithoutLowerCasing(termRunes)
+			token.Term = analysis.BuildTermFromRunes(stemmedRunes)
+		}
+	}
+	return input
+}
+
+func PorterStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewPorterStemmer(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, PorterStemmerConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/porter/porter_test.go
+++ b/analysis/token/porter/porter_test.go
@ -0,0 +1,115 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package porter
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestPorterStemmer(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("walking"),
+		},
+		&analysis.Token{
+			Term: []byte("talked"),
+		},
+		&analysis.Token{
+			Term: []byte("business"),
+		},
+		&analysis.Token{
+			Term:    []byte("protected"),
+			KeyWord: true,
+		},
+		&analysis.Token{
+			Term: []byte("cat"),
+		},
+		&analysis.Token{
+			Term: []byte("done"),
+		},
+		// a term which does stem, but does not change length
+		&analysis.Token{
+			Term: []byte("marty"),
+		},
+	}
+
+	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("walk"),
+		},
+		&analysis.Token{
+			Term: []byte("talk"),
+		},
+		&analysis.Token{
+			Term: []byte("busi"),
+		},
+		&analysis.Token{
+			Term:    []byte("protected"),
+			KeyWord: true,
+		},
+		&analysis.Token{
+			Term: []byte("cat"),
+		},
+		&analysis.Token{
+			Term: []byte("done"),
+		},
+		&analysis.Token{
+			Term: []byte("marti"),
+		},
+	}
+
+	filter := NewPorterStemmer()
+	ouputTokenStream := filter.Filter(inputTokenStream)
+	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
+		t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
+	}
+}
+
+func BenchmarkPorterStemmer(b *testing.B) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("walking"),
+		},
+		&analysis.Token{
+			Term: []byte("talked"),
+		},
+		&analysis.Token{
+			Term: []byte("business"),
+		},
+		&analysis.Token{
+			Term:    []byte("protected"),
+			KeyWord: true,
+		},
+		&analysis.Token{
+			Term: []byte("cat"),
+		},
+		&analysis.Token{
+			Term: []byte("done"),
+		},
+	}
+
+	filter := NewPorterStemmer()
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		filter.Filter(inputTokenStream)
+	}
+
+}
--- a/analysis/token/reverse/reverse.go
+++ b/analysis/token/reverse/reverse.go
@ -0,0 +1,78 @@
+//  Copyright (c) 2019 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package reverse
+
+import (
+	"unicode"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+// Name is the name used to register ReverseFilter in the bleve registry
+const Name = "reverse"
+
+type ReverseFilter struct {
+}
+
+func NewReverseFilter() *ReverseFilter {
+	return &ReverseFilter{}
+}
+
+func (f *ReverseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		token.Term = reverse(token.Term)
+	}
+	return input
+}
+
+func ReverseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewReverseFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, ReverseFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
+
+// reverse(..) will generate a reversed version of the provided
+// unicode array and return it back to its caller.
+func reverse(s []byte) []byte {
+	cursorIn := 0
+	inputRunes := []rune(string(s))
+	cursorOut := len(s)
+	output := make([]byte, len(s))
+	for i := 0; i < len(inputRunes); {
+		wid := utf8.RuneLen(inputRunes[i])
+		i++
+		for i < len(inputRunes) {
+			r := inputRunes[i]
+			if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Me, r) || unicode.Is(unicode.Mc, r) {
+				wid += utf8.RuneLen(r)
+				i++
+			} else {
+				break
+			}
+		}
+		copy(output[cursorOut-wid:cursorOut], s[cursorIn:cursorIn+wid])
+		cursorIn += wid
+		cursorOut -= wid
+	}
+
+	return output
+}
--- a/analysis/token/reverse/reverse_test.go
+++ b/analysis/token/reverse/reverse_test.go
@ -0,0 +1,184 @@
+//  Copyright (c) 2019 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package reverse
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestReverseFilter(t *testing.T) {
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{},
+		&analysis.Token{
+			Term: []byte("one"),
+		},
+		&analysis.Token{
+			Term: []byte("TWo"),
+		},
+		&analysis.Token{
+			Term: []byte("thRee"),
+		},
+		&analysis.Token{
+			Term: []byte("four's"),
+		},
+		&analysis.Token{
+			Term: []byte("what's this in reverse"),
+		},
+		&analysis.Token{
+			Term: []byte("œ∑´®†"),
+		},
+		&analysis.Token{
+			Term: []byte("İȺȾCAT÷≥≤µ123"),
+		},
+		&analysis.Token{
+			Term: []byte("!@#$%^&*()"),
+		},
+		&analysis.Token{
+			Term: []byte("cafés"),
+		},
+		&analysis.Token{
+			Term: []byte("¿Dónde estás?"),
+		},
+		&analysis.Token{
+			Term: []byte("Me gustaría una cerveza."),
+		},
+	}
+
+	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{},
+		&analysis.Token{
+			Term: []byte("eno"),
+		},
+		&analysis.Token{
+			Term: []byte("oWT"),
+		},
+		&analysis.Token{
+			Term: []byte("eeRht"),
+		},
+		&analysis.Token{
+			Term: []byte("s'ruof"),
+		},
+		&analysis.Token{
+			Term: []byte("esrever ni siht s'tahw"),
+		},
+		&analysis.Token{
+			Term: []byte("†®´∑œ"),
+		},
+		&analysis.Token{
+			Term: []byte("321µ≤≥÷TACȾȺİ"),
+		},
+		&analysis.Token{
+			Term: []byte(")(*&^%$#@!"),
+		},
+		&analysis.Token{
+			Term: []byte("séfac"),
+		},
+		&analysis.Token{
+			Term: []byte("?sátse ednóD¿"),
+		},
+		&analysis.Token{
+			Term: []byte(".azevrec anu aíratsug eM"),
+		},
+	}
+
+	filter := NewReverseFilter()
+	outputTokenStream := filter.Filter(inputTokenStream)
+	for i := 0; i < len(expectedTokenStream); i++ {
+		if !bytes.Equal(outputTokenStream[i].Term, expectedTokenStream[i].Term) {
+			t.Errorf("[%d] expected %s got %s",
+				i+1, expectedTokenStream[i].Term, outputTokenStream[i].Term)
+		}
+	}
+}
+
+func BenchmarkReverseFilter(b *testing.B) {
+	input := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("A"),
+		},
+		&analysis.Token{
+			Term: []byte("boiling"),
+		},
+		&analysis.Token{
+			Term: []byte("liquid"),
+		},
+		&analysis.Token{
+			Term: []byte("expanding"),
+		},
+		&analysis.Token{
+			Term: []byte("vapor"),
+		},
+		&analysis.Token{
+			Term: []byte("explosion"),
+		},
+		&analysis.Token{
+			Term: []byte("caused"),
+		},
+		&analysis.Token{
+			Term: []byte("by"),
+		},
+		&analysis.Token{
+			Term: []byte("the"),
+		},
+		&analysis.Token{
+			Term: []byte("rupture"),
+		},
+		&analysis.Token{
+			Term: []byte("of"),
+		},
+		&analysis.Token{
+			Term: []byte("a"),
+		},
+		&analysis.Token{
+			Term: []byte("vessel"),
+		},
+		&analysis.Token{
+			Term: []byte("containing"),
+		},
+		&analysis.Token{
+			Term: []byte("pressurized"),
+		},
+		&analysis.Token{
+			Term: []byte("liquid"),
+		},
+		&analysis.Token{
+			Term: []byte("above"),
+		},
+		&analysis.Token{
+			Term: []byte("its"),
+		},
+		&analysis.Token{
+			Term: []byte("boiling"),
+		},
+		&analysis.Token{
+			Term: []byte("point"),
+		},
+		&analysis.Token{
+			Term: []byte("İȺȾCAT"),
+		},
+		&analysis.Token{
+			Term: []byte("Me gustaría una cerveza."),
+		},
+	}
+	filter := NewReverseFilter()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		filter.Filter(input)
+	}
+}
--- a/analysis/token/shingle/shingle.go
+++ b/analysis/token/shingle/shingle.go
@ -0,0 +1,172 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package shingle
+
+import (
+	"container/ring"
+	"fmt"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "shingle"
+
+type ShingleFilter struct {
+	min            int
+	max            int
+	outputOriginal bool
+	tokenSeparator string
+	fill           string
+}
+
+func NewShingleFilter(min, max int, outputOriginal bool, sep, fill string) *ShingleFilter {
+	return &ShingleFilter{
+		min:            min,
+		max:            max,
+		outputOriginal: outputOriginal,
+		tokenSeparator: sep,
+		fill:           fill,
+	}
+}
+
+func (s *ShingleFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0, len(input))
+
+	ring := ring.New(s.max)
+	itemsInRing := 0
+	currentPosition := 0
+	for _, token := range input {
+		if s.outputOriginal {
+			rv = append(rv, token)
+		}
+
+		// if there are gaps, insert filler tokens
+		offset := token.Position - currentPosition
+		for offset > 1 {
+			fillerToken := analysis.Token{
+				Position: 0,
+				Start:    -1,
+				End:      -1,
+				Type:     analysis.AlphaNumeric,
+				Term:     []byte(s.fill),
+			}
+			ring.Value = &fillerToken
+			if itemsInRing < s.max {
+				itemsInRing++
+			}
+			rv = append(rv, s.shingleCurrentRingState(ring, itemsInRing)...)
+			ring = ring.Next()
+			offset--
+		}
+		currentPosition = token.Position
+
+		ring.Value = token
+		if itemsInRing < s.max {
+			itemsInRing++
+		}
+		rv = append(rv, s.shingleCurrentRingState(ring, itemsInRing)...)
+		ring = ring.Next()
+	}
+
+	return rv
+}
+
+func (s *ShingleFilter) shingleCurrentRingState(ring *ring.Ring, itemsInRing int) analysis.TokenStream {
+	rv := make(analysis.TokenStream, 0)
+	for shingleN := s.min; shingleN <= s.max; shingleN++ {
+		// if there are enough items in the ring
+		// to produce a shingle of this size
+		if itemsInRing >= shingleN {
+			thisShingleRing := ring.Move(-(shingleN - 1))
+			shingledBytes := make([]byte, 0)
+			pos := 0
+			start := -1
+			end := 0
+			for i := 0; i < shingleN; i++ {
+				if i != 0 {
+					shingledBytes = append(shingledBytes, []byte(s.tokenSeparator)...)
+				}
+				curr := thisShingleRing.Value.(*analysis.Token)
+				if pos == 0 && curr.Position != 0 {
+					pos = curr.Position
+				}
+				if start == -1 && curr.Start != -1 {
+					start = curr.Start
+				}
+				if curr.End != -1 {
+					end = curr.End
+				}
+				shingledBytes = append(shingledBytes, curr.Term...)
+				thisShingleRing = thisShingleRing.Next()
+			}
+			token := analysis.Token{
+				Type: analysis.Shingle,
+				Term: shingledBytes,
+			}
+			if pos != 0 {
+				token.Position = pos
+			}
+			if start != -1 {
+				token.Start = start
+			}
+			if end != -1 {
+				token.End = end
+			}
+			rv = append(rv, &token)
+		}
+	}
+	return rv
+}
+
+func ShingleFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	minVal, ok := config["min"].(float64)
+	if !ok {
+		return nil, fmt.Errorf("must specify min")
+	}
+	min := int(minVal)
+	maxVal, ok := config["max"].(float64)
+	if !ok {
+		return nil, fmt.Errorf("must specify max")
+	}
+	max := int(maxVal)
+
+	outputOriginal := false
+	outVal, ok := config["output_original"].(bool)
+	if ok {
+		outputOriginal = outVal
+	}
+
+	sep := " "
+	sepVal, ok := config["separator"].(string)
+	if ok {
+		sep = sepVal
+	}
+
+	fill := "_"
+	fillVal, ok := config["filler"].(string)
+	if ok {
+		fill = fillVal
+	}
+
+	return NewShingleFilter(min, max, outputOriginal, sep, fill), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, ShingleFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/shingle/shingle_test.go
+++ b/analysis/token/shingle/shingle_test.go
@ -0,0 +1,416 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package shingle
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestShingleFilter(t *testing.T) {
+
+	tests := []struct {
+		min            int
+		max            int
+		outputOriginal bool
+		separator      string
+		filler         string
+		input          analysis.TokenStream
+		output         analysis.TokenStream
+	}{
+		{
+			min:            2,
+			max:            2,
+			outputOriginal: false,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the"),
+				},
+				&analysis.Token{
+					Term: []byte("quick"),
+				},
+				&analysis.Token{
+					Term: []byte("brown"),
+				},
+				&analysis.Token{
+					Term: []byte("fox"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the quick"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("quick brown"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("brown fox"),
+					Type: analysis.Shingle,
+				},
+			},
+		},
+		{
+			min:            3,
+			max:            3,
+			outputOriginal: false,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the"),
+				},
+				&analysis.Token{
+					Term: []byte("quick"),
+				},
+				&analysis.Token{
+					Term: []byte("brown"),
+				},
+				&analysis.Token{
+					Term: []byte("fox"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the quick brown"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("quick brown fox"),
+					Type: analysis.Shingle,
+				},
+			},
+		},
+		{
+			min:            2,
+			max:            3,
+			outputOriginal: false,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the"),
+				},
+				&analysis.Token{
+					Term: []byte("quick"),
+				},
+				&analysis.Token{
+					Term: []byte("brown"),
+				},
+				&analysis.Token{
+					Term: []byte("fox"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the quick"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("quick brown"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("the quick brown"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("brown fox"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("quick brown fox"),
+					Type: analysis.Shingle,
+				},
+			},
+		},
+		{
+			min:            3,
+			max:            3,
+			outputOriginal: false,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("ugly"),
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("quick"),
+					Position: 3,
+				},
+				&analysis.Token{
+					Term:     []byte("brown"),
+					Position: 4,
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("ugly _ quick"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("_ quick brown"),
+					Type:     analysis.Shingle,
+					Position: 3,
+				},
+			},
+		},
+		{
+			min:            1,
+			max:            5,
+			outputOriginal: false,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("test"),
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("text"),
+					Position: 2,
+				},
+				// token 3 removed by stop filter
+				&analysis.Token{
+					Term:     []byte("see"),
+					Position: 4,
+				},
+				&analysis.Token{
+					Term:     []byte("shingles"),
+					Position: 5,
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term:     []byte("test"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("text"),
+					Type:     analysis.Shingle,
+					Position: 2,
+				},
+				&analysis.Token{
+					Term:     []byte("test text"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term: []byte("_"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term:     []byte("text _"),
+					Type:     analysis.Shingle,
+					Position: 2,
+				},
+				&analysis.Token{
+					Term:     []byte("test text _"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("see"),
+					Type:     analysis.Shingle,
+					Position: 4,
+				},
+				&analysis.Token{
+					Term:     []byte("_ see"),
+					Type:     analysis.Shingle,
+					Position: 4,
+				},
+				&analysis.Token{
+					Term:     []byte("text _ see"),
+					Type:     analysis.Shingle,
+					Position: 2,
+				},
+				&analysis.Token{
+					Term:     []byte("test text _ see"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+				&analysis.Token{
+					Term:     []byte("shingles"),
+					Type:     analysis.Shingle,
+					Position: 5,
+				},
+				&analysis.Token{
+					Term:     []byte("see shingles"),
+					Type:     analysis.Shingle,
+					Position: 4,
+				},
+				&analysis.Token{
+					Term:     []byte("_ see shingles"),
+					Type:     analysis.Shingle,
+					Position: 4,
+				},
+				&analysis.Token{
+					Term:     []byte("text _ see shingles"),
+					Type:     analysis.Shingle,
+					Position: 2,
+				},
+				&analysis.Token{
+					Term:     []byte("test text _ see shingles"),
+					Type:     analysis.Shingle,
+					Position: 1,
+				},
+			},
+		},
+		{
+			min:            2,
+			max:            2,
+			outputOriginal: true,
+			separator:      " ",
+			filler:         "_",
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the"),
+				},
+				&analysis.Token{
+					Term: []byte("quick"),
+				},
+				&analysis.Token{
+					Term: []byte("brown"),
+				},
+				&analysis.Token{
+					Term: []byte("fox"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the"),
+				},
+				&analysis.Token{
+					Term: []byte("quick"),
+				},
+				&analysis.Token{
+					Term: []byte("the quick"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("brown"),
+				},
+				&analysis.Token{
+					Term: []byte("quick brown"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("fox"),
+				},
+				&analysis.Token{
+					Term: []byte("brown fox"),
+					Type: analysis.Shingle,
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		shingleFilter := NewShingleFilter(test.min, test.max, test.outputOriginal, test.separator, test.filler)
+		actual := shingleFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output, actual)
+		}
+	}
+}
+
+// TestShingleFilterBug431 tests that the shingle filter is in fact stateless
+// by making using the same filter instance twice and ensuring we do not get
+// contaminated output
+func TestShingleFilterBug431(t *testing.T) {
+
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the"),
+				},
+				&analysis.Token{
+					Term: []byte("quick"),
+				},
+				&analysis.Token{
+					Term: []byte("brown"),
+				},
+				&analysis.Token{
+					Term: []byte("fox"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("the quick"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("quick brown"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("brown fox"),
+					Type: analysis.Shingle,
+				},
+			},
+		},
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a"),
+				},
+				&analysis.Token{
+					Term: []byte("sad"),
+				},
+				&analysis.Token{
+					Term: []byte("dirty"),
+				},
+				&analysis.Token{
+					Term: []byte("sock"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("a sad"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("sad dirty"),
+					Type: analysis.Shingle,
+				},
+				&analysis.Token{
+					Term: []byte("dirty sock"),
+					Type: analysis.Shingle,
+				},
+			},
+		},
+	}
+
+	shingleFilter := NewShingleFilter(2, 2, false, " ", "_")
+	for _, test := range tests {
+		actual := shingleFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output, actual)
+		}
+	}
+
+}
--- a/analysis/token/snowball/snowball.go
+++ b/analysis/token/snowball/snowball.go
@ -0,0 +1,62 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package snowball
+
+import (
+	"fmt"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+
+	"github.com/blevesearch/snowball"
+)
+
+const Name = "stemmer_snowball"
+
+type SnowballStemmer struct {
+	language string
+}
+
+func NewSnowballStemmer(language string) *SnowballStemmer {
+	return &SnowballStemmer{
+		language: language,
+	}
+}
+
+func (s *SnowballStemmer) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		// if it is not a protected keyword, stem it
+		if !token.KeyWord {
+			stemmed, _ := snowball.Stem(string(token.Term), s.language, true)
+			token.Term = []byte(stemmed)
+		}
+	}
+	return input
+}
+
+func SnowballStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	language, ok := config["language"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify language")
+	}
+	return NewSnowballStemmer(language), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, SnowballStemmerConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/snowball/snowball_test.go
+++ b/analysis/token/snowball/snowball_test.go
@ -0,0 +1,115 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package snowball
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestSnowballStemmer(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("walking"),
+		},
+		&analysis.Token{
+			Term: []byte("talked"),
+		},
+		&analysis.Token{
+			Term: []byte("business"),
+		},
+		&analysis.Token{
+			Term:    []byte("protected"),
+			KeyWord: true,
+		},
+		&analysis.Token{
+			Term: []byte("cat"),
+		},
+		&analysis.Token{
+			Term: []byte("done"),
+		},
+		// a term which does stem, but does not change length
+		&analysis.Token{
+			Term: []byte("marty"),
+		},
+	}
+
+	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("walk"),
+		},
+		&analysis.Token{
+			Term: []byte("talk"),
+		},
+		&analysis.Token{
+			Term: []byte("busi"),
+		},
+		&analysis.Token{
+			Term:    []byte("protected"),
+			KeyWord: true,
+		},
+		&analysis.Token{
+			Term: []byte("cat"),
+		},
+		&analysis.Token{
+			Term: []byte("done"),
+		},
+		&analysis.Token{
+			Term: []byte("marti"),
+		},
+	}
+
+	filter := NewSnowballStemmer("english")
+	ouputTokenStream := filter.Filter(inputTokenStream)
+	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
+		t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3])
+	}
+}
+
+func BenchmarkSnowballStemmer(b *testing.B) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("walking"),
+		},
+		&analysis.Token{
+			Term: []byte("talked"),
+		},
+		&analysis.Token{
+			Term: []byte("business"),
+		},
+		&analysis.Token{
+			Term:    []byte("protected"),
+			KeyWord: true,
+		},
+		&analysis.Token{
+			Term: []byte("cat"),
+		},
+		&analysis.Token{
+			Term: []byte("done"),
+		},
+	}
+
+	filter := NewSnowballStemmer("english")
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		filter.Filter(inputTokenStream)
+	}
+
+}
--- a/analysis/token/stop/stop.go
+++ b/analysis/token/stop/stop.go
@ -0,0 +1,73 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package stop implements a TokenFilter removing tokens found in
+// a TokenMap.
+//
+// It constructor takes the following arguments:
+//
+// "stop_token_map" (string): the name of the token map identifying tokens to
+// remove.
+package stop
+
+import (
+	"fmt"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "stop_tokens"
+
+type StopTokensFilter struct {
+	stopTokens analysis.TokenMap
+}
+
+func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter {
+	return &StopTokensFilter{
+		stopTokens: stopTokens,
+	}
+}
+
+func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	j := 0
+	for _, token := range input {
+		_, isStopToken := f.stopTokens[string(token.Term)]
+		if !isStopToken {
+			input[j] = token
+			j++
+		}
+	}
+
+	return input[:j]
+}
+
+func StopTokensFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	stopTokenMapName, ok := config["stop_token_map"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify stop_token_map")
+	}
+	stopTokenMap, err := cache.TokenMapNamed(stopTokenMapName)
+	if err != nil {
+		return nil, fmt.Errorf("error building stop words filter: %v", err)
+	}
+	return NewStopTokensFilter(stopTokenMap), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, StopTokensFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/stop/stop_test.go
+++ b/analysis/token/stop/stop_test.go
@ -0,0 +1,124 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stop
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/analysis/tokenmap"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+func TestStopWordsFilter(t *testing.T) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("a"),
+		},
+		&analysis.Token{
+			Term: []byte("walk"),
+		},
+		&analysis.Token{
+			Term: []byte("in"),
+		},
+		&analysis.Token{
+			Term: []byte("the"),
+		},
+		&analysis.Token{
+			Term: []byte("park"),
+		},
+	}
+
+	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("walk"),
+		},
+		&analysis.Token{
+			Term: []byte("park"),
+		},
+	}
+
+	cache := registry.NewCache()
+	stopListConfig := map[string]interface{}{
+		"type":   tokenmap.Name,
+		"tokens": []interface{}{"a", "in", "the"},
+	}
+	_, err := cache.DefineTokenMap("stop_test", stopListConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	stopConfig := map[string]interface{}{
+		"type":           "stop_tokens",
+		"stop_token_map": "stop_test",
+	}
+	stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	ouputTokenStream := stopFilter.Filter(inputTokenStream)
+	if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) {
+		t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream)
+	}
+}
+
+func BenchmarkStopWordsFilter(b *testing.B) {
+
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("a"),
+		},
+		&analysis.Token{
+			Term: []byte("walk"),
+		},
+		&analysis.Token{
+			Term: []byte("in"),
+		},
+		&analysis.Token{
+			Term: []byte("the"),
+		},
+		&analysis.Token{
+			Term: []byte("park"),
+		},
+	}
+
+	cache := registry.NewCache()
+	stopListConfig := map[string]interface{}{
+		"type":   tokenmap.Name,
+		"tokens": []interface{}{"a", "in", "the"},
+	}
+	_, err := cache.DefineTokenMap("stop_test", stopListConfig)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	stopConfig := map[string]interface{}{
+		"type":           "stop_tokens",
+		"stop_token_map": "stop_test",
+	}
+	stopFilter, err := cache.DefineTokenFilter("stop_test", stopConfig)
+	if err != nil {
+		b.Fatal(err)
+	}
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		stopFilter.Filter(inputTokenStream)
+	}
+
+}
--- a/analysis/token/truncate/truncate.go
+++ b/analysis/token/truncate/truncate.go
@ -0,0 +1,62 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package truncate
+
+import (
+	"fmt"
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "truncate_token"
+
+type TruncateTokenFilter struct {
+	length int
+}
+
+func NewTruncateTokenFilter(length int) *TruncateTokenFilter {
+	return &TruncateTokenFilter{
+		length: length,
+	}
+}
+
+func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		wordLen := utf8.RuneCount(token.Term)
+		if wordLen > s.length {
+			token.Term = analysis.TruncateRunes(token.Term, wordLen-s.length)
+		}
+	}
+	return input
+}
+
+func TruncateTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	lenVal, ok := config["length"].(float64)
+	if !ok {
+		return nil, fmt.Errorf("must specify length")
+	}
+	length := int(lenVal)
+
+	return NewTruncateTokenFilter(length), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, TruncateTokenFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/truncate/truncate_test.go
+++ b/analysis/token/truncate/truncate_test.go
@ -0,0 +1,79 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package truncate
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestTruncateTokenFilter(t *testing.T) {
+
+	tests := []struct {
+		length int
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			length: 5,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("abcdefgh"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("abcde"),
+				},
+			},
+		},
+		{
+			length: 3,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("こんにちは世界"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("こんに"),
+				},
+			},
+		},
+		{
+			length: 10,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("แยกคำภาษาไ"),
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		truncateTokenFilter := NewTruncateTokenFilter(test.length)
+		actual := truncateTokenFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
+		}
+	}
+}
--- a/analysis/token/unicodenorm/unicodenorm.go
+++ b/analysis/token/unicodenorm/unicodenorm.go
@ -0,0 +1,82 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package unicodenorm
+
+import (
+	"fmt"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+	"golang.org/x/text/unicode/norm"
+)
+
+const Name = "normalize_unicode"
+
+const NFC = "nfc"
+const NFD = "nfd"
+const NFKC = "nfkc"
+const NFKD = "nfkd"
+
+var forms = map[string]norm.Form{
+	NFC:  norm.NFC,
+	NFD:  norm.NFD,
+	NFKC: norm.NFKC,
+	NFKD: norm.NFKD,
+}
+
+type UnicodeNormalizeFilter struct {
+	form norm.Form
+}
+
+func NewUnicodeNormalizeFilter(formName string) (*UnicodeNormalizeFilter, error) {
+	form, ok := forms[formName]
+	if !ok {
+		return nil, fmt.Errorf("no form named %s", formName)
+	}
+	return &UnicodeNormalizeFilter{
+		form: form,
+	}, nil
+}
+
+func MustNewUnicodeNormalizeFilter(formName string) *UnicodeNormalizeFilter {
+	filter, err := NewUnicodeNormalizeFilter(formName)
+	if err != nil {
+		panic(err)
+	}
+	return filter
+}
+
+func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		token.Term = s.form.Bytes(token.Term)
+	}
+	return input
+}
+
+func UnicodeNormalizeFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	formVal, ok := config["form"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify form")
+	}
+	form := formVal
+	return NewUnicodeNormalizeFilter(form)
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, UnicodeNormalizeFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/unicodenorm/unicodenorm_test.go
+++ b/analysis/token/unicodenorm/unicodenorm_test.go
@ -0,0 +1,162 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package unicodenorm
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+// the following tests come from the lucene
+// test cases for CJK width filter
+// which is our basis for using this
+// as a substitute for that
+func TestUnicodeNormalization(t *testing.T) {
+
+	tests := []struct {
+		formName string
+		input    analysis.TokenStream
+		output   analysis.TokenStream
+	}{
+		{
+			formName: NFKD,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Ｔｅｓｔ"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Test"),
+				},
+			},
+		},
+		{
+			formName: NFKD,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("１２３４"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("1234"),
+				},
+			},
+		},
+		{
+			formName: NFKD,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ｶﾀｶﾅ"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("カタカナ"),
+				},
+			},
+		},
+		{
+			formName: NFKC,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ｳﾞｨｯﾂ"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ヴィッツ"),
+				},
+			},
+		},
+		{
+			formName: NFKC,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("ﾊﾟﾅｿﾆｯｸ"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("パナソニック"),
+				},
+			},
+		},
+		{
+			formName: NFD,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u212B"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0041\u030A"),
+				},
+			},
+		},
+		{
+			formName: NFC,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u212B"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u00C5"),
+				},
+			},
+		},
+		{
+			formName: NFKD,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\uFB01"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0066\u0069"),
+				},
+			},
+		},
+		{
+			formName: NFKC,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\uFB01"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("\u0066\u0069"),
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		filter := MustNewUnicodeNormalizeFilter(test.formName)
+		actual := filter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
+			t.Errorf("expected %#v, got %#v", test.output[0].Term, actual[0].Term)
+		}
+	}
+}
--- a/analysis/token/unique/unique.go
+++ b/analysis/token/unique/unique.go
@ -0,0 +1,56 @@
+//  Copyright (c) 2018 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package unique
+
+import (
+	"github.com/blevesearch/bleve/v2/analysis"
+	"github.com/blevesearch/bleve/v2/registry"
+)
+
+const Name = "unique"
+
+// UniqueTermFilter retains only the tokens which mark the first occurrence of
+// a term. Tokens whose term appears in a preceding token are dropped.
+type UniqueTermFilter struct{}
+
+func NewUniqueTermFilter() *UniqueTermFilter {
+	return &UniqueTermFilter{}
+}
+
+func (f *UniqueTermFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	encounteredTerms := make(map[string]struct{}, len(input)/4)
+	j := 0
+	for _, token := range input {
+		term := string(token.Term)
+		if _, ok := encounteredTerms[term]; ok {
+			continue
+		}
+		encounteredTerms[term] = struct{}{}
+		input[j] = token
+		j++
+	}
+	return input[:j]
+}
+
+func UniqueTermFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewUniqueTermFilter(), nil
+}
+
+func init() {
+	err := registry.RegisterTokenFilter(Name, UniqueTermFilterConstructor)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/analysis/token/unique/unique_test.go
+++ b/analysis/token/unique/unique_test.go
@ -0,0 +1,84 @@
+//  Copyright (c) 2018 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package unique
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/blevesearch/bleve/v2/analysis"
+)
+
+func TestUniqueTermFilter(t *testing.T) {
+	var tests = []struct {
+		input analysis.TokenStream
+		// expected indices of input which should be included in the output. We
+		// use indices instead of another TokenStream, since position/start/end
+		// should be preserved.
+		expectedIndices []int
+	}{
+		{
+			input:           tokenStream(),
+			expectedIndices: []int{},
+		},
+		{
+			input:           tokenStream("a"),
+			expectedIndices: []int{0},
+		},
+		{
+			input:           tokenStream("each", "term", "in", "this", "sentence", "is", "unique"),
+			expectedIndices: []int{0, 1, 2, 3, 4, 5, 6},
+		},
+		{
+			input:           tokenStream("Lui", "è", "alto", "e", "lei", "è", "bassa"),
+			expectedIndices: []int{0, 1, 2, 3, 4, 6},
+		},
+		{
+			input:           tokenStream("a", "a", "A", "a", "a", "A"),
+			expectedIndices: []int{0, 2},
+		},
+	}
+	uniqueTermFilter := NewUniqueTermFilter()
+	for _, test := range tests {
+		expected := subStream(test.input, test.expectedIndices)
+		actual := uniqueTermFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, expected) {
+			t.Errorf("expected %s \n\n got %s", expected, actual)
+		}
+	}
+}
+
+func tokenStream(termStrs ...string) analysis.TokenStream {
+	tokenStream := make([]*analysis.Token, len(termStrs))
+	index := 0
+	for i, termStr := range termStrs {
+		tokenStream[i] = &analysis.Token{
+			Term:     []byte(termStr),
+			Position: i + 1,
+			Start:    index,
+			End:      index + len(termStr),
+		}
+		index += len(termStr)
+	}
+	return analysis.TokenStream(tokenStream)
+}
+
+func subStream(stream analysis.TokenStream, indices []int) analysis.TokenStream {
+	result := make(analysis.TokenStream, len(indices))
+	for i, index := range indices {
+		result[i] = stream[index]
+	}
+	return result
+}