1
0
Fork 0
golang-github-blevesearch-b.../analysis/lang/en/plural_stemmer.go
Daniel Baumann 982828099e
Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-19 00:20:02 +02:00

177 lines
4.6 KiB
Go

/*
This code was ported from the Open Search Project
https://github.com/opensearch-project/OpenSearch/blob/main/modules/analysis-common/src/main/java/org/opensearch/analysis/common/EnglishPluralStemFilter.java
The algorithm itself was created by Mark Harwood
https://github.com/markharwood
*/
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package en
import (
"strings"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)
const PluralStemmerName = "stemmer_en_plural"
type EnglishPluralStemmerFilter struct {
}
func NewEnglishPluralStemmerFilter() *EnglishPluralStemmerFilter {
return &EnglishPluralStemmerFilter{}
}
func (s *EnglishPluralStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
for _, token := range input {
token.Term = []byte(stem(string(token.Term)))
}
return input
}
func EnglishPluralStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
return NewEnglishPluralStemmerFilter(), nil
}
func init() {
err := registry.RegisterTokenFilter(PluralStemmerName, EnglishPluralStemmerFilterConstructor)
if err != nil {
panic(err)
}
}
// ----------------------------------------------------------------------------
// Words ending in oes that retain the e when stemmed
var oesExceptions = []string{"shoes", "canoes", "oboes"}
// Words ending in ches that retain the e when stemmed
var chesExceptions = []string{
"cliches",
"avalanches",
"mustaches",
"moustaches",
"quiches",
"headaches",
"heartaches",
"porsches",
"tranches",
"caches",
}
func stem(word string) string {
runes := []rune(strings.ToLower(word))
if len(runes) < 3 || runes[len(runes)-1] != 's' {
return string(runes)
}
switch runes[len(runes)-2] {
case 'u':
fallthrough
case 's':
return string(runes)
case 'e':
// Modified ies->y logic from original s-stemmer - only work on strings > 4
// so spies -> spy still but pies->pie.
// The original code also special-cased aies and eies for no good reason as far as I can tell.
// ( no words of consequence - eg http://www.thefreedictionary.com/words-that-end-in-aies )
if len(runes) > 4 && runes[len(runes)-3] == 'i' {
runes[len(runes)-3] = 'y'
return string(runes[0 : len(runes)-2])
}
// Suffix rules to remove any dangling "e"
if len(runes) > 3 {
// xes (but >1 prefix so we can stem "boxes->box" but keep "axes->axe")
if len(runes) > 4 && runes[len(runes)-3] == 'x' {
return string(runes[0 : len(runes)-2])
}
// oes
if len(runes) > 3 && runes[len(runes)-3] == 'o' {
if isException(runes, oesExceptions) {
// Only remove the S
return string(runes[0 : len(runes)-1])
}
// Remove the es
return string(runes[0 : len(runes)-2])
}
if len(runes) > 4 {
// shes/sses
if runes[len(runes)-4] == 's' && (runes[len(runes)-3] == 'h' || runes[len(runes)-3] == 's') {
return string(runes[0 : len(runes)-2])
}
// ches
if len(runes) > 4 {
if runes[len(runes)-4] == 'c' && runes[len(runes)-3] == 'h' {
if isException(runes, chesExceptions) {
// Only remove the S
return string(runes[0 : len(runes)-1])
}
// Remove the es
return string(runes[0 : len(runes)-2])
}
}
}
}
fallthrough
default:
return string(runes[0 : len(runes)-1])
}
}
func isException(word []rune, exceptions []string) bool {
for _, exception := range exceptions {
exceptionRunes := []rune(exception)
exceptionPos := len(exceptionRunes) - 1
wordPos := len(word) - 1
matched := true
for exceptionPos >= 0 && wordPos >= 0 {
if exceptionRunes[exceptionPos] != word[wordPos] {
matched = false
break
}
exceptionPos--
wordPos--
}
if matched {
return true
}
}
return false
}