Adding upstream version 2.5.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
c71cb8b61d
commit
982828099e
783 changed files with 150650 additions and 0 deletions
60
analysis/lang/pl/analyzer_pl.go
Normal file
60
analysis/lang/pl/analyzer_pl.go
Normal file
|
@ -0,0 +1,60 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package pl
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
)
|
||||
|
||||
const AnalyzerName = "pl"
|
||||
|
||||
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
|
||||
tokenizer, err := cache.TokenizerNamed(unicode.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stopPlFilter, err := cache.TokenFilterNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stemmerPlFilter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv := analysis.DefaultAnalyzer{
|
||||
Tokenizer: tokenizer,
|
||||
TokenFilters: []analysis.TokenFilter{
|
||||
toLowerFilter,
|
||||
stopPlFilter,
|
||||
stemmerPlFilter,
|
||||
},
|
||||
}
|
||||
return &rv, nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
149
analysis/lang/pl/analyzer_pl_test.go
Normal file
149
analysis/lang/pl/analyzer_pl_test.go
Normal file
|
@ -0,0 +1,149 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package pl
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestPolishAnalyzer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
// stemming
|
||||
{
|
||||
input: []byte("śmiało"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("śmieć"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("przypadku"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("przypadek"),
|
||||
},
|
||||
},
|
||||
},
|
||||
// stop word
|
||||
{
|
||||
input: []byte("według"),
|
||||
output: analysis.TokenStream{},
|
||||
},
|
||||
// digits safe
|
||||
{
|
||||
input: []byte("text 1000"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("text"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("1000"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("badawczego było opracowanie kompendium które przystępny sposób prezentowało niespecjalistom zakresu kryptografii kwantowej wykorzystanie technik kwantowych do bezpiecznego przesyłu przetwarzania informacji"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("badawczy"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("opracować"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("kompendium"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("przystyć"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("prezentować"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("niespecjalista"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("zakres"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("kryptografia"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("kwantowy"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("wykorzyseć"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("technika"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("kwantowy"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("bezpieczny"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("przesył"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("przetwarzać"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("informacja"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("Ale ta wiedza była utrzymywana w tajemnicy"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("wiedza"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("utrzymywać"),
|
||||
},
|
||||
&analysis.Token{
|
||||
Term: []byte("tajemnik"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
analyzer, err := cache.AnalyzerNamed(AnalyzerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := analyzer.Analyze(test.input)
|
||||
if len(actual) != len(test.output) {
|
||||
t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
|
||||
}
|
||||
for i, tok := range actual {
|
||||
if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
|
||||
t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
58
analysis/lang/pl/stemmer_pl.go
Normal file
58
analysis/lang/pl/stemmer_pl.go
Normal file
|
@ -0,0 +1,58 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package pl
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/lang/pl/stempel"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const SnowballStemmerName = "stemmer_pl"
|
||||
|
||||
type PolishStemmerFilter struct {
|
||||
trie stempel.Trie
|
||||
}
|
||||
|
||||
func NewPolishStemmerFilter() (*PolishStemmerFilter, error) {
|
||||
trie, err := stempel.LoadTrie()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &PolishStemmerFilter{
|
||||
trie: trie,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (s *PolishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
|
||||
for _, token := range input {
|
||||
buff := []rune(string(token.Term))
|
||||
diff := s.trie.GetLastOnPath(buff)
|
||||
buff = stempel.Diff(buff, diff)
|
||||
token.Term = []byte(string(buff))
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func PolishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
return NewPolishStemmerFilter()
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(SnowballStemmerName, PolishStemmerFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
67
analysis/lang/pl/stemmer_pl_test.go
Normal file
67
analysis/lang/pl/stemmer_pl_test.go
Normal file
|
@ -0,0 +1,67 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package pl
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func TestPolishStemmer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input analysis.TokenStream
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("utrzymywana"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("utrzymywać"),
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("tajemnicy"),
|
||||
},
|
||||
},
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("tajemnik"),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cache := registry.NewCache()
|
||||
filter, err := cache.TokenFilterNamed(SnowballStemmerName)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, test := range tests {
|
||||
actual := filter.Filter(test.input)
|
||||
if !reflect.DeepEqual(actual, test.output) {
|
||||
t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
|
||||
}
|
||||
}
|
||||
}
|
202
analysis/lang/pl/stempel/LICENSE
Normal file
202
analysis/lang/pl/stempel/LICENSE
Normal file
|
@ -0,0 +1,202 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
53
analysis/lang/pl/stempel/cell.go
Normal file
53
analysis/lang/pl/stempel/cell.go
Normal file
|
@ -0,0 +1,53 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stempel
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/stempel/javadata"
|
||||
)
|
||||
|
||||
type cell struct {
|
||||
ref int32
|
||||
cmd int32
|
||||
}
|
||||
|
||||
func (c *cell) String() string {
|
||||
return fmt.Sprintf("ref(%d) cmd(%d)", c.ref, c.cmd)
|
||||
}
|
||||
|
||||
func newCell(r *javadata.Reader) (*cell, error) {
|
||||
cmd, err := r.ReadInt32()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading cell cmd: %v", err)
|
||||
}
|
||||
_, err = r.ReadInt32()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading cell cnt: %v", err)
|
||||
}
|
||||
ref, err := r.ReadInt32()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading cell ref: %v", err)
|
||||
}
|
||||
_, err = r.ReadInt32()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading cell skip: %v", err)
|
||||
}
|
||||
return &cell{
|
||||
cmd: cmd,
|
||||
ref: ref,
|
||||
}, nil
|
||||
}
|
64
analysis/lang/pl/stempel/diff.go
Normal file
64
analysis/lang/pl/stempel/diff.go
Normal file
|
@ -0,0 +1,64 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stempel
|
||||
|
||||
// Diff transforms the dest rune slice following the rules described
|
||||
// in the diff command rune slice.
|
||||
func Diff(dest, diff []rune) []rune {
|
||||
if len(diff) == 0 {
|
||||
return dest
|
||||
}
|
||||
|
||||
pos := len(dest) - 1
|
||||
if pos < 0 {
|
||||
return dest
|
||||
}
|
||||
|
||||
for i := 0; i < len(diff)/2; i++ {
|
||||
cmd := diff[2*i]
|
||||
param := diff[2*i+1]
|
||||
parNum := int(param - 'a' + 1)
|
||||
switch cmd {
|
||||
case '-':
|
||||
pos = pos - parNum + 1
|
||||
case 'R':
|
||||
if pos < 0 || pos >= len(dest) {
|
||||
// out of bounds, just return
|
||||
return dest
|
||||
}
|
||||
dest[pos] = param
|
||||
case 'D':
|
||||
o := pos
|
||||
pos -= parNum - 1
|
||||
if pos < 0 || pos >= len(dest) {
|
||||
// out of bounds, just return
|
||||
return dest
|
||||
}
|
||||
dest = append(dest[:pos], dest[o+1:]...)
|
||||
case 'I':
|
||||
pos++
|
||||
if pos < 0 || pos > len(dest) {
|
||||
// out of bounds, just return
|
||||
return dest
|
||||
}
|
||||
|
||||
dest = append(dest, 0)
|
||||
copy(dest[pos+1:], dest[pos:])
|
||||
dest[pos] = param
|
||||
}
|
||||
pos--
|
||||
}
|
||||
return dest
|
||||
}
|
144
analysis/lang/pl/stempel/diff_test.go
Normal file
144
analysis/lang/pl/stempel/diff_test.go
Normal file
|
@ -0,0 +1,144 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stempel
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestDiff(t *testing.T) {
|
||||
tests := []struct {
|
||||
in []rune
|
||||
cmd []rune
|
||||
out []rune
|
||||
}{
|
||||
// test delete, this command deletes N chars backwards from the current pos
|
||||
// the current pos starts at the end of the string
|
||||
// if you try to delete a negative number of chars or more chars than there
|
||||
// are, you will get the buffer at that time
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// delete 1
|
||||
cmd: []rune{'D', 'a'},
|
||||
out: []rune{'h', 'e', 'l', 'l'},
|
||||
},
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// delete 2
|
||||
cmd: []rune{'D', 'a' + 1},
|
||||
out: []rune{'h', 'e', 'l'},
|
||||
},
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// delete 3
|
||||
cmd: []rune{'D', 'a' + 2},
|
||||
out: []rune{'h', 'e'},
|
||||
},
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// delete 4
|
||||
cmd: []rune{'D', 'a' + 3},
|
||||
out: []rune{'h'},
|
||||
},
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// delete 5
|
||||
cmd: []rune{'D', 'a' + 4},
|
||||
out: []rune{},
|
||||
},
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// delete 6 (invalid, return buffer at that point)
|
||||
cmd: []rune{'D', 'a' + 5},
|
||||
out: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
},
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// delete -1
|
||||
cmd: []rune{'D', 'a' - 1},
|
||||
out: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
},
|
||||
// delete one char twice
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// delete 1, delete 1
|
||||
cmd: []rune{'D', 'a', 'D', 'a'},
|
||||
out: []rune{'h', 'e', 'l'},
|
||||
},
|
||||
// test insert
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// insert 'p'
|
||||
cmd: []rune{'I', 'p'},
|
||||
out: []rune{'h', 'e', 'l', 'l', 'o', 'p'},
|
||||
},
|
||||
// insert twice
|
||||
{
|
||||
in: []rune{'h'},
|
||||
// insert 'l', insert 'e'
|
||||
// NOTE how the cursor moves backwards, so we have to insert in reverse
|
||||
cmd: []rune{'I', 'l', 'I', 'e'},
|
||||
out: []rune{'h', 'e', 'l'},
|
||||
},
|
||||
// test replace
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// replace with 'y'
|
||||
cmd: []rune{'R', 'y'},
|
||||
out: []rune{'h', 'e', 'l', 'l', 'y'},
|
||||
},
|
||||
// test replace again
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// replace with 'y', then replace with 'x'
|
||||
// NOTE how the cursor moves backwards as we replace
|
||||
cmd: []rune{'R', 'y', 'R', 'x'},
|
||||
out: []rune{'h', 'e', 'l', 'x', 'y'},
|
||||
},
|
||||
// test skip, then replace
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// skip 1, then replace with 'y'
|
||||
cmd: []rune{'-', 'a', 'R', 'y'},
|
||||
out: []rune{'h', 'e', 'l', 'y', 'o'},
|
||||
},
|
||||
// test skip 2, then replace
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// skip 1, then replace with 'y'
|
||||
cmd: []rune{'-', 'a' + 1, 'R', 'y'},
|
||||
out: []rune{'h', 'e', 'y', 'l', 'o'},
|
||||
},
|
||||
// test skip 2, then replace
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
// skip 5 (too far), then replace with 'y'
|
||||
// get original
|
||||
cmd: []rune{'-', 'a' + 4, 'R', 'y'},
|
||||
out: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(fmt.Sprintf("%s-'%s'", string(test.in), string(test.cmd)), func(t *testing.T) {
|
||||
got := Diff(test.in, test.cmd)
|
||||
if !reflect.DeepEqual(test.out, got) {
|
||||
t.Errorf("expected %v, got %v", test.out, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
71
analysis/lang/pl/stempel/file.go
Normal file
71
analysis/lang/pl/stempel/file.go
Normal file
|
@ -0,0 +1,71 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stempel
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
_ "embed"
|
||||
"github.com/blevesearch/stempel/javadata"
|
||||
"io"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
//go:embed pl/stemmer_20000.tbl
|
||||
var stempelFile []byte
|
||||
|
||||
// Trie is the external interface to work with the stempel trie
|
||||
type Trie interface {
|
||||
GetLastOnPath([]rune) []rune
|
||||
}
|
||||
|
||||
// Open attempts to open a file at the specified path, and use it to
|
||||
// build a Trie
|
||||
func Open(path string) (Trie, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return buildTrieFromReader(f)
|
||||
}
|
||||
|
||||
// LoadTrie load trie from embed file
|
||||
func LoadTrie() (Trie, error) {
|
||||
return buildTrieFromReader(bytes.NewReader(stempelFile))
|
||||
}
|
||||
|
||||
// buildTrieFromReader build trie from io.Reader
|
||||
func buildTrieFromReader(f io.Reader) (Trie, error) {
|
||||
r := javadata.NewReader(f)
|
||||
method, err := r.ReadUTF()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var rv Trie
|
||||
if strings.Contains(method, "M") {
|
||||
rv, err = newMultiTrie(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
} else {
|
||||
rv, err = newTrie(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
return rv, nil
|
||||
}
|
91
analysis/lang/pl/stempel/file_test.go
Normal file
91
analysis/lang/pl/stempel/file_test.go
Normal file
|
@ -0,0 +1,91 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stempel
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"compress/gzip"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"golang.org/x/text/encoding/charmap"
|
||||
)
|
||||
|
||||
func TestEmpty(t *testing.T) {
|
||||
trie, err := Open("pl/stemmer_20000.tbl")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
buff := []rune("")
|
||||
diff := trie.GetLastOnPath(buff)
|
||||
if len(diff) > 0 {
|
||||
t.Fatalf("expected empty diff, got %v", diff)
|
||||
}
|
||||
buff = Diff(buff, diff)
|
||||
if len(buff) > 0 {
|
||||
t.Fatalf("expected empty buff, got %v", buff)
|
||||
}
|
||||
}
|
||||
|
||||
// TestStem only tests that we can successfully stem everything in the
|
||||
// dictionary without crashing. It does not attempt to assert correct output.
|
||||
func TestStem(t *testing.T) {
|
||||
trie, err := Open("pl/stemmer_20000.tbl")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
wordFileGz, err := os.Open("pl/pl_PL.dic.gz")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer func() {
|
||||
cerr := wordFileGz.Close()
|
||||
if cerr != nil {
|
||||
t.Fatal(cerr)
|
||||
}
|
||||
}()
|
||||
|
||||
wordFile, err := gzip.NewReader(wordFileGz)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer func() {
|
||||
cerr := wordFile.Close()
|
||||
if cerr != nil {
|
||||
t.Fatal(cerr)
|
||||
}
|
||||
}()
|
||||
|
||||
cr := charmap.ISO8859_2.NewDecoder().Reader(wordFile)
|
||||
|
||||
scanner := bufio.NewScanner(cr)
|
||||
for scanner.Scan() {
|
||||
before := scanner.Text()
|
||||
hasSlash := strings.Index(before, "/")
|
||||
if hasSlash > 0 {
|
||||
before = before[0:hasSlash]
|
||||
}
|
||||
buff := []rune(before)
|
||||
diff := trie.GetLastOnPath(buff)
|
||||
_ = Diff(buff, diff)
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
35
analysis/lang/pl/stempel/fuzz.go
Normal file
35
analysis/lang/pl/stempel/fuzz.go
Normal file
|
@ -0,0 +1,35 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//go:build gofuzz
|
||||
// +build gofuzz
|
||||
|
||||
package stempel
|
||||
|
||||
var fuzzTrie Trie
|
||||
|
||||
func init() {
|
||||
var err error
|
||||
fuzzTrie, err = Open("pl/stemmer_20000.tbl")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
func Fuzz(data []byte) int {
|
||||
inRunes := []rune(string(data))
|
||||
diff := fuzzTrie.GetLastOnPath(inRunes)
|
||||
_ = Diff(inRunes, diff)
|
||||
return 1
|
||||
}
|
3
analysis/lang/pl/stempel/javadata/README.md
Normal file
3
analysis/lang/pl/stempel/javadata/README.md
Normal file
|
@ -0,0 +1,3 @@
|
|||
# javadata
|
||||
|
||||
Go library to read data written with java.io.DataOutput
|
34
analysis/lang/pl/stempel/javadata/fuzz.go
Normal file
34
analysis/lang/pl/stempel/javadata/fuzz.go
Normal file
|
@ -0,0 +1,34 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//go:build gofuzz
|
||||
// +build gofuzz
|
||||
|
||||
package javadata
|
||||
|
||||
import "bytes"
|
||||
|
||||
func Fuzz(data []byte) int {
|
||||
br := bytes.NewReader(data)
|
||||
jdr := NewReader(br)
|
||||
|
||||
var err error
|
||||
for err == nil {
|
||||
_, err = jdr.ReadUTF()
|
||||
}
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return 1
|
||||
}
|
135
analysis/lang/pl/stempel/javadata/input.go
Normal file
135
analysis/lang/pl/stempel/javadata/input.go
Normal file
|
@ -0,0 +1,135 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package javadata
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
)
|
||||
|
||||
// ErrMalformedInput returned when malformed input is encountered
|
||||
var ErrMalformedInput = fmt.Errorf("malformed input")
|
||||
|
||||
// Reader knows how to read java serialized data
|
||||
type Reader struct {
|
||||
r *bufio.Reader
|
||||
}
|
||||
|
||||
// NewReader creates a new java data input reader
|
||||
func NewReader(r io.Reader) *Reader {
|
||||
return &Reader{r: bufio.NewReader(r)}
|
||||
}
|
||||
|
||||
// ReadBool attempts to reads a bool from the stream
|
||||
func (r *Reader) ReadBool() (bool, error) {
|
||||
b, err := r.r.ReadByte()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return b != 0, nil
|
||||
}
|
||||
|
||||
// ReadInt32 attempts to reads a signed 32-bit integer from the stream
|
||||
func (r *Reader) ReadInt32() (rv int32, err error) {
|
||||
err = binary.Read(r.r, binary.BigEndian, &rv)
|
||||
return
|
||||
}
|
||||
|
||||
// ReadUint16 attempts to reads a unsigned 16-bit integer from the stream
|
||||
func (r *Reader) ReadUint16() (rv uint16, err error) {
|
||||
err = binary.Read(r.r, binary.BigEndian, &rv)
|
||||
return
|
||||
}
|
||||
|
||||
// ReadCharAsRune attempts to read a java two byte char and return it as a rune
|
||||
func (r *Reader) ReadCharAsRune() (rv rune, err error) {
|
||||
var char uint16
|
||||
err = binary.Read(r.r, binary.BigEndian, &char)
|
||||
rv = rune(char)
|
||||
return
|
||||
}
|
||||
|
||||
// ReadUTF attempts to reads a UTF-encoded string from the stream
|
||||
// this method follows the specific alternate encoding desribed here:
|
||||
// https://docs.oracle.com/javase/7/docs/api/java/io/DataInput.html
|
||||
func (r *Reader) ReadUTF() (string, error) {
|
||||
utfLen, err := r.ReadUint16()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
bytes := make([]byte, utfLen)
|
||||
runes := make([]rune, utfLen)
|
||||
_, err = io.ReadFull(r.r, bytes)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var count uint16
|
||||
var runeCount uint16
|
||||
|
||||
// handle simple case of all ascii
|
||||
for count < utfLen {
|
||||
c := bytes[count]
|
||||
if bytes[count] > 127 {
|
||||
break
|
||||
}
|
||||
count++
|
||||
runes[runeCount] = rune(c)
|
||||
runeCount++
|
||||
}
|
||||
|
||||
// handle rest
|
||||
for count < utfLen {
|
||||
c := bytes[count]
|
||||
switch bytes[count] >> 4 {
|
||||
case 0, 1, 2, 3, 4, 5, 6, 7, 8:
|
||||
/* 0xxxxxxx*/
|
||||
count++
|
||||
runes[runeCount] = rune(c)
|
||||
runeCount++
|
||||
case 12, 13:
|
||||
/* 110x xxxx 10xx xxxx*/
|
||||
count += 2
|
||||
if count > utfLen {
|
||||
return "", ErrMalformedInput
|
||||
}
|
||||
char2 := rune(bytes[count-1])
|
||||
if (char2 & 0xC0) != 0x80 {
|
||||
return "", ErrMalformedInput
|
||||
}
|
||||
runes[runeCount] = (rune(c)&0x1F)<<6 | char2&0x3F
|
||||
runeCount++
|
||||
case 14:
|
||||
/* 1110 xxxx 10xx xxxx 10xx xxxx */
|
||||
count += 3
|
||||
if count > utfLen {
|
||||
return "", ErrMalformedInput
|
||||
}
|
||||
char2 := rune(bytes[count-2])
|
||||
char3 := rune(bytes[count-1])
|
||||
if ((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80) {
|
||||
return "", ErrMalformedInput
|
||||
}
|
||||
runes[runeCount] = ((rune(c)&0x0F)<<12 | (char2&0x3F)<<6 | (char3&0x3F)<<0)
|
||||
runeCount++
|
||||
default:
|
||||
/* 10xx xxxx, 1111 xxxx */
|
||||
return "", ErrMalformedInput
|
||||
}
|
||||
}
|
||||
return string(runes[0:runeCount]), nil
|
||||
}
|
249
analysis/lang/pl/stempel/javadata/input_test.go
Normal file
249
analysis/lang/pl/stempel/javadata/input_test.go
Normal file
|
@ -0,0 +1,249 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package javadata
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestReadBool(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
in []byte
|
||||
out bool
|
||||
err error
|
||||
}{
|
||||
{
|
||||
in: []byte{0},
|
||||
out: false,
|
||||
},
|
||||
{
|
||||
in: []byte{1},
|
||||
out: true,
|
||||
},
|
||||
{
|
||||
in: []byte{27},
|
||||
out: true,
|
||||
},
|
||||
{
|
||||
in: []byte{},
|
||||
err: io.EOF,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(string(test.in), func(t *testing.T) {
|
||||
sr := bytes.NewReader(test.in)
|
||||
dr := NewReader(sr)
|
||||
actual, err := dr.ReadBool()
|
||||
if err != test.err {
|
||||
t.Error(err)
|
||||
}
|
||||
if actual != test.out {
|
||||
t.Errorf("expected %t, got %t", test.out, actual)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadUint16(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
in []byte
|
||||
out uint16
|
||||
err error
|
||||
}{
|
||||
{
|
||||
in: []byte{0, 0},
|
||||
out: 0,
|
||||
},
|
||||
{
|
||||
in: []byte{0, 1},
|
||||
out: 1,
|
||||
},
|
||||
{
|
||||
in: []byte{1, 0},
|
||||
out: 256,
|
||||
},
|
||||
{
|
||||
in: []byte{},
|
||||
err: io.EOF,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(string(test.in), func(t *testing.T) {
|
||||
sr := bytes.NewReader(test.in)
|
||||
dr := NewReader(sr)
|
||||
actual, err := dr.ReadUint16()
|
||||
if err != test.err {
|
||||
t.Error(err)
|
||||
}
|
||||
if actual != test.out {
|
||||
t.Errorf("expected %d, got %d", test.out, actual)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadInt32(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
in []byte
|
||||
out int32
|
||||
err error
|
||||
}{
|
||||
{
|
||||
in: []byte{0, 0, 0, 0},
|
||||
out: 0,
|
||||
},
|
||||
{
|
||||
in: []byte{0, 0, 0, 1},
|
||||
out: 1,
|
||||
},
|
||||
{
|
||||
in: []byte{0, 0, 1, 0},
|
||||
out: 256,
|
||||
},
|
||||
{
|
||||
in: []byte{0, 1, 0, 0},
|
||||
out: 65536,
|
||||
},
|
||||
{
|
||||
in: []byte{},
|
||||
err: io.EOF,
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(string(test.in), func(t *testing.T) {
|
||||
sr := bytes.NewReader(test.in)
|
||||
dr := NewReader(sr)
|
||||
actual, err := dr.ReadInt32()
|
||||
if err != test.err {
|
||||
t.Error(err)
|
||||
}
|
||||
if actual != test.out {
|
||||
t.Errorf("expected %d, got %d", test.out, actual)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadUTF(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
in []byte
|
||||
out string
|
||||
err error
|
||||
}{
|
||||
{
|
||||
in: []byte{0, 3, 'c', 'a', 't'},
|
||||
out: "cat",
|
||||
},
|
||||
{
|
||||
in: []byte{0, 2, 0xc2, 0xa3},
|
||||
out: "£",
|
||||
},
|
||||
{
|
||||
in: []byte{0, 3, 0xe3, 0x85, 0x85},
|
||||
out: "ㅅ",
|
||||
},
|
||||
{
|
||||
in: []byte{0, 6, 0xe3, 0x85, 0x85, 'c', 'a', 't'},
|
||||
out: "ㅅcat",
|
||||
},
|
||||
{
|
||||
in: []byte{},
|
||||
err: io.EOF,
|
||||
},
|
||||
{
|
||||
in: []byte{0, 3},
|
||||
err: io.EOF,
|
||||
},
|
||||
{
|
||||
in: []byte{0, 1, 0xc2},
|
||||
err: ErrMalformedInput,
|
||||
},
|
||||
{
|
||||
in: []byte{0, 2, 0xc2, 0xc3},
|
||||
err: ErrMalformedInput,
|
||||
},
|
||||
{
|
||||
in: []byte{0, 2, 0xe3, 0x85},
|
||||
err: ErrMalformedInput,
|
||||
},
|
||||
{
|
||||
in: []byte{0, 3, 0xe3, 0xc5, 0x85},
|
||||
err: ErrMalformedInput,
|
||||
},
|
||||
{
|
||||
in: []byte{0, 1, 0xff},
|
||||
err: ErrMalformedInput,
|
||||
},
|
||||
{
|
||||
in: []byte{0x0, 0x05, 0x44, 0x61, 0x52, 0xc4, 0x87},
|
||||
out: "DaRć",
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(string(test.in), func(t *testing.T) {
|
||||
sr := bytes.NewReader(test.in)
|
||||
dr := NewReader(sr)
|
||||
actual, err := dr.ReadUTF()
|
||||
if err != test.err {
|
||||
t.Error(err)
|
||||
}
|
||||
if actual != test.out {
|
||||
t.Errorf("expected %s, got %s", test.out, actual)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// func TestFile(t *testing.T) {
|
||||
// f, err := os.Open("stemmer_20000.tbl")
|
||||
// if err != nil {
|
||||
// t.Fatal(err)
|
||||
// }
|
||||
// r := NewReader(f)
|
||||
// reversed, err := r.ReadBool()
|
||||
// if err != nil {
|
||||
// t.Fatal(err)
|
||||
// }
|
||||
// log.Printf("reversed: %t", reversed)
|
||||
// root, err := r.ReadInt32()
|
||||
// if err != nil {
|
||||
// t.Fatal(err)
|
||||
// }
|
||||
// log.Printf("root: %d", root)
|
||||
// n, err := r.ReadInt32()
|
||||
// if err != nil {
|
||||
// t.Fatal(err)
|
||||
// }
|
||||
// log.Printf("n is %d", n)
|
||||
// // for n > 0 {
|
||||
// // utf, err := r.ReadUTF()
|
||||
// // if err != nil {
|
||||
// // t.Error(err)
|
||||
// // }
|
||||
// // log.Printf("read: %s", utf)
|
||||
// // n--
|
||||
// // }
|
||||
// }
|
140
analysis/lang/pl/stempel/multi_trie.go
Normal file
140
analysis/lang/pl/stempel/multi_trie.go
Normal file
|
@ -0,0 +1,140 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stempel
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/stempel/javadata"
|
||||
)
|
||||
|
||||
// multiTrie represents a trie of tries. When using the multiTrie, each trie
|
||||
// is consulted consecutively to find commands to perform on the input. Thus
|
||||
// a multiTrie with seven tries might have up to seven groups of commands to
|
||||
// perform on the input.
|
||||
type multiTrie struct {
|
||||
tries []*trie
|
||||
by int32
|
||||
forward bool
|
||||
}
|
||||
|
||||
func newMultiTrie(r *javadata.Reader) (rv *multiTrie, err error) {
|
||||
rv = &multiTrie{}
|
||||
rv.forward, err = r.ReadBool()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv.by, err = r.ReadInt32()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
nTries, err := r.ReadInt32()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for nTries > 0 {
|
||||
trie, err := newTrie(r)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
rv.tries = append(rv.tries, trie)
|
||||
nTries--
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
const eom = rune('*')
|
||||
|
||||
func (t *multiTrie) GetLastOnPath(key []rune) []rune {
|
||||
var rv []rune
|
||||
lastKey := key
|
||||
p := make([][]rune, len(t.tries))
|
||||
lastR := ' '
|
||||
for i := 0; i < len(t.tries); i++ {
|
||||
r := t.tries[i].GetLastOnPath(lastKey)
|
||||
if len(r) == 0 || len(r) == 1 && r[0] == eom {
|
||||
return rv
|
||||
}
|
||||
if cannotFollow(lastR, r[0]) {
|
||||
return rv
|
||||
}
|
||||
lastR = r[len(r)-2]
|
||||
p[i] = r
|
||||
if p[i][0] == '-' {
|
||||
if i > 0 {
|
||||
var err error
|
||||
key, err = t.skip(key, lengthPP(p[i-1]))
|
||||
if err != nil {
|
||||
return rv
|
||||
}
|
||||
}
|
||||
var err error
|
||||
key, err = t.skip(key, lengthPP(p[i]))
|
||||
if err != nil {
|
||||
return rv
|
||||
}
|
||||
}
|
||||
rv = append(rv, r...)
|
||||
if len(key) != 0 {
|
||||
lastKey = key
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func cannotFollow(after, goes rune) bool {
|
||||
switch after {
|
||||
case '-', 'D':
|
||||
return after == goes
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
var errIndexOutOfBounds = fmt.Errorf("index out of bounds")
|
||||
|
||||
func (t *multiTrie) skip(in []rune, count int) ([]rune, error) {
|
||||
if count > len(in) {
|
||||
return nil, errIndexOutOfBounds
|
||||
}
|
||||
if t.forward {
|
||||
return in[count:], nil
|
||||
}
|
||||
return in[0 : len(in)-count], nil
|
||||
}
|
||||
|
||||
func lengthPP(cmd []rune) int {
|
||||
rv := 0
|
||||
for i := 0; i < len(cmd); i++ {
|
||||
switch cmd[i] {
|
||||
case '-', 'D':
|
||||
i++
|
||||
rv += int(cmd[i] - rune('a') + 1)
|
||||
case 'R':
|
||||
i++
|
||||
rv++
|
||||
fallthrough
|
||||
case 'I':
|
||||
}
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (t *multiTrie) String() string {
|
||||
rv := ""
|
||||
for i, trie := range t.tries {
|
||||
rv += fmt.Sprintf("trie %d\n\n %v\n--------\n", i, trie)
|
||||
}
|
||||
return rv
|
||||
}
|
BIN
analysis/lang/pl/stempel/pl/pl_PL.dic.gz
Normal file
BIN
analysis/lang/pl/stempel/pl/pl_PL.dic.gz
Normal file
Binary file not shown.
BIN
analysis/lang/pl/stempel/pl/stemmer_20000.tbl
Normal file
BIN
analysis/lang/pl/stempel/pl/stemmer_20000.tbl
Normal file
Binary file not shown.
80
analysis/lang/pl/stempel/row.go
Normal file
80
analysis/lang/pl/stempel/row.go
Normal file
|
@ -0,0 +1,80 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stempel
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/stempel/javadata"
|
||||
)
|
||||
|
||||
type row struct {
|
||||
cells map[rune]*cell
|
||||
}
|
||||
|
||||
func (r *row) String() string {
|
||||
rv := ""
|
||||
for k, v := range r.cells {
|
||||
rv += fmt.Sprintf("[%s:%v]\n", string(k), v)
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func newRow(r *javadata.Reader) (*row, error) {
|
||||
rv := &row{
|
||||
cells: make(map[rune]*cell),
|
||||
}
|
||||
|
||||
nCells, err := r.ReadInt32()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading num cells: %v", err)
|
||||
}
|
||||
|
||||
for nCells > 0 {
|
||||
|
||||
c, err := r.ReadCharAsRune()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading cell char: %v", err)
|
||||
}
|
||||
cell, err := newCell(r)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading cell: %v", err)
|
||||
}
|
||||
|
||||
rv.cells[c] = cell
|
||||
nCells--
|
||||
}
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func (r *row) getCmd(way rune) int32 {
|
||||
c := r.at(way)
|
||||
if c != nil {
|
||||
return c.cmd
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func (r *row) getRef(way rune) int32 {
|
||||
c := r.at(way)
|
||||
if c != nil {
|
||||
return c.ref
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
func (r *row) at(c rune) *cell {
|
||||
return r.cells[c]
|
||||
}
|
48
analysis/lang/pl/stempel/strenum.go
Normal file
48
analysis/lang/pl/stempel/strenum.go
Normal file
|
@ -0,0 +1,48 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stempel
|
||||
|
||||
import (
|
||||
"io"
|
||||
)
|
||||
|
||||
type strEnum struct {
|
||||
r []rune
|
||||
from int
|
||||
by int
|
||||
}
|
||||
|
||||
func newStrEnum(s []rune, up bool) *strEnum {
|
||||
rv := &strEnum{
|
||||
r: s,
|
||||
}
|
||||
if up {
|
||||
rv.from = 0
|
||||
rv.by = 1
|
||||
} else {
|
||||
rv.from = len(s) - 1
|
||||
rv.by = -1
|
||||
}
|
||||
return rv
|
||||
}
|
||||
|
||||
func (s *strEnum) next() (rune, error) {
|
||||
if s.from < 0 || s.from >= len(s.r) {
|
||||
return 0, io.EOF
|
||||
}
|
||||
rv := s.r[s.from]
|
||||
s.from += s.by
|
||||
return rv, nil
|
||||
}
|
61
analysis/lang/pl/stempel/strenum_test.go
Normal file
61
analysis/lang/pl/stempel/strenum_test.go
Normal file
|
@ -0,0 +1,61 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stempel
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestStrenumNext(t *testing.T) {
|
||||
|
||||
tests := []struct {
|
||||
in []rune
|
||||
up bool
|
||||
expect []rune
|
||||
}{
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
up: true,
|
||||
expect: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
},
|
||||
{
|
||||
in: []rune{'h', 'e', 'l', 'l', 'o'},
|
||||
up: false,
|
||||
expect: []rune{'o', 'l', 'l', 'e', 'h'},
|
||||
},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
t.Run(fmt.Sprintf("%s-up-%t", string(test.in), test.up), func(t *testing.T) {
|
||||
strenum := newStrEnum(test.in, test.up)
|
||||
var got []rune
|
||||
next, err := strenum.next()
|
||||
for err == nil {
|
||||
got = append(got, next)
|
||||
next, err = strenum.next()
|
||||
}
|
||||
if err != io.EOF {
|
||||
t.Errorf("next got err: %v", err)
|
||||
}
|
||||
if !reflect.DeepEqual(got, test.expect) {
|
||||
t.Errorf("expected %v, got %v", test.expect, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
}
|
132
analysis/lang/pl/stempel/trie.go
Normal file
132
analysis/lang/pl/stempel/trie.go
Normal file
|
@ -0,0 +1,132 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package stempel
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/blevesearch/stempel/javadata"
|
||||
)
|
||||
|
||||
// trie represents the internal trie structure
|
||||
type trie struct {
|
||||
rows []*row
|
||||
cmds []string
|
||||
root int32
|
||||
forward bool
|
||||
}
|
||||
|
||||
func newTrie(r *javadata.Reader) (rv *trie, err error) {
|
||||
rv = &trie{}
|
||||
rv.forward, err = r.ReadBool()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading trie forward: %v", err)
|
||||
}
|
||||
rv.root, err = r.ReadInt32()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading trie root: %v", err)
|
||||
}
|
||||
|
||||
// commands
|
||||
nCommands, err := r.ReadInt32()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading trie num commands: %v", err)
|
||||
}
|
||||
for nCommands > 0 {
|
||||
utfCommand, nerr := r.ReadUTF()
|
||||
if nerr != nil {
|
||||
return nil, fmt.Errorf("error reading trie command utf: %v", nerr)
|
||||
}
|
||||
rv.cmds = append(rv.cmds, utfCommand)
|
||||
nCommands--
|
||||
}
|
||||
|
||||
// rows
|
||||
nRows, err := r.ReadInt32()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading trie num rows: %v", err)
|
||||
}
|
||||
for nRows > 0 {
|
||||
row, err := newRow(r)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error reading trie row: %v", err)
|
||||
}
|
||||
rv.rows = append(rv.rows, row)
|
||||
nRows--
|
||||
}
|
||||
|
||||
return rv, nil
|
||||
}
|
||||
|
||||
func (t *trie) getRow(i int) *row {
|
||||
if i < 0 || i >= len(t.rows) {
|
||||
return nil
|
||||
}
|
||||
return t.rows[i]
|
||||
}
|
||||
|
||||
func (t *trie) GetLastOnPath(key []rune) []rune {
|
||||
now := t.getRow(int(t.root))
|
||||
var last []rune
|
||||
var w int32
|
||||
e := newStrEnum(key, t.forward)
|
||||
|
||||
// walk over each rune
|
||||
// if rune has row in the table, note the cmd (as last)
|
||||
// if rune has row in table, see if it transitions to another row
|
||||
// if it does, move to that row and next char on next loop itr
|
||||
// if it does not, return the last cmd
|
||||
// if you get to end of string and there is command in row use it
|
||||
// or return last
|
||||
for i := 0; i < len(key)-1; i++ {
|
||||
r, err := e.next()
|
||||
if err != nil {
|
||||
return last
|
||||
}
|
||||
w = now.getCmd(r)
|
||||
if w >= 0 {
|
||||
last = []rune(t.cmds[w])
|
||||
}
|
||||
w = now.getRef(r)
|
||||
if w >= 0 {
|
||||
now = t.getRow(int(w))
|
||||
} else {
|
||||
return last
|
||||
}
|
||||
}
|
||||
r, err := e.next()
|
||||
if err != nil {
|
||||
return last
|
||||
}
|
||||
w = now.getCmd(r)
|
||||
if err != nil {
|
||||
return last
|
||||
}
|
||||
if w >= 0 {
|
||||
return []rune(t.cmds[w])
|
||||
}
|
||||
return last
|
||||
}
|
||||
|
||||
func (t *trie) String() string {
|
||||
rv := ""
|
||||
for _, cmd := range t.cmds {
|
||||
rv += fmt.Sprintf("cmd: %s\n", string(cmd))
|
||||
}
|
||||
for _, row := range t.rows {
|
||||
rv += fmt.Sprintf("row: %v\n", row)
|
||||
}
|
||||
return rv
|
||||
}
|
36
analysis/lang/pl/stop_filter_pl.go
Normal file
36
analysis/lang/pl/stop_filter_pl.go
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright (c) 2018 Couchbase, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package pl
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/stop"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
|
||||
tokenMap, err := cache.TokenMapNamed(StopName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return stop.NewStopTokensFilter(tokenMap), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
368
analysis/lang/pl/stop_words_pl.go
Normal file
368
analysis/lang/pl/stop_words_pl.go
Normal file
|
@ -0,0 +1,368 @@
|
|||
package pl
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const StopName = "stop_pl"
|
||||
|
||||
// this content was obtained from:
|
||||
// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
|
||||
// ` was changed to ' to allow for literal string
|
||||
|
||||
var PolishStopWords = []byte(` | From https://github.com/stopwords-iso/stopwords-pl/tree/master
|
||||
| The MIT License (MIT)
|
||||
| See https://github.com/stopwords-iso/stopwords-pl/blob/master/LICENSE
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
| - english text is auto-translate
|
||||
|
|
||||
| NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
|
||||
| a polish stop word list. comments begin with vertical bar. each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
a | and
|
||||
aby | to
|
||||
ach | ah
|
||||
acz | although
|
||||
aczkolwiek | although
|
||||
aj | ay
|
||||
albo | or
|
||||
ale | but
|
||||
ależ | but
|
||||
ani | or
|
||||
aż | until
|
||||
bardziej | more
|
||||
bardzo | very
|
||||
bez | without
|
||||
bo | because
|
||||
bowiem | because
|
||||
by | by
|
||||
byli | were
|
||||
bym | i would
|
||||
bynajmniej | not at all
|
||||
być | to be
|
||||
był | was
|
||||
była | was
|
||||
było | was
|
||||
były | were
|
||||
będzie | will be
|
||||
będą | they will
|
||||
cali | inches
|
||||
cała | whole
|
||||
cały | whole
|
||||
chce | i want
|
||||
choć | though
|
||||
ci | you
|
||||
ciebie | you
|
||||
cię | you
|
||||
co | what
|
||||
cokolwiek | whatever
|
||||
coraz | getting
|
||||
coś | something
|
||||
czasami | sometimes
|
||||
czasem | sometimes
|
||||
czemu | why
|
||||
czy | whether
|
||||
czyli | that is
|
||||
często | often
|
||||
daleko | far
|
||||
dla | for
|
||||
dlaczego | why
|
||||
dlatego | which is why
|
||||
do | down
|
||||
dobrze | all right
|
||||
dokąd | where
|
||||
dość | enough
|
||||
dr | dr
|
||||
dużo | a lot
|
||||
dwa | two
|
||||
dwaj | two
|
||||
dwie | two
|
||||
dwoje | two
|
||||
dzisiaj | today
|
||||
dziś | today
|
||||
gdy | when
|
||||
gdyby | if
|
||||
gdyż | because
|
||||
gdzie | where
|
||||
gdziekolwiek | wherever
|
||||
gdzieś | somewhere
|
||||
go | him
|
||||
godz | time
|
||||
hab | hab
|
||||
i | and
|
||||
ich | their
|
||||
ii | ii
|
||||
iii | iii
|
||||
ile | how much
|
||||
im | them
|
||||
inna | different
|
||||
inne | other
|
||||
inny | other
|
||||
innych | other
|
||||
inż | eng
|
||||
iv | iv
|
||||
ix | ix
|
||||
iż | that
|
||||
ja | i
|
||||
jak | how
|
||||
jakaś | some
|
||||
jakby | as if
|
||||
jaki | what
|
||||
jakichś | some
|
||||
jakie | what
|
||||
jakiś | some
|
||||
jakiż | what
|
||||
jakkolwiek | however
|
||||
jako | as
|
||||
jakoś | somehow
|
||||
je | them
|
||||
jeden | one
|
||||
jedna | one
|
||||
jednak | but
|
||||
jednakże | however
|
||||
jedno | one
|
||||
jednym | one
|
||||
jedynie | only
|
||||
jego | his
|
||||
jej | her
|
||||
jemu | him
|
||||
jest | is
|
||||
jestem | i am
|
||||
jeszcze | still
|
||||
jeśli | if
|
||||
jeżeli | if
|
||||
już | already
|
||||
ją | i
|
||||
każdy | everyone
|
||||
kiedy | when
|
||||
kierunku | direction
|
||||
kilka | several
|
||||
kilku | several
|
||||
kimś | someone
|
||||
kto | who
|
||||
ktokolwiek | anyone
|
||||
ktoś | someone
|
||||
która | which
|
||||
które | which
|
||||
którego | whose
|
||||
której | which
|
||||
który | which
|
||||
których | which
|
||||
którym | which
|
||||
którzy | who
|
||||
ku | to
|
||||
lat | years
|
||||
lecz | but
|
||||
lub | or
|
||||
ma | has
|
||||
mają | may
|
||||
mam | i have
|
||||
mamy | we have
|
||||
mało | little
|
||||
mgr | msc
|
||||
mi | to me
|
||||
miał | had
|
||||
mimo | despite
|
||||
między | between
|
||||
mnie | me
|
||||
mną | me
|
||||
mogą | they can
|
||||
moi | my
|
||||
moim | my
|
||||
moja | my
|
||||
moje | my
|
||||
może | maybe
|
||||
możliwe | that's possible
|
||||
można | you can
|
||||
mu | him
|
||||
musi | has to
|
||||
my | we
|
||||
mój | my
|
||||
na | on
|
||||
nad | above
|
||||
nam | u.s
|
||||
nami | us
|
||||
nas | us
|
||||
nasi | our
|
||||
nasz | our
|
||||
nasza | our
|
||||
nasze | our
|
||||
naszego | our
|
||||
naszych | ours
|
||||
natomiast | whereas
|
||||
natychmiast | immediately
|
||||
nawet | even
|
||||
nic | nothing
|
||||
nich | them
|
||||
nie | no
|
||||
niech | let
|
||||
niego | him
|
||||
niej | her
|
||||
niemu | not him
|
||||
nigdy | never
|
||||
nim | him
|
||||
nimi | them
|
||||
nią | her
|
||||
niż | than
|
||||
no | yeah
|
||||
nowe | new
|
||||
np | e.g.
|
||||
nr | no
|
||||
o | about
|
||||
o.o. | o.o.
|
||||
obok | near
|
||||
od | from
|
||||
ok | approx
|
||||
około | about
|
||||
on | he
|
||||
ona | she
|
||||
one | they
|
||||
oni | they
|
||||
ono | it
|
||||
oraz | and
|
||||
oto | here
|
||||
owszem | yes
|
||||
pan | mr
|
||||
pana | mr
|
||||
pani | you
|
||||
pl | pl
|
||||
po | after
|
||||
pod | under
|
||||
podczas | while
|
||||
pomimo | despite
|
||||
ponad | above
|
||||
ponieważ | because
|
||||
powinien | should
|
||||
powinna | she should
|
||||
powinni | they should
|
||||
powinno | should
|
||||
poza | apart from
|
||||
prawie | almost
|
||||
prof | prof
|
||||
przecież | yet
|
||||
przed | before
|
||||
przede | above
|
||||
przedtem | before
|
||||
przez | by
|
||||
przy | by
|
||||
raz | once
|
||||
razie | case
|
||||
roku | year
|
||||
również | also
|
||||
sam | alone
|
||||
sama | alone
|
||||
się | myself
|
||||
skąd | from where
|
||||
sobie | myself
|
||||
sobą | myself
|
||||
sposób | way
|
||||
swoje | own
|
||||
są | are
|
||||
ta | this
|
||||
tak | yes
|
||||
taka | such
|
||||
taki | such
|
||||
takich | such
|
||||
takie | such
|
||||
także | too
|
||||
tam | over there
|
||||
te | these
|
||||
tego | this
|
||||
tej | this one
|
||||
tel | phone
|
||||
temu | ago
|
||||
ten | this
|
||||
teraz | now
|
||||
też | too
|
||||
to | this
|
||||
tobie | you
|
||||
tobą | you
|
||||
toteż | this as well
|
||||
totobą | you
|
||||
trzeba | it's necessary to
|
||||
tu | here
|
||||
tutaj | here
|
||||
twoi | yours
|
||||
twoim | yours
|
||||
twoja | your
|
||||
twoje | your
|
||||
twym | your
|
||||
twój | your
|
||||
ty | you
|
||||
tych | these
|
||||
tylko | just
|
||||
tym | this
|
||||
tys | thousand
|
||||
tzw | so-called
|
||||
tę | these
|
||||
u | at
|
||||
ul | st
|
||||
vi | vi
|
||||
vii | vii
|
||||
viii | viii
|
||||
vol | vol
|
||||
w | in
|
||||
wam | you
|
||||
wami | you
|
||||
was | mustache
|
||||
wasi | yours
|
||||
wasz | yours
|
||||
wasza | yours
|
||||
wasze | yours
|
||||
we | in
|
||||
według | according to
|
||||
wie | knows
|
||||
wiele | many
|
||||
wielu | many
|
||||
więc | so
|
||||
więcej | more
|
||||
wszyscy | all
|
||||
wszystkich | everyone
|
||||
wszystkie | all
|
||||
wszystkim | everyone
|
||||
wszystko | all
|
||||
wtedy | then
|
||||
www | www
|
||||
wy | you
|
||||
właśnie | exactly
|
||||
wśród | among
|
||||
xi | x.x
|
||||
xii | xii
|
||||
xiii | xii
|
||||
xiv | xiv
|
||||
xv | xv
|
||||
z | with
|
||||
za | behind
|
||||
zapewne | probably
|
||||
zawsze | always
|
||||
zaś | and
|
||||
ze | that
|
||||
zeznowu | testify
|
||||
znowu | again
|
||||
znów | again
|
||||
został | left
|
||||
zł | zloty
|
||||
żaden | no
|
||||
żadna | none
|
||||
żadne | none
|
||||
żadnych | none
|
||||
że | that
|
||||
żeby | to
|
||||
|
||||
`)
|
||||
|
||||
func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
|
||||
rv := analysis.NewTokenMap()
|
||||
err := rv.LoadBytes(PolishStopWords)
|
||||
return rv, err
|
||||
}
|
||||
|
||||
func init() {
|
||||
err := registry.RegisterTokenMap(StopName, TokenMapConstructor)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue