Adding upstream version 0.28.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-22 10:57:38 +02:00 · 2025-05-22 10:57:38 +02:00 · e28c88ef14
commit e28c88ef14
parent 88f1d47ab6
933 changed files with 194711 additions and 0 deletions
--- a/tools/tokenizer/tokenizer.go
+++ b/tools/tokenizer/tokenizer.go
@ -0,0 +1,221 @@
+// Package tokenizer implements a rudimentary tokens parser of buffered
+// io.Reader while respecting quotes and parenthesis boundaries.
+//
+// Example
+//
+//	tk := tokenizer.NewFromString("a, b, (c, d)")
+//	result, _ := tk.ScanAll() // ["a", "b", "(c, d)"]
+package tokenizer
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"strings"
+)
+
+// eof represents a marker rune for the end of the reader.
+const eof = rune(0)
+
+// DefaultSeparators is a list with the default token separator characters.
+var DefaultSeparators = []rune{','}
+
+var whitespaceChars = []rune{'\t', '\n', '\v', '\f', '\r', ' ', 0x85, 0xA0}
+
+// NewFromString creates new Tokenizer from the provided string.
+func NewFromString(str string) *Tokenizer {
+	return New(strings.NewReader(str))
+}
+
+// NewFromBytes creates new Tokenizer from the provided bytes slice.
+func NewFromBytes(b []byte) *Tokenizer {
+	return New(bytes.NewReader(b))
+}
+
+// New creates new Tokenizer from the provided reader with DefaultSeparators.
+func New(r io.Reader) *Tokenizer {
+	t := &Tokenizer{r: bufio.NewReader(r)}
+
+	t.Separators(DefaultSeparators...)
+
+	return t
+}
+
+// Tokenizer defines a struct that parses a reader into tokens while
+// respecting quotes and parenthesis boundaries.
+type Tokenizer struct {
+	r *bufio.Reader
+
+	trimCutset        string
+	separators        []rune
+	keepSeparator     bool
+	keepEmptyTokens   bool
+	ignoreParenthesis bool
+}
+
+// Separators defines the provided separatos of the current Tokenizer.
+func (t *Tokenizer) Separators(separators ...rune) {
+	t.separators = separators
+
+	t.rebuildTrimCutset()
+}
+
+// KeepSeparator defines whether to keep the separator rune as part
+// of the token (default to false).
+func (t *Tokenizer) KeepSeparator(state bool) {
+	t.keepSeparator = state
+}
+
+// KeepEmptyTokens defines whether to keep empty tokens on Scan() (default to false).
+func (t *Tokenizer) KeepEmptyTokens(state bool) {
+	t.keepEmptyTokens = state
+}
+
+// IgnoreParenthesis defines whether to ignore the parenthesis boundaries
+// and to treat the '(' and ')' as regular characters.
+func (t *Tokenizer) IgnoreParenthesis(state bool) {
+	t.ignoreParenthesis = state
+}
+
+// Scan reads and returns the next available token from the Tokenizer's buffer (trimmed!).
+//
+// Empty tokens are skipped if t.keepEmptyTokens is not set (which is the default).
+//
+// Returns [io.EOF] error when there are no more tokens to scan.
+func (t *Tokenizer) Scan() (string, error) {
+	ch := t.read()
+	if ch == eof {
+		return "", io.EOF
+	}
+	t.unread()
+
+	token, err := t.readToken()
+	if err != nil {
+		return "", err
+	}
+
+	if !t.keepEmptyTokens && token == "" {
+		return t.Scan()
+	}
+
+	return token, err
+}
+
+// ScanAll reads the entire Tokenizer's buffer and return all found tokens.
+func (t *Tokenizer) ScanAll() ([]string, error) {
+	tokens := []string{}
+
+	for {
+		token, err := t.Scan()
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+
+			return nil, err
+		}
+
+		tokens = append(tokens, token)
+	}
+
+	return tokens, nil
+}
+
+// readToken reads a single token from the buffer and returns it.
+func (t *Tokenizer) readToken() (string, error) {
+	var buf bytes.Buffer
+	var parenthesis int
+	var quoteCh rune
+	var prevCh rune
+
+	for {
+		ch := t.read()
+
+		if ch == eof {
+			break
+		}
+
+		if !t.isEscapeRune(prevCh) {
+			if !t.ignoreParenthesis && ch == '(' && quoteCh == eof {
+				parenthesis++ // opening parenthesis
+			} else if !t.ignoreParenthesis && ch == ')' && parenthesis > 0 && quoteCh == eof {
+				parenthesis-- // closing parenthesis
+			} else if t.isQuoteRune(ch) {
+				switch quoteCh {
+				case ch:
+					quoteCh = eof // closing quote
+				case eof:
+					quoteCh = ch // opening quote
+				}
+			}
+		}
+
+		if t.isSeperatorRune(ch) && parenthesis == 0 && quoteCh == eof {
+			if t.keepSeparator {
+				buf.WriteRune(ch)
+			}
+			break
+		}
+
+		prevCh = ch
+		buf.WriteRune(ch)
+	}
+
+	if parenthesis > 0 || quoteCh != eof {
+		return "", fmt.Errorf("unbalanced parenthesis or quoted expression: %q", buf.String())
+	}
+
+	return strings.Trim(buf.String(), t.trimCutset), nil
+}
+
+// read reads the next rune from the buffered reader.
+// Returns the `rune(0)` if an error or `io.EOF` occurs.
+func (t *Tokenizer) read() rune {
+	ch, _, err := t.r.ReadRune()
+	if err != nil {
+		return eof
+	}
+
+	return ch
+}
+
+// unread places the previously read rune back on the reader.
+func (t *Tokenizer) unread() error {
+	return t.r.UnreadRune()
+}
+
+// rebuildTrimCutset rebuilds the tokenizer trimCutset based on its separator runes.
+func (t *Tokenizer) rebuildTrimCutset() {
+	var cutset strings.Builder
+
+	for _, w := range whitespaceChars {
+		if t.isSeperatorRune(w) {
+			continue
+		}
+		cutset.WriteRune(w)
+	}
+
+	t.trimCutset = cutset.String()
+}
+
+// isSeperatorRune checks if a rune is a token part separator.
+func (t *Tokenizer) isSeperatorRune(ch rune) bool {
+	for _, r := range t.separators {
+		if ch == r {
+			return true
+		}
+	}
+
+	return false
+}
+
+// isQuoteRune checks if a rune is a quote.
+func (t *Tokenizer) isQuoteRune(ch rune) bool {
+	return ch == '\'' || ch == '"' || ch == '`'
+}
+
+// isEscapeRune checks if a rune is an escape character.
+func (t *Tokenizer) isEscapeRune(ch rune) bool {
+	return ch == '\\'
+}
--- a/tools/tokenizer/tokenizer_test.go
+++ b/tools/tokenizer/tokenizer_test.go
@ -0,0 +1,303 @@
+package tokenizer
+
+import (
+	"io"
+	"strings"
+	"testing"
+)
+
+func TestFactories(t *testing.T) {
+	expectedContent := "test"
+
+	scenarios := []struct {
+		name string
+		tk   *Tokenizer
+	}{
+		{
+			"New()",
+			New(strings.NewReader(expectedContent)),
+		},
+		{
+			"NewFromString()",
+			NewFromString(expectedContent),
+		},
+		{
+			"NewFromBytes()",
+			NewFromBytes([]byte(expectedContent)),
+		},
+	}
+
+	for _, s := range scenarios {
+		t.Run(s.name, func(t *testing.T) {
+			content, _ := s.tk.r.ReadString(0)
+
+			if content != expectedContent {
+				t.Fatalf("Expected reader with content %q, got %q", expectedContent, content)
+			}
+
+			if s.tk.keepSeparator != false {
+				t.Fatal("Expected keepSeparator false, got true")
+			}
+
+			if s.tk.ignoreParenthesis != false {
+				t.Fatal("Expected ignoreParenthesis false, got true")
+			}
+
+			if len(s.tk.separators) != len(DefaultSeparators) {
+				t.Fatalf("Expected \n%v, \ngot \n%v", DefaultSeparators, s.tk.separators)
+			}
+
+			for _, r := range s.tk.separators {
+				exists := false
+				for _, def := range s.tk.separators {
+					if r == def {
+						exists = true
+						break
+					}
+				}
+				if !exists {
+					t.Fatalf("Unexpected sepator %s", string(r))
+				}
+			}
+		})
+	}
+}
+
+func TestScan(t *testing.T) {
+	tk := NewFromString("abc, 123.456, (abc)")
+
+	expectedTokens := []string{"abc", "123.456", "(abc)"}
+
+	for _, token := range expectedTokens {
+		result, err := tk.Scan()
+		if err != nil {
+			t.Fatalf("Expected token %q, got error %v", token, err)
+		}
+
+		if result != token {
+			t.Fatalf("Expected token %q, got error %v", token, result)
+		}
+	}
+
+	// scan the last character
+	token, err := tk.Scan()
+	if err != io.EOF {
+		t.Fatalf("Expected EOF error, got %v", err)
+	}
+	if token != "" || err != io.EOF {
+		t.Fatalf("Expected empty token, got %q", token)
+	}
+}
+
+func TestScanAll(t *testing.T) {
+	scenarios := []struct {
+		name              string
+		content           string
+		separators        []rune
+		keepSeparator     bool
+		keepEmptyTokens   bool
+		ignoreParenthesis bool
+		expectError       bool
+		expectTokens      []string
+	}{
+		{
+			name:              "empty string",
+			content:           "",
+			separators:        DefaultSeparators,
+			keepSeparator:     false,
+			keepEmptyTokens:   false,
+			ignoreParenthesis: false,
+			expectError:       false,
+			expectTokens:      nil,
+		},
+		{
+			name:              "unbalanced parenthesis",
+			content:           `(a,b() c`,
+			separators:        DefaultSeparators,
+			keepSeparator:     false,
+			keepEmptyTokens:   false,
+			ignoreParenthesis: false,
+			expectError:       true,
+			expectTokens:      []string{},
+		},
+		{
+			name:              "unmatching quotes",
+			content:           `'asd"`,
+			separators:        DefaultSeparators,
+			keepSeparator:     false,
+			keepEmptyTokens:   false,
+			ignoreParenthesis: false,
+			expectError:       true,
+			expectTokens:      []string{},
+		},
+		{
+			name:              "no separators",
+			content:           `a, b, c, d, e 123, "abc"`,
+			separators:        nil,
+			keepSeparator:     false,
+			keepEmptyTokens:   false,
+			ignoreParenthesis: false,
+			expectError:       false,
+			expectTokens:      []string{`a, b, c, d, e 123, "abc"`},
+		},
+		{
+			name: "default separators",
+			content: `a, b , c  , d e  , "a,b,  c  " , ,, ,	  (123, 456)
+			`,
+			separators:        DefaultSeparators,
+			keepSeparator:     false,
+			keepEmptyTokens:   false,
+			ignoreParenthesis: false,
+			expectError:       false,
+			expectTokens: []string{
+				"a",
+				"b",
+				"c",
+				"d e",
+				`"a,b,  c  "`,
+				`(123, 456)`,
+			},
+		},
+		{
+			name:              "keep separators",
+			content:           `a, b, c, d  e, "a,b,  c  ",	(123, 456)`,
+			separators:        []rune{',', ' '}, // the space should be removed from the cutset
+			keepSeparator:     true,
+			keepEmptyTokens:   true,
+			ignoreParenthesis: false,
+			expectError:       false,
+			expectTokens: []string{
+				"a,",
+				" ",
+				"b,",
+				" ",
+				"c,",
+				" ",
+				"d ",
+				" ",
+				"e,",
+				" ",
+				`"a,b,  c  ",`,
+				`(123, 456)`,
+			},
+		},
+		{
+			name:              "custom separators",
+			content:           `a | b c  d &(e + f) &  "g & h" & & &`,
+			separators:        []rune{'|', '&'},
+			keepSeparator:     false,
+			keepEmptyTokens:   false,
+			ignoreParenthesis: false,
+			expectError:       false,
+			expectTokens: []string{
+				"a",
+				"b c  d",
+				"(e + f)",
+				`"g & h"`,
+			},
+		},
+		{
+			name:              "ignoring parenthesis",
+			content:           `a, b, (c,d)`,
+			separators:        DefaultSeparators,
+			keepSeparator:     false,
+			keepEmptyTokens:   false,
+			ignoreParenthesis: true,
+			expectError:       false,
+			expectTokens: []string{
+				"a",
+				"b",
+				"(c",
+				"d)",
+			},
+		},
+		{
+			name:              "keep empty tokens",
+			content:           `a, b, (c, d), ,, , e, , f`,
+			separators:        DefaultSeparators,
+			keepSeparator:     false,
+			keepEmptyTokens:   true,
+			ignoreParenthesis: false,
+			expectError:       false,
+			expectTokens: []string{
+				"a",
+				"b",
+				"(c, d)",
+				"",
+				"",
+				"",
+				"e",
+				"",
+				"f",
+			},
+		},
+	}
+
+	for _, s := range scenarios {
+		t.Run(s.name, func(t *testing.T) {
+			tk := NewFromString(s.content)
+
+			tk.Separators(s.separators...)
+			tk.KeepSeparator(s.keepSeparator)
+			tk.KeepEmptyTokens(s.keepEmptyTokens)
+			tk.IgnoreParenthesis(s.ignoreParenthesis)
+
+			tokens, err := tk.ScanAll()
+
+			hasErr := err != nil
+			if hasErr != s.expectError {
+				t.Fatalf("Expected hasErr %v, got %v (%v)", s.expectError, hasErr, err)
+			}
+
+			if len(tokens) != len(s.expectTokens) {
+				t.Fatalf("Expected \n%v (%d), \ngot \n%v (%d)", s.expectTokens, len(s.expectTokens), tokens, len(tokens))
+			}
+
+			for _, tok := range tokens {
+				exists := false
+				for _, def := range s.expectTokens {
+					if tok == def {
+						exists = true
+						break
+					}
+				}
+				if !exists {
+					t.Fatalf("Unexpected token %q", tok)
+				}
+			}
+		})
+	}
+}
+
+func TestTrimCutset(t *testing.T) {
+	scenarios := []struct {
+		name           string
+		separators     []rune
+		expectedCutset string
+	}{
+		{
+			"default factory separators",
+			nil,
+			"\t\n\v\f\r \u0085\u00a0",
+		},
+		{
+			"custom separators",
+			[]rune{'\t', ' ', '\r', ','},
+			"\n\v\f\u0085\u00a0",
+		},
+	}
+
+	for _, s := range scenarios {
+		t.Run(s.name, func(t *testing.T) {
+			tk := NewFromString("")
+
+			if len(s.separators) > 0 {
+				tk.Separators(s.separators...)
+			}
+
+			if tk.trimCutset != s.expectedCutset {
+				t.Fatalf("Expected cutset %q, got %q", s.expectedCutset, tk.trimCutset)
+			}
+		})
+	}
+}