1
0
Fork 0
golang-github-ganigeorgiev-.../scanner.go
Daniel Baumann 16e40566d2
Adding upstream version 0.5.0.
Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-05-22 20:57:13 +02:00

679 lines
16 KiB
Go

package fexpr
import (
"bytes"
"fmt"
"strings"
"unicode/utf8"
)
// eof represents a marker rune for the end of the reader.
const eof = rune(0)
// JoinOp represents a join type operator.
type JoinOp string
// supported join type operators
const (
JoinAnd JoinOp = "&&"
JoinOr JoinOp = "||"
)
// SignOp represents an expression sign operator.
type SignOp string
// supported expression sign operators
const (
SignEq SignOp = "="
SignNeq SignOp = "!="
SignLike SignOp = "~"
SignNlike SignOp = "!~"
SignLt SignOp = "<"
SignLte SignOp = "<="
SignGt SignOp = ">"
SignGte SignOp = ">="
// array/any operators
SignAnyEq SignOp = "?="
SignAnyNeq SignOp = "?!="
SignAnyLike SignOp = "?~"
SignAnyNlike SignOp = "?!~"
SignAnyLt SignOp = "?<"
SignAnyLte SignOp = "?<="
SignAnyGt SignOp = "?>"
SignAnyGte SignOp = "?>="
)
// TokenType represents a Token type.
type TokenType string
// token type constants
const (
TokenUnexpected TokenType = "unexpected"
TokenEOF TokenType = "eof"
TokenWS TokenType = "whitespace"
TokenJoin TokenType = "join"
TokenSign TokenType = "sign"
TokenIdentifier TokenType = "identifier" // variable, column name, placeholder, etc.
TokenFunction TokenType = "function" // function
TokenNumber TokenType = "number"
TokenText TokenType = "text" // ' or " quoted string
TokenGroup TokenType = "group" // groupped/nested tokens
TokenComment TokenType = "comment"
)
// Token represents a single scanned literal (one or more combined runes).
type Token struct {
Meta interface{}
Type TokenType
Literal string
}
// NewScanner creates and returns a new scanner instance loaded with the specified data.
func NewScanner(data []byte) *Scanner {
return &Scanner{
data: data,
maxFuncDepth: 3,
}
}
// Scanner represents a filter and lexical scanner.
type Scanner struct {
data []byte
pos int
maxFuncDepth int
}
// Scan reads and returns the next available token value from the scanner's buffer.
func (s *Scanner) Scan() (Token, error) {
ch := s.read()
if ch == eof {
return Token{Type: TokenEOF, Literal: ""}, nil
}
if isWhitespaceRune(ch) {
s.unread()
return s.scanWhitespace()
}
if isGroupStartRune(ch) {
s.unread()
return s.scanGroup()
}
if isIdentifierStartRune(ch) {
s.unread()
return s.scanIdentifier(s.maxFuncDepth)
}
if isNumberStartRune(ch) {
s.unread()
return s.scanNumber()
}
if isTextStartRune(ch) {
s.unread()
return s.scanText(false)
}
if isSignStartRune(ch) {
s.unread()
return s.scanSign()
}
if isJoinStartRune(ch) {
s.unread()
return s.scanJoin()
}
if isCommentStartRune(ch) {
s.unread()
return s.scanComment()
}
return Token{Type: TokenUnexpected, Literal: string(ch)}, fmt.Errorf("unexpected character %q", ch)
}
// scanWhitespace consumes all contiguous whitespace runes.
func (s *Scanner) scanWhitespace() (Token, error) {
var buf bytes.Buffer
// Reads every subsequent whitespace character into the buffer.
// Non-whitespace runes and EOF will cause the loop to exit.
for {
ch := s.read()
if ch == eof {
break
}
if !isWhitespaceRune(ch) {
s.unread()
break
}
// write the whitespace rune
buf.WriteRune(ch)
}
return Token{Type: TokenWS, Literal: buf.String()}, nil
}
// scanNumber consumes all contiguous digit runes
// (complex numbers and scientific notations are not supported).
func (s *Scanner) scanNumber() (Token, error) {
var buf bytes.Buffer
var hadDot bool
// Read every subsequent digit rune into the buffer.
// Non-digit runes and EOF will cause the loop to exit.
for {
ch := s.read()
if ch == eof {
break
}
// not a digit rune
if !isDigitRune(ch) &&
// minus sign but not at the beginning
(ch != '-' || buf.Len() != 0) &&
// dot but there was already another dot
(ch != '.' || hadDot) {
s.unread()
break
}
// write the rune
buf.WriteRune(ch)
if ch == '.' {
hadDot = true
}
}
total := buf.Len()
literal := buf.String()
var err error
// only "-" or starts with "." or ends with "."
if (total == 1 && literal[0] == '-') || literal[0] == '.' || literal[total-1] == '.' {
err = fmt.Errorf("invalid number %q", literal)
}
return Token{Type: TokenNumber, Literal: buf.String()}, err
}
// scanText consumes all contiguous quoted text runes.
func (s *Scanner) scanText(preserveQuotes bool) (Token, error) {
var buf bytes.Buffer
// read the first rune to determine the quotes type
firstCh := s.read()
buf.WriteRune(firstCh)
var prevCh rune
var hasMatchingQuotes bool
// Read every subsequent text rune into the buffer.
// EOF and matching unescaped ending quote will cause the loop to exit.
for {
ch := s.read()
if ch == eof {
break
}
// write the text rune
buf.WriteRune(ch)
// unescaped matching quote, aka. the end
if ch == firstCh && prevCh != '\\' {
hasMatchingQuotes = true
break
}
prevCh = ch
}
literal := buf.String()
var err error
if !hasMatchingQuotes {
err = fmt.Errorf("invalid quoted text %q", literal)
} else if !preserveQuotes {
// unquote
literal = literal[1 : len(literal)-1]
// remove escaped quotes prefix (aka. \)
firstChStr := string(firstCh)
literal = strings.ReplaceAll(literal, `\`+firstChStr, firstChStr)
}
return Token{Type: TokenText, Literal: literal}, err
}
// scanComment consumes all contiguous single line comment runes until
// a new character (\n) or EOF is reached.
func (s *Scanner) scanComment() (Token, error) {
var buf bytes.Buffer
// Read the first 2 characters without writting them to the buffer.
if !isCommentStartRune(s.read()) || !isCommentStartRune(s.read()) {
return Token{Type: TokenComment}, ErrInvalidComment
}
// Read every subsequent comment text rune into the buffer.
// \n and EOF will cause the loop to exit.
for i := 0; ; i++ {
ch := s.read()
if ch == eof || ch == '\n' {
break
}
buf.WriteRune(ch)
}
return Token{Type: TokenComment, Literal: strings.TrimSpace(buf.String())}, nil
}
// scanIdentifier consumes all contiguous ident runes.
func (s *Scanner) scanIdentifier(funcDepth int) (Token, error) {
var buf bytes.Buffer
// read the first rune in case it is a special start identifier character
buf.WriteRune(s.read())
// Read every subsequent identifier rune into the buffer.
// Non-ident runes and EOF will cause the loop to exit.
for {
ch := s.read()
if ch == eof {
break
}
// func
if ch == '(' {
funcName := buf.String()
if funcDepth <= 0 {
return Token{Type: TokenFunction, Literal: funcName}, fmt.Errorf("max nested function arguments reached (max: %d)", s.maxFuncDepth)
}
if !isValidIdentifier(funcName) {
return Token{Type: TokenFunction, Literal: funcName}, fmt.Errorf("invalid function name %q", funcName)
}
s.unread()
return s.scanFunctionArgs(funcName, funcDepth)
}
// not an identifier character
if !isLetterRune(ch) && !isDigitRune(ch) && !isIdentifierCombineRune(ch) && ch != '_' {
s.unread()
break
}
// write the identifier rune
buf.WriteRune(ch)
}
literal := buf.String()
var err error
if !isValidIdentifier(literal) {
err = fmt.Errorf("invalid identifier %q", literal)
}
return Token{Type: TokenIdentifier, Literal: literal}, err
}
// scanSign consumes all contiguous sign operator runes.
func (s *Scanner) scanSign() (Token, error) {
var buf bytes.Buffer
// Read every subsequent sign rune into the buffer.
// Non-sign runes and EOF will cause the loop to exit.
for {
ch := s.read()
if ch == eof {
break
}
if !isSignStartRune(ch) {
s.unread()
break
}
// write the sign rune
buf.WriteRune(ch)
}
literal := buf.String()
var err error
if !isSignOperator(literal) {
err = fmt.Errorf("invalid sign operator %q", literal)
}
return Token{Type: TokenSign, Literal: literal}, err
}
// scanJoin consumes all contiguous join operator runes.
func (s *Scanner) scanJoin() (Token, error) {
var buf bytes.Buffer
// Read every subsequent join operator rune into the buffer.
// Non-join runes and EOF will cause the loop to exit.
for {
ch := s.read()
if ch == eof {
break
}
if !isJoinStartRune(ch) {
s.unread()
break
}
// write the join operator rune
buf.WriteRune(ch)
}
literal := buf.String()
var err error
if !isJoinOperator(literal) {
err = fmt.Errorf("invalid join operator %q", literal)
}
return Token{Type: TokenJoin, Literal: literal}, err
}
// scanGroup consumes all runes within a group/parenthesis.
func (s *Scanner) scanGroup() (Token, error) {
var buf bytes.Buffer
// read the first group bracket without writing it to the buffer
firstChar := s.read()
openGroups := 1
// Read every subsequent text rune into the buffer.
// EOF and matching unescaped ending quote will cause the loop to exit.
for {
ch := s.read()
if ch == eof {
break
}
if isGroupStartRune(ch) {
// nested group
openGroups++
buf.WriteRune(ch)
} else if isTextStartRune(ch) {
s.unread()
t, err := s.scanText(true) // with quotes to preserve the exact text start/end runes
if err != nil {
// write the errored literal as it is
buf.WriteString(t.Literal)
return Token{Type: TokenGroup, Literal: buf.String()}, err
}
buf.WriteString(t.Literal)
} else if ch == ')' {
openGroups--
if openGroups <= 0 {
// main group end
break
} else {
buf.WriteRune(ch)
}
} else {
buf.WriteRune(ch)
}
}
literal := buf.String()
var err error
if !isGroupStartRune(firstChar) || openGroups > 0 {
err = fmt.Errorf("invalid formatted group - missing %d closing bracket(s)", openGroups)
}
return Token{Type: TokenGroup, Literal: literal}, err
}
// scanFunctionArgs consumes all contiguous function call runes to
// extract its arguments and returns a function token with the found
// Token arguments loaded in Token.Meta.
func (s *Scanner) scanFunctionArgs(funcName string, funcDepth int) (Token, error) {
var args []Token
var expectComma, isComma, isClosed bool
ch := s.read()
if ch != '(' {
return Token{Type: TokenFunction, Literal: funcName}, fmt.Errorf("invalid or incomplete function call %q", funcName)
}
// Read every subsequent rune until ')' or EOF has been reached.
for {
ch := s.read()
if ch == eof {
break
}
if ch == ')' {
isClosed = true
break
}
// skip whitespaces
if isWhitespaceRune(ch) {
_, err := s.scanWhitespace()
if err != nil {
return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("failed to scan whitespaces in function %q: %w", funcName, err)
}
continue
}
// skip comments
if isCommentStartRune(ch) {
s.unread()
_, err := s.scanComment()
if err != nil {
return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("failed to scan comment in function %q: %w", funcName, err)
}
continue
}
isComma = ch == ','
if expectComma && !isComma {
return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("expected comma after the last argument in function %q", funcName)
}
if !expectComma && isComma {
return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("unexpected comma in function %q", funcName)
}
expectComma = false // reset
if isComma {
continue
}
if isIdentifierStartRune(ch) {
s.unread()
t, err := s.scanIdentifier(funcDepth - 1)
if err != nil {
return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid identifier argument %q in function %q: %w", t.Literal, funcName, err)
}
args = append(args, t)
expectComma = true
} else if isNumberStartRune(ch) {
s.unread()
t, err := s.scanNumber()
if err != nil {
return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid number argument %q in function %q: %w", t.Literal, funcName, err)
}
args = append(args, t)
expectComma = true
} else if isTextStartRune(ch) {
s.unread()
t, err := s.scanText(false)
if err != nil {
return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid text argument %q in function %q: %w", t.Literal, funcName, err)
}
args = append(args, t)
expectComma = true
} else {
return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("unsupported argument character %q in function %q", ch, funcName)
}
}
if !isClosed {
return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid or incomplete function %q (expected ')')", funcName)
}
return Token{Type: TokenFunction, Literal: funcName, Meta: args}, nil
}
// unread unreads the last character and revert the position 1 step back.
func (s *Scanner) unread() {
if s.pos > 0 {
s.pos = s.pos - 1
}
}
// read reads the next rune and moves the position forward.
func (s *Scanner) read() rune {
if s.pos >= len(s.data) {
return eof
}
ch, n := utf8.DecodeRune(s.data[s.pos:])
s.pos += n
return ch
}
// Lexical helpers:
// -------------------------------------------------------------------
// isWhitespaceRune checks if a rune is a space, tab, or newline.
func isWhitespaceRune(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' }
// isLetterRune checks if a rune is a letter.
func isLetterRune(ch rune) bool {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
}
// isDigitRune checks if a rune is a digit.
func isDigitRune(ch rune) bool {
return (ch >= '0' && ch <= '9')
}
// isTextStartRune checks if a rune is a valid quoted text first character
// (aka. single or double quote).
func isTextStartRune(ch rune) bool {
return ch == '\'' || ch == '"'
}
// isNumberStartRune checks if a rune is a valid number start character (aka. digit).
func isNumberStartRune(ch rune) bool {
return ch == '-' || isDigitRune(ch)
}
// isSignStartRune checks if a rune is a valid sign operator start character.
func isSignStartRune(ch rune) bool {
return ch == '=' ||
ch == '?' ||
ch == '!' ||
ch == '>' ||
ch == '<' ||
ch == '~'
}
// isJoinStartRune checks if a rune is a valid join type start character.
func isJoinStartRune(ch rune) bool {
return ch == '&' || ch == '|'
}
// isGroupStartRune checks if a rune is a valid group/parenthesis start character.
func isGroupStartRune(ch rune) bool {
return ch == '('
}
// isCommentStartRune checks if a rune is a valid comment start character.
func isCommentStartRune(ch rune) bool {
return ch == '/'
}
// isIdentifierStartRune checks if a rune is valid identifier's first character.
func isIdentifierStartRune(ch rune) bool {
return isLetterRune(ch) || isIdentifierSpecialStartRune(ch)
}
// isIdentifierSpecialStartRune checks if a rune is valid identifier's first special character.
func isIdentifierSpecialStartRune(ch rune) bool {
return ch == '@' || ch == '_' || ch == '#'
}
// isIdentifierCombineRune checks if a rune is valid identifier's combine character.
func isIdentifierCombineRune(ch rune) bool {
return ch == '.' || ch == ':'
}
// isSignOperator checks if a literal is a valid sign operator.
func isSignOperator(literal string) bool {
switch SignOp(literal) {
case
SignEq,
SignNeq,
SignLt,
SignLte,
SignGt,
SignGte,
SignLike,
SignNlike,
SignAnyEq,
SignAnyNeq,
SignAnyLike,
SignAnyNlike,
SignAnyLt,
SignAnyLte,
SignAnyGt,
SignAnyGte:
return true
}
return false
}
// isJoinOperator checks if a literal is a valid join type operator.
func isJoinOperator(literal string) bool {
switch JoinOp(literal) {
case
JoinAnd,
JoinOr:
return true
}
return false
}
// isValidIdentifier validates the literal against common identifier requirements.
func isValidIdentifier(literal string) bool {
length := len(literal)
return (
// doesn't end with combine rune
!isIdentifierCombineRune(rune(literal[length-1])) &&
// is not just a special start rune
(length != 1 || !isIdentifierSpecialStartRune(rune(literal[0]))))
}