golang-github-ganigeorgiev-.../scanner.go

package fexpr

import (
	"bytes"
	"fmt"
	"strings"
	"unicode/utf8"
)

// eof represents a marker rune for the end of the reader.
const eof = rune(0)

// JoinOp represents a join type operator.
type JoinOp string

// supported join type operators
const (
	JoinAnd JoinOp = "&&"
	JoinOr  JoinOp = "||"
)

// SignOp represents an expression sign operator.
type SignOp string

// supported expression sign operators
const (
	SignEq    SignOp = "="
	SignNeq   SignOp = "!="
	SignLike  SignOp = "~"
	SignNlike SignOp = "!~"
	SignLt    SignOp = "<"
	SignLte   SignOp = "<="
	SignGt    SignOp = ">"
	SignGte   SignOp = ">="

	// array/any operators
	SignAnyEq    SignOp = "?="
	SignAnyNeq   SignOp = "?!="
	SignAnyLike  SignOp = "?~"
	SignAnyNlike SignOp = "?!~"
	SignAnyLt    SignOp = "?<"
	SignAnyLte   SignOp = "?<="
	SignAnyGt    SignOp = "?>"
	SignAnyGte   SignOp = "?>="
)

// TokenType represents a Token type.
type TokenType string

// token type constants
const (
	TokenUnexpected TokenType = "unexpected"
	TokenEOF        TokenType = "eof"
	TokenWS         TokenType = "whitespace"
	TokenJoin       TokenType = "join"
	TokenSign       TokenType = "sign"
	TokenIdentifier TokenType = "identifier" // variable, column name, placeholder, etc.
	TokenFunction   TokenType = "function"   // function
	TokenNumber     TokenType = "number"
	TokenText       TokenType = "text"  // ' or " quoted string
	TokenGroup      TokenType = "group" // groupped/nested tokens
	TokenComment    TokenType = "comment"
)

// Token represents a single scanned literal (one or more combined runes).
type Token struct {
	Meta    interface{}
	Type    TokenType
	Literal string
}

// NewScanner creates and returns a new scanner instance loaded with the specified data.
func NewScanner(data []byte) *Scanner {
	return &Scanner{
		data:         data,
		maxFuncDepth: 3,
	}
}

// Scanner represents a filter and lexical scanner.
type Scanner struct {
	data         []byte
	pos          int
	maxFuncDepth int
}

// Scan reads and returns the next available token value from the scanner's buffer.
func (s *Scanner) Scan() (Token, error) {
	ch := s.read()

	if ch == eof {
		return Token{Type: TokenEOF, Literal: ""}, nil
	}

	if isWhitespaceRune(ch) {
		s.unread()
		return s.scanWhitespace()
	}

	if isGroupStartRune(ch) {
		s.unread()
		return s.scanGroup()
	}

	if isIdentifierStartRune(ch) {
		s.unread()
		return s.scanIdentifier(s.maxFuncDepth)
	}

	if isNumberStartRune(ch) {
		s.unread()
		return s.scanNumber()
	}

	if isTextStartRune(ch) {
		s.unread()
		return s.scanText(false)
	}

	if isSignStartRune(ch) {
		s.unread()
		return s.scanSign()
	}

	if isJoinStartRune(ch) {
		s.unread()
		return s.scanJoin()
	}

	if isCommentStartRune(ch) {
		s.unread()
		return s.scanComment()
	}

	return Token{Type: TokenUnexpected, Literal: string(ch)}, fmt.Errorf("unexpected character %q", ch)
}

// scanWhitespace consumes all contiguous whitespace runes.
func (s *Scanner) scanWhitespace() (Token, error) {
	var buf bytes.Buffer

	// Reads every subsequent whitespace character into the buffer.
	// Non-whitespace runes and EOF will cause the loop to exit.
	for {
		ch := s.read()

		if ch == eof {
			break
		}

		if !isWhitespaceRune(ch) {
			s.unread()
			break
		}

		// write the whitespace rune
		buf.WriteRune(ch)
	}

	return Token{Type: TokenWS, Literal: buf.String()}, nil
}

// scanNumber consumes all contiguous digit runes
// (complex numbers and scientific notations are not supported).
func (s *Scanner) scanNumber() (Token, error) {
	var buf bytes.Buffer

	var hadDot bool

	// Read every subsequent digit rune into the buffer.
	// Non-digit runes and EOF will cause the loop to exit.
	for {
		ch := s.read()

		if ch == eof {
			break
		}

		// not a digit rune
		if !isDigitRune(ch) &&
			// minus sign but not at the beginning
			(ch != '-' || buf.Len() != 0) &&
			// dot but there was already another dot
			(ch != '.' || hadDot) {
			s.unread()
			break
		}

		// write the rune
		buf.WriteRune(ch)

		if ch == '.' {
			hadDot = true
		}
	}

	total := buf.Len()
	literal := buf.String()

	var err error
	// only "-" or starts with "." or ends with "."
	if (total == 1 && literal[0] == '-') || literal[0] == '.' || literal[total-1] == '.' {
		err = fmt.Errorf("invalid number %q", literal)
	}

	return Token{Type: TokenNumber, Literal: buf.String()}, err
}

// scanText consumes all contiguous quoted text runes.
func (s *Scanner) scanText(preserveQuotes bool) (Token, error) {
	var buf bytes.Buffer

	// read the first rune to determine the quotes type
	firstCh := s.read()
	buf.WriteRune(firstCh)
	var prevCh rune
	var hasMatchingQuotes bool

	// Read every subsequent text rune into the buffer.
	// EOF and matching unescaped ending quote will cause the loop to exit.
	for {
		ch := s.read()

		if ch == eof {
			break
		}

		// write the text rune
		buf.WriteRune(ch)

		// unescaped matching quote, aka. the end
		if ch == firstCh && prevCh != '\\' {
			hasMatchingQuotes = true
			break
		}

		prevCh = ch
	}

	literal := buf.String()

	var err error
	if !hasMatchingQuotes {
		err = fmt.Errorf("invalid quoted text %q", literal)
	} else if !preserveQuotes {
		// unquote
		literal = literal[1 : len(literal)-1]
		// remove escaped quotes prefix (aka. \)
		firstChStr := string(firstCh)
		literal = strings.ReplaceAll(literal, `\`+firstChStr, firstChStr)
	}

	return Token{Type: TokenText, Literal: literal}, err
}

// scanComment consumes all contiguous single line comment runes until
// a new character (\n) or EOF is reached.
func (s *Scanner) scanComment() (Token, error) {
	var buf bytes.Buffer

	// Read the first 2 characters without writting them to the buffer.
	if !isCommentStartRune(s.read()) || !isCommentStartRune(s.read()) {
		return Token{Type: TokenComment}, ErrInvalidComment
	}

	// Read every subsequent comment text rune into the buffer.
	// \n and EOF will cause the loop to exit.
	for i := 0; ; i++ {
		ch := s.read()

		if ch == eof || ch == '\n' {
			break
		}

		buf.WriteRune(ch)
	}

	return Token{Type: TokenComment, Literal: strings.TrimSpace(buf.String())}, nil
}

// scanIdentifier consumes all contiguous ident runes.
func (s *Scanner) scanIdentifier(funcDepth int) (Token, error) {
	var buf bytes.Buffer

	// read the first rune in case it is a special start identifier character
	buf.WriteRune(s.read())

	// Read every subsequent identifier rune into the buffer.
	// Non-ident runes and EOF will cause the loop to exit.
	for {
		ch := s.read()

		if ch == eof {
			break
		}

		// func
		if ch == '(' {
			funcName := buf.String()
			if funcDepth <= 0 {
				return Token{Type: TokenFunction, Literal: funcName}, fmt.Errorf("max nested function arguments reached (max: %d)", s.maxFuncDepth)
			}
			if !isValidIdentifier(funcName) {
				return Token{Type: TokenFunction, Literal: funcName}, fmt.Errorf("invalid function name %q", funcName)
			}
			s.unread()
			return s.scanFunctionArgs(funcName, funcDepth)
		}

		// not an identifier character
		if !isLetterRune(ch) && !isDigitRune(ch) && !isIdentifierCombineRune(ch) && ch != '_' {
			s.unread()
			break
		}

		// write the identifier rune
		buf.WriteRune(ch)
	}

	literal := buf.String()

	var err error
	if !isValidIdentifier(literal) {
		err = fmt.Errorf("invalid identifier %q", literal)
	}

	return Token{Type: TokenIdentifier, Literal: literal}, err
}

// scanSign consumes all contiguous sign operator runes.
func (s *Scanner) scanSign() (Token, error) {
	var buf bytes.Buffer

	// Read every subsequent sign rune into the buffer.
	// Non-sign runes and EOF will cause the loop to exit.
	for {
		ch := s.read()

		if ch == eof {
			break
		}

		if !isSignStartRune(ch) {
			s.unread()
			break
		}

		// write the sign rune
		buf.WriteRune(ch)
	}

	literal := buf.String()

	var err error
	if !isSignOperator(literal) {
		err = fmt.Errorf("invalid sign operator %q", literal)
	}

	return Token{Type: TokenSign, Literal: literal}, err
}

// scanJoin consumes all contiguous join operator runes.
func (s *Scanner) scanJoin() (Token, error) {
	var buf bytes.Buffer

	// Read every subsequent join operator rune into the buffer.
	// Non-join runes and EOF will cause the loop to exit.
	for {
		ch := s.read()

		if ch == eof {
			break
		}

		if !isJoinStartRune(ch) {
			s.unread()
			break
		}

		// write the join operator rune
		buf.WriteRune(ch)
	}

	literal := buf.String()

	var err error
	if !isJoinOperator(literal) {
		err = fmt.Errorf("invalid join operator %q", literal)
	}

	return Token{Type: TokenJoin, Literal: literal}, err
}

// scanGroup consumes all runes within a group/parenthesis.
func (s *Scanner) scanGroup() (Token, error) {
	var buf bytes.Buffer

	// read the first group bracket without writing it to the buffer
	firstChar := s.read()
	openGroups := 1

	// Read every subsequent text rune into the buffer.
	// EOF and matching unescaped ending quote will cause the loop to exit.
	for {
		ch := s.read()

		if ch == eof {
			break
		}

		if isGroupStartRune(ch) {
			// nested group
			openGroups++
			buf.WriteRune(ch)
		} else if isTextStartRune(ch) {
			s.unread()
			t, err := s.scanText(true) // with quotes to preserve the exact text start/end runes
			if err != nil {
				// write the errored literal as it is
				buf.WriteString(t.Literal)
				return Token{Type: TokenGroup, Literal: buf.String()}, err
			}

			buf.WriteString(t.Literal)
		} else if ch == ')' {
			openGroups--

			if openGroups <= 0 {
				// main group end
				break
			} else {
				buf.WriteRune(ch)
			}
		} else {
			buf.WriteRune(ch)
		}
	}

	literal := buf.String()

	var err error
	if !isGroupStartRune(firstChar) || openGroups > 0 {
		err = fmt.Errorf("invalid formatted group - missing %d closing bracket(s)", openGroups)
	}

	return Token{Type: TokenGroup, Literal: literal}, err
}

// scanFunctionArgs consumes all contiguous function call runes to
// extract its arguments and returns a function token with the found
// Token arguments loaded in Token.Meta.
func (s *Scanner) scanFunctionArgs(funcName string, funcDepth int) (Token, error) {
	var args []Token

	var expectComma, isComma, isClosed bool

	ch := s.read()
	if ch != '(' {
		return Token{Type: TokenFunction, Literal: funcName}, fmt.Errorf("invalid or incomplete function call %q", funcName)
	}

	// Read every subsequent rune until ')' or EOF has been reached.
	for {
		ch := s.read()

		if ch == eof {
			break
		}

		if ch == ')' {
			isClosed = true
			break
		}

		// skip whitespaces
		if isWhitespaceRune(ch) {
			_, err := s.scanWhitespace()
			if err != nil {
				return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("failed to scan whitespaces in function %q: %w", funcName, err)
			}
			continue
		}

		// skip comments
		if isCommentStartRune(ch) {
			s.unread()
			_, err := s.scanComment()
			if err != nil {
				return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("failed to scan comment in function %q: %w", funcName, err)
			}
			continue
		}

		isComma = ch == ','

		if expectComma && !isComma {
			return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("expected comma after the last argument in function %q", funcName)
		}

		if !expectComma && isComma {
			return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("unexpected comma in function %q", funcName)
		}

		expectComma = false // reset

		if isComma {
			continue
		}

		if isIdentifierStartRune(ch) {
			s.unread()
			t, err := s.scanIdentifier(funcDepth - 1)
			if err != nil {
				return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid identifier argument %q in function %q: %w", t.Literal, funcName, err)
			}
			args = append(args, t)
			expectComma = true
		} else if isNumberStartRune(ch) {
			s.unread()
			t, err := s.scanNumber()
			if err != nil {
				return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid number argument %q in function %q: %w", t.Literal, funcName, err)
			}
			args = append(args, t)
			expectComma = true
		} else if isTextStartRune(ch) {
			s.unread()
			t, err := s.scanText(false)
			if err != nil {
				return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid text argument %q in function %q: %w", t.Literal, funcName, err)
			}
			args = append(args, t)
			expectComma = true
		} else {
			return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("unsupported argument character %q in function %q", ch, funcName)
		}
	}

	if !isClosed {
		return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid or incomplete function %q (expected ')')", funcName)
	}

	return Token{Type: TokenFunction, Literal: funcName, Meta: args}, nil
}

// unread unreads the last character and revert the position 1 step back.
func (s *Scanner) unread() {
	if s.pos > 0 {
		s.pos = s.pos - 1
	}
}

// read reads the next rune and moves the position forward.
func (s *Scanner) read() rune {
	if s.pos >= len(s.data) {
		return eof
	}

	ch, n := utf8.DecodeRune(s.data[s.pos:])
	s.pos += n

	return ch
}

// Lexical helpers:
// -------------------------------------------------------------------

// isWhitespaceRune checks if a rune is a space, tab, or newline.
func isWhitespaceRune(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' }

// isLetterRune checks if a rune is a letter.
func isLetterRune(ch rune) bool {
	return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
}

// isDigitRune checks if a rune is a digit.
func isDigitRune(ch rune) bool {
	return (ch >= '0' && ch <= '9')
}

// isTextStartRune checks if a rune is a valid quoted text first character
// (aka. single or double quote).
func isTextStartRune(ch rune) bool {
	return ch == '\'' || ch == '"'
}

// isNumberStartRune checks if a rune is a valid number start character (aka. digit).
func isNumberStartRune(ch rune) bool {
	return ch == '-' || isDigitRune(ch)
}

// isSignStartRune checks if a rune is a valid sign operator start character.
func isSignStartRune(ch rune) bool {
	return ch == '=' ||
		ch == '?' ||
		ch == '!' ||
		ch == '>' ||
		ch == '<' ||
		ch == '~'
}

// isJoinStartRune checks if a rune is a valid join type start character.
func isJoinStartRune(ch rune) bool {
	return ch == '&' || ch == '|'
}

// isGroupStartRune checks if a rune is a valid group/parenthesis start character.
func isGroupStartRune(ch rune) bool {
	return ch == '('
}

// isCommentStartRune checks if a rune is a valid comment start character.
func isCommentStartRune(ch rune) bool {
	return ch == '/'
}

// isIdentifierStartRune checks if a rune is valid identifier's first character.
func isIdentifierStartRune(ch rune) bool {
	return isLetterRune(ch) || isIdentifierSpecialStartRune(ch)
}

// isIdentifierSpecialStartRune checks if a rune is valid identifier's first special character.
func isIdentifierSpecialStartRune(ch rune) bool {
	return ch == '@' || ch == '_' || ch == '#'
}

// isIdentifierCombineRune checks if a rune is valid identifier's combine character.
func isIdentifierCombineRune(ch rune) bool {
	return ch == '.' || ch == ':'
}

// isSignOperator checks if a literal is a valid sign operator.
func isSignOperator(literal string) bool {
	switch SignOp(literal) {
	case
		SignEq,
		SignNeq,
		SignLt,
		SignLte,
		SignGt,
		SignGte,
		SignLike,
		SignNlike,
		SignAnyEq,
		SignAnyNeq,
		SignAnyLike,
		SignAnyNlike,
		SignAnyLt,
		SignAnyLte,
		SignAnyGt,
		SignAnyGte:
		return true
	}

	return false
}

// isJoinOperator checks if a literal is a valid join type operator.
func isJoinOperator(literal string) bool {
	switch JoinOp(literal) {
	case
		JoinAnd,
		JoinOr:
		return true
	}

	return false
}

// isValidIdentifier validates the literal against common identifier requirements.
func isValidIdentifier(literal string) bool {
	length := len(literal)

	return (
	// doesn't end with combine rune
	!isIdentifierCombineRune(rune(literal[length-1])) &&
		// is not just a special start rune
		(length != 1 || !isIdentifierSpecialStartRune(rune(literal[0]))))
}