From 16e40566d2f510b56dea8a44c2eab3ed88f8ba3e Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Thu, 22 May 2025 20:57:13 +0200 Subject: [PATCH] Adding upstream version 0.5.0. Signed-off-by: Daniel Baumann --- LICENSE.md | 29 ++ README.md | 118 ++++++++ examples_test.go | 36 +++ go.mod | 3 + parser.go | 130 +++++++++ parser_test.go | 142 ++++++++++ scanner.go | 679 +++++++++++++++++++++++++++++++++++++++++++++++ scanner_test.go | 166 ++++++++++++ 8 files changed, 1303 insertions(+) create mode 100644 LICENSE.md create mode 100644 README.md create mode 100644 examples_test.go create mode 100644 go.mod create mode 100644 parser.go create mode 100644 parser_test.go create mode 100644 scanner.go create mode 100644 scanner_test.go diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..d180815 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2022-present, Gani Georgiev +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a71c469 --- /dev/null +++ b/README.md @@ -0,0 +1,118 @@ +fexpr +[![Go Report Card](https://goreportcard.com/badge/github.com/ganigeorgiev/fexpr)](https://goreportcard.com/report/github.com/ganigeorgiev/fexpr) +[![GoDoc](https://godoc.org/github.com/ganigeorgiev/fexpr?status.svg)](https://pkg.go.dev/github.com/ganigeorgiev/fexpr) +================================================================================ + +**fexpr** is a filter query language parser that generates easy to work with AST structure so that you can create safely SQL, Elasticsearch, etc. queries from user input. + +Or in other words, transform the string `"id > 1"` into the struct `[{&& {{identifier id} > {number 1}}}]`. + +Supports parenthesis and various conditional expression operators (see [Grammar](https://github.com/ganigeorgiev/fexpr#grammar)). + + +## Example usage + +``` +go get github.com/ganigeorgiev/fexpr +``` + +```go +package main + +import github.com/ganigeorgiev/fexpr + +func main() { + // [{&& {{identifier id} = {number 123}}} {&& {{identifier status} = {text active}}}] + result, err := fexpr.Parse("id=123 && status='active'") +} +``` + +> Note that each parsed expression statement contains a join/union operator (`&&` or `||`) so that the result can be consumed on small chunks without having to rely on the group/nesting context. + +> See the [package documentation](https://pkg.go.dev/github.com/ganigeorgiev/fexpr) for more details and examples. + + +## Grammar + +**fexpr** grammar resembles the SQL `WHERE` expression syntax. It recognizes several token types (identifiers, numbers, quoted text, expression operators, whitespaces, etc.). + +> You could find all supported tokens in [`scanner.go`](https://github.com/ganigeorgiev/fexpr/blob/master/scanner.go). + +#### Operators + +- **`=`** Equal operator (eg. `a=b`) +- **`!=`** NOT Equal operator (eg. `a!=b`) +- **`>`** Greater than operator (eg. `a>b`) +- **`>=`** Greater than or equal operator (eg. `a>=b`) +- **`<`** Less than or equal operator (eg. `a`** Array/Any Greater than operator (eg. `a?>b`) +- **`?>=`** Array/Any Greater than or equal operator (eg. `a?>=b`) +- **`?<`** Array/Any Less than or equal operator (eg. `a? 123")) + +// scan single token at a time until EOF or error is reached +for { + t, err := s.Scan() + if t.Type == fexpr.TokenEOF || err != nil { + break + } + + fmt.Println(t) +} + +// Output: +// { identifier id} +// { whitespace } +// { sign >} +// { whitespace } +// { number 123} +``` diff --git a/examples_test.go b/examples_test.go new file mode 100644 index 0000000..82fb2bf --- /dev/null +++ b/examples_test.go @@ -0,0 +1,36 @@ +package fexpr_test + +import ( + "fmt" + + "github.com/ganigeorgiev/fexpr" +) + +func ExampleScanner_Scan() { + s := fexpr.NewScanner([]byte("id > 123")) + + for { + t, err := s.Scan() + if t.Type == fexpr.TokenEOF || err != nil { + break + } + + fmt.Println(t) + } + + // Output: + // { identifier id} + // { whitespace } + // { sign >} + // { whitespace } + // { number 123} +} + +func ExampleParse() { + result, _ := fexpr.Parse("id > 123") + + fmt.Println(result) + + // Output: + // [{{{ identifier id} > { number 123}} &&}] +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..5b6fb4d --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/ganigeorgiev/fexpr + +go 1.16 diff --git a/parser.go b/parser.go new file mode 100644 index 0000000..aba8393 --- /dev/null +++ b/parser.go @@ -0,0 +1,130 @@ +package fexpr + +import ( + "errors" + "fmt" +) + +var ErrEmpty = errors.New("empty filter expression") +var ErrIncomplete = errors.New("invalid or incomplete filter expression") +var ErrInvalidComment = errors.New("invalid comment") + +// Expr represents an individual tokenized expression consisting +// of left operand, operator and a right operand. +type Expr struct { + Left Token + Op SignOp + Right Token +} + +// IsZero checks if the current Expr has zero-valued props. +func (e Expr) IsZero() bool { + return e.Op == "" && e.Left.Literal == "" && e.Left.Type == "" && e.Right.Literal == "" && e.Right.Type == "" +} + +// ExprGroup represents a wrapped expression and its join type. +// +// The group's Item could be either an `Expr` instance or `[]ExprGroup` slice (for nested expressions). +type ExprGroup struct { + Item interface{} + Join JoinOp +} + +// parser's state machine steps +const ( + stepBeforeSign = iota + stepSign + stepAfterSign + StepJoin +) + +// Parse parses the provided text and returns its processed AST +// in the form of `ExprGroup` slice(s). +// +// Comments and whitespaces are ignored. +func Parse(text string) ([]ExprGroup, error) { + result := []ExprGroup{} + scanner := NewScanner([]byte(text)) + step := stepBeforeSign + join := JoinAnd + + var expr Expr + + for { + t, err := scanner.Scan() + if err != nil { + return nil, err + } + + if t.Type == TokenEOF { + break + } + + if t.Type == TokenWS || t.Type == TokenComment { + continue + } + + if t.Type == TokenGroup { + groupResult, err := Parse(t.Literal) + if err != nil { + return nil, err + } + + // append only if non-empty group + if len(groupResult) > 0 { + result = append(result, ExprGroup{Join: join, Item: groupResult}) + } + + step = StepJoin + continue + } + + switch step { + case stepBeforeSign: + if t.Type != TokenIdentifier && t.Type != TokenText && t.Type != TokenNumber && t.Type != TokenFunction { + return nil, fmt.Errorf("expected left operand (identifier, function, text or number), got %q (%s)", t.Literal, t.Type) + } + + expr = Expr{Left: t} + + step = stepSign + case stepSign: + if t.Type != TokenSign { + return nil, fmt.Errorf("expected a sign operator, got %q (%s)", t.Literal, t.Type) + } + + expr.Op = SignOp(t.Literal) + step = stepAfterSign + case stepAfterSign: + if t.Type != TokenIdentifier && t.Type != TokenText && t.Type != TokenNumber && t.Type != TokenFunction { + return nil, fmt.Errorf("expected right operand (identifier, function text or number), got %q (%s)", t.Literal, t.Type) + } + + expr.Right = t + result = append(result, ExprGroup{Join: join, Item: expr}) + + step = StepJoin + case StepJoin: + if t.Type != TokenJoin { + return nil, fmt.Errorf("expected && or ||, got %q (%s)", t.Literal, t.Type) + } + + join = JoinAnd + if t.Literal == "||" { + join = JoinOr + } + + step = stepBeforeSign + } + } + + if step != StepJoin { + if len(result) == 0 && expr.IsZero() { + return nil, ErrEmpty + } + + return nil, ErrIncomplete + } + + return result, nil +} diff --git a/parser_test.go b/parser_test.go new file mode 100644 index 0000000..9209ba7 --- /dev/null +++ b/parser_test.go @@ -0,0 +1,142 @@ +package fexpr + +import ( + "fmt" + "testing" +) + +func TestExprIzZero(t *testing.T) { + scenarios := []struct { + expr Expr + result bool + }{ + {Expr{}, true}, + {Expr{Op: SignAnyEq}, false}, + {Expr{Left: Token{Literal: "123"}}, false}, + {Expr{Left: Token{Type: TokenWS}}, false}, + {Expr{Right: Token{Literal: "123"}}, false}, + {Expr{Right: Token{Type: TokenWS}}, false}, + } + + for i, s := range scenarios { + t.Run(fmt.Sprintf("s%d", i), func(t *testing.T) { + if v := s.expr.IsZero(); v != s.result { + t.Fatalf("Expected %v, got %v for \n%v", s.result, v, s.expr) + } + }) + } +} + +func TestParse(t *testing.T) { + scenarios := []struct { + input string + expectedError bool + expectedPrint string + }{ + {`> 1`, true, "[]"}, + {`a >`, true, "[]"}, + {`a > >`, true, "[]"}, + {`a > %`, true, "[]"}, + {`a ! 1`, true, "[]"}, + {`a - 1`, true, "[]"}, + {`a + 1`, true, "[]"}, + {`1 - 1`, true, "[]"}, + {`1 + 1`, true, "[]"}, + {`> a 1`, true, "[]"}, + {`a || 1`, true, "[]"}, + {`a && 1`, true, "[]"}, + {`test > 1 &&`, true, `[]`}, + {`|| test = 1`, true, `[]`}, + {`test = 1 && ||`, true, "[]"}, + {`test = 1 && a`, true, "[]"}, + {`test = 1 && a`, true, "[]"}, + {`test = 1 && "a"`, true, "[]"}, + {`test = 1 a`, true, "[]"}, + {`test = 1 a`, true, "[]"}, + {`test = 1 "a"`, true, "[]"}, + {`test = 1@test`, true, "[]"}, + {`test = .@test`, true, "[]"}, + // mismatched text quotes + {`test = "demo'`, true, "[]"}, + {`test = 'demo"`, true, "[]"}, + {`test = 'demo'"`, true, "[]"}, + {`test = 'demo''`, true, "[]"}, + {`test = "demo"'`, true, "[]"}, + {`test = "demo""`, true, "[]"}, + {`test = ""demo""`, true, "[]"}, + {`test = ''demo''`, true, "[]"}, + {"test = `demo`", true, "[]"}, + // comments + {"test = / demo", true, "[]"}, + {"test = // demo", true, "[]"}, + {"// demo", true, "[]"}, + {"test = 123 // demo", false, "[{{{ identifier test} = { number 123}} &&}]"}, + {"test = // demo\n123", false, "[{{{ identifier test} = { number 123}} &&}]"}, + {` + a = 123 && + // demo + b = 456 + `, false, "[{{{ identifier a} = { number 123}} &&} {{{ identifier b} = { number 456}} &&}]"}, + // functions + {`test() = 12`, false, `[{{{[] function test} = { number 12}} &&}]`}, + {`(a.b.c(1) = d.e.f(2)) || 1=2`, false, `[{[{{{[{ number 1}] function a.b.c} = {[{ number 2}] function d.e.f}} &&}] &&} {{{ number 1} = { number 2}} ||}]`}, + // valid simple expression and sign operators check + {`1=12`, false, `[{{{ number 1} = { number 12}} &&}]`}, + {` 1 = 12 `, false, `[{{{ number 1} = { number 12}} &&}]`}, + {`"demo" != test`, false, `[{{{ text demo} != { identifier test}} &&}]`}, + {`a~1`, false, `[{{{ identifier a} ~ { number 1}} &&}]`}, + {`a !~ 1`, false, `[{{{ identifier a} !~ { number 1}} &&}]`}, + {`test>12`, false, `[{{{ identifier test} > { number 12}} &&}]`}, + {`test > 12`, false, `[{{{ identifier test} > { number 12}} &&}]`}, + {`test >="test"`, false, `[{{{ identifier test} >= { text test}} &&}]`}, + {`test<@demo.test2`, false, `[{{{ identifier test} < { identifier @demo.test2}} &&}]`}, + {`1<="test"`, false, `[{{{ number 1} <= { text test}} &&}]`}, + {`1<="te'st"`, false, `[{{{ number 1} <= { text te'st}} &&}]`}, + {`demo='te\'st'`, false, `[{{{ identifier demo} = { text te'st}} &&}]`}, + {`demo="te\'st"`, false, `[{{{ identifier demo} = { text te\'st}} &&}]`}, + {`demo="te\"st"`, false, `[{{{ identifier demo} = { text te"st}} &&}]`}, + // invalid parenthesis + {`(a=1`, true, `[]`}, + {`a=1)`, true, `[]`}, + {`((a=1)`, true, `[]`}, + {`{a=1}`, true, `[]`}, + {`[a=1]`, true, `[]`}, + {`((a=1 || a=2) && c=1))`, true, `[]`}, + // valid parenthesis + {`()`, true, `[]`}, + {`(a=1)`, false, `[{[{{{ identifier a} = { number 1}} &&}] &&}]`}, + {`(a="test(")`, false, `[{[{{{ identifier a} = { text test(}} &&}] &&}]`}, + {`(a="test)")`, false, `[{[{{{ identifier a} = { text test)}} &&}] &&}]`}, + {`((a=1))`, false, `[{[{[{{{ identifier a} = { number 1}} &&}] &&}] &&}]`}, + {`a=1 || 2!=3`, false, `[{{{ identifier a} = { number 1}} &&} {{{ number 2} != { number 3}} ||}]`}, + {`a=1 && 2!=3`, false, `[{{{ identifier a} = { number 1}} &&} {{{ number 2} != { number 3}} &&}]`}, + {`a=1 && 2!=3 || "b"=a`, false, `[{{{ identifier a} = { number 1}} &&} {{{ number 2} != { number 3}} &&} {{{ text b} = { identifier a}} ||}]`}, + {`(a=1 && 2!=3) || "b"=a`, false, `[{[{{{ identifier a} = { number 1}} &&} {{{ number 2} != { number 3}} &&}] &&} {{{ text b} = { identifier a}} ||}]`}, + {`((a=1 || a=2) && (c=1))`, false, `[{[{[{{{ identifier a} = { number 1}} &&} {{{ identifier a} = { number 2}} ||}] &&} {[{{{ identifier c} = { number 1}} &&}] &&}] &&}]`}, + // https://github.com/pocketbase/pocketbase/issues/5017 + {`(a='"')`, false, `[{[{{{ identifier a} = { text "}} &&}] &&}]`}, + {`(a='\'')`, false, `[{[{{{ identifier a} = { text '}} &&}] &&}]`}, + {`(a="'")`, false, `[{[{{{ identifier a} = { text '}} &&}] &&}]`}, + {`(a="\"")`, false, `[{[{{{ identifier a} = { text "}} &&}] &&}]`}, + } + + for i, scenario := range scenarios { + t.Run(fmt.Sprintf("s%d:%s", i, scenario.input), func(t *testing.T) { + v, err := Parse(scenario.input) + + if scenario.expectedError && err == nil { + t.Fatalf("Expected error, got nil (%q)", scenario.input) + } + + if !scenario.expectedError && err != nil { + t.Fatalf("Did not expect error, got %q (%q).", err, scenario.input) + } + + vPrint := fmt.Sprintf("%v", v) + + if vPrint != scenario.expectedPrint { + t.Fatalf("Expected %s, got %s", scenario.expectedPrint, vPrint) + } + }) + } +} diff --git a/scanner.go b/scanner.go new file mode 100644 index 0000000..51fd2be --- /dev/null +++ b/scanner.go @@ -0,0 +1,679 @@ +package fexpr + +import ( + "bytes" + "fmt" + "strings" + "unicode/utf8" +) + +// eof represents a marker rune for the end of the reader. +const eof = rune(0) + +// JoinOp represents a join type operator. +type JoinOp string + +// supported join type operators +const ( + JoinAnd JoinOp = "&&" + JoinOr JoinOp = "||" +) + +// SignOp represents an expression sign operator. +type SignOp string + +// supported expression sign operators +const ( + SignEq SignOp = "=" + SignNeq SignOp = "!=" + SignLike SignOp = "~" + SignNlike SignOp = "!~" + SignLt SignOp = "<" + SignLte SignOp = "<=" + SignGt SignOp = ">" + SignGte SignOp = ">=" + + // array/any operators + SignAnyEq SignOp = "?=" + SignAnyNeq SignOp = "?!=" + SignAnyLike SignOp = "?~" + SignAnyNlike SignOp = "?!~" + SignAnyLt SignOp = "?<" + SignAnyLte SignOp = "?<=" + SignAnyGt SignOp = "?>" + SignAnyGte SignOp = "?>=" +) + +// TokenType represents a Token type. +type TokenType string + +// token type constants +const ( + TokenUnexpected TokenType = "unexpected" + TokenEOF TokenType = "eof" + TokenWS TokenType = "whitespace" + TokenJoin TokenType = "join" + TokenSign TokenType = "sign" + TokenIdentifier TokenType = "identifier" // variable, column name, placeholder, etc. + TokenFunction TokenType = "function" // function + TokenNumber TokenType = "number" + TokenText TokenType = "text" // ' or " quoted string + TokenGroup TokenType = "group" // groupped/nested tokens + TokenComment TokenType = "comment" +) + +// Token represents a single scanned literal (one or more combined runes). +type Token struct { + Meta interface{} + Type TokenType + Literal string +} + +// NewScanner creates and returns a new scanner instance loaded with the specified data. +func NewScanner(data []byte) *Scanner { + return &Scanner{ + data: data, + maxFuncDepth: 3, + } +} + +// Scanner represents a filter and lexical scanner. +type Scanner struct { + data []byte + pos int + maxFuncDepth int +} + +// Scan reads and returns the next available token value from the scanner's buffer. +func (s *Scanner) Scan() (Token, error) { + ch := s.read() + + if ch == eof { + return Token{Type: TokenEOF, Literal: ""}, nil + } + + if isWhitespaceRune(ch) { + s.unread() + return s.scanWhitespace() + } + + if isGroupStartRune(ch) { + s.unread() + return s.scanGroup() + } + + if isIdentifierStartRune(ch) { + s.unread() + return s.scanIdentifier(s.maxFuncDepth) + } + + if isNumberStartRune(ch) { + s.unread() + return s.scanNumber() + } + + if isTextStartRune(ch) { + s.unread() + return s.scanText(false) + } + + if isSignStartRune(ch) { + s.unread() + return s.scanSign() + } + + if isJoinStartRune(ch) { + s.unread() + return s.scanJoin() + } + + if isCommentStartRune(ch) { + s.unread() + return s.scanComment() + } + + return Token{Type: TokenUnexpected, Literal: string(ch)}, fmt.Errorf("unexpected character %q", ch) +} + +// scanWhitespace consumes all contiguous whitespace runes. +func (s *Scanner) scanWhitespace() (Token, error) { + var buf bytes.Buffer + + // Reads every subsequent whitespace character into the buffer. + // Non-whitespace runes and EOF will cause the loop to exit. + for { + ch := s.read() + + if ch == eof { + break + } + + if !isWhitespaceRune(ch) { + s.unread() + break + } + + // write the whitespace rune + buf.WriteRune(ch) + } + + return Token{Type: TokenWS, Literal: buf.String()}, nil +} + +// scanNumber consumes all contiguous digit runes +// (complex numbers and scientific notations are not supported). +func (s *Scanner) scanNumber() (Token, error) { + var buf bytes.Buffer + + var hadDot bool + + // Read every subsequent digit rune into the buffer. + // Non-digit runes and EOF will cause the loop to exit. + for { + ch := s.read() + + if ch == eof { + break + } + + // not a digit rune + if !isDigitRune(ch) && + // minus sign but not at the beginning + (ch != '-' || buf.Len() != 0) && + // dot but there was already another dot + (ch != '.' || hadDot) { + s.unread() + break + } + + // write the rune + buf.WriteRune(ch) + + if ch == '.' { + hadDot = true + } + } + + total := buf.Len() + literal := buf.String() + + var err error + // only "-" or starts with "." or ends with "." + if (total == 1 && literal[0] == '-') || literal[0] == '.' || literal[total-1] == '.' { + err = fmt.Errorf("invalid number %q", literal) + } + + return Token{Type: TokenNumber, Literal: buf.String()}, err +} + +// scanText consumes all contiguous quoted text runes. +func (s *Scanner) scanText(preserveQuotes bool) (Token, error) { + var buf bytes.Buffer + + // read the first rune to determine the quotes type + firstCh := s.read() + buf.WriteRune(firstCh) + var prevCh rune + var hasMatchingQuotes bool + + // Read every subsequent text rune into the buffer. + // EOF and matching unescaped ending quote will cause the loop to exit. + for { + ch := s.read() + + if ch == eof { + break + } + + // write the text rune + buf.WriteRune(ch) + + // unescaped matching quote, aka. the end + if ch == firstCh && prevCh != '\\' { + hasMatchingQuotes = true + break + } + + prevCh = ch + } + + literal := buf.String() + + var err error + if !hasMatchingQuotes { + err = fmt.Errorf("invalid quoted text %q", literal) + } else if !preserveQuotes { + // unquote + literal = literal[1 : len(literal)-1] + // remove escaped quotes prefix (aka. \) + firstChStr := string(firstCh) + literal = strings.ReplaceAll(literal, `\`+firstChStr, firstChStr) + } + + return Token{Type: TokenText, Literal: literal}, err +} + +// scanComment consumes all contiguous single line comment runes until +// a new character (\n) or EOF is reached. +func (s *Scanner) scanComment() (Token, error) { + var buf bytes.Buffer + + // Read the first 2 characters without writting them to the buffer. + if !isCommentStartRune(s.read()) || !isCommentStartRune(s.read()) { + return Token{Type: TokenComment}, ErrInvalidComment + } + + // Read every subsequent comment text rune into the buffer. + // \n and EOF will cause the loop to exit. + for i := 0; ; i++ { + ch := s.read() + + if ch == eof || ch == '\n' { + break + } + + buf.WriteRune(ch) + } + + return Token{Type: TokenComment, Literal: strings.TrimSpace(buf.String())}, nil +} + +// scanIdentifier consumes all contiguous ident runes. +func (s *Scanner) scanIdentifier(funcDepth int) (Token, error) { + var buf bytes.Buffer + + // read the first rune in case it is a special start identifier character + buf.WriteRune(s.read()) + + // Read every subsequent identifier rune into the buffer. + // Non-ident runes and EOF will cause the loop to exit. + for { + ch := s.read() + + if ch == eof { + break + } + + // func + if ch == '(' { + funcName := buf.String() + if funcDepth <= 0 { + return Token{Type: TokenFunction, Literal: funcName}, fmt.Errorf("max nested function arguments reached (max: %d)", s.maxFuncDepth) + } + if !isValidIdentifier(funcName) { + return Token{Type: TokenFunction, Literal: funcName}, fmt.Errorf("invalid function name %q", funcName) + } + s.unread() + return s.scanFunctionArgs(funcName, funcDepth) + } + + // not an identifier character + if !isLetterRune(ch) && !isDigitRune(ch) && !isIdentifierCombineRune(ch) && ch != '_' { + s.unread() + break + } + + // write the identifier rune + buf.WriteRune(ch) + } + + literal := buf.String() + + var err error + if !isValidIdentifier(literal) { + err = fmt.Errorf("invalid identifier %q", literal) + } + + return Token{Type: TokenIdentifier, Literal: literal}, err +} + +// scanSign consumes all contiguous sign operator runes. +func (s *Scanner) scanSign() (Token, error) { + var buf bytes.Buffer + + // Read every subsequent sign rune into the buffer. + // Non-sign runes and EOF will cause the loop to exit. + for { + ch := s.read() + + if ch == eof { + break + } + + if !isSignStartRune(ch) { + s.unread() + break + } + + // write the sign rune + buf.WriteRune(ch) + } + + literal := buf.String() + + var err error + if !isSignOperator(literal) { + err = fmt.Errorf("invalid sign operator %q", literal) + } + + return Token{Type: TokenSign, Literal: literal}, err +} + +// scanJoin consumes all contiguous join operator runes. +func (s *Scanner) scanJoin() (Token, error) { + var buf bytes.Buffer + + // Read every subsequent join operator rune into the buffer. + // Non-join runes and EOF will cause the loop to exit. + for { + ch := s.read() + + if ch == eof { + break + } + + if !isJoinStartRune(ch) { + s.unread() + break + } + + // write the join operator rune + buf.WriteRune(ch) + } + + literal := buf.String() + + var err error + if !isJoinOperator(literal) { + err = fmt.Errorf("invalid join operator %q", literal) + } + + return Token{Type: TokenJoin, Literal: literal}, err +} + +// scanGroup consumes all runes within a group/parenthesis. +func (s *Scanner) scanGroup() (Token, error) { + var buf bytes.Buffer + + // read the first group bracket without writing it to the buffer + firstChar := s.read() + openGroups := 1 + + // Read every subsequent text rune into the buffer. + // EOF and matching unescaped ending quote will cause the loop to exit. + for { + ch := s.read() + + if ch == eof { + break + } + + if isGroupStartRune(ch) { + // nested group + openGroups++ + buf.WriteRune(ch) + } else if isTextStartRune(ch) { + s.unread() + t, err := s.scanText(true) // with quotes to preserve the exact text start/end runes + if err != nil { + // write the errored literal as it is + buf.WriteString(t.Literal) + return Token{Type: TokenGroup, Literal: buf.String()}, err + } + + buf.WriteString(t.Literal) + } else if ch == ')' { + openGroups-- + + if openGroups <= 0 { + // main group end + break + } else { + buf.WriteRune(ch) + } + } else { + buf.WriteRune(ch) + } + } + + literal := buf.String() + + var err error + if !isGroupStartRune(firstChar) || openGroups > 0 { + err = fmt.Errorf("invalid formatted group - missing %d closing bracket(s)", openGroups) + } + + return Token{Type: TokenGroup, Literal: literal}, err +} + +// scanFunctionArgs consumes all contiguous function call runes to +// extract its arguments and returns a function token with the found +// Token arguments loaded in Token.Meta. +func (s *Scanner) scanFunctionArgs(funcName string, funcDepth int) (Token, error) { + var args []Token + + var expectComma, isComma, isClosed bool + + ch := s.read() + if ch != '(' { + return Token{Type: TokenFunction, Literal: funcName}, fmt.Errorf("invalid or incomplete function call %q", funcName) + } + + // Read every subsequent rune until ')' or EOF has been reached. + for { + ch := s.read() + + if ch == eof { + break + } + + if ch == ')' { + isClosed = true + break + } + + // skip whitespaces + if isWhitespaceRune(ch) { + _, err := s.scanWhitespace() + if err != nil { + return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("failed to scan whitespaces in function %q: %w", funcName, err) + } + continue + } + + // skip comments + if isCommentStartRune(ch) { + s.unread() + _, err := s.scanComment() + if err != nil { + return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("failed to scan comment in function %q: %w", funcName, err) + } + continue + } + + isComma = ch == ',' + + if expectComma && !isComma { + return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("expected comma after the last argument in function %q", funcName) + } + + if !expectComma && isComma { + return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("unexpected comma in function %q", funcName) + } + + expectComma = false // reset + + if isComma { + continue + } + + if isIdentifierStartRune(ch) { + s.unread() + t, err := s.scanIdentifier(funcDepth - 1) + if err != nil { + return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid identifier argument %q in function %q: %w", t.Literal, funcName, err) + } + args = append(args, t) + expectComma = true + } else if isNumberStartRune(ch) { + s.unread() + t, err := s.scanNumber() + if err != nil { + return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid number argument %q in function %q: %w", t.Literal, funcName, err) + } + args = append(args, t) + expectComma = true + } else if isTextStartRune(ch) { + s.unread() + t, err := s.scanText(false) + if err != nil { + return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid text argument %q in function %q: %w", t.Literal, funcName, err) + } + args = append(args, t) + expectComma = true + } else { + return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("unsupported argument character %q in function %q", ch, funcName) + } + } + + if !isClosed { + return Token{Type: TokenFunction, Literal: funcName, Meta: args}, fmt.Errorf("invalid or incomplete function %q (expected ')')", funcName) + } + + return Token{Type: TokenFunction, Literal: funcName, Meta: args}, nil +} + +// unread unreads the last character and revert the position 1 step back. +func (s *Scanner) unread() { + if s.pos > 0 { + s.pos = s.pos - 1 + } +} + +// read reads the next rune and moves the position forward. +func (s *Scanner) read() rune { + if s.pos >= len(s.data) { + return eof + } + + ch, n := utf8.DecodeRune(s.data[s.pos:]) + s.pos += n + + return ch +} + +// Lexical helpers: +// ------------------------------------------------------------------- + +// isWhitespaceRune checks if a rune is a space, tab, or newline. +func isWhitespaceRune(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' } + +// isLetterRune checks if a rune is a letter. +func isLetterRune(ch rune) bool { + return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') +} + +// isDigitRune checks if a rune is a digit. +func isDigitRune(ch rune) bool { + return (ch >= '0' && ch <= '9') +} + +// isTextStartRune checks if a rune is a valid quoted text first character +// (aka. single or double quote). +func isTextStartRune(ch rune) bool { + return ch == '\'' || ch == '"' +} + +// isNumberStartRune checks if a rune is a valid number start character (aka. digit). +func isNumberStartRune(ch rune) bool { + return ch == '-' || isDigitRune(ch) +} + +// isSignStartRune checks if a rune is a valid sign operator start character. +func isSignStartRune(ch rune) bool { + return ch == '=' || + ch == '?' || + ch == '!' || + ch == '>' || + ch == '<' || + ch == '~' +} + +// isJoinStartRune checks if a rune is a valid join type start character. +func isJoinStartRune(ch rune) bool { + return ch == '&' || ch == '|' +} + +// isGroupStartRune checks if a rune is a valid group/parenthesis start character. +func isGroupStartRune(ch rune) bool { + return ch == '(' +} + +// isCommentStartRune checks if a rune is a valid comment start character. +func isCommentStartRune(ch rune) bool { + return ch == '/' +} + +// isIdentifierStartRune checks if a rune is valid identifier's first character. +func isIdentifierStartRune(ch rune) bool { + return isLetterRune(ch) || isIdentifierSpecialStartRune(ch) +} + +// isIdentifierSpecialStartRune checks if a rune is valid identifier's first special character. +func isIdentifierSpecialStartRune(ch rune) bool { + return ch == '@' || ch == '_' || ch == '#' +} + +// isIdentifierCombineRune checks if a rune is valid identifier's combine character. +func isIdentifierCombineRune(ch rune) bool { + return ch == '.' || ch == ':' +} + +// isSignOperator checks if a literal is a valid sign operator. +func isSignOperator(literal string) bool { + switch SignOp(literal) { + case + SignEq, + SignNeq, + SignLt, + SignLte, + SignGt, + SignGte, + SignLike, + SignNlike, + SignAnyEq, + SignAnyNeq, + SignAnyLike, + SignAnyNlike, + SignAnyLt, + SignAnyLte, + SignAnyGt, + SignAnyGte: + return true + } + + return false +} + +// isJoinOperator checks if a literal is a valid join type operator. +func isJoinOperator(literal string) bool { + switch JoinOp(literal) { + case + JoinAnd, + JoinOr: + return true + } + + return false +} + +// isValidIdentifier validates the literal against common identifier requirements. +func isValidIdentifier(literal string) bool { + length := len(literal) + + return ( + // doesn't end with combine rune + !isIdentifierCombineRune(rune(literal[length-1])) && + // is not just a special start rune + (length != 1 || !isIdentifierSpecialStartRune(rune(literal[0])))) +} diff --git a/scanner_test.go b/scanner_test.go new file mode 100644 index 0000000..48fe99e --- /dev/null +++ b/scanner_test.go @@ -0,0 +1,166 @@ +package fexpr + +import ( + "fmt" + "testing" +) + +func TestNewScanner(t *testing.T) { + s := NewScanner([]byte("test")) + + data := string(s.data) + + if data != "test" { + t.Errorf("Expected the scanner reader data to be %q, got %q", "test", data) + } +} + +func TestScannerScan(t *testing.T) { + type output struct { + error bool + print string + } + testScenarios := []struct { + text string + expects []output + }{ + // whitespace + {" ", []output{{false, "{ whitespace }"}}}, + {"test 123", []output{{false, "{ identifier test}"}, {false, "{ whitespace }"}, {false, "{ number 123}"}}}, + // identifier + {`test`, []output{{false, `{ identifier test}`}}}, + {`@`, []output{{true, `{ identifier @}`}}}, + {`test:`, []output{{true, `{ identifier test:}`}}}, + {`test.`, []output{{true, `{ identifier test.}`}}}, + {`@test.123:c`, []output{{false, `{ identifier @test.123:c}`}}}, + {`_test_a.123`, []output{{false, `{ identifier _test_a.123}`}}}, + {`#test.123:456`, []output{{false, `{ identifier #test.123:456}`}}}, + {`.test.123`, []output{{true, `{ unexpected .}`}, {false, `{ identifier test.123}`}}}, + {`:test.123`, []output{{true, `{ unexpected :}`}, {false, `{ identifier test.123}`}}}, + {`test#@`, []output{{false, `{ identifier test}`}, {true, `{ identifier #}`}, {true, `{ identifier @}`}}}, + {`test'`, []output{{false, `{ identifier test}`}, {true, `{ text '}`}}}, + {`test"d`, []output{{false, `{ identifier test}`}, {true, `{ text "d}`}}}, + // number + {`123`, []output{{false, `{ number 123}`}}}, + {`-123`, []output{{false, `{ number -123}`}}}, + {`-123.456`, []output{{false, `{ number -123.456}`}}}, + {`123.456`, []output{{false, `{ number 123.456}`}}}, + {`12.34.56`, []output{{false, `{ number 12.34}`}, {true, `{ unexpected .}`}, {false, `{ number 56}`}}}, + {`.123`, []output{{true, `{ unexpected .}`}, {false, `{ number 123}`}}}, + {`- 123`, []output{{true, `{ number -}`}, {false, `{ whitespace }`}, {false, `{ number 123}`}}}, + {`12-3`, []output{{false, `{ number 12}`}, {false, `{ number -3}`}}}, + {`123.abc`, []output{{true, `{ number 123.}`}, {false, `{ identifier abc}`}}}, + // text + {`""`, []output{{false, `{ text }`}}}, + {`''`, []output{{false, `{ text }`}}}, + {`'test'`, []output{{false, `{ text test}`}}}, + {`'te\'st'`, []output{{false, `{ text te'st}`}}}, + {`"te\"st"`, []output{{false, `{ text te"st}`}}}, + {`"tes@#,;!@#%^'\"t"`, []output{{false, `{ text tes@#,;!@#%^'"t}`}}}, + {`'tes@#,;!@#%^\'"t'`, []output{{false, `{ text tes@#,;!@#%^'"t}`}}}, + {`"test`, []output{{true, `{ text "test}`}}}, + {`'test`, []output{{true, `{ text 'test}`}}}, + {`'АБЦ`, []output{{true, `{ text 'АБЦ}`}}}, + // join types + {`&&||`, []output{{true, `{ join &&||}`}}}, + {`&& ||`, []output{{false, `{ join &&}`}, {false, `{ whitespace }`}, {false, `{ join ||}`}}}, + {`'||test&&'&&123`, []output{{false, `{ text ||test&&}`}, {false, `{ join &&}`}, {false, `{ number 123}`}}}, + // expression signs + {`=!=`, []output{{true, `{ sign =!=}`}}}, + {`= != ~ !~ > >= < <= ?= ?!= ?~ ?!~ ?> ?>= ?< ?<=`, []output{ + {false, `{ sign =}`}, + {false, `{ whitespace }`}, + {false, `{ sign !=}`}, + {false, `{ whitespace }`}, + {false, `{ sign ~}`}, + {false, `{ whitespace }`}, + {false, `{ sign !~}`}, + {false, `{ whitespace }`}, + {false, `{ sign >}`}, + {false, `{ whitespace }`}, + {false, `{ sign >=}`}, + {false, `{ whitespace }`}, + {false, `{ sign <}`}, + {false, `{ whitespace }`}, + {false, `{ sign <=}`}, + {false, `{ whitespace }`}, + {false, `{ sign ?=}`}, + {false, `{ whitespace }`}, + {false, `{ sign ?!=}`}, + {false, `{ whitespace }`}, + {false, `{ sign ?~}`}, + {false, `{ whitespace }`}, + {false, `{ sign ?!~}`}, + {false, `{ whitespace }`}, + {false, `{ sign ?>}`}, + {false, `{ whitespace }`}, + {false, `{ sign ?>=}`}, + {false, `{ whitespace }`}, + {false, `{ sign ?<}`}, + {false, `{ whitespace }`}, + {false, `{ sign ?<=}`}, + }}, + // comments + {`/ test`, []output{{true, `{ comment }`}, {false, `{ identifier test}`}}}, + {`/ / test`, []output{{true, `{ comment }`}, {true, `{ comment }`}, {false, `{ identifier test}`}}}, + {`//`, []output{{false, `{ comment }`}}}, + {`//test`, []output{{false, `{ comment test}`}}}, + {`// test`, []output{{false, `{ comment test}`}}}, + {`// test1 //test2 `, []output{{false, `{ comment test1 //test2}`}}}, + {`///test`, []output{{false, `{ comment /test}`}}}, + // funcs + {`test()`, []output{{false, `{[] function test}`}}}, + {`test(a, b`, []output{{true, `{[{ identifier a} { identifier b}] function test}`}}}, + {`@test:abc()`, []output{{false, `{[] function @test:abc}`}}}, + {`test( a )`, []output{{false, `{[{ identifier a}] function test}`}}}, // with whitespaces + {`test(a, b)`, []output{{false, `{[{ identifier a} { identifier b}] function test}`}}}, + {`test(a, b, )`, []output{{false, `{[{ identifier a} { identifier b}] function test}`}}}, // single trailing comma + {`test(a,,)`, []output{{true, `{[{ identifier a}] function test}`}, {true, `{ unexpected )}`}}}, // unexpected trailing commas + {`test(a,,,b)`, []output{{true, `{[{ identifier a}] function test}`}, {true, `{ unexpected ,}`}, {false, `{ identifier b}`}, {true, `{ unexpected )}`}}}, // unexpected mid-args commas + {`test( @test.a.b:test , 123, "ab)c", 'd,ce', false)`, []output{{false, `{[{ identifier @test.a.b:test} { number 123} { text ab)c} { text d,ce} { identifier false}] function test}`}}}, + {"test(a //test)", []output{{true, `{[{ identifier a}] function test}`}}}, // invalid simple comment + {"test(a //test\n)", []output{{false, `{[{ identifier a}] function test}`}}}, // valid simple comment + {"test(a, //test\n, b)", []output{{true, `{[{ identifier a}] function test}`}, {false, `{ whitespace }`}, {false, `{ identifier b}`}, {true, `{ unexpected )}`}}}, + {"test(a, //test\n b)", []output{{false, `{[{ identifier a} { identifier b}] function test}`}}}, + {"test(a, test(test(b), c), d)", []output{{false, `{[{ identifier a} {[{[{ identifier b}] function test} { identifier c}] function test} { identifier d}] function test}`}}}, + // max funcs depth + {"a(b(c(1)))", []output{{false, `{[{[{[{ number 1}] function c}] function b}] function a}`}}}, + {"a(b(c(d(1))))", []output{{true, `{[] function a}`}, {false, `{ number 1}`}, {true, `{ unexpected )}`}, {true, `{ unexpected )}`}, {true, `{ unexpected )}`}, {true, `{ unexpected )}`}}}, + // groups/parenthesis + {`a)`, []output{{false, `{ identifier a}`}, {true, `{ unexpected )}`}}}, + {`(a b c`, []output{{true, `{ group a b c}`}}}, + {`(a b c)`, []output{{false, `{ group a b c}`}}}, + {`((a b c))`, []output{{false, `{ group (a b c)}`}}}, + {`((a )b c))`, []output{{false, `{ group (a )b c}`}, {true, `{ unexpected )}`}}}, + {`("ab)("c)`, []output{{false, `{ group "ab)("c}`}}}, + {`("ab)(c)`, []output{{true, `{ group "ab)(c)}`}}}, + {`( func(1, 2, 3, func(4)) a b c )`, []output{{false, `{ group func(1, 2, 3, func(4)) a b c }`}}}, + } + + for _, scenario := range testScenarios { + t.Run(scenario.text, func(t *testing.T) { + s := NewScanner([]byte(scenario.text)) + + // scan the text tokens + for j, expect := range scenario.expects { + token, err := s.Scan() + + hasErr := err != nil + if expect.error != hasErr { + t.Errorf("[%d] Expected hasErr %v, got %v: %v (%v)", j, expect.error, hasErr, err, token) + } + + tokenPrint := fmt.Sprintf("%v", token) + if tokenPrint != expect.print { + t.Errorf("[%d] Expected token %s, got %s", j, expect.print, tokenPrint) + } + } + + // the last remaining token should be the eof + lastToken, err := s.Scan() + if err != nil || lastToken.Type != TokenEOF { + t.Fatalf("Expected EOF token, got %v (%v)", lastToken, err) + } + }) + } +}