1
0
Fork 0
sqlglot/sqlglot/parser.py
Daniel Baumann c61927f460
Adding upstream version 25.29.0.
Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 21:56:11 +01:00

7573 lines
278 KiB
Python

from __future__ import annotations
import logging
import typing as t
from collections import defaultdict
from sqlglot import exp
from sqlglot.errors import ErrorLevel, ParseError, concat_messages, merge_errors
from sqlglot.helper import apply_index_offset, ensure_list, seq_get
from sqlglot.time import format_time
from sqlglot.tokens import Token, Tokenizer, TokenType
from sqlglot.trie import TrieResult, in_trie, new_trie
if t.TYPE_CHECKING:
from sqlglot._typing import E, Lit
from sqlglot.dialects.dialect import Dialect, DialectType
T = t.TypeVar("T")
logger = logging.getLogger("sqlglot")
OPTIONS_TYPE = t.Dict[str, t.Sequence[t.Union[t.Sequence[str], str]]]
def build_var_map(args: t.List) -> exp.StarMap | exp.VarMap:
if len(args) == 1 and args[0].is_star:
return exp.StarMap(this=args[0])
keys = []
values = []
for i in range(0, len(args), 2):
keys.append(args[i])
values.append(args[i + 1])
return exp.VarMap(keys=exp.array(*keys, copy=False), values=exp.array(*values, copy=False))
def build_like(args: t.List) -> exp.Escape | exp.Like:
like = exp.Like(this=seq_get(args, 1), expression=seq_get(args, 0))
return exp.Escape(this=like, expression=seq_get(args, 2)) if len(args) > 2 else like
def binary_range_parser(
expr_type: t.Type[exp.Expression], reverse_args: bool = False
) -> t.Callable[[Parser, t.Optional[exp.Expression]], t.Optional[exp.Expression]]:
def _parse_binary_range(
self: Parser, this: t.Optional[exp.Expression]
) -> t.Optional[exp.Expression]:
expression = self._parse_bitwise()
if reverse_args:
this, expression = expression, this
return self._parse_escape(self.expression(expr_type, this=this, expression=expression))
return _parse_binary_range
def build_logarithm(args: t.List, dialect: Dialect) -> exp.Func:
# Default argument order is base, expression
this = seq_get(args, 0)
expression = seq_get(args, 1)
if expression:
if not dialect.LOG_BASE_FIRST:
this, expression = expression, this
return exp.Log(this=this, expression=expression)
return (exp.Ln if dialect.parser_class.LOG_DEFAULTS_TO_LN else exp.Log)(this=this)
def build_hex(args: t.List, dialect: Dialect) -> exp.Hex | exp.LowerHex:
arg = seq_get(args, 0)
return exp.LowerHex(this=arg) if dialect.HEX_LOWERCASE else exp.Hex(this=arg)
def build_lower(args: t.List) -> exp.Lower | exp.Hex:
# LOWER(HEX(..)) can be simplified to LowerHex to simplify its transpilation
arg = seq_get(args, 0)
return exp.LowerHex(this=arg.this) if isinstance(arg, exp.Hex) else exp.Lower(this=arg)
def build_upper(args: t.List) -> exp.Upper | exp.Hex:
# UPPER(HEX(..)) can be simplified to Hex to simplify its transpilation
arg = seq_get(args, 0)
return exp.Hex(this=arg.this) if isinstance(arg, exp.Hex) else exp.Upper(this=arg)
def build_extract_json_with_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]:
def _builder(args: t.List, dialect: Dialect) -> E:
expression = expr_type(
this=seq_get(args, 0), expression=dialect.to_json_path(seq_get(args, 1))
)
if len(args) > 2 and expr_type is exp.JSONExtract:
expression.set("expressions", args[2:])
return expression
return _builder
def build_mod(args: t.List) -> exp.Mod:
this = seq_get(args, 0)
expression = seq_get(args, 1)
# Wrap the operands if they are binary nodes, e.g. MOD(a + 1, 7) -> (a + 1) % 7
this = exp.Paren(this=this) if isinstance(this, exp.Binary) else this
expression = exp.Paren(this=expression) if isinstance(expression, exp.Binary) else expression
return exp.Mod(this=this, expression=expression)
def build_pad(args: t.List, is_left: bool = True):
return exp.Pad(
this=seq_get(args, 0),
expression=seq_get(args, 1),
fill_pattern=seq_get(args, 2),
is_left=is_left,
)
def build_array_constructor(
exp_class: t.Type[E], args: t.List, bracket_kind: TokenType, dialect: Dialect
) -> exp.Expression:
array_exp = exp_class(expressions=args)
if exp_class == exp.Array and dialect.HAS_DISTINCT_ARRAY_CONSTRUCTORS:
array_exp.set("bracket_notation", bracket_kind == TokenType.L_BRACKET)
return array_exp
def build_convert_timezone(
args: t.List, default_source_tz: t.Optional[str] = None
) -> t.Union[exp.ConvertTimezone, exp.Anonymous]:
if len(args) == 2:
source_tz = exp.Literal.string(default_source_tz) if default_source_tz else None
return exp.ConvertTimezone(
source_tz=source_tz, target_tz=seq_get(args, 0), timestamp=seq_get(args, 1)
)
return exp.ConvertTimezone.from_arg_list(args)
def build_trim(args: t.List, is_left: bool = True):
return exp.Trim(
this=seq_get(args, 0),
expression=seq_get(args, 1),
position="LEADING" if is_left else "TRAILING",
)
def build_coalesce(args: t.List, is_nvl: t.Optional[bool] = None) -> exp.Coalesce:
return exp.Coalesce(this=seq_get(args, 0), expressions=args[1:], is_nvl=is_nvl)
class _Parser(type):
def __new__(cls, clsname, bases, attrs):
klass = super().__new__(cls, clsname, bases, attrs)
klass.SHOW_TRIE = new_trie(key.split(" ") for key in klass.SHOW_PARSERS)
klass.SET_TRIE = new_trie(key.split(" ") for key in klass.SET_PARSERS)
return klass
class Parser(metaclass=_Parser):
"""
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Args:
error_level: The desired error level.
Default: ErrorLevel.IMMEDIATE
error_message_context: The amount of context to capture from a query string when displaying
the error message (in number of characters).
Default: 100
max_errors: Maximum number of error messages to include in a raised ParseError.
This is only relevant if error_level is ErrorLevel.RAISE.
Default: 3
"""
FUNCTIONS: t.Dict[str, t.Callable] = {
**{name: func.from_arg_list for name, func in exp.FUNCTION_BY_NAME.items()},
**dict.fromkeys(("COALESCE", "IFNULL", "NVL"), build_coalesce),
"ARRAY": lambda args, dialect: exp.Array(expressions=args),
"ARRAYAGG": lambda args, dialect: exp.ArrayAgg(
this=seq_get(args, 0), nulls_excluded=dialect.ARRAY_AGG_INCLUDES_NULLS is None or None
),
"ARRAY_AGG": lambda args, dialect: exp.ArrayAgg(
this=seq_get(args, 0), nulls_excluded=dialect.ARRAY_AGG_INCLUDES_NULLS is None or None
),
"CHAR": lambda args: exp.Chr(expressions=args),
"CHR": lambda args: exp.Chr(expressions=args),
"COUNT": lambda args: exp.Count(this=seq_get(args, 0), expressions=args[1:], big_int=True),
"CONCAT": lambda args, dialect: exp.Concat(
expressions=args,
safe=not dialect.STRICT_STRING_CONCAT,
coalesce=dialect.CONCAT_COALESCE,
),
"CONCAT_WS": lambda args, dialect: exp.ConcatWs(
expressions=args,
safe=not dialect.STRICT_STRING_CONCAT,
coalesce=dialect.CONCAT_COALESCE,
),
"CONVERT_TIMEZONE": build_convert_timezone,
"DATE_TO_DATE_STR": lambda args: exp.Cast(
this=seq_get(args, 0),
to=exp.DataType(this=exp.DataType.Type.TEXT),
),
"GENERATE_DATE_ARRAY": lambda args: exp.GenerateDateArray(
start=seq_get(args, 0),
end=seq_get(args, 1),
step=seq_get(args, 2) or exp.Interval(this=exp.Literal.number(1), unit=exp.var("DAY")),
),
"GLOB": lambda args: exp.Glob(this=seq_get(args, 1), expression=seq_get(args, 0)),
"HEX": build_hex,
"INSTR": lambda args: exp.StrPosition(this=seq_get(args, 0), substr=seq_get(args, 1)),
"JSON_EXTRACT": build_extract_json_with_path(exp.JSONExtract),
"JSON_EXTRACT_SCALAR": build_extract_json_with_path(exp.JSONExtractScalar),
"JSON_EXTRACT_PATH_TEXT": build_extract_json_with_path(exp.JSONExtractScalar),
"LIKE": build_like,
"LOG": build_logarithm,
"LOG2": lambda args: exp.Log(this=exp.Literal.number(2), expression=seq_get(args, 0)),
"LOG10": lambda args: exp.Log(this=exp.Literal.number(10), expression=seq_get(args, 0)),
"LOWER": build_lower,
"LPAD": lambda args: build_pad(args),
"LEFTPAD": lambda args: build_pad(args),
"LTRIM": lambda args: build_trim(args),
"MOD": build_mod,
"RIGHTPAD": lambda args: build_pad(args, is_left=False),
"RPAD": lambda args: build_pad(args, is_left=False),
"RTRIM": lambda args: build_trim(args, is_left=False),
"SCOPE_RESOLUTION": lambda args: exp.ScopeResolution(expression=seq_get(args, 0))
if len(args) != 2
else exp.ScopeResolution(this=seq_get(args, 0), expression=seq_get(args, 1)),
"TIME_TO_TIME_STR": lambda args: exp.Cast(
this=seq_get(args, 0),
to=exp.DataType(this=exp.DataType.Type.TEXT),
),
"TO_HEX": build_hex,
"TS_OR_DS_TO_DATE_STR": lambda args: exp.Substring(
this=exp.Cast(
this=seq_get(args, 0),
to=exp.DataType(this=exp.DataType.Type.TEXT),
),
start=exp.Literal.number(1),
length=exp.Literal.number(10),
),
"UNNEST": lambda args: exp.Unnest(expressions=ensure_list(seq_get(args, 0))),
"UPPER": build_upper,
"VAR_MAP": build_var_map,
}
NO_PAREN_FUNCTIONS = {
TokenType.CURRENT_DATE: exp.CurrentDate,
TokenType.CURRENT_DATETIME: exp.CurrentDate,
TokenType.CURRENT_TIME: exp.CurrentTime,
TokenType.CURRENT_TIMESTAMP: exp.CurrentTimestamp,
TokenType.CURRENT_USER: exp.CurrentUser,
}
STRUCT_TYPE_TOKENS = {
TokenType.NESTED,
TokenType.OBJECT,
TokenType.STRUCT,
TokenType.UNION,
}
NESTED_TYPE_TOKENS = {
TokenType.ARRAY,
TokenType.LIST,
TokenType.LOWCARDINALITY,
TokenType.MAP,
TokenType.NULLABLE,
TokenType.RANGE,
*STRUCT_TYPE_TOKENS,
}
ENUM_TYPE_TOKENS = {
TokenType.ENUM,
TokenType.ENUM8,
TokenType.ENUM16,
}
AGGREGATE_TYPE_TOKENS = {
TokenType.AGGREGATEFUNCTION,
TokenType.SIMPLEAGGREGATEFUNCTION,
}
TYPE_TOKENS = {
TokenType.BIT,
TokenType.BOOLEAN,
TokenType.TINYINT,
TokenType.UTINYINT,
TokenType.SMALLINT,
TokenType.USMALLINT,
TokenType.INT,
TokenType.UINT,
TokenType.BIGINT,
TokenType.UBIGINT,
TokenType.INT128,
TokenType.UINT128,
TokenType.INT256,
TokenType.UINT256,
TokenType.MEDIUMINT,
TokenType.UMEDIUMINT,
TokenType.FIXEDSTRING,
TokenType.FLOAT,
TokenType.DOUBLE,
TokenType.CHAR,
TokenType.NCHAR,
TokenType.VARCHAR,
TokenType.NVARCHAR,
TokenType.BPCHAR,
TokenType.TEXT,
TokenType.MEDIUMTEXT,
TokenType.LONGTEXT,
TokenType.MEDIUMBLOB,
TokenType.LONGBLOB,
TokenType.BINARY,
TokenType.VARBINARY,
TokenType.JSON,
TokenType.JSONB,
TokenType.INTERVAL,
TokenType.TINYBLOB,
TokenType.TINYTEXT,
TokenType.TIME,
TokenType.TIMETZ,
TokenType.TIMESTAMP,
TokenType.TIMESTAMP_S,
TokenType.TIMESTAMP_MS,
TokenType.TIMESTAMP_NS,
TokenType.TIMESTAMPTZ,
TokenType.TIMESTAMPLTZ,
TokenType.TIMESTAMPNTZ,
TokenType.DATETIME,
TokenType.DATETIME64,
TokenType.DATE,
TokenType.DATE32,
TokenType.INT4RANGE,
TokenType.INT4MULTIRANGE,
TokenType.INT8RANGE,
TokenType.INT8MULTIRANGE,
TokenType.NUMRANGE,
TokenType.NUMMULTIRANGE,
TokenType.TSRANGE,
TokenType.TSMULTIRANGE,
TokenType.TSTZRANGE,
TokenType.TSTZMULTIRANGE,
TokenType.DATERANGE,
TokenType.DATEMULTIRANGE,
TokenType.DECIMAL,
TokenType.DECIMAL32,
TokenType.DECIMAL64,
TokenType.DECIMAL128,
TokenType.DECIMAL256,
TokenType.UDECIMAL,
TokenType.BIGDECIMAL,
TokenType.UUID,
TokenType.GEOGRAPHY,
TokenType.GEOMETRY,
TokenType.POINT,
TokenType.RING,
TokenType.LINESTRING,
TokenType.MULTILINESTRING,
TokenType.POLYGON,
TokenType.MULTIPOLYGON,
TokenType.HLLSKETCH,
TokenType.HSTORE,
TokenType.PSEUDO_TYPE,
TokenType.SUPER,
TokenType.SERIAL,
TokenType.SMALLSERIAL,
TokenType.BIGSERIAL,
TokenType.XML,
TokenType.YEAR,
TokenType.UNIQUEIDENTIFIER,
TokenType.USERDEFINED,
TokenType.MONEY,
TokenType.SMALLMONEY,
TokenType.ROWVERSION,
TokenType.IMAGE,
TokenType.VARIANT,
TokenType.VECTOR,
TokenType.OBJECT,
TokenType.OBJECT_IDENTIFIER,
TokenType.INET,
TokenType.IPADDRESS,
TokenType.IPPREFIX,
TokenType.IPV4,
TokenType.IPV6,
TokenType.UNKNOWN,
TokenType.NULL,
TokenType.NAME,
TokenType.TDIGEST,
*ENUM_TYPE_TOKENS,
*NESTED_TYPE_TOKENS,
*AGGREGATE_TYPE_TOKENS,
}
SIGNED_TO_UNSIGNED_TYPE_TOKEN = {
TokenType.BIGINT: TokenType.UBIGINT,
TokenType.INT: TokenType.UINT,
TokenType.MEDIUMINT: TokenType.UMEDIUMINT,
TokenType.SMALLINT: TokenType.USMALLINT,
TokenType.TINYINT: TokenType.UTINYINT,
TokenType.DECIMAL: TokenType.UDECIMAL,
}
SUBQUERY_PREDICATES = {
TokenType.ANY: exp.Any,
TokenType.ALL: exp.All,
TokenType.EXISTS: exp.Exists,
TokenType.SOME: exp.Any,
}
RESERVED_TOKENS = {
*Tokenizer.SINGLE_TOKENS.values(),
TokenType.SELECT,
} - {TokenType.IDENTIFIER}
DB_CREATABLES = {
TokenType.DATABASE,
TokenType.DICTIONARY,
TokenType.MODEL,
TokenType.SCHEMA,
TokenType.SEQUENCE,
TokenType.STORAGE_INTEGRATION,
TokenType.TABLE,
TokenType.TAG,
TokenType.VIEW,
TokenType.WAREHOUSE,
TokenType.STREAMLIT,
}
CREATABLES = {
TokenType.COLUMN,
TokenType.CONSTRAINT,
TokenType.FOREIGN_KEY,
TokenType.FUNCTION,
TokenType.INDEX,
TokenType.PROCEDURE,
*DB_CREATABLES,
}
ALTERABLES = {
TokenType.INDEX,
TokenType.TABLE,
TokenType.VIEW,
}
# Tokens that can represent identifiers
ID_VAR_TOKENS = {
TokenType.ALL,
TokenType.VAR,
TokenType.ANTI,
TokenType.APPLY,
TokenType.ASC,
TokenType.ASOF,
TokenType.AUTO_INCREMENT,
TokenType.BEGIN,
TokenType.BPCHAR,
TokenType.CACHE,
TokenType.CASE,
TokenType.COLLATE,
TokenType.COMMAND,
TokenType.COMMENT,
TokenType.COMMIT,
TokenType.CONSTRAINT,
TokenType.COPY,
TokenType.CUBE,
TokenType.DEFAULT,
TokenType.DELETE,
TokenType.DESC,
TokenType.DESCRIBE,
TokenType.DICTIONARY,
TokenType.DIV,
TokenType.END,
TokenType.EXECUTE,
TokenType.ESCAPE,
TokenType.FALSE,
TokenType.FIRST,
TokenType.FILTER,
TokenType.FINAL,
TokenType.FORMAT,
TokenType.FULL,
TokenType.IDENTIFIER,
TokenType.IS,
TokenType.ISNULL,
TokenType.INTERVAL,
TokenType.KEEP,
TokenType.KILL,
TokenType.LEFT,
TokenType.LOAD,
TokenType.MERGE,
TokenType.NATURAL,
TokenType.NEXT,
TokenType.OFFSET,
TokenType.OPERATOR,
TokenType.ORDINALITY,
TokenType.OVERLAPS,
TokenType.OVERWRITE,
TokenType.PARTITION,
TokenType.PERCENT,
TokenType.PIVOT,
TokenType.PRAGMA,
TokenType.RANGE,
TokenType.RECURSIVE,
TokenType.REFERENCES,
TokenType.REFRESH,
TokenType.RENAME,
TokenType.REPLACE,
TokenType.RIGHT,
TokenType.ROLLUP,
TokenType.ROW,
TokenType.ROWS,
TokenType.SEMI,
TokenType.SET,
TokenType.SETTINGS,
TokenType.SHOW,
TokenType.TEMPORARY,
TokenType.TOP,
TokenType.TRUE,
TokenType.TRUNCATE,
TokenType.UNIQUE,
TokenType.UNNEST,
TokenType.UNPIVOT,
TokenType.UPDATE,
TokenType.USE,
TokenType.VOLATILE,
TokenType.WINDOW,
*CREATABLES,
*SUBQUERY_PREDICATES,
*TYPE_TOKENS,
*NO_PAREN_FUNCTIONS,
}
ID_VAR_TOKENS.remove(TokenType.UNION)
INTERVAL_VARS = ID_VAR_TOKENS - {TokenType.END}
TABLE_ALIAS_TOKENS = ID_VAR_TOKENS - {
TokenType.ANTI,
TokenType.APPLY,
TokenType.ASOF,
TokenType.FULL,
TokenType.LEFT,
TokenType.LOCK,
TokenType.NATURAL,
TokenType.OFFSET,
TokenType.RIGHT,
TokenType.SEMI,
TokenType.WINDOW,
}
ALIAS_TOKENS = ID_VAR_TOKENS
ARRAY_CONSTRUCTORS = {
"ARRAY": exp.Array,
"LIST": exp.List,
}
COMMENT_TABLE_ALIAS_TOKENS = TABLE_ALIAS_TOKENS - {TokenType.IS}
UPDATE_ALIAS_TOKENS = TABLE_ALIAS_TOKENS - {TokenType.SET}
TRIM_TYPES = {"LEADING", "TRAILING", "BOTH"}
FUNC_TOKENS = {
TokenType.COLLATE,
TokenType.COMMAND,
TokenType.CURRENT_DATE,
TokenType.CURRENT_DATETIME,
TokenType.CURRENT_TIMESTAMP,
TokenType.CURRENT_TIME,
TokenType.CURRENT_USER,
TokenType.FILTER,
TokenType.FIRST,
TokenType.FORMAT,
TokenType.GLOB,
TokenType.IDENTIFIER,
TokenType.INDEX,
TokenType.ISNULL,
TokenType.ILIKE,
TokenType.INSERT,
TokenType.LIKE,
TokenType.MERGE,
TokenType.OFFSET,
TokenType.PRIMARY_KEY,
TokenType.RANGE,
TokenType.REPLACE,
TokenType.RLIKE,
TokenType.ROW,
TokenType.UNNEST,
TokenType.VAR,
TokenType.LEFT,
TokenType.RIGHT,
TokenType.SEQUENCE,
TokenType.DATE,
TokenType.DATETIME,
TokenType.TABLE,
TokenType.TIMESTAMP,
TokenType.TIMESTAMPTZ,
TokenType.TRUNCATE,
TokenType.WINDOW,
TokenType.XOR,
*TYPE_TOKENS,
*SUBQUERY_PREDICATES,
}
CONJUNCTION: t.Dict[TokenType, t.Type[exp.Expression]] = {
TokenType.AND: exp.And,
}
ASSIGNMENT: t.Dict[TokenType, t.Type[exp.Expression]] = {
TokenType.COLON_EQ: exp.PropertyEQ,
}
DISJUNCTION: t.Dict[TokenType, t.Type[exp.Expression]] = {
TokenType.OR: exp.Or,
}
EQUALITY = {
TokenType.EQ: exp.EQ,
TokenType.NEQ: exp.NEQ,
TokenType.NULLSAFE_EQ: exp.NullSafeEQ,
}
COMPARISON = {
TokenType.GT: exp.GT,
TokenType.GTE: exp.GTE,
TokenType.LT: exp.LT,
TokenType.LTE: exp.LTE,
}
BITWISE = {
TokenType.AMP: exp.BitwiseAnd,
TokenType.CARET: exp.BitwiseXor,
TokenType.PIPE: exp.BitwiseOr,
}
TERM = {
TokenType.DASH: exp.Sub,
TokenType.PLUS: exp.Add,
TokenType.MOD: exp.Mod,
TokenType.COLLATE: exp.Collate,
}
FACTOR = {
TokenType.DIV: exp.IntDiv,
TokenType.LR_ARROW: exp.Distance,
TokenType.SLASH: exp.Div,
TokenType.STAR: exp.Mul,
}
EXPONENT: t.Dict[TokenType, t.Type[exp.Expression]] = {}
TIMES = {
TokenType.TIME,
TokenType.TIMETZ,
}
TIMESTAMPS = {
TokenType.TIMESTAMP,
TokenType.TIMESTAMPTZ,
TokenType.TIMESTAMPLTZ,
*TIMES,
}
SET_OPERATIONS = {
TokenType.UNION,
TokenType.INTERSECT,
TokenType.EXCEPT,
}
JOIN_METHODS = {
TokenType.ASOF,
TokenType.NATURAL,
TokenType.POSITIONAL,
}
JOIN_SIDES = {
TokenType.LEFT,
TokenType.RIGHT,
TokenType.FULL,
}
JOIN_KINDS = {
TokenType.ANTI,
TokenType.CROSS,
TokenType.INNER,
TokenType.OUTER,
TokenType.SEMI,
TokenType.STRAIGHT_JOIN,
}
JOIN_HINTS: t.Set[str] = set()
LAMBDAS = {
TokenType.ARROW: lambda self, expressions: self.expression(
exp.Lambda,
this=self._replace_lambda(
self._parse_assignment(),
expressions,
),
expressions=expressions,
),
TokenType.FARROW: lambda self, expressions: self.expression(
exp.Kwarg,
this=exp.var(expressions[0].name),
expression=self._parse_assignment(),
),
}
COLUMN_OPERATORS = {
TokenType.DOT: None,
TokenType.DCOLON: lambda self, this, to: self.expression(
exp.Cast if self.STRICT_CAST else exp.TryCast,
this=this,
to=to,
),
TokenType.ARROW: lambda self, this, path: self.expression(
exp.JSONExtract,
this=this,
expression=self.dialect.to_json_path(path),
only_json_types=self.JSON_ARROWS_REQUIRE_JSON_TYPE,
),
TokenType.DARROW: lambda self, this, path: self.expression(
exp.JSONExtractScalar,
this=this,
expression=self.dialect.to_json_path(path),
only_json_types=self.JSON_ARROWS_REQUIRE_JSON_TYPE,
),
TokenType.HASH_ARROW: lambda self, this, path: self.expression(
exp.JSONBExtract,
this=this,
expression=path,
),
TokenType.DHASH_ARROW: lambda self, this, path: self.expression(
exp.JSONBExtractScalar,
this=this,
expression=path,
),
TokenType.PLACEHOLDER: lambda self, this, key: self.expression(
exp.JSONBContains,
this=this,
expression=key,
),
}
EXPRESSION_PARSERS = {
exp.Cluster: lambda self: self._parse_sort(exp.Cluster, TokenType.CLUSTER_BY),
exp.Column: lambda self: self._parse_column(),
exp.Condition: lambda self: self._parse_assignment(),
exp.DataType: lambda self: self._parse_types(allow_identifiers=False, schema=True),
exp.Expression: lambda self: self._parse_expression(),
exp.From: lambda self: self._parse_from(joins=True),
exp.Group: lambda self: self._parse_group(),
exp.Having: lambda self: self._parse_having(),
exp.Identifier: lambda self: self._parse_id_var(),
exp.Join: lambda self: self._parse_join(),
exp.Lambda: lambda self: self._parse_lambda(),
exp.Lateral: lambda self: self._parse_lateral(),
exp.Limit: lambda self: self._parse_limit(),
exp.Offset: lambda self: self._parse_offset(),
exp.Order: lambda self: self._parse_order(),
exp.Ordered: lambda self: self._parse_ordered(),
exp.Properties: lambda self: self._parse_properties(),
exp.Qualify: lambda self: self._parse_qualify(),
exp.Returning: lambda self: self._parse_returning(),
exp.Select: lambda self: self._parse_select(),
exp.Sort: lambda self: self._parse_sort(exp.Sort, TokenType.SORT_BY),
exp.Table: lambda self: self._parse_table_parts(),
exp.TableAlias: lambda self: self._parse_table_alias(),
exp.When: lambda self: seq_get(self._parse_when_matched(), 0),
exp.Where: lambda self: self._parse_where(),
exp.Window: lambda self: self._parse_named_window(),
exp.With: lambda self: self._parse_with(),
"JOIN_TYPE": lambda self: self._parse_join_parts(),
}
STATEMENT_PARSERS = {
TokenType.ALTER: lambda self: self._parse_alter(),
TokenType.BEGIN: lambda self: self._parse_transaction(),
TokenType.CACHE: lambda self: self._parse_cache(),
TokenType.COMMENT: lambda self: self._parse_comment(),
TokenType.COMMIT: lambda self: self._parse_commit_or_rollback(),
TokenType.COPY: lambda self: self._parse_copy(),
TokenType.CREATE: lambda self: self._parse_create(),
TokenType.DELETE: lambda self: self._parse_delete(),
TokenType.DESC: lambda self: self._parse_describe(),
TokenType.DESCRIBE: lambda self: self._parse_describe(),
TokenType.DROP: lambda self: self._parse_drop(),
TokenType.GRANT: lambda self: self._parse_grant(),
TokenType.INSERT: lambda self: self._parse_insert(),
TokenType.KILL: lambda self: self._parse_kill(),
TokenType.LOAD: lambda self: self._parse_load(),
TokenType.MERGE: lambda self: self._parse_merge(),
TokenType.PIVOT: lambda self: self._parse_simplified_pivot(),
TokenType.PRAGMA: lambda self: self.expression(exp.Pragma, this=self._parse_expression()),
TokenType.REFRESH: lambda self: self._parse_refresh(),
TokenType.ROLLBACK: lambda self: self._parse_commit_or_rollback(),
TokenType.SET: lambda self: self._parse_set(),
TokenType.TRUNCATE: lambda self: self._parse_truncate_table(),
TokenType.UNCACHE: lambda self: self._parse_uncache(),
TokenType.UPDATE: lambda self: self._parse_update(),
TokenType.USE: lambda self: self.expression(
exp.Use,
kind=self._parse_var_from_options(self.USABLES, raise_unmatched=False),
this=self._parse_table(schema=False),
),
TokenType.SEMICOLON: lambda self: self.expression(exp.Semicolon),
}
UNARY_PARSERS = {
TokenType.PLUS: lambda self: self._parse_unary(), # Unary + is handled as a no-op
TokenType.NOT: lambda self: self.expression(exp.Not, this=self._parse_equality()),
TokenType.TILDA: lambda self: self.expression(exp.BitwiseNot, this=self._parse_unary()),
TokenType.DASH: lambda self: self.expression(exp.Neg, this=self._parse_unary()),
TokenType.PIPE_SLASH: lambda self: self.expression(exp.Sqrt, this=self._parse_unary()),
TokenType.DPIPE_SLASH: lambda self: self.expression(exp.Cbrt, this=self._parse_unary()),
}
STRING_PARSERS = {
TokenType.HEREDOC_STRING: lambda self, token: self.expression(
exp.RawString, this=token.text
),
TokenType.NATIONAL_STRING: lambda self, token: self.expression(
exp.National, this=token.text
),
TokenType.RAW_STRING: lambda self, token: self.expression(exp.RawString, this=token.text),
TokenType.STRING: lambda self, token: self.expression(
exp.Literal, this=token.text, is_string=True
),
TokenType.UNICODE_STRING: lambda self, token: self.expression(
exp.UnicodeString,
this=token.text,
escape=self._match_text_seq("UESCAPE") and self._parse_string(),
),
}
NUMERIC_PARSERS = {
TokenType.BIT_STRING: lambda self, token: self.expression(exp.BitString, this=token.text),
TokenType.BYTE_STRING: lambda self, token: self.expression(exp.ByteString, this=token.text),
TokenType.HEX_STRING: lambda self, token: self.expression(exp.HexString, this=token.text),
TokenType.NUMBER: lambda self, token: self.expression(
exp.Literal, this=token.text, is_string=False
),
}
PRIMARY_PARSERS = {
**STRING_PARSERS,
**NUMERIC_PARSERS,
TokenType.INTRODUCER: lambda self, token: self._parse_introducer(token),
TokenType.NULL: lambda self, _: self.expression(exp.Null),
TokenType.TRUE: lambda self, _: self.expression(exp.Boolean, this=True),
TokenType.FALSE: lambda self, _: self.expression(exp.Boolean, this=False),
TokenType.SESSION_PARAMETER: lambda self, _: self._parse_session_parameter(),
TokenType.STAR: lambda self, _: self._parse_star_ops(),
}
PLACEHOLDER_PARSERS = {
TokenType.PLACEHOLDER: lambda self: self.expression(exp.Placeholder),
TokenType.PARAMETER: lambda self: self._parse_parameter(),
TokenType.COLON: lambda self: (
self.expression(exp.Placeholder, this=self._prev.text)
if self._match_set(self.ID_VAR_TOKENS)
else None
),
}
RANGE_PARSERS = {
TokenType.AT_GT: binary_range_parser(exp.ArrayContainsAll),
TokenType.BETWEEN: lambda self, this: self._parse_between(this),
TokenType.GLOB: binary_range_parser(exp.Glob),
TokenType.ILIKE: binary_range_parser(exp.ILike),
TokenType.IN: lambda self, this: self._parse_in(this),
TokenType.IRLIKE: binary_range_parser(exp.RegexpILike),
TokenType.IS: lambda self, this: self._parse_is(this),
TokenType.LIKE: binary_range_parser(exp.Like),
TokenType.LT_AT: binary_range_parser(exp.ArrayContainsAll, reverse_args=True),
TokenType.OVERLAPS: binary_range_parser(exp.Overlaps),
TokenType.RLIKE: binary_range_parser(exp.RegexpLike),
TokenType.SIMILAR_TO: binary_range_parser(exp.SimilarTo),
TokenType.FOR: lambda self, this: self._parse_comprehension(this),
}
PROPERTY_PARSERS: t.Dict[str, t.Callable] = {
"ALLOWED_VALUES": lambda self: self.expression(
exp.AllowedValuesProperty, expressions=self._parse_csv(self._parse_primary)
),
"ALGORITHM": lambda self: self._parse_property_assignment(exp.AlgorithmProperty),
"AUTO": lambda self: self._parse_auto_property(),
"AUTO_INCREMENT": lambda self: self._parse_property_assignment(exp.AutoIncrementProperty),
"BACKUP": lambda self: self.expression(
exp.BackupProperty, this=self._parse_var(any_token=True)
),
"BLOCKCOMPRESSION": lambda self: self._parse_blockcompression(),
"CHARSET": lambda self, **kwargs: self._parse_character_set(**kwargs),
"CHARACTER SET": lambda self, **kwargs: self._parse_character_set(**kwargs),
"CHECKSUM": lambda self: self._parse_checksum(),
"CLUSTER BY": lambda self: self._parse_cluster(),
"CLUSTERED": lambda self: self._parse_clustered_by(),
"COLLATE": lambda self, **kwargs: self._parse_property_assignment(
exp.CollateProperty, **kwargs
),
"COMMENT": lambda self: self._parse_property_assignment(exp.SchemaCommentProperty),
"CONTAINS": lambda self: self._parse_contains_property(),
"COPY": lambda self: self._parse_copy_property(),
"DATABLOCKSIZE": lambda self, **kwargs: self._parse_datablocksize(**kwargs),
"DATA_DELETION": lambda self: self._parse_data_deletion_property(),
"DEFINER": lambda self: self._parse_definer(),
"DETERMINISTIC": lambda self: self.expression(
exp.StabilityProperty, this=exp.Literal.string("IMMUTABLE")
),
"DISTRIBUTED": lambda self: self._parse_distributed_property(),
"DUPLICATE": lambda self: self._parse_duplicate(),
"DYNAMIC": lambda self: self.expression(exp.DynamicProperty),
"DISTKEY": lambda self: self._parse_distkey(),
"DISTSTYLE": lambda self: self._parse_property_assignment(exp.DistStyleProperty),
"EMPTY": lambda self: self.expression(exp.EmptyProperty),
"ENGINE": lambda self: self._parse_property_assignment(exp.EngineProperty),
"EXECUTE": lambda self: self._parse_property_assignment(exp.ExecuteAsProperty),
"EXTERNAL": lambda self: self.expression(exp.ExternalProperty),
"FALLBACK": lambda self, **kwargs: self._parse_fallback(**kwargs),
"FORMAT": lambda self: self._parse_property_assignment(exp.FileFormatProperty),
"FREESPACE": lambda self: self._parse_freespace(),
"GLOBAL": lambda self: self.expression(exp.GlobalProperty),
"HEAP": lambda self: self.expression(exp.HeapProperty),
"ICEBERG": lambda self: self.expression(exp.IcebergProperty),
"IMMUTABLE": lambda self: self.expression(
exp.StabilityProperty, this=exp.Literal.string("IMMUTABLE")
),
"INHERITS": lambda self: self.expression(
exp.InheritsProperty, expressions=self._parse_wrapped_csv(self._parse_table)
),
"INPUT": lambda self: self.expression(exp.InputModelProperty, this=self._parse_schema()),
"JOURNAL": lambda self, **kwargs: self._parse_journal(**kwargs),
"LANGUAGE": lambda self: self._parse_property_assignment(exp.LanguageProperty),
"LAYOUT": lambda self: self._parse_dict_property(this="LAYOUT"),
"LIFETIME": lambda self: self._parse_dict_range(this="LIFETIME"),
"LIKE": lambda self: self._parse_create_like(),
"LOCATION": lambda self: self._parse_property_assignment(exp.LocationProperty),
"LOCK": lambda self: self._parse_locking(),
"LOCKING": lambda self: self._parse_locking(),
"LOG": lambda self, **kwargs: self._parse_log(**kwargs),
"MATERIALIZED": lambda self: self.expression(exp.MaterializedProperty),
"MERGEBLOCKRATIO": lambda self, **kwargs: self._parse_mergeblockratio(**kwargs),
"MODIFIES": lambda self: self._parse_modifies_property(),
"MULTISET": lambda self: self.expression(exp.SetProperty, multi=True),
"NO": lambda self: self._parse_no_property(),
"ON": lambda self: self._parse_on_property(),
"ORDER BY": lambda self: self._parse_order(skip_order_token=True),
"OUTPUT": lambda self: self.expression(exp.OutputModelProperty, this=self._parse_schema()),
"PARTITION": lambda self: self._parse_partitioned_of(),
"PARTITION BY": lambda self: self._parse_partitioned_by(),
"PARTITIONED BY": lambda self: self._parse_partitioned_by(),
"PARTITIONED_BY": lambda self: self._parse_partitioned_by(),
"PRIMARY KEY": lambda self: self._parse_primary_key(in_props=True),
"RANGE": lambda self: self._parse_dict_range(this="RANGE"),
"READS": lambda self: self._parse_reads_property(),
"REMOTE": lambda self: self._parse_remote_with_connection(),
"RETURNS": lambda self: self._parse_returns(),
"STRICT": lambda self: self.expression(exp.StrictProperty),
"STREAMING": lambda self: self.expression(exp.StreamingTableProperty),
"ROW": lambda self: self._parse_row(),
"ROW_FORMAT": lambda self: self._parse_property_assignment(exp.RowFormatProperty),
"SAMPLE": lambda self: self.expression(
exp.SampleProperty, this=self._match_text_seq("BY") and self._parse_bitwise()
),
"SECURE": lambda self: self.expression(exp.SecureProperty),
"SECURITY": lambda self: self._parse_security(),
"SET": lambda self: self.expression(exp.SetProperty, multi=False),
"SETTINGS": lambda self: self._parse_settings_property(),
"SHARING": lambda self: self._parse_property_assignment(exp.SharingProperty),
"SORTKEY": lambda self: self._parse_sortkey(),
"SOURCE": lambda self: self._parse_dict_property(this="SOURCE"),
"STABLE": lambda self: self.expression(
exp.StabilityProperty, this=exp.Literal.string("STABLE")
),
"STORED": lambda self: self._parse_stored(),
"SYSTEM_VERSIONING": lambda self: self._parse_system_versioning_property(),
"TBLPROPERTIES": lambda self: self._parse_wrapped_properties(),
"TEMP": lambda self: self.expression(exp.TemporaryProperty),
"TEMPORARY": lambda self: self.expression(exp.TemporaryProperty),
"TO": lambda self: self._parse_to_table(),
"TRANSIENT": lambda self: self.expression(exp.TransientProperty),
"TRANSFORM": lambda self: self.expression(
exp.TransformModelProperty, expressions=self._parse_wrapped_csv(self._parse_expression)
),
"TTL": lambda self: self._parse_ttl(),
"USING": lambda self: self._parse_property_assignment(exp.FileFormatProperty),
"UNLOGGED": lambda self: self.expression(exp.UnloggedProperty),
"VOLATILE": lambda self: self._parse_volatile_property(),
"WITH": lambda self: self._parse_with_property(),
}
CONSTRAINT_PARSERS = {
"AUTOINCREMENT": lambda self: self._parse_auto_increment(),
"AUTO_INCREMENT": lambda self: self._parse_auto_increment(),
"CASESPECIFIC": lambda self: self.expression(exp.CaseSpecificColumnConstraint, not_=False),
"CHARACTER SET": lambda self: self.expression(
exp.CharacterSetColumnConstraint, this=self._parse_var_or_string()
),
"CHECK": lambda self: self.expression(
exp.CheckColumnConstraint,
this=self._parse_wrapped(self._parse_assignment),
enforced=self._match_text_seq("ENFORCED"),
),
"COLLATE": lambda self: self.expression(
exp.CollateColumnConstraint,
this=self._parse_identifier() or self._parse_column(),
),
"COMMENT": lambda self: self.expression(
exp.CommentColumnConstraint, this=self._parse_string()
),
"COMPRESS": lambda self: self._parse_compress(),
"CLUSTERED": lambda self: self.expression(
exp.ClusteredColumnConstraint, this=self._parse_wrapped_csv(self._parse_ordered)
),
"NONCLUSTERED": lambda self: self.expression(
exp.NonClusteredColumnConstraint, this=self._parse_wrapped_csv(self._parse_ordered)
),
"DEFAULT": lambda self: self.expression(
exp.DefaultColumnConstraint, this=self._parse_bitwise()
),
"ENCODE": lambda self: self.expression(exp.EncodeColumnConstraint, this=self._parse_var()),
"EPHEMERAL": lambda self: self.expression(
exp.EphemeralColumnConstraint, this=self._parse_bitwise()
),
"EXCLUDE": lambda self: self.expression(
exp.ExcludeColumnConstraint, this=self._parse_index_params()
),
"FOREIGN KEY": lambda self: self._parse_foreign_key(),
"FORMAT": lambda self: self.expression(
exp.DateFormatColumnConstraint, this=self._parse_var_or_string()
),
"GENERATED": lambda self: self._parse_generated_as_identity(),
"IDENTITY": lambda self: self._parse_auto_increment(),
"INLINE": lambda self: self._parse_inline(),
"LIKE": lambda self: self._parse_create_like(),
"NOT": lambda self: self._parse_not_constraint(),
"NULL": lambda self: self.expression(exp.NotNullColumnConstraint, allow_null=True),
"ON": lambda self: (
self._match(TokenType.UPDATE)
and self.expression(exp.OnUpdateColumnConstraint, this=self._parse_function())
)
or self.expression(exp.OnProperty, this=self._parse_id_var()),
"PATH": lambda self: self.expression(exp.PathColumnConstraint, this=self._parse_string()),
"PERIOD": lambda self: self._parse_period_for_system_time(),
"PRIMARY KEY": lambda self: self._parse_primary_key(),
"REFERENCES": lambda self: self._parse_references(match=False),
"TITLE": lambda self: self.expression(
exp.TitleColumnConstraint, this=self._parse_var_or_string()
),
"TTL": lambda self: self.expression(exp.MergeTreeTTL, expressions=[self._parse_bitwise()]),
"UNIQUE": lambda self: self._parse_unique(),
"UPPERCASE": lambda self: self.expression(exp.UppercaseColumnConstraint),
"WITH": lambda self: self.expression(
exp.Properties, expressions=self._parse_wrapped_properties()
),
}
ALTER_PARSERS = {
"ADD": lambda self: self._parse_alter_table_add(),
"AS": lambda self: self._parse_select(),
"ALTER": lambda self: self._parse_alter_table_alter(),
"CLUSTER BY": lambda self: self._parse_cluster(wrapped=True),
"DELETE": lambda self: self.expression(exp.Delete, where=self._parse_where()),
"DROP": lambda self: self._parse_alter_table_drop(),
"RENAME": lambda self: self._parse_alter_table_rename(),
"SET": lambda self: self._parse_alter_table_set(),
"SWAP": lambda self: self.expression(
exp.SwapTable, this=self._match(TokenType.WITH) and self._parse_table(schema=True)
),
}
ALTER_ALTER_PARSERS = {
"DISTKEY": lambda self: self._parse_alter_diststyle(),
"DISTSTYLE": lambda self: self._parse_alter_diststyle(),
"SORTKEY": lambda self: self._parse_alter_sortkey(),
"COMPOUND": lambda self: self._parse_alter_sortkey(compound=True),
}
SCHEMA_UNNAMED_CONSTRAINTS = {
"CHECK",
"EXCLUDE",
"FOREIGN KEY",
"LIKE",
"PERIOD",
"PRIMARY KEY",
"UNIQUE",
}
NO_PAREN_FUNCTION_PARSERS = {
"ANY": lambda self: self.expression(exp.Any, this=self._parse_bitwise()),
"CASE": lambda self: self._parse_case(),
"CONNECT_BY_ROOT": lambda self: self.expression(
exp.ConnectByRoot, this=self._parse_column()
),
"IF": lambda self: self._parse_if(),
"NEXT": lambda self: self._parse_next_value_for(),
}
INVALID_FUNC_NAME_TOKENS = {
TokenType.IDENTIFIER,
TokenType.STRING,
}
FUNCTIONS_WITH_ALIASED_ARGS = {"STRUCT"}
KEY_VALUE_DEFINITIONS = (exp.Alias, exp.EQ, exp.PropertyEQ, exp.Slice)
FUNCTION_PARSERS = {
"CAST": lambda self: self._parse_cast(self.STRICT_CAST),
"CONVERT": lambda self: self._parse_convert(self.STRICT_CAST),
"DECODE": lambda self: self._parse_decode(),
"EXTRACT": lambda self: self._parse_extract(),
"GAP_FILL": lambda self: self._parse_gap_fill(),
"JSON_OBJECT": lambda self: self._parse_json_object(),
"JSON_OBJECTAGG": lambda self: self._parse_json_object(agg=True),
"JSON_TABLE": lambda self: self._parse_json_table(),
"MATCH": lambda self: self._parse_match_against(),
"NORMALIZE": lambda self: self._parse_normalize(),
"OPENJSON": lambda self: self._parse_open_json(),
"OVERLAY": lambda self: self._parse_overlay(),
"POSITION": lambda self: self._parse_position(),
"PREDICT": lambda self: self._parse_predict(),
"SAFE_CAST": lambda self: self._parse_cast(False, safe=True),
"STRING_AGG": lambda self: self._parse_string_agg(),
"SUBSTRING": lambda self: self._parse_substring(),
"TRIM": lambda self: self._parse_trim(),
"TRY_CAST": lambda self: self._parse_cast(False, safe=True),
"TRY_CONVERT": lambda self: self._parse_convert(False, safe=True),
}
QUERY_MODIFIER_PARSERS = {
TokenType.MATCH_RECOGNIZE: lambda self: ("match", self._parse_match_recognize()),
TokenType.PREWHERE: lambda self: ("prewhere", self._parse_prewhere()),
TokenType.WHERE: lambda self: ("where", self._parse_where()),
TokenType.GROUP_BY: lambda self: ("group", self._parse_group()),
TokenType.HAVING: lambda self: ("having", self._parse_having()),
TokenType.QUALIFY: lambda self: ("qualify", self._parse_qualify()),
TokenType.WINDOW: lambda self: ("windows", self._parse_window_clause()),
TokenType.ORDER_BY: lambda self: ("order", self._parse_order()),
TokenType.LIMIT: lambda self: ("limit", self._parse_limit()),
TokenType.FETCH: lambda self: ("limit", self._parse_limit()),
TokenType.OFFSET: lambda self: ("offset", self._parse_offset()),
TokenType.FOR: lambda self: ("locks", self._parse_locks()),
TokenType.LOCK: lambda self: ("locks", self._parse_locks()),
TokenType.TABLE_SAMPLE: lambda self: ("sample", self._parse_table_sample(as_modifier=True)),
TokenType.USING: lambda self: ("sample", self._parse_table_sample(as_modifier=True)),
TokenType.CLUSTER_BY: lambda self: (
"cluster",
self._parse_sort(exp.Cluster, TokenType.CLUSTER_BY),
),
TokenType.DISTRIBUTE_BY: lambda self: (
"distribute",
self._parse_sort(exp.Distribute, TokenType.DISTRIBUTE_BY),
),
TokenType.SORT_BY: lambda self: ("sort", self._parse_sort(exp.Sort, TokenType.SORT_BY)),
TokenType.CONNECT_BY: lambda self: ("connect", self._parse_connect(skip_start_token=True)),
TokenType.START_WITH: lambda self: ("connect", self._parse_connect()),
}
SET_PARSERS = {
"GLOBAL": lambda self: self._parse_set_item_assignment("GLOBAL"),
"LOCAL": lambda self: self._parse_set_item_assignment("LOCAL"),
"SESSION": lambda self: self._parse_set_item_assignment("SESSION"),
"TRANSACTION": lambda self: self._parse_set_transaction(),
}
SHOW_PARSERS: t.Dict[str, t.Callable] = {}
TYPE_LITERAL_PARSERS = {
exp.DataType.Type.JSON: lambda self, this, _: self.expression(exp.ParseJSON, this=this),
}
TYPE_CONVERTERS: t.Dict[exp.DataType.Type, t.Callable[[exp.DataType], exp.DataType]] = {}
DDL_SELECT_TOKENS = {TokenType.SELECT, TokenType.WITH, TokenType.L_PAREN}
PRE_VOLATILE_TOKENS = {TokenType.CREATE, TokenType.REPLACE, TokenType.UNIQUE}
TRANSACTION_KIND = {"DEFERRED", "IMMEDIATE", "EXCLUSIVE"}
TRANSACTION_CHARACTERISTICS: OPTIONS_TYPE = {
"ISOLATION": (
("LEVEL", "REPEATABLE", "READ"),
("LEVEL", "READ", "COMMITTED"),
("LEVEL", "READ", "UNCOMITTED"),
("LEVEL", "SERIALIZABLE"),
),
"READ": ("WRITE", "ONLY"),
}
CONFLICT_ACTIONS: OPTIONS_TYPE = dict.fromkeys(
("ABORT", "FAIL", "IGNORE", "REPLACE", "ROLLBACK", "UPDATE"), tuple()
)
CONFLICT_ACTIONS["DO"] = ("NOTHING", "UPDATE")
CREATE_SEQUENCE: OPTIONS_TYPE = {
"SCALE": ("EXTEND", "NOEXTEND"),
"SHARD": ("EXTEND", "NOEXTEND"),
"NO": ("CYCLE", "CACHE", "MAXVALUE", "MINVALUE"),
**dict.fromkeys(
(
"SESSION",
"GLOBAL",
"KEEP",
"NOKEEP",
"ORDER",
"NOORDER",
"NOCACHE",
"CYCLE",
"NOCYCLE",
"NOMINVALUE",
"NOMAXVALUE",
"NOSCALE",
"NOSHARD",
),
tuple(),
),
}
ISOLATED_LOADING_OPTIONS: OPTIONS_TYPE = {"FOR": ("ALL", "INSERT", "NONE")}
USABLES: OPTIONS_TYPE = dict.fromkeys(
("ROLE", "WAREHOUSE", "DATABASE", "SCHEMA", "CATALOG"), tuple()
)
CAST_ACTIONS: OPTIONS_TYPE = dict.fromkeys(("RENAME", "ADD"), ("FIELDS",))
SCHEMA_BINDING_OPTIONS: OPTIONS_TYPE = {
"TYPE": ("EVOLUTION",),
**dict.fromkeys(("BINDING", "COMPENSATION", "EVOLUTION"), tuple()),
}
PROCEDURE_OPTIONS: OPTIONS_TYPE = {}
EXECUTE_AS_OPTIONS: OPTIONS_TYPE = dict.fromkeys(("CALLER", "SELF", "OWNER"), tuple())
KEY_CONSTRAINT_OPTIONS: OPTIONS_TYPE = {
"NOT": ("ENFORCED",),
"MATCH": (
"FULL",
"PARTIAL",
"SIMPLE",
),
"INITIALLY": ("DEFERRED", "IMMEDIATE"),
**dict.fromkeys(("DEFERRABLE", "NORELY"), tuple()),
}
INSERT_ALTERNATIVES = {"ABORT", "FAIL", "IGNORE", "REPLACE", "ROLLBACK"}
CLONE_KEYWORDS = {"CLONE", "COPY"}
HISTORICAL_DATA_PREFIX = {"AT", "BEFORE", "END"}
HISTORICAL_DATA_KIND = {"TIMESTAMP", "OFFSET", "STATEMENT", "STREAM"}
OPCLASS_FOLLOW_KEYWORDS = {"ASC", "DESC", "NULLS", "WITH"}
OPTYPE_FOLLOW_TOKENS = {TokenType.COMMA, TokenType.R_PAREN}
TABLE_INDEX_HINT_TOKENS = {TokenType.FORCE, TokenType.IGNORE, TokenType.USE}
VIEW_ATTRIBUTES = {"ENCRYPTION", "SCHEMABINDING", "VIEW_METADATA"}
WINDOW_ALIAS_TOKENS = ID_VAR_TOKENS - {TokenType.ROWS}
WINDOW_BEFORE_PAREN_TOKENS = {TokenType.OVER}
WINDOW_SIDES = {"FOLLOWING", "PRECEDING"}
JSON_KEY_VALUE_SEPARATOR_TOKENS = {TokenType.COLON, TokenType.COMMA, TokenType.IS}
FETCH_TOKENS = ID_VAR_TOKENS - {TokenType.ROW, TokenType.ROWS, TokenType.PERCENT}
ADD_CONSTRAINT_TOKENS = {
TokenType.CONSTRAINT,
TokenType.FOREIGN_KEY,
TokenType.INDEX,
TokenType.KEY,
TokenType.PRIMARY_KEY,
TokenType.UNIQUE,
}
DISTINCT_TOKENS = {TokenType.DISTINCT}
NULL_TOKENS = {TokenType.NULL}
UNNEST_OFFSET_ALIAS_TOKENS = ID_VAR_TOKENS - SET_OPERATIONS
SELECT_START_TOKENS = {TokenType.L_PAREN, TokenType.WITH, TokenType.SELECT}
COPY_INTO_VARLEN_OPTIONS = {"FILE_FORMAT", "COPY_OPTIONS", "FORMAT_OPTIONS", "CREDENTIAL"}
IS_JSON_PREDICATE_KIND = {"VALUE", "SCALAR", "ARRAY", "OBJECT"}
ODBC_DATETIME_LITERALS = {
"d": exp.Date,
"t": exp.Time,
"ts": exp.Timestamp,
}
ON_CONDITION_TOKENS = {"ERROR", "NULL", "TRUE", "FALSE", "EMPTY"}
PRIVILEGE_FOLLOW_TOKENS = {TokenType.ON, TokenType.COMMA, TokenType.L_PAREN}
# The style options for the DESCRIBE statement
DESCRIBE_STYLES = {"ANALYZE", "EXTENDED", "FORMATTED", "HISTORY"}
OPERATION_MODIFIERS: t.Set[str] = set()
STRICT_CAST = True
PREFIXED_PIVOT_COLUMNS = False
IDENTIFY_PIVOT_STRINGS = False
LOG_DEFAULTS_TO_LN = False
# Whether ADD is present for each column added by ALTER TABLE
ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN = True
# Whether the table sample clause expects CSV syntax
TABLESAMPLE_CSV = False
# The default method used for table sampling
DEFAULT_SAMPLING_METHOD: t.Optional[str] = None
# Whether the SET command needs a delimiter (e.g. "=") for assignments
SET_REQUIRES_ASSIGNMENT_DELIMITER = True
# Whether the TRIM function expects the characters to trim as its first argument
TRIM_PATTERN_FIRST = False
# Whether string aliases are supported `SELECT COUNT(*) 'count'`
STRING_ALIASES = False
# Whether query modifiers such as LIMIT are attached to the UNION node (vs its right operand)
MODIFIERS_ATTACHED_TO_SET_OP = True
SET_OP_MODIFIERS = {"order", "limit", "offset"}
# Whether to parse IF statements that aren't followed by a left parenthesis as commands
NO_PAREN_IF_COMMANDS = True
# Whether the -> and ->> operators expect documents of type JSON (e.g. Postgres)
JSON_ARROWS_REQUIRE_JSON_TYPE = False
# Whether the `:` operator is used to extract a value from a VARIANT column
COLON_IS_VARIANT_EXTRACT = False
# Whether or not a VALUES keyword needs to be followed by '(' to form a VALUES clause.
# If this is True and '(' is not found, the keyword will be treated as an identifier
VALUES_FOLLOWED_BY_PAREN = True
# Whether implicit unnesting is supported, e.g. SELECT 1 FROM y.z AS z, z.a (Redshift)
SUPPORTS_IMPLICIT_UNNEST = False
# Whether or not interval spans are supported, INTERVAL 1 YEAR TO MONTHS
INTERVAL_SPANS = True
# Whether a PARTITION clause can follow a table reference
SUPPORTS_PARTITION_SELECTION = False
__slots__ = (
"error_level",
"error_message_context",
"max_errors",
"dialect",
"sql",
"errors",
"_tokens",
"_index",
"_curr",
"_next",
"_prev",
"_prev_comments",
)
# Autofilled
SHOW_TRIE: t.Dict = {}
SET_TRIE: t.Dict = {}
def __init__(
self,
error_level: t.Optional[ErrorLevel] = None,
error_message_context: int = 100,
max_errors: int = 3,
dialect: DialectType = None,
):
from sqlglot.dialects import Dialect
self.error_level = error_level or ErrorLevel.IMMEDIATE
self.error_message_context = error_message_context
self.max_errors = max_errors
self.dialect = Dialect.get_or_raise(dialect)
self.reset()
def reset(self):
self.sql = ""
self.errors = []
self._tokens = []
self._index = 0
self._curr = None
self._next = None
self._prev = None
self._prev_comments = None
def parse(
self, raw_tokens: t.List[Token], sql: t.Optional[str] = None
) -> t.List[t.Optional[exp.Expression]]:
"""
Parses a list of tokens and returns a list of syntax trees, one tree
per parsed SQL statement.
Args:
raw_tokens: The list of tokens.
sql: The original SQL string, used to produce helpful debug messages.
Returns:
The list of the produced syntax trees.
"""
return self._parse(
parse_method=self.__class__._parse_statement, raw_tokens=raw_tokens, sql=sql
)
def parse_into(
self,
expression_types: exp.IntoType,
raw_tokens: t.List[Token],
sql: t.Optional[str] = None,
) -> t.List[t.Optional[exp.Expression]]:
"""
Parses a list of tokens into a given Expression type. If a collection of Expression
types is given instead, this method will try to parse the token list into each one
of them, stopping at the first for which the parsing succeeds.
Args:
expression_types: The expression type(s) to try and parse the token list into.
raw_tokens: The list of tokens.
sql: The original SQL string, used to produce helpful debug messages.
Returns:
The target Expression.
"""
errors = []
for expression_type in ensure_list(expression_types):
parser = self.EXPRESSION_PARSERS.get(expression_type)
if not parser:
raise TypeError(f"No parser registered for {expression_type}")
try:
return self._parse(parser, raw_tokens, sql)
except ParseError as e:
e.errors[0]["into_expression"] = expression_type
errors.append(e)
raise ParseError(
f"Failed to parse '{sql or raw_tokens}' into {expression_types}",
errors=merge_errors(errors),
) from errors[-1]
def _parse(
self,
parse_method: t.Callable[[Parser], t.Optional[exp.Expression]],
raw_tokens: t.List[Token],
sql: t.Optional[str] = None,
) -> t.List[t.Optional[exp.Expression]]:
self.reset()
self.sql = sql or ""
total = len(raw_tokens)
chunks: t.List[t.List[Token]] = [[]]
for i, token in enumerate(raw_tokens):
if token.token_type == TokenType.SEMICOLON:
if token.comments:
chunks.append([token])
if i < total - 1:
chunks.append([])
else:
chunks[-1].append(token)
expressions = []
for tokens in chunks:
self._index = -1
self._tokens = tokens
self._advance()
expressions.append(parse_method(self))
if self._index < len(self._tokens):
self.raise_error("Invalid expression / Unexpected token")
self.check_errors()
return expressions
def check_errors(self) -> None:
"""Logs or raises any found errors, depending on the chosen error level setting."""
if self.error_level == ErrorLevel.WARN:
for error in self.errors:
logger.error(str(error))
elif self.error_level == ErrorLevel.RAISE and self.errors:
raise ParseError(
concat_messages(self.errors, self.max_errors),
errors=merge_errors(self.errors),
)
def raise_error(self, message: str, token: t.Optional[Token] = None) -> None:
"""
Appends an error in the list of recorded errors or raises it, depending on the chosen
error level setting.
"""
token = token or self._curr or self._prev or Token.string("")
start = token.start
end = token.end + 1
start_context = self.sql[max(start - self.error_message_context, 0) : start]
highlight = self.sql[start:end]
end_context = self.sql[end : end + self.error_message_context]
error = ParseError.new(
f"{message}. Line {token.line}, Col: {token.col}.\n"
f" {start_context}\033[4m{highlight}\033[0m{end_context}",
description=message,
line=token.line,
col=token.col,
start_context=start_context,
highlight=highlight,
end_context=end_context,
)
if self.error_level == ErrorLevel.IMMEDIATE:
raise error
self.errors.append(error)
def expression(
self, exp_class: t.Type[E], comments: t.Optional[t.List[str]] = None, **kwargs
) -> E:
"""
Creates a new, validated Expression.
Args:
exp_class: The expression class to instantiate.
comments: An optional list of comments to attach to the expression.
kwargs: The arguments to set for the expression along with their respective values.
Returns:
The target expression.
"""
instance = exp_class(**kwargs)
instance.add_comments(comments) if comments else self._add_comments(instance)
return self.validate_expression(instance)
def _add_comments(self, expression: t.Optional[exp.Expression]) -> None:
if expression and self._prev_comments:
expression.add_comments(self._prev_comments)
self._prev_comments = None
def validate_expression(self, expression: E, args: t.Optional[t.List] = None) -> E:
"""
Validates an Expression, making sure that all its mandatory arguments are set.
Args:
expression: The expression to validate.
args: An optional list of items that was used to instantiate the expression, if it's a Func.
Returns:
The validated expression.
"""
if self.error_level != ErrorLevel.IGNORE:
for error_message in expression.error_messages(args):
self.raise_error(error_message)
return expression
def _find_sql(self, start: Token, end: Token) -> str:
return self.sql[start.start : end.end + 1]
def _is_connected(self) -> bool:
return self._prev and self._curr and self._prev.end + 1 == self._curr.start
def _advance(self, times: int = 1) -> None:
self._index += times
self._curr = seq_get(self._tokens, self._index)
self._next = seq_get(self._tokens, self._index + 1)
if self._index > 0:
self._prev = self._tokens[self._index - 1]
self._prev_comments = self._prev.comments
else:
self._prev = None
self._prev_comments = None
def _retreat(self, index: int) -> None:
if index != self._index:
self._advance(index - self._index)
def _warn_unsupported(self) -> None:
if len(self._tokens) <= 1:
return
# We use _find_sql because self.sql may comprise multiple chunks, and we're only
# interested in emitting a warning for the one being currently processed.
sql = self._find_sql(self._tokens[0], self._tokens[-1])[: self.error_message_context]
logger.warning(
f"'{sql}' contains unsupported syntax. Falling back to parsing as a 'Command'."
)
def _parse_command(self) -> exp.Command:
self._warn_unsupported()
return self.expression(
exp.Command,
comments=self._prev_comments,
this=self._prev.text.upper(),
expression=self._parse_string(),
)
def _try_parse(self, parse_method: t.Callable[[], T], retreat: bool = False) -> t.Optional[T]:
"""
Attemps to backtrack if a parse function that contains a try/catch internally raises an error.
This behavior can be different depending on the uset-set ErrorLevel, so _try_parse aims to
solve this by setting & resetting the parser state accordingly
"""
index = self._index
error_level = self.error_level
self.error_level = ErrorLevel.IMMEDIATE
try:
this = parse_method()
except ParseError:
this = None
finally:
if not this or retreat:
self._retreat(index)
self.error_level = error_level
return this
def _parse_comment(self, allow_exists: bool = True) -> exp.Expression:
start = self._prev
exists = self._parse_exists() if allow_exists else None
self._match(TokenType.ON)
materialized = self._match_text_seq("MATERIALIZED")
kind = self._match_set(self.CREATABLES) and self._prev
if not kind:
return self._parse_as_command(start)
if kind.token_type in (TokenType.FUNCTION, TokenType.PROCEDURE):
this = self._parse_user_defined_function(kind=kind.token_type)
elif kind.token_type == TokenType.TABLE:
this = self._parse_table(alias_tokens=self.COMMENT_TABLE_ALIAS_TOKENS)
elif kind.token_type == TokenType.COLUMN:
this = self._parse_column()
else:
this = self._parse_id_var()
self._match(TokenType.IS)
return self.expression(
exp.Comment,
this=this,
kind=kind.text,
expression=self._parse_string(),
exists=exists,
materialized=materialized,
)
def _parse_to_table(
self,
) -> exp.ToTableProperty:
table = self._parse_table_parts(schema=True)
return self.expression(exp.ToTableProperty, this=table)
# https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/mergetree#mergetree-table-ttl
def _parse_ttl(self) -> exp.Expression:
def _parse_ttl_action() -> t.Optional[exp.Expression]:
this = self._parse_bitwise()
if self._match_text_seq("DELETE"):
return self.expression(exp.MergeTreeTTLAction, this=this, delete=True)
if self._match_text_seq("RECOMPRESS"):
return self.expression(
exp.MergeTreeTTLAction, this=this, recompress=self._parse_bitwise()
)
if self._match_text_seq("TO", "DISK"):
return self.expression(
exp.MergeTreeTTLAction, this=this, to_disk=self._parse_string()
)
if self._match_text_seq("TO", "VOLUME"):
return self.expression(
exp.MergeTreeTTLAction, this=this, to_volume=self._parse_string()
)
return this
expressions = self._parse_csv(_parse_ttl_action)
where = self._parse_where()
group = self._parse_group()
aggregates = None
if group and self._match(TokenType.SET):
aggregates = self._parse_csv(self._parse_set_item)
return self.expression(
exp.MergeTreeTTL,
expressions=expressions,
where=where,
group=group,
aggregates=aggregates,
)
def _parse_statement(self) -> t.Optional[exp.Expression]:
if self._curr is None:
return None
if self._match_set(self.STATEMENT_PARSERS):
return self.STATEMENT_PARSERS[self._prev.token_type](self)
if self._match_set(self.dialect.tokenizer.COMMANDS):
return self._parse_command()
expression = self._parse_expression()
expression = self._parse_set_operations(expression) if expression else self._parse_select()
return self._parse_query_modifiers(expression)
def _parse_drop(self, exists: bool = False) -> exp.Drop | exp.Command:
start = self._prev
temporary = self._match(TokenType.TEMPORARY)
materialized = self._match_text_seq("MATERIALIZED")
kind = self._match_set(self.CREATABLES) and self._prev.text.upper()
if not kind:
return self._parse_as_command(start)
concurrently = self._match_text_seq("CONCURRENTLY")
if_exists = exists or self._parse_exists()
table = self._parse_table_parts(
schema=True, is_db_reference=self._prev.token_type == TokenType.SCHEMA
)
cluster = self._parse_on_property() if self._match(TokenType.ON) else None
if self._match(TokenType.L_PAREN, advance=False):
expressions = self._parse_wrapped_csv(self._parse_types)
else:
expressions = None
return self.expression(
exp.Drop,
comments=start.comments,
exists=if_exists,
this=table,
expressions=expressions,
kind=self.dialect.CREATABLE_KIND_MAPPING.get(kind) or kind,
temporary=temporary,
materialized=materialized,
cascade=self._match_text_seq("CASCADE"),
constraints=self._match_text_seq("CONSTRAINTS"),
purge=self._match_text_seq("PURGE"),
cluster=cluster,
concurrently=concurrently,
)
def _parse_exists(self, not_: bool = False) -> t.Optional[bool]:
return (
self._match_text_seq("IF")
and (not not_ or self._match(TokenType.NOT))
and self._match(TokenType.EXISTS)
)
def _parse_create(self) -> exp.Create | exp.Command:
# Note: this can't be None because we've matched a statement parser
start = self._prev
comments = self._prev_comments
replace = (
start.token_type == TokenType.REPLACE
or self._match_pair(TokenType.OR, TokenType.REPLACE)
or self._match_pair(TokenType.OR, TokenType.ALTER)
)
refresh = self._match_pair(TokenType.OR, TokenType.REFRESH)
unique = self._match(TokenType.UNIQUE)
if self._match_text_seq("CLUSTERED", "COLUMNSTORE"):
clustered = True
elif self._match_text_seq("NONCLUSTERED", "COLUMNSTORE") or self._match_text_seq(
"COLUMNSTORE"
):
clustered = False
else:
clustered = None
if self._match_pair(TokenType.TABLE, TokenType.FUNCTION, advance=False):
self._advance()
properties = None
create_token = self._match_set(self.CREATABLES) and self._prev
if not create_token:
# exp.Properties.Location.POST_CREATE
properties = self._parse_properties()
create_token = self._match_set(self.CREATABLES) and self._prev
if not properties or not create_token:
return self._parse_as_command(start)
concurrently = self._match_text_seq("CONCURRENTLY")
exists = self._parse_exists(not_=True)
this = None
expression: t.Optional[exp.Expression] = None
indexes = None
no_schema_binding = None
begin = None
end = None
clone = None
def extend_props(temp_props: t.Optional[exp.Properties]) -> None:
nonlocal properties
if properties and temp_props:
properties.expressions.extend(temp_props.expressions)
elif temp_props:
properties = temp_props
if create_token.token_type in (TokenType.FUNCTION, TokenType.PROCEDURE):
this = self._parse_user_defined_function(kind=create_token.token_type)
# exp.Properties.Location.POST_SCHEMA ("schema" here is the UDF's type signature)
extend_props(self._parse_properties())
expression = self._match(TokenType.ALIAS) and self._parse_heredoc()
extend_props(self._parse_properties())
if not expression:
if self._match(TokenType.COMMAND):
expression = self._parse_as_command(self._prev)
else:
begin = self._match(TokenType.BEGIN)
return_ = self._match_text_seq("RETURN")
if self._match(TokenType.STRING, advance=False):
# Takes care of BigQuery's JavaScript UDF definitions that end in an OPTIONS property
# # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#create_function_statement
expression = self._parse_string()
extend_props(self._parse_properties())
else:
expression = self._parse_user_defined_function_expression()
end = self._match_text_seq("END")
if return_:
expression = self.expression(exp.Return, this=expression)
elif create_token.token_type == TokenType.INDEX:
# Postgres allows anonymous indexes, eg. CREATE INDEX IF NOT EXISTS ON t(c)
if not self._match(TokenType.ON):
index = self._parse_id_var()
anonymous = False
else:
index = None
anonymous = True
this = self._parse_index(index=index, anonymous=anonymous)
elif create_token.token_type in self.DB_CREATABLES:
table_parts = self._parse_table_parts(
schema=True, is_db_reference=create_token.token_type == TokenType.SCHEMA
)
# exp.Properties.Location.POST_NAME
self._match(TokenType.COMMA)
extend_props(self._parse_properties(before=True))
this = self._parse_schema(this=table_parts)
# exp.Properties.Location.POST_SCHEMA and POST_WITH
extend_props(self._parse_properties())
self._match(TokenType.ALIAS)
if not self._match_set(self.DDL_SELECT_TOKENS, advance=False):
# exp.Properties.Location.POST_ALIAS
extend_props(self._parse_properties())
if create_token.token_type == TokenType.SEQUENCE:
expression = self._parse_types()
extend_props(self._parse_properties())
else:
expression = self._parse_ddl_select()
if create_token.token_type == TokenType.TABLE:
# exp.Properties.Location.POST_EXPRESSION
extend_props(self._parse_properties())
indexes = []
while True:
index = self._parse_index()
# exp.Properties.Location.POST_INDEX
extend_props(self._parse_properties())
if not index:
break
else:
self._match(TokenType.COMMA)
indexes.append(index)
elif create_token.token_type == TokenType.VIEW:
if self._match_text_seq("WITH", "NO", "SCHEMA", "BINDING"):
no_schema_binding = True
shallow = self._match_text_seq("SHALLOW")
if self._match_texts(self.CLONE_KEYWORDS):
copy = self._prev.text.lower() == "copy"
clone = self.expression(
exp.Clone, this=self._parse_table(schema=True), shallow=shallow, copy=copy
)
if self._curr and not self._match_set((TokenType.R_PAREN, TokenType.COMMA), advance=False):
return self._parse_as_command(start)
create_kind_text = create_token.text.upper()
return self.expression(
exp.Create,
comments=comments,
this=this,
kind=self.dialect.CREATABLE_KIND_MAPPING.get(create_kind_text) or create_kind_text,
replace=replace,
refresh=refresh,
unique=unique,
expression=expression,
exists=exists,
properties=properties,
indexes=indexes,
no_schema_binding=no_schema_binding,
begin=begin,
end=end,
clone=clone,
concurrently=concurrently,
clustered=clustered,
)
def _parse_sequence_properties(self) -> t.Optional[exp.SequenceProperties]:
seq = exp.SequenceProperties()
options = []
index = self._index
while self._curr:
self._match(TokenType.COMMA)
if self._match_text_seq("INCREMENT"):
self._match_text_seq("BY")
self._match_text_seq("=")
seq.set("increment", self._parse_term())
elif self._match_text_seq("MINVALUE"):
seq.set("minvalue", self._parse_term())
elif self._match_text_seq("MAXVALUE"):
seq.set("maxvalue", self._parse_term())
elif self._match(TokenType.START_WITH) or self._match_text_seq("START"):
self._match_text_seq("=")
seq.set("start", self._parse_term())
elif self._match_text_seq("CACHE"):
# T-SQL allows empty CACHE which is initialized dynamically
seq.set("cache", self._parse_number() or True)
elif self._match_text_seq("OWNED", "BY"):
# "OWNED BY NONE" is the default
seq.set("owned", None if self._match_text_seq("NONE") else self._parse_column())
else:
opt = self._parse_var_from_options(self.CREATE_SEQUENCE, raise_unmatched=False)
if opt:
options.append(opt)
else:
break
seq.set("options", options if options else None)
return None if self._index == index else seq
def _parse_property_before(self) -> t.Optional[exp.Expression]:
# only used for teradata currently
self._match(TokenType.COMMA)
kwargs = {
"no": self._match_text_seq("NO"),
"dual": self._match_text_seq("DUAL"),
"before": self._match_text_seq("BEFORE"),
"default": self._match_text_seq("DEFAULT"),
"local": (self._match_text_seq("LOCAL") and "LOCAL")
or (self._match_text_seq("NOT", "LOCAL") and "NOT LOCAL"),
"after": self._match_text_seq("AFTER"),
"minimum": self._match_texts(("MIN", "MINIMUM")),
"maximum": self._match_texts(("MAX", "MAXIMUM")),
}
if self._match_texts(self.PROPERTY_PARSERS):
parser = self.PROPERTY_PARSERS[self._prev.text.upper()]
try:
return parser(self, **{k: v for k, v in kwargs.items() if v})
except TypeError:
self.raise_error(f"Cannot parse property '{self._prev.text}'")
return None
def _parse_wrapped_properties(self) -> t.List[exp.Expression]:
return self._parse_wrapped_csv(self._parse_property)
def _parse_property(self) -> t.Optional[exp.Expression]:
if self._match_texts(self.PROPERTY_PARSERS):
return self.PROPERTY_PARSERS[self._prev.text.upper()](self)
if self._match(TokenType.DEFAULT) and self._match_texts(self.PROPERTY_PARSERS):
return self.PROPERTY_PARSERS[self._prev.text.upper()](self, default=True)
if self._match_text_seq("COMPOUND", "SORTKEY"):
return self._parse_sortkey(compound=True)
if self._match_text_seq("SQL", "SECURITY"):
return self.expression(exp.SqlSecurityProperty, definer=self._match_text_seq("DEFINER"))
index = self._index
key = self._parse_column()
if not self._match(TokenType.EQ):
self._retreat(index)
return self._parse_sequence_properties()
# Transform the key to exp.Dot if it's dotted identifiers wrapped in exp.Column or to exp.Var otherwise
if isinstance(key, exp.Column):
key = key.to_dot() if len(key.parts) > 1 else exp.var(key.name)
value = self._parse_bitwise() or self._parse_var(any_token=True)
# Transform the value to exp.Var if it was parsed as exp.Column(exp.Identifier())
if isinstance(value, exp.Column):
value = exp.var(value.name)
return self.expression(exp.Property, this=key, value=value)
def _parse_stored(self) -> exp.FileFormatProperty:
self._match(TokenType.ALIAS)
input_format = self._parse_string() if self._match_text_seq("INPUTFORMAT") else None
output_format = self._parse_string() if self._match_text_seq("OUTPUTFORMAT") else None
return self.expression(
exp.FileFormatProperty,
this=(
self.expression(
exp.InputOutputFormat, input_format=input_format, output_format=output_format
)
if input_format or output_format
else self._parse_var_or_string() or self._parse_number() or self._parse_id_var()
),
)
def _parse_unquoted_field(self) -> t.Optional[exp.Expression]:
field = self._parse_field()
if isinstance(field, exp.Identifier) and not field.quoted:
field = exp.var(field)
return field
def _parse_property_assignment(self, exp_class: t.Type[E], **kwargs: t.Any) -> E:
self._match(TokenType.EQ)
self._match(TokenType.ALIAS)
return self.expression(exp_class, this=self._parse_unquoted_field(), **kwargs)
def _parse_properties(self, before: t.Optional[bool] = None) -> t.Optional[exp.Properties]:
properties = []
while True:
if before:
prop = self._parse_property_before()
else:
prop = self._parse_property()
if not prop:
break
for p in ensure_list(prop):
properties.append(p)
if properties:
return self.expression(exp.Properties, expressions=properties)
return None
def _parse_fallback(self, no: bool = False) -> exp.FallbackProperty:
return self.expression(
exp.FallbackProperty, no=no, protection=self._match_text_seq("PROTECTION")
)
def _parse_security(self) -> t.Optional[exp.SecurityProperty]:
if self._match_texts(("DEFINER", "INVOKER")):
security_specifier = self._prev.text.upper()
return self.expression(exp.SecurityProperty, this=security_specifier)
return None
def _parse_settings_property(self) -> exp.SettingsProperty:
return self.expression(
exp.SettingsProperty, expressions=self._parse_csv(self._parse_assignment)
)
def _parse_volatile_property(self) -> exp.VolatileProperty | exp.StabilityProperty:
if self._index >= 2:
pre_volatile_token = self._tokens[self._index - 2]
else:
pre_volatile_token = None
if pre_volatile_token and pre_volatile_token.token_type in self.PRE_VOLATILE_TOKENS:
return exp.VolatileProperty()
return self.expression(exp.StabilityProperty, this=exp.Literal.string("VOLATILE"))
def _parse_retention_period(self) -> exp.Var:
# Parse TSQL's HISTORY_RETENTION_PERIOD: {INFINITE | <number> DAY | DAYS | MONTH ...}
number = self._parse_number()
number_str = f"{number} " if number else ""
unit = self._parse_var(any_token=True)
return exp.var(f"{number_str}{unit}")
def _parse_system_versioning_property(
self, with_: bool = False
) -> exp.WithSystemVersioningProperty:
self._match(TokenType.EQ)
prop = self.expression(
exp.WithSystemVersioningProperty,
**{ # type: ignore
"on": True,
"with": with_,
},
)
if self._match_text_seq("OFF"):
prop.set("on", False)
return prop
self._match(TokenType.ON)
if self._match(TokenType.L_PAREN):
while self._curr and not self._match(TokenType.R_PAREN):
if self._match_text_seq("HISTORY_TABLE", "="):
prop.set("this", self._parse_table_parts())
elif self._match_text_seq("DATA_CONSISTENCY_CHECK", "="):
prop.set("data_consistency", self._advance_any() and self._prev.text.upper())
elif self._match_text_seq("HISTORY_RETENTION_PERIOD", "="):
prop.set("retention_period", self._parse_retention_period())
self._match(TokenType.COMMA)
return prop
def _parse_data_deletion_property(self) -> exp.DataDeletionProperty:
self._match(TokenType.EQ)
on = self._match_text_seq("ON") or not self._match_text_seq("OFF")
prop = self.expression(exp.DataDeletionProperty, on=on)
if self._match(TokenType.L_PAREN):
while self._curr and not self._match(TokenType.R_PAREN):
if self._match_text_seq("FILTER_COLUMN", "="):
prop.set("filter_column", self._parse_column())
elif self._match_text_seq("RETENTION_PERIOD", "="):
prop.set("retention_period", self._parse_retention_period())
self._match(TokenType.COMMA)
return prop
def _parse_distributed_property(self) -> exp.DistributedByProperty:
kind = "HASH"
expressions: t.Optional[t.List[exp.Expression]] = None
if self._match_text_seq("BY", "HASH"):
expressions = self._parse_wrapped_csv(self._parse_id_var)
elif self._match_text_seq("BY", "RANDOM"):
kind = "RANDOM"
# If the BUCKETS keyword is not present, the number of buckets is AUTO
buckets: t.Optional[exp.Expression] = None
if self._match_text_seq("BUCKETS") and not self._match_text_seq("AUTO"):
buckets = self._parse_number()
return self.expression(
exp.DistributedByProperty,
expressions=expressions,
kind=kind,
buckets=buckets,
order=self._parse_order(),
)
def _parse_duplicate(self) -> exp.DuplicateKeyProperty:
self._match_text_seq("KEY")
expressions = self._parse_wrapped_csv(self._parse_id_var, optional=False)
return self.expression(exp.DuplicateKeyProperty, expressions=expressions)
def _parse_with_property(self) -> t.Optional[exp.Expression] | t.List[exp.Expression]:
if self._match_text_seq("(", "SYSTEM_VERSIONING"):
prop = self._parse_system_versioning_property(with_=True)
self._match_r_paren()
return prop
if self._match(TokenType.L_PAREN, advance=False):
return self._parse_wrapped_properties()
if self._match_text_seq("JOURNAL"):
return self._parse_withjournaltable()
if self._match_texts(self.VIEW_ATTRIBUTES):
return self.expression(exp.ViewAttributeProperty, this=self._prev.text.upper())
if self._match_text_seq("DATA"):
return self._parse_withdata(no=False)
elif self._match_text_seq("NO", "DATA"):
return self._parse_withdata(no=True)
if self._match(TokenType.SERDE_PROPERTIES, advance=False):
return self._parse_serde_properties(with_=True)
if self._match(TokenType.SCHEMA):
return self.expression(
exp.WithSchemaBindingProperty,
this=self._parse_var_from_options(self.SCHEMA_BINDING_OPTIONS),
)
if self._match_texts(self.PROCEDURE_OPTIONS, advance=False):
return self.expression(
exp.WithProcedureOptions, expressions=self._parse_csv(self._parse_procedure_option)
)
if not self._next:
return None
return self._parse_withisolatedloading()
def _parse_procedure_option(self) -> exp.Expression | None:
if self._match_text_seq("EXECUTE", "AS"):
return self.expression(
exp.ExecuteAsProperty,
this=self._parse_var_from_options(self.EXECUTE_AS_OPTIONS, raise_unmatched=False)
or self._parse_string(),
)
return self._parse_var_from_options(self.PROCEDURE_OPTIONS)
# https://dev.mysql.com/doc/refman/8.0/en/create-view.html
def _parse_definer(self) -> t.Optional[exp.DefinerProperty]:
self._match(TokenType.EQ)
user = self._parse_id_var()
self._match(TokenType.PARAMETER)
host = self._parse_id_var() or (self._match(TokenType.MOD) and self._prev.text)
if not user or not host:
return None
return exp.DefinerProperty(this=f"{user}@{host}")
def _parse_withjournaltable(self) -> exp.WithJournalTableProperty:
self._match(TokenType.TABLE)
self._match(TokenType.EQ)
return self.expression(exp.WithJournalTableProperty, this=self._parse_table_parts())
def _parse_log(self, no: bool = False) -> exp.LogProperty:
return self.expression(exp.LogProperty, no=no)
def _parse_journal(self, **kwargs) -> exp.JournalProperty:
return self.expression(exp.JournalProperty, **kwargs)
def _parse_checksum(self) -> exp.ChecksumProperty:
self._match(TokenType.EQ)
on = None
if self._match(TokenType.ON):
on = True
elif self._match_text_seq("OFF"):
on = False
return self.expression(exp.ChecksumProperty, on=on, default=self._match(TokenType.DEFAULT))
def _parse_cluster(self, wrapped: bool = False) -> exp.Cluster:
return self.expression(
exp.Cluster,
expressions=(
self._parse_wrapped_csv(self._parse_ordered)
if wrapped
else self._parse_csv(self._parse_ordered)
),
)
def _parse_clustered_by(self) -> exp.ClusteredByProperty:
self._match_text_seq("BY")
self._match_l_paren()
expressions = self._parse_csv(self._parse_column)
self._match_r_paren()
if self._match_text_seq("SORTED", "BY"):
self._match_l_paren()
sorted_by = self._parse_csv(self._parse_ordered)
self._match_r_paren()
else:
sorted_by = None
self._match(TokenType.INTO)
buckets = self._parse_number()
self._match_text_seq("BUCKETS")
return self.expression(
exp.ClusteredByProperty,
expressions=expressions,
sorted_by=sorted_by,
buckets=buckets,
)
def _parse_copy_property(self) -> t.Optional[exp.CopyGrantsProperty]:
if not self._match_text_seq("GRANTS"):
self._retreat(self._index - 1)
return None
return self.expression(exp.CopyGrantsProperty)
def _parse_freespace(self) -> exp.FreespaceProperty:
self._match(TokenType.EQ)
return self.expression(
exp.FreespaceProperty, this=self._parse_number(), percent=self._match(TokenType.PERCENT)
)
def _parse_mergeblockratio(
self, no: bool = False, default: bool = False
) -> exp.MergeBlockRatioProperty:
if self._match(TokenType.EQ):
return self.expression(
exp.MergeBlockRatioProperty,
this=self._parse_number(),
percent=self._match(TokenType.PERCENT),
)
return self.expression(exp.MergeBlockRatioProperty, no=no, default=default)
def _parse_datablocksize(
self,
default: t.Optional[bool] = None,
minimum: t.Optional[bool] = None,
maximum: t.Optional[bool] = None,
) -> exp.DataBlocksizeProperty:
self._match(TokenType.EQ)
size = self._parse_number()
units = None
if self._match_texts(("BYTES", "KBYTES", "KILOBYTES")):
units = self._prev.text
return self.expression(
exp.DataBlocksizeProperty,
size=size,
units=units,
default=default,
minimum=minimum,
maximum=maximum,
)
def _parse_blockcompression(self) -> exp.BlockCompressionProperty:
self._match(TokenType.EQ)
always = self._match_text_seq("ALWAYS")
manual = self._match_text_seq("MANUAL")
never = self._match_text_seq("NEVER")
default = self._match_text_seq("DEFAULT")
autotemp = None
if self._match_text_seq("AUTOTEMP"):
autotemp = self._parse_schema()
return self.expression(
exp.BlockCompressionProperty,
always=always,
manual=manual,
never=never,
default=default,
autotemp=autotemp,
)
def _parse_withisolatedloading(self) -> t.Optional[exp.IsolatedLoadingProperty]:
index = self._index
no = self._match_text_seq("NO")
concurrent = self._match_text_seq("CONCURRENT")
if not self._match_text_seq("ISOLATED", "LOADING"):
self._retreat(index)
return None
target = self._parse_var_from_options(self.ISOLATED_LOADING_OPTIONS, raise_unmatched=False)
return self.expression(
exp.IsolatedLoadingProperty, no=no, concurrent=concurrent, target=target
)
def _parse_locking(self) -> exp.LockingProperty:
if self._match(TokenType.TABLE):
kind = "TABLE"
elif self._match(TokenType.VIEW):
kind = "VIEW"
elif self._match(TokenType.ROW):
kind = "ROW"
elif self._match_text_seq("DATABASE"):
kind = "DATABASE"
else:
kind = None
if kind in ("DATABASE", "TABLE", "VIEW"):
this = self._parse_table_parts()
else:
this = None
if self._match(TokenType.FOR):
for_or_in = "FOR"
elif self._match(TokenType.IN):
for_or_in = "IN"
else:
for_or_in = None
if self._match_text_seq("ACCESS"):
lock_type = "ACCESS"
elif self._match_texts(("EXCL", "EXCLUSIVE")):
lock_type = "EXCLUSIVE"
elif self._match_text_seq("SHARE"):
lock_type = "SHARE"
elif self._match_text_seq("READ"):
lock_type = "READ"
elif self._match_text_seq("WRITE"):
lock_type = "WRITE"
elif self._match_text_seq("CHECKSUM"):
lock_type = "CHECKSUM"
else:
lock_type = None
override = self._match_text_seq("OVERRIDE")
return self.expression(
exp.LockingProperty,
this=this,
kind=kind,
for_or_in=for_or_in,
lock_type=lock_type,
override=override,
)
def _parse_partition_by(self) -> t.List[exp.Expression]:
if self._match(TokenType.PARTITION_BY):
return self._parse_csv(self._parse_assignment)
return []
def _parse_partition_bound_spec(self) -> exp.PartitionBoundSpec:
def _parse_partition_bound_expr() -> t.Optional[exp.Expression]:
if self._match_text_seq("MINVALUE"):
return exp.var("MINVALUE")
if self._match_text_seq("MAXVALUE"):
return exp.var("MAXVALUE")
return self._parse_bitwise()
this: t.Optional[exp.Expression | t.List[exp.Expression]] = None
expression = None
from_expressions = None
to_expressions = None
if self._match(TokenType.IN):
this = self._parse_wrapped_csv(self._parse_bitwise)
elif self._match(TokenType.FROM):
from_expressions = self._parse_wrapped_csv(_parse_partition_bound_expr)
self._match_text_seq("TO")
to_expressions = self._parse_wrapped_csv(_parse_partition_bound_expr)
elif self._match_text_seq("WITH", "(", "MODULUS"):
this = self._parse_number()
self._match_text_seq(",", "REMAINDER")
expression = self._parse_number()
self._match_r_paren()
else:
self.raise_error("Failed to parse partition bound spec.")
return self.expression(
exp.PartitionBoundSpec,
this=this,
expression=expression,
from_expressions=from_expressions,
to_expressions=to_expressions,
)
# https://www.postgresql.org/docs/current/sql-createtable.html
def _parse_partitioned_of(self) -> t.Optional[exp.PartitionedOfProperty]:
if not self._match_text_seq("OF"):
self._retreat(self._index - 1)
return None
this = self._parse_table(schema=True)
if self._match(TokenType.DEFAULT):
expression: exp.Var | exp.PartitionBoundSpec = exp.var("DEFAULT")
elif self._match_text_seq("FOR", "VALUES"):
expression = self._parse_partition_bound_spec()
else:
self.raise_error("Expecting either DEFAULT or FOR VALUES clause.")
return self.expression(exp.PartitionedOfProperty, this=this, expression=expression)
def _parse_partitioned_by(self) -> exp.PartitionedByProperty:
self._match(TokenType.EQ)
return self.expression(
exp.PartitionedByProperty,
this=self._parse_schema() or self._parse_bracket(self._parse_field()),
)
def _parse_withdata(self, no: bool = False) -> exp.WithDataProperty:
if self._match_text_seq("AND", "STATISTICS"):
statistics = True
elif self._match_text_seq("AND", "NO", "STATISTICS"):
statistics = False
else:
statistics = None
return self.expression(exp.WithDataProperty, no=no, statistics=statistics)
def _parse_contains_property(self) -> t.Optional[exp.SqlReadWriteProperty]:
if self._match_text_seq("SQL"):
return self.expression(exp.SqlReadWriteProperty, this="CONTAINS SQL")
return None
def _parse_modifies_property(self) -> t.Optional[exp.SqlReadWriteProperty]:
if self._match_text_seq("SQL", "DATA"):
return self.expression(exp.SqlReadWriteProperty, this="MODIFIES SQL DATA")
return None
def _parse_no_property(self) -> t.Optional[exp.Expression]:
if self._match_text_seq("PRIMARY", "INDEX"):
return exp.NoPrimaryIndexProperty()
if self._match_text_seq("SQL"):
return self.expression(exp.SqlReadWriteProperty, this="NO SQL")
return None
def _parse_on_property(self) -> t.Optional[exp.Expression]:
if self._match_text_seq("COMMIT", "PRESERVE", "ROWS"):
return exp.OnCommitProperty()
if self._match_text_seq("COMMIT", "DELETE", "ROWS"):
return exp.OnCommitProperty(delete=True)
return self.expression(exp.OnProperty, this=self._parse_schema(self._parse_id_var()))
def _parse_reads_property(self) -> t.Optional[exp.SqlReadWriteProperty]:
if self._match_text_seq("SQL", "DATA"):
return self.expression(exp.SqlReadWriteProperty, this="READS SQL DATA")
return None
def _parse_distkey(self) -> exp.DistKeyProperty:
return self.expression(exp.DistKeyProperty, this=self._parse_wrapped(self._parse_id_var))
def _parse_create_like(self) -> t.Optional[exp.LikeProperty]:
table = self._parse_table(schema=True)
options = []
while self._match_texts(("INCLUDING", "EXCLUDING")):
this = self._prev.text.upper()
id_var = self._parse_id_var()
if not id_var:
return None
options.append(
self.expression(exp.Property, this=this, value=exp.var(id_var.this.upper()))
)
return self.expression(exp.LikeProperty, this=table, expressions=options)
def _parse_sortkey(self, compound: bool = False) -> exp.SortKeyProperty:
return self.expression(
exp.SortKeyProperty, this=self._parse_wrapped_id_vars(), compound=compound
)
def _parse_character_set(self, default: bool = False) -> exp.CharacterSetProperty:
self._match(TokenType.EQ)
return self.expression(
exp.CharacterSetProperty, this=self._parse_var_or_string(), default=default
)
def _parse_remote_with_connection(self) -> exp.RemoteWithConnectionModelProperty:
self._match_text_seq("WITH", "CONNECTION")
return self.expression(
exp.RemoteWithConnectionModelProperty, this=self._parse_table_parts()
)
def _parse_returns(self) -> exp.ReturnsProperty:
value: t.Optional[exp.Expression]
null = None
is_table = self._match(TokenType.TABLE)
if is_table:
if self._match(TokenType.LT):
value = self.expression(
exp.Schema,
this="TABLE",
expressions=self._parse_csv(self._parse_struct_types),
)
if not self._match(TokenType.GT):
self.raise_error("Expecting >")
else:
value = self._parse_schema(exp.var("TABLE"))
elif self._match_text_seq("NULL", "ON", "NULL", "INPUT"):
null = True
value = None
else:
value = self._parse_types()
return self.expression(exp.ReturnsProperty, this=value, is_table=is_table, null=null)
def _parse_describe(self) -> exp.Describe:
kind = self._match_set(self.CREATABLES) and self._prev.text
style = self._match_texts(self.DESCRIBE_STYLES) and self._prev.text.upper()
if self._match(TokenType.DOT):
style = None
self._retreat(self._index - 2)
this = self._parse_table(schema=True)
properties = self._parse_properties()
expressions = properties.expressions if properties else None
partition = self._parse_partition()
return self.expression(
exp.Describe,
this=this,
style=style,
kind=kind,
expressions=expressions,
partition=partition,
)
def _parse_multitable_inserts(self, comments: t.Optional[t.List[str]]) -> exp.MultitableInserts:
kind = self._prev.text.upper()
expressions = []
def parse_conditional_insert() -> t.Optional[exp.ConditionalInsert]:
if self._match(TokenType.WHEN):
expression = self._parse_disjunction()
self._match(TokenType.THEN)
else:
expression = None
else_ = self._match(TokenType.ELSE)
if not self._match(TokenType.INTO):
return None
return self.expression(
exp.ConditionalInsert,
this=self.expression(
exp.Insert,
this=self._parse_table(schema=True),
expression=self._parse_derived_table_values(),
),
expression=expression,
else_=else_,
)
expression = parse_conditional_insert()
while expression is not None:
expressions.append(expression)
expression = parse_conditional_insert()
return self.expression(
exp.MultitableInserts,
kind=kind,
comments=comments,
expressions=expressions,
source=self._parse_table(),
)
def _parse_insert(self) -> t.Union[exp.Insert, exp.MultitableInserts]:
comments = ensure_list(self._prev_comments)
hint = self._parse_hint()
overwrite = self._match(TokenType.OVERWRITE)
ignore = self._match(TokenType.IGNORE)
local = self._match_text_seq("LOCAL")
alternative = None
is_function = None
if self._match_text_seq("DIRECTORY"):
this: t.Optional[exp.Expression] = self.expression(
exp.Directory,
this=self._parse_var_or_string(),
local=local,
row_format=self._parse_row_format(match_row=True),
)
else:
if self._match_set((TokenType.FIRST, TokenType.ALL)):
comments += ensure_list(self._prev_comments)
return self._parse_multitable_inserts(comments)
if self._match(TokenType.OR):
alternative = self._match_texts(self.INSERT_ALTERNATIVES) and self._prev.text
self._match(TokenType.INTO)
comments += ensure_list(self._prev_comments)
self._match(TokenType.TABLE)
is_function = self._match(TokenType.FUNCTION)
this = (
self._parse_table(schema=True, parse_partition=True)
if not is_function
else self._parse_function()
)
returning = self._parse_returning()
return self.expression(
exp.Insert,
comments=comments,
hint=hint,
is_function=is_function,
this=this,
stored=self._match_text_seq("STORED") and self._parse_stored(),
by_name=self._match_text_seq("BY", "NAME"),
exists=self._parse_exists(),
where=self._match_pair(TokenType.REPLACE, TokenType.WHERE) and self._parse_assignment(),
partition=self._match(TokenType.PARTITION_BY) and self._parse_partitioned_by(),
settings=self._match_text_seq("SETTINGS") and self._parse_settings_property(),
expression=self._parse_derived_table_values() or self._parse_ddl_select(),
conflict=self._parse_on_conflict(),
returning=returning or self._parse_returning(),
overwrite=overwrite,
alternative=alternative,
ignore=ignore,
source=self._match(TokenType.TABLE) and self._parse_table(),
)
def _parse_kill(self) -> exp.Kill:
kind = exp.var(self._prev.text) if self._match_texts(("CONNECTION", "QUERY")) else None
return self.expression(
exp.Kill,
this=self._parse_primary(),
kind=kind,
)
def _parse_on_conflict(self) -> t.Optional[exp.OnConflict]:
conflict = self._match_text_seq("ON", "CONFLICT")
duplicate = self._match_text_seq("ON", "DUPLICATE", "KEY")
if not conflict and not duplicate:
return None
conflict_keys = None
constraint = None
if conflict:
if self._match_text_seq("ON", "CONSTRAINT"):
constraint = self._parse_id_var()
elif self._match(TokenType.L_PAREN):
conflict_keys = self._parse_csv(self._parse_id_var)
self._match_r_paren()
action = self._parse_var_from_options(self.CONFLICT_ACTIONS)
if self._prev.token_type == TokenType.UPDATE:
self._match(TokenType.SET)
expressions = self._parse_csv(self._parse_equality)
else:
expressions = None
return self.expression(
exp.OnConflict,
duplicate=duplicate,
expressions=expressions,
action=action,
conflict_keys=conflict_keys,
constraint=constraint,
)
def _parse_returning(self) -> t.Optional[exp.Returning]:
if not self._match(TokenType.RETURNING):
return None
return self.expression(
exp.Returning,
expressions=self._parse_csv(self._parse_expression),
into=self._match(TokenType.INTO) and self._parse_table_part(),
)
def _parse_row(self) -> t.Optional[exp.RowFormatSerdeProperty | exp.RowFormatDelimitedProperty]:
if not self._match(TokenType.FORMAT):
return None
return self._parse_row_format()
def _parse_serde_properties(self, with_: bool = False) -> t.Optional[exp.SerdeProperties]:
index = self._index
with_ = with_ or self._match_text_seq("WITH")
if not self._match(TokenType.SERDE_PROPERTIES):
self._retreat(index)
return None
return self.expression(
exp.SerdeProperties,
**{ # type: ignore
"expressions": self._parse_wrapped_properties(),
"with": with_,
},
)
def _parse_row_format(
self, match_row: bool = False
) -> t.Optional[exp.RowFormatSerdeProperty | exp.RowFormatDelimitedProperty]:
if match_row and not self._match_pair(TokenType.ROW, TokenType.FORMAT):
return None
if self._match_text_seq("SERDE"):
this = self._parse_string()
serde_properties = self._parse_serde_properties()
return self.expression(
exp.RowFormatSerdeProperty, this=this, serde_properties=serde_properties
)
self._match_text_seq("DELIMITED")
kwargs = {}
if self._match_text_seq("FIELDS", "TERMINATED", "BY"):
kwargs["fields"] = self._parse_string()
if self._match_text_seq("ESCAPED", "BY"):
kwargs["escaped"] = self._parse_string()
if self._match_text_seq("COLLECTION", "ITEMS", "TERMINATED", "BY"):
kwargs["collection_items"] = self._parse_string()
if self._match_text_seq("MAP", "KEYS", "TERMINATED", "BY"):
kwargs["map_keys"] = self._parse_string()
if self._match_text_seq("LINES", "TERMINATED", "BY"):
kwargs["lines"] = self._parse_string()
if self._match_text_seq("NULL", "DEFINED", "AS"):
kwargs["null"] = self._parse_string()
return self.expression(exp.RowFormatDelimitedProperty, **kwargs) # type: ignore
def _parse_load(self) -> exp.LoadData | exp.Command:
if self._match_text_seq("DATA"):
local = self._match_text_seq("LOCAL")
self._match_text_seq("INPATH")
inpath = self._parse_string()
overwrite = self._match(TokenType.OVERWRITE)
self._match_pair(TokenType.INTO, TokenType.TABLE)
return self.expression(
exp.LoadData,
this=self._parse_table(schema=True),
local=local,
overwrite=overwrite,
inpath=inpath,
partition=self._parse_partition(),
input_format=self._match_text_seq("INPUTFORMAT") and self._parse_string(),
serde=self._match_text_seq("SERDE") and self._parse_string(),
)
return self._parse_as_command(self._prev)
def _parse_delete(self) -> exp.Delete:
# This handles MySQL's "Multiple-Table Syntax"
# https://dev.mysql.com/doc/refman/8.0/en/delete.html
tables = None
comments = self._prev_comments
if not self._match(TokenType.FROM, advance=False):
tables = self._parse_csv(self._parse_table) or None
returning = self._parse_returning()
return self.expression(
exp.Delete,
comments=comments,
tables=tables,
this=self._match(TokenType.FROM) and self._parse_table(joins=True),
using=self._match(TokenType.USING) and self._parse_table(joins=True),
cluster=self._match(TokenType.ON) and self._parse_on_property(),
where=self._parse_where(),
returning=returning or self._parse_returning(),
limit=self._parse_limit(),
)
def _parse_update(self) -> exp.Update:
comments = self._prev_comments
this = self._parse_table(joins=True, alias_tokens=self.UPDATE_ALIAS_TOKENS)
expressions = self._match(TokenType.SET) and self._parse_csv(self._parse_equality)
returning = self._parse_returning()
return self.expression(
exp.Update,
comments=comments,
**{ # type: ignore
"this": this,
"expressions": expressions,
"from": self._parse_from(joins=True),
"where": self._parse_where(),
"returning": returning or self._parse_returning(),
"order": self._parse_order(),
"limit": self._parse_limit(),
},
)
def _parse_uncache(self) -> exp.Uncache:
if not self._match(TokenType.TABLE):
self.raise_error("Expecting TABLE after UNCACHE")
return self.expression(
exp.Uncache, exists=self._parse_exists(), this=self._parse_table(schema=True)
)
def _parse_cache(self) -> exp.Cache:
lazy = self._match_text_seq("LAZY")
self._match(TokenType.TABLE)
table = self._parse_table(schema=True)
options = []
if self._match_text_seq("OPTIONS"):
self._match_l_paren()
k = self._parse_string()
self._match(TokenType.EQ)
v = self._parse_string()
options = [k, v]
self._match_r_paren()
self._match(TokenType.ALIAS)
return self.expression(
exp.Cache,
this=table,
lazy=lazy,
options=options,
expression=self._parse_select(nested=True),
)
def _parse_partition(self) -> t.Optional[exp.Partition]:
if not self._match(TokenType.PARTITION):
return None
return self.expression(
exp.Partition, expressions=self._parse_wrapped_csv(self._parse_assignment)
)
def _parse_value(self) -> t.Optional[exp.Tuple]:
if self._match(TokenType.L_PAREN):
expressions = self._parse_csv(self._parse_expression)
self._match_r_paren()
return self.expression(exp.Tuple, expressions=expressions)
# In some dialects we can have VALUES 1, 2 which results in 1 column & 2 rows.
expression = self._parse_expression()
if expression:
return self.expression(exp.Tuple, expressions=[expression])
return None
def _parse_projections(self) -> t.List[exp.Expression]:
return self._parse_expressions()
def _parse_select(
self,
nested: bool = False,
table: bool = False,
parse_subquery_alias: bool = True,
parse_set_operation: bool = True,
) -> t.Optional[exp.Expression]:
cte = self._parse_with()
if cte:
this = self._parse_statement()
if not this:
self.raise_error("Failed to parse any statement following CTE")
return cte
if "with" in this.arg_types:
this.set("with", cte)
else:
self.raise_error(f"{this.key} does not support CTE")
this = cte
return this
# duckdb supports leading with FROM x
from_ = self._parse_from() if self._match(TokenType.FROM, advance=False) else None
if self._match(TokenType.SELECT):
comments = self._prev_comments
hint = self._parse_hint()
if self._next and not self._next.token_type == TokenType.DOT:
all_ = self._match(TokenType.ALL)
distinct = self._match_set(self.DISTINCT_TOKENS)
else:
all_, distinct = None, None
kind = (
self._match(TokenType.ALIAS)
and self._match_texts(("STRUCT", "VALUE"))
and self._prev.text.upper()
)
if distinct:
distinct = self.expression(
exp.Distinct,
on=self._parse_value() if self._match(TokenType.ON) else None,
)
if all_ and distinct:
self.raise_error("Cannot specify both ALL and DISTINCT after SELECT")
operation_modifiers = []
while self._curr and self._match_texts(self.OPERATION_MODIFIERS):
operation_modifiers.append(exp.var(self._prev.text.upper()))
limit = self._parse_limit(top=True)
projections = self._parse_projections()
this = self.expression(
exp.Select,
kind=kind,
hint=hint,
distinct=distinct,
expressions=projections,
limit=limit,
operation_modifiers=operation_modifiers or None,
)
this.comments = comments
into = self._parse_into()
if into:
this.set("into", into)
if not from_:
from_ = self._parse_from()
if from_:
this.set("from", from_)
this = self._parse_query_modifiers(this)
elif (table or nested) and self._match(TokenType.L_PAREN):
if self._match(TokenType.PIVOT):
this = self._parse_simplified_pivot()
elif self._match(TokenType.FROM):
this = exp.select("*").from_(
t.cast(exp.From, self._parse_from(skip_from_token=True))
)
else:
this = (
self._parse_table()
if table
else self._parse_select(nested=True, parse_set_operation=False)
)
# Transform exp.Values into a exp.Table to pass through parse_query_modifiers
# in case a modifier (e.g. join) is following
if table and isinstance(this, exp.Values) and this.alias:
alias = this.args["alias"].pop()
this = exp.Table(this=this, alias=alias)
this = self._parse_query_modifiers(self._parse_set_operations(this))
self._match_r_paren()
# We return early here so that the UNION isn't attached to the subquery by the
# following call to _parse_set_operations, but instead becomes the parent node
return self._parse_subquery(this, parse_alias=parse_subquery_alias)
elif self._match(TokenType.VALUES, advance=False):
this = self._parse_derived_table_values()
elif from_:
this = exp.select("*").from_(from_.this, copy=False)
elif self._match(TokenType.SUMMARIZE):
table = self._match(TokenType.TABLE)
this = self._parse_select() or self._parse_string() or self._parse_table()
return self.expression(exp.Summarize, this=this, table=table)
elif self._match(TokenType.DESCRIBE):
this = self._parse_describe()
elif self._match_text_seq("STREAM"):
this = self._parse_function()
if this:
this = self.expression(exp.Stream, this=this)
else:
self._retreat(self._index - 1)
else:
this = None
return self._parse_set_operations(this) if parse_set_operation else this
def _parse_with(self, skip_with_token: bool = False) -> t.Optional[exp.With]:
if not skip_with_token and not self._match(TokenType.WITH):
return None
comments = self._prev_comments
recursive = self._match(TokenType.RECURSIVE)
last_comments = None
expressions = []
while True:
expressions.append(self._parse_cte())
if last_comments:
expressions[-1].add_comments(last_comments)
if not self._match(TokenType.COMMA) and not self._match(TokenType.WITH):
break
else:
self._match(TokenType.WITH)
last_comments = self._prev_comments
return self.expression(
exp.With, comments=comments, expressions=expressions, recursive=recursive
)
def _parse_cte(self) -> exp.CTE:
alias = self._parse_table_alias(self.ID_VAR_TOKENS)
if not alias or not alias.this:
self.raise_error("Expected CTE to have alias")
self._match(TokenType.ALIAS)
comments = self._prev_comments
if self._match_text_seq("NOT", "MATERIALIZED"):
materialized = False
elif self._match_text_seq("MATERIALIZED"):
materialized = True
else:
materialized = None
return self.expression(
exp.CTE,
this=self._parse_wrapped(self._parse_statement),
alias=alias,
materialized=materialized,
comments=comments,
)
def _parse_table_alias(
self, alias_tokens: t.Optional[t.Collection[TokenType]] = None
) -> t.Optional[exp.TableAlias]:
any_token = self._match(TokenType.ALIAS)
alias = (
self._parse_id_var(any_token=any_token, tokens=alias_tokens or self.TABLE_ALIAS_TOKENS)
or self._parse_string_as_identifier()
)
index = self._index
if self._match(TokenType.L_PAREN):
columns = self._parse_csv(self._parse_function_parameter)
self._match_r_paren() if columns else self._retreat(index)
else:
columns = None
if not alias and not columns:
return None
table_alias = self.expression(exp.TableAlias, this=alias, columns=columns)
# We bubble up comments from the Identifier to the TableAlias
if isinstance(alias, exp.Identifier):
table_alias.add_comments(alias.pop_comments())
return table_alias
def _parse_subquery(
self, this: t.Optional[exp.Expression], parse_alias: bool = True
) -> t.Optional[exp.Subquery]:
if not this:
return None
return self.expression(
exp.Subquery,
this=this,
pivots=self._parse_pivots(),
alias=self._parse_table_alias() if parse_alias else None,
sample=self._parse_table_sample(),
)
def _implicit_unnests_to_explicit(self, this: E) -> E:
from sqlglot.optimizer.normalize_identifiers import normalize_identifiers as _norm
refs = {_norm(this.args["from"].this.copy(), dialect=self.dialect).alias_or_name}
for i, join in enumerate(this.args.get("joins") or []):
table = join.this
normalized_table = table.copy()
normalized_table.meta["maybe_column"] = True
normalized_table = _norm(normalized_table, dialect=self.dialect)
if isinstance(table, exp.Table) and not join.args.get("on"):
if normalized_table.parts[0].name in refs:
table_as_column = table.to_column()
unnest = exp.Unnest(expressions=[table_as_column])
# Table.to_column creates a parent Alias node that we want to convert to
# a TableAlias and attach to the Unnest, so it matches the parser's output
if isinstance(table.args.get("alias"), exp.TableAlias):
table_as_column.replace(table_as_column.this)
exp.alias_(unnest, None, table=[table.args["alias"].this], copy=False)
table.replace(unnest)
refs.add(normalized_table.alias_or_name)
return this
def _parse_query_modifiers(
self, this: t.Optional[exp.Expression]
) -> t.Optional[exp.Expression]:
if isinstance(this, (exp.Query, exp.Table)):
for join in self._parse_joins():
this.append("joins", join)
for lateral in iter(self._parse_lateral, None):
this.append("laterals", lateral)
while True:
if self._match_set(self.QUERY_MODIFIER_PARSERS, advance=False):
parser = self.QUERY_MODIFIER_PARSERS[self._curr.token_type]
key, expression = parser(self)
if expression:
this.set(key, expression)
if key == "limit":
offset = expression.args.pop("offset", None)
if offset:
offset = exp.Offset(expression=offset)
this.set("offset", offset)
limit_by_expressions = expression.expressions
expression.set("expressions", None)
offset.set("expressions", limit_by_expressions)
continue
break
if self.SUPPORTS_IMPLICIT_UNNEST and this and this.args.get("from"):
this = self._implicit_unnests_to_explicit(this)
return this
def _parse_hint(self) -> t.Optional[exp.Hint]:
if self._match(TokenType.HINT):
hints = []
for hint in iter(
lambda: self._parse_csv(
lambda: self._parse_function() or self._parse_var(upper=True)
),
[],
):
hints.extend(hint)
if not self._match_pair(TokenType.STAR, TokenType.SLASH):
self.raise_error("Expected */ after HINT")
return self.expression(exp.Hint, expressions=hints)
return None
def _parse_into(self) -> t.Optional[exp.Into]:
if not self._match(TokenType.INTO):
return None
temp = self._match(TokenType.TEMPORARY)
unlogged = self._match_text_seq("UNLOGGED")
self._match(TokenType.TABLE)
return self.expression(
exp.Into, this=self._parse_table(schema=True), temporary=temp, unlogged=unlogged
)
def _parse_from(
self, joins: bool = False, skip_from_token: bool = False
) -> t.Optional[exp.From]:
if not skip_from_token and not self._match(TokenType.FROM):
return None
return self.expression(
exp.From, comments=self._prev_comments, this=self._parse_table(joins=joins)
)
def _parse_match_recognize_measure(self) -> exp.MatchRecognizeMeasure:
return self.expression(
exp.MatchRecognizeMeasure,
window_frame=self._match_texts(("FINAL", "RUNNING")) and self._prev.text.upper(),
this=self._parse_expression(),
)
def _parse_match_recognize(self) -> t.Optional[exp.MatchRecognize]:
if not self._match(TokenType.MATCH_RECOGNIZE):
return None
self._match_l_paren()
partition = self._parse_partition_by()
order = self._parse_order()
measures = (
self._parse_csv(self._parse_match_recognize_measure)
if self._match_text_seq("MEASURES")
else None
)
if self._match_text_seq("ONE", "ROW", "PER", "MATCH"):
rows = exp.var("ONE ROW PER MATCH")
elif self._match_text_seq("ALL", "ROWS", "PER", "MATCH"):
text = "ALL ROWS PER MATCH"
if self._match_text_seq("SHOW", "EMPTY", "MATCHES"):
text += " SHOW EMPTY MATCHES"
elif self._match_text_seq("OMIT", "EMPTY", "MATCHES"):
text += " OMIT EMPTY MATCHES"
elif self._match_text_seq("WITH", "UNMATCHED", "ROWS"):
text += " WITH UNMATCHED ROWS"
rows = exp.var(text)
else:
rows = None
if self._match_text_seq("AFTER", "MATCH", "SKIP"):
text = "AFTER MATCH SKIP"
if self._match_text_seq("PAST", "LAST", "ROW"):
text += " PAST LAST ROW"
elif self._match_text_seq("TO", "NEXT", "ROW"):
text += " TO NEXT ROW"
elif self._match_text_seq("TO", "FIRST"):
text += f" TO FIRST {self._advance_any().text}" # type: ignore
elif self._match_text_seq("TO", "LAST"):
text += f" TO LAST {self._advance_any().text}" # type: ignore
after = exp.var(text)
else:
after = None
if self._match_text_seq("PATTERN"):
self._match_l_paren()
if not self._curr:
self.raise_error("Expecting )", self._curr)
paren = 1
start = self._curr
while self._curr and paren > 0:
if self._curr.token_type == TokenType.L_PAREN:
paren += 1
if self._curr.token_type == TokenType.R_PAREN:
paren -= 1
end = self._prev
self._advance()
if paren > 0:
self.raise_error("Expecting )", self._curr)
pattern = exp.var(self._find_sql(start, end))
else:
pattern = None
define = (
self._parse_csv(self._parse_name_as_expression)
if self._match_text_seq("DEFINE")
else None
)
self._match_r_paren()
return self.expression(
exp.MatchRecognize,
partition_by=partition,
order=order,
measures=measures,
rows=rows,
after=after,
pattern=pattern,
define=define,
alias=self._parse_table_alias(),
)
def _parse_lateral(self) -> t.Optional[exp.Lateral]:
cross_apply = self._match_pair(TokenType.CROSS, TokenType.APPLY)
if not cross_apply and self._match_pair(TokenType.OUTER, TokenType.APPLY):
cross_apply = False
if cross_apply is not None:
this = self._parse_select(table=True)
view = None
outer = None
elif self._match(TokenType.LATERAL):
this = self._parse_select(table=True)
view = self._match(TokenType.VIEW)
outer = self._match(TokenType.OUTER)
else:
return None
if not this:
this = (
self._parse_unnest()
or self._parse_function()
or self._parse_id_var(any_token=False)
)
while self._match(TokenType.DOT):
this = exp.Dot(
this=this,
expression=self._parse_function() or self._parse_id_var(any_token=False),
)
if view:
table = self._parse_id_var(any_token=False)
columns = self._parse_csv(self._parse_id_var) if self._match(TokenType.ALIAS) else []
table_alias: t.Optional[exp.TableAlias] = self.expression(
exp.TableAlias, this=table, columns=columns
)
elif isinstance(this, (exp.Subquery, exp.Unnest)) and this.alias:
# We move the alias from the lateral's child node to the lateral itself
table_alias = this.args["alias"].pop()
else:
table_alias = self._parse_table_alias()
return self.expression(
exp.Lateral,
this=this,
view=view,
outer=outer,
alias=table_alias,
cross_apply=cross_apply,
)
def _parse_join_parts(
self,
) -> t.Tuple[t.Optional[Token], t.Optional[Token], t.Optional[Token]]:
return (
self._match_set(self.JOIN_METHODS) and self._prev,
self._match_set(self.JOIN_SIDES) and self._prev,
self._match_set(self.JOIN_KINDS) and self._prev,
)
def _parse_using_identifiers(self) -> t.List[exp.Expression]:
def _parse_column_as_identifier() -> t.Optional[exp.Expression]:
this = self._parse_column()
if isinstance(this, exp.Column):
return this.this
return this
return self._parse_wrapped_csv(_parse_column_as_identifier, optional=True)
def _parse_join(
self, skip_join_token: bool = False, parse_bracket: bool = False
) -> t.Optional[exp.Join]:
if self._match(TokenType.COMMA):
return self.expression(exp.Join, this=self._parse_table())
index = self._index
method, side, kind = self._parse_join_parts()
hint = self._prev.text if self._match_texts(self.JOIN_HINTS) else None
join = self._match(TokenType.JOIN) or (kind and kind.token_type == TokenType.STRAIGHT_JOIN)
if not skip_join_token and not join:
self._retreat(index)
kind = None
method = None
side = None
outer_apply = self._match_pair(TokenType.OUTER, TokenType.APPLY, False)
cross_apply = self._match_pair(TokenType.CROSS, TokenType.APPLY, False)
if not skip_join_token and not join and not outer_apply and not cross_apply:
return None
kwargs: t.Dict[str, t.Any] = {"this": self._parse_table(parse_bracket=parse_bracket)}
if kind and kind.token_type == TokenType.ARRAY and self._match(TokenType.COMMA):
kwargs["expressions"] = self._parse_csv(
lambda: self._parse_table(parse_bracket=parse_bracket)
)
if method:
kwargs["method"] = method.text
if side:
kwargs["side"] = side.text
if kind:
kwargs["kind"] = kind.text
if hint:
kwargs["hint"] = hint
if self._match(TokenType.MATCH_CONDITION):
kwargs["match_condition"] = self._parse_wrapped(self._parse_comparison)
if self._match(TokenType.ON):
kwargs["on"] = self._parse_assignment()
elif self._match(TokenType.USING):
kwargs["using"] = self._parse_using_identifiers()
elif (
not (outer_apply or cross_apply)
and not isinstance(kwargs["this"], exp.Unnest)
and not (kind and kind.token_type in (TokenType.CROSS, TokenType.ARRAY))
):
index = self._index
joins: t.Optional[list] = list(self._parse_joins())
if joins and self._match(TokenType.ON):
kwargs["on"] = self._parse_assignment()
elif joins and self._match(TokenType.USING):
kwargs["using"] = self._parse_using_identifiers()
else:
joins = None
self._retreat(index)
kwargs["this"].set("joins", joins if joins else None)
comments = [c for token in (method, side, kind) if token for c in token.comments]
return self.expression(exp.Join, comments=comments, **kwargs)
def _parse_opclass(self) -> t.Optional[exp.Expression]:
this = self._parse_assignment()
if self._match_texts(self.OPCLASS_FOLLOW_KEYWORDS, advance=False):
return this
if not self._match_set(self.OPTYPE_FOLLOW_TOKENS, advance=False):
return self.expression(exp.Opclass, this=this, expression=self._parse_table_parts())
return this
def _parse_index_params(self) -> exp.IndexParameters:
using = self._parse_var(any_token=True) if self._match(TokenType.USING) else None
if self._match(TokenType.L_PAREN, advance=False):
columns = self._parse_wrapped_csv(self._parse_with_operator)
else:
columns = None
include = self._parse_wrapped_id_vars() if self._match_text_seq("INCLUDE") else None
partition_by = self._parse_partition_by()
with_storage = self._match(TokenType.WITH) and self._parse_wrapped_properties()
tablespace = (
self._parse_var(any_token=True)
if self._match_text_seq("USING", "INDEX", "TABLESPACE")
else None
)
where = self._parse_where()
on = self._parse_field() if self._match(TokenType.ON) else None
return self.expression(
exp.IndexParameters,
using=using,
columns=columns,
include=include,
partition_by=partition_by,
where=where,
with_storage=with_storage,
tablespace=tablespace,
on=on,
)
def _parse_index(
self, index: t.Optional[exp.Expression] = None, anonymous: bool = False
) -> t.Optional[exp.Index]:
if index or anonymous:
unique = None
primary = None
amp = None
self._match(TokenType.ON)
self._match(TokenType.TABLE) # hive
table = self._parse_table_parts(schema=True)
else:
unique = self._match(TokenType.UNIQUE)
primary = self._match_text_seq("PRIMARY")
amp = self._match_text_seq("AMP")
if not self._match(TokenType.INDEX):
return None
index = self._parse_id_var()
table = None
params = self._parse_index_params()
return self.expression(
exp.Index,
this=index,
table=table,
unique=unique,
primary=primary,
amp=amp,
params=params,
)
def _parse_table_hints(self) -> t.Optional[t.List[exp.Expression]]:
hints: t.List[exp.Expression] = []
if self._match_pair(TokenType.WITH, TokenType.L_PAREN):
# https://learn.microsoft.com/en-us/sql/t-sql/queries/hints-transact-sql-table?view=sql-server-ver16
hints.append(
self.expression(
exp.WithTableHint,
expressions=self._parse_csv(
lambda: self._parse_function() or self._parse_var(any_token=True)
),
)
)
self._match_r_paren()
else:
# https://dev.mysql.com/doc/refman/8.0/en/index-hints.html
while self._match_set(self.TABLE_INDEX_HINT_TOKENS):
hint = exp.IndexTableHint(this=self._prev.text.upper())
self._match_set((TokenType.INDEX, TokenType.KEY))
if self._match(TokenType.FOR):
hint.set("target", self._advance_any() and self._prev.text.upper())
hint.set("expressions", self._parse_wrapped_id_vars())
hints.append(hint)
return hints or None
def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]:
return (
(not schema and self._parse_function(optional_parens=False))
or self._parse_id_var(any_token=False)
or self._parse_string_as_identifier()
or self._parse_placeholder()
)
def _parse_table_parts(
self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False
) -> exp.Table:
catalog = None
db = None
table: t.Optional[exp.Expression | str] = self._parse_table_part(schema=schema)
while self._match(TokenType.DOT):
if catalog:
# This allows nesting the table in arbitrarily many dot expressions if needed
table = self.expression(
exp.Dot, this=table, expression=self._parse_table_part(schema=schema)
)
else:
catalog = db
db = table
# "" used for tsql FROM a..b case
table = self._parse_table_part(schema=schema) or ""
if (
wildcard
and self._is_connected()
and (isinstance(table, exp.Identifier) or not table)
and self._match(TokenType.STAR)
):
if isinstance(table, exp.Identifier):
table.args["this"] += "*"
else:
table = exp.Identifier(this="*")
# We bubble up comments from the Identifier to the Table
comments = table.pop_comments() if isinstance(table, exp.Expression) else None
if is_db_reference:
catalog = db
db = table
table = None
if not table and not is_db_reference:
self.raise_error(f"Expected table name but got {self._curr}")
if not db and is_db_reference:
self.raise_error(f"Expected database name but got {self._curr}")
table = self.expression(
exp.Table,
comments=comments,
this=table,
db=db,
catalog=catalog,
)
changes = self._parse_changes()
if changes:
table.set("changes", changes)
at_before = self._parse_historical_data()
if at_before:
table.set("when", at_before)
pivots = self._parse_pivots()
if pivots:
table.set("pivots", pivots)
return table
def _parse_table(
self,
schema: bool = False,
joins: bool = False,
alias_tokens: t.Optional[t.Collection[TokenType]] = None,
parse_bracket: bool = False,
is_db_reference: bool = False,
parse_partition: bool = False,
) -> t.Optional[exp.Expression]:
lateral = self._parse_lateral()
if lateral:
return lateral
unnest = self._parse_unnest()
if unnest:
return unnest
values = self._parse_derived_table_values()
if values:
return values
subquery = self._parse_select(table=True)
if subquery:
if not subquery.args.get("pivots"):
subquery.set("pivots", self._parse_pivots())
return subquery
bracket = parse_bracket and self._parse_bracket(None)
bracket = self.expression(exp.Table, this=bracket) if bracket else None
rows_from = self._match_text_seq("ROWS", "FROM") and self._parse_wrapped_csv(
self._parse_table
)
rows_from = self.expression(exp.Table, rows_from=rows_from) if rows_from else None
only = self._match(TokenType.ONLY)
this = t.cast(
exp.Expression,
bracket
or rows_from
or self._parse_bracket(
self._parse_table_parts(schema=schema, is_db_reference=is_db_reference)
),
)
if only:
this.set("only", only)
# Postgres supports a wildcard (table) suffix operator, which is a no-op in this context
self._match_text_seq("*")
parse_partition = parse_partition or self.SUPPORTS_PARTITION_SELECTION
if parse_partition and self._match(TokenType.PARTITION, advance=False):
this.set("partition", self._parse_partition())
if schema:
return self._parse_schema(this=this)
version = self._parse_version()
if version:
this.set("version", version)
if self.dialect.ALIAS_POST_TABLESAMPLE:
this.set("sample", self._parse_table_sample())
alias = self._parse_table_alias(alias_tokens=alias_tokens or self.TABLE_ALIAS_TOKENS)
if alias:
this.set("alias", alias)
if isinstance(this, exp.Table) and self._match_text_seq("AT"):
return self.expression(
exp.AtIndex, this=this.to_column(copy=False), expression=self._parse_id_var()
)
this.set("hints", self._parse_table_hints())
if not this.args.get("pivots"):
this.set("pivots", self._parse_pivots())
if not self.dialect.ALIAS_POST_TABLESAMPLE:
this.set("sample", self._parse_table_sample())
if joins:
for join in self._parse_joins():
this.append("joins", join)
if self._match_pair(TokenType.WITH, TokenType.ORDINALITY):
this.set("ordinality", True)
this.set("alias", self._parse_table_alias())
return this
def _parse_version(self) -> t.Optional[exp.Version]:
if self._match(TokenType.TIMESTAMP_SNAPSHOT):
this = "TIMESTAMP"
elif self._match(TokenType.VERSION_SNAPSHOT):
this = "VERSION"
else:
return None
if self._match_set((TokenType.FROM, TokenType.BETWEEN)):
kind = self._prev.text.upper()
start = self._parse_bitwise()
self._match_texts(("TO", "AND"))
end = self._parse_bitwise()
expression: t.Optional[exp.Expression] = self.expression(
exp.Tuple, expressions=[start, end]
)
elif self._match_text_seq("CONTAINED", "IN"):
kind = "CONTAINED IN"
expression = self.expression(
exp.Tuple, expressions=self._parse_wrapped_csv(self._parse_bitwise)
)
elif self._match(TokenType.ALL):
kind = "ALL"
expression = None
else:
self._match_text_seq("AS", "OF")
kind = "AS OF"
expression = self._parse_type()
return self.expression(exp.Version, this=this, expression=expression, kind=kind)
def _parse_historical_data(self) -> t.Optional[exp.HistoricalData]:
# https://docs.snowflake.com/en/sql-reference/constructs/at-before
index = self._index
historical_data = None
if self._match_texts(self.HISTORICAL_DATA_PREFIX):
this = self._prev.text.upper()
kind = (
self._match(TokenType.L_PAREN)
and self._match_texts(self.HISTORICAL_DATA_KIND)
and self._prev.text.upper()
)
expression = self._match(TokenType.FARROW) and self._parse_bitwise()
if expression:
self._match_r_paren()
historical_data = self.expression(
exp.HistoricalData, this=this, kind=kind, expression=expression
)
else:
self._retreat(index)
return historical_data
def _parse_changes(self) -> t.Optional[exp.Changes]:
if not self._match_text_seq("CHANGES", "(", "INFORMATION", "=>"):
return None
information = self._parse_var(any_token=True)
self._match_r_paren()
return self.expression(
exp.Changes,
information=information,
at_before=self._parse_historical_data(),
end=self._parse_historical_data(),
)
def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]:
if not self._match(TokenType.UNNEST):
return None
expressions = self._parse_wrapped_csv(self._parse_equality)
offset = self._match_pair(TokenType.WITH, TokenType.ORDINALITY)
alias = self._parse_table_alias() if with_alias else None
if alias:
if self.dialect.UNNEST_COLUMN_ONLY:
if alias.args.get("columns"):
self.raise_error("Unexpected extra column alias in unnest.")
alias.set("columns", [alias.this])
alias.set("this", None)
columns = alias.args.get("columns") or []
if offset and len(expressions) < len(columns):
offset = columns.pop()
if not offset and self._match_pair(TokenType.WITH, TokenType.OFFSET):
self._match(TokenType.ALIAS)
offset = self._parse_id_var(
any_token=False, tokens=self.UNNEST_OFFSET_ALIAS_TOKENS
) or exp.to_identifier("offset")
return self.expression(exp.Unnest, expressions=expressions, alias=alias, offset=offset)
def _parse_derived_table_values(self) -> t.Optional[exp.Values]:
is_derived = self._match_pair(TokenType.L_PAREN, TokenType.VALUES)
if not is_derived and not (
# ClickHouse's `FORMAT Values` is equivalent to `VALUES`
self._match_text_seq("VALUES") or self._match_text_seq("FORMAT", "VALUES")
):
return None
expressions = self._parse_csv(self._parse_value)
alias = self._parse_table_alias()
if is_derived:
self._match_r_paren()
return self.expression(
exp.Values, expressions=expressions, alias=alias or self._parse_table_alias()
)
def _parse_table_sample(self, as_modifier: bool = False) -> t.Optional[exp.TableSample]:
if not self._match(TokenType.TABLE_SAMPLE) and not (
as_modifier and self._match_text_seq("USING", "SAMPLE")
):
return None
bucket_numerator = None
bucket_denominator = None
bucket_field = None
percent = None
size = None
seed = None
method = self._parse_var(tokens=(TokenType.ROW,), upper=True)
matched_l_paren = self._match(TokenType.L_PAREN)
if self.TABLESAMPLE_CSV:
num = None
expressions = self._parse_csv(self._parse_primary)
else:
expressions = None
num = (
self._parse_factor()
if self._match(TokenType.NUMBER, advance=False)
else self._parse_primary() or self._parse_placeholder()
)
if self._match_text_seq("BUCKET"):
bucket_numerator = self._parse_number()
self._match_text_seq("OUT", "OF")
bucket_denominator = bucket_denominator = self._parse_number()
self._match(TokenType.ON)
bucket_field = self._parse_field()
elif self._match_set((TokenType.PERCENT, TokenType.MOD)):
percent = num
elif self._match(TokenType.ROWS) or not self.dialect.TABLESAMPLE_SIZE_IS_PERCENT:
size = num
else:
percent = num
if matched_l_paren:
self._match_r_paren()
if self._match(TokenType.L_PAREN):
method = self._parse_var(upper=True)
seed = self._match(TokenType.COMMA) and self._parse_number()
self._match_r_paren()
elif self._match_texts(("SEED", "REPEATABLE")):
seed = self._parse_wrapped(self._parse_number)
if not method and self.DEFAULT_SAMPLING_METHOD:
method = exp.var(self.DEFAULT_SAMPLING_METHOD)
return self.expression(
exp.TableSample,
expressions=expressions,
method=method,
bucket_numerator=bucket_numerator,
bucket_denominator=bucket_denominator,
bucket_field=bucket_field,
percent=percent,
size=size,
seed=seed,
)
def _parse_pivots(self) -> t.Optional[t.List[exp.Pivot]]:
return list(iter(self._parse_pivot, None)) or None
def _parse_joins(self) -> t.Iterator[exp.Join]:
return iter(self._parse_join, None)
# https://duckdb.org/docs/sql/statements/pivot
def _parse_simplified_pivot(self) -> exp.Pivot:
def _parse_on() -> t.Optional[exp.Expression]:
this = self._parse_bitwise()
return self._parse_in(this) if self._match(TokenType.IN) else this
this = self._parse_table()
expressions = self._match(TokenType.ON) and self._parse_csv(_parse_on)
using = self._match(TokenType.USING) and self._parse_csv(
lambda: self._parse_alias(self._parse_function())
)
group = self._parse_group()
return self.expression(
exp.Pivot, this=this, expressions=expressions, using=using, group=group
)
def _parse_pivot_in(self) -> exp.In | exp.PivotAny:
def _parse_aliased_expression() -> t.Optional[exp.Expression]:
this = self._parse_select_or_expression()
self._match(TokenType.ALIAS)
alias = self._parse_bitwise()
if alias:
if isinstance(alias, exp.Column) and not alias.db:
alias = alias.this
return self.expression(exp.PivotAlias, this=this, alias=alias)
return this
value = self._parse_column()
if not self._match_pair(TokenType.IN, TokenType.L_PAREN):
self.raise_error("Expecting IN (")
if self._match(TokenType.ANY):
exprs: t.List[exp.Expression] = ensure_list(exp.PivotAny(this=self._parse_order()))
else:
exprs = self._parse_csv(_parse_aliased_expression)
self._match_r_paren()
return self.expression(exp.In, this=value, expressions=exprs)
def _parse_pivot(self) -> t.Optional[exp.Pivot]:
index = self._index
include_nulls = None
if self._match(TokenType.PIVOT):
unpivot = False
elif self._match(TokenType.UNPIVOT):
unpivot = True
# https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-qry-select-unpivot.html#syntax
if self._match_text_seq("INCLUDE", "NULLS"):
include_nulls = True
elif self._match_text_seq("EXCLUDE", "NULLS"):
include_nulls = False
else:
return None
expressions = []
if not self._match(TokenType.L_PAREN):
self._retreat(index)
return None
if unpivot:
expressions = self._parse_csv(self._parse_column)
else:
expressions = self._parse_csv(lambda: self._parse_alias(self._parse_function()))
if not expressions:
self.raise_error("Failed to parse PIVOT's aggregation list")
if not self._match(TokenType.FOR):
self.raise_error("Expecting FOR")
field = self._parse_pivot_in()
default_on_null = self._match_text_seq("DEFAULT", "ON", "NULL") and self._parse_wrapped(
self._parse_bitwise
)
self._match_r_paren()
pivot = self.expression(
exp.Pivot,
expressions=expressions,
field=field,
unpivot=unpivot,
include_nulls=include_nulls,
default_on_null=default_on_null,
)
if not self._match_set((TokenType.PIVOT, TokenType.UNPIVOT), advance=False):
pivot.set("alias", self._parse_table_alias())
if not unpivot:
names = self._pivot_column_names(t.cast(t.List[exp.Expression], expressions))
columns: t.List[exp.Expression] = []
for fld in pivot.args["field"].expressions:
field_name = fld.sql() if self.IDENTIFY_PIVOT_STRINGS else fld.alias_or_name
for name in names:
if self.PREFIXED_PIVOT_COLUMNS:
name = f"{name}_{field_name}" if name else field_name
else:
name = f"{field_name}_{name}" if name else field_name
columns.append(exp.to_identifier(name))
pivot.set("columns", columns)
return pivot
def _pivot_column_names(self, aggregations: t.List[exp.Expression]) -> t.List[str]:
return [agg.alias for agg in aggregations]
def _parse_prewhere(self, skip_where_token: bool = False) -> t.Optional[exp.PreWhere]:
if not skip_where_token and not self._match(TokenType.PREWHERE):
return None
return self.expression(
exp.PreWhere, comments=self._prev_comments, this=self._parse_assignment()
)
def _parse_where(self, skip_where_token: bool = False) -> t.Optional[exp.Where]:
if not skip_where_token and not self._match(TokenType.WHERE):
return None
return self.expression(
exp.Where, comments=self._prev_comments, this=self._parse_assignment()
)
def _parse_group(self, skip_group_by_token: bool = False) -> t.Optional[exp.Group]:
if not skip_group_by_token and not self._match(TokenType.GROUP_BY):
return None
elements: t.Dict[str, t.Any] = defaultdict(list)
if self._match(TokenType.ALL):
elements["all"] = True
elif self._match(TokenType.DISTINCT):
elements["all"] = False
while True:
index = self._index
elements["expressions"].extend(
self._parse_csv(
lambda: None
if self._match_set((TokenType.CUBE, TokenType.ROLLUP), advance=False)
else self._parse_assignment()
)
)
before_with_index = self._index
with_prefix = self._match(TokenType.WITH)
if self._match(TokenType.ROLLUP):
elements["rollup"].append(
self._parse_cube_or_rollup(exp.Rollup, with_prefix=with_prefix)
)
elif self._match(TokenType.CUBE):
elements["cube"].append(
self._parse_cube_or_rollup(exp.Cube, with_prefix=with_prefix)
)
elif self._match(TokenType.GROUPING_SETS):
elements["grouping_sets"].append(
self.expression(
exp.GroupingSets,
expressions=self._parse_wrapped_csv(self._parse_grouping_set),
)
)
elif self._match_text_seq("TOTALS"):
elements["totals"] = True # type: ignore
if before_with_index <= self._index <= before_with_index + 1:
self._retreat(before_with_index)
break
if index == self._index:
break
return self.expression(exp.Group, **elements) # type: ignore
def _parse_cube_or_rollup(self, kind: t.Type[E], with_prefix: bool = False) -> E:
return self.expression(
kind, expressions=[] if with_prefix else self._parse_wrapped_csv(self._parse_column)
)
def _parse_grouping_set(self) -> t.Optional[exp.Expression]:
if self._match(TokenType.L_PAREN):
grouping_set = self._parse_csv(self._parse_column)
self._match_r_paren()
return self.expression(exp.Tuple, expressions=grouping_set)
return self._parse_column()
def _parse_having(self, skip_having_token: bool = False) -> t.Optional[exp.Having]:
if not skip_having_token and not self._match(TokenType.HAVING):
return None
return self.expression(exp.Having, this=self._parse_assignment())
def _parse_qualify(self) -> t.Optional[exp.Qualify]:
if not self._match(TokenType.QUALIFY):
return None
return self.expression(exp.Qualify, this=self._parse_assignment())
def _parse_connect(self, skip_start_token: bool = False) -> t.Optional[exp.Connect]:
if skip_start_token:
start = None
elif self._match(TokenType.START_WITH):
start = self._parse_assignment()
else:
return None
self._match(TokenType.CONNECT_BY)
nocycle = self._match_text_seq("NOCYCLE")
self.NO_PAREN_FUNCTION_PARSERS["PRIOR"] = lambda self: self.expression(
exp.Prior, this=self._parse_bitwise()
)
connect = self._parse_assignment()
self.NO_PAREN_FUNCTION_PARSERS.pop("PRIOR")
if not start and self._match(TokenType.START_WITH):
start = self._parse_assignment()
return self.expression(exp.Connect, start=start, connect=connect, nocycle=nocycle)
def _parse_name_as_expression(self) -> t.Optional[exp.Expression]:
this = self._parse_id_var(any_token=True)
if self._match(TokenType.ALIAS):
this = self.expression(exp.Alias, alias=this, this=self._parse_assignment())
return this
def _parse_interpolate(self) -> t.Optional[t.List[exp.Expression]]:
if self._match_text_seq("INTERPOLATE"):
return self._parse_wrapped_csv(self._parse_name_as_expression)
return None
def _parse_order(
self, this: t.Optional[exp.Expression] = None, skip_order_token: bool = False
) -> t.Optional[exp.Expression]:
siblings = None
if not skip_order_token and not self._match(TokenType.ORDER_BY):
if not self._match(TokenType.ORDER_SIBLINGS_BY):
return this
siblings = True
return self.expression(
exp.Order,
this=this,
expressions=self._parse_csv(self._parse_ordered),
siblings=siblings,
)
def _parse_sort(self, exp_class: t.Type[E], token: TokenType) -> t.Optional[E]:
if not self._match(token):
return None
return self.expression(exp_class, expressions=self._parse_csv(self._parse_ordered))
def _parse_ordered(
self, parse_method: t.Optional[t.Callable] = None
) -> t.Optional[exp.Ordered]:
this = parse_method() if parse_method else self._parse_assignment()
if not this:
return None
if this.name.upper() == "ALL" and self.dialect.SUPPORTS_ORDER_BY_ALL:
this = exp.var("ALL")
asc = self._match(TokenType.ASC)
desc = self._match(TokenType.DESC) or (asc and False)
is_nulls_first = self._match_text_seq("NULLS", "FIRST")
is_nulls_last = self._match_text_seq("NULLS", "LAST")
nulls_first = is_nulls_first or False
explicitly_null_ordered = is_nulls_first or is_nulls_last
if (
not explicitly_null_ordered
and (
(not desc and self.dialect.NULL_ORDERING == "nulls_are_small")
or (desc and self.dialect.NULL_ORDERING != "nulls_are_small")
)
and self.dialect.NULL_ORDERING != "nulls_are_last"
):
nulls_first = True
if self._match_text_seq("WITH", "FILL"):
with_fill = self.expression(
exp.WithFill,
**{ # type: ignore
"from": self._match(TokenType.FROM) and self._parse_bitwise(),
"to": self._match_text_seq("TO") and self._parse_bitwise(),
"step": self._match_text_seq("STEP") and self._parse_bitwise(),
"interpolate": self._parse_interpolate(),
},
)
else:
with_fill = None
return self.expression(
exp.Ordered, this=this, desc=desc, nulls_first=nulls_first, with_fill=with_fill
)
def _parse_limit(
self,
this: t.Optional[exp.Expression] = None,
top: bool = False,
skip_limit_token: bool = False,
) -> t.Optional[exp.Expression]:
if skip_limit_token or self._match(TokenType.TOP if top else TokenType.LIMIT):
comments = self._prev_comments
if top:
limit_paren = self._match(TokenType.L_PAREN)
expression = self._parse_term() if limit_paren else self._parse_number()
if limit_paren:
self._match_r_paren()
else:
expression = self._parse_term()
if self._match(TokenType.COMMA):
offset = expression
expression = self._parse_term()
else:
offset = None
limit_exp = self.expression(
exp.Limit,
this=this,
expression=expression,
offset=offset,
comments=comments,
expressions=self._parse_limit_by(),
)
return limit_exp
if self._match(TokenType.FETCH):
direction = self._match_set((TokenType.FIRST, TokenType.NEXT))
direction = self._prev.text.upper() if direction else "FIRST"
count = self._parse_field(tokens=self.FETCH_TOKENS)
percent = self._match(TokenType.PERCENT)
self._match_set((TokenType.ROW, TokenType.ROWS))
only = self._match_text_seq("ONLY")
with_ties = self._match_text_seq("WITH", "TIES")
if only and with_ties:
self.raise_error("Cannot specify both ONLY and WITH TIES in FETCH clause")
return self.expression(
exp.Fetch,
direction=direction,
count=count,
percent=percent,
with_ties=with_ties,
)
return this
def _parse_offset(self, this: t.Optional[exp.Expression] = None) -> t.Optional[exp.Expression]:
if not self._match(TokenType.OFFSET):
return this
count = self._parse_term()
self._match_set((TokenType.ROW, TokenType.ROWS))
return self.expression(
exp.Offset, this=this, expression=count, expressions=self._parse_limit_by()
)
def _parse_limit_by(self) -> t.Optional[t.List[exp.Expression]]:
return self._match_text_seq("BY") and self._parse_csv(self._parse_bitwise)
def _parse_locks(self) -> t.List[exp.Lock]:
locks = []
while True:
if self._match_text_seq("FOR", "UPDATE"):
update = True
elif self._match_text_seq("FOR", "SHARE") or self._match_text_seq(
"LOCK", "IN", "SHARE", "MODE"
):
update = False
else:
break
expressions = None
if self._match_text_seq("OF"):
expressions = self._parse_csv(lambda: self._parse_table(schema=True))
wait: t.Optional[bool | exp.Expression] = None
if self._match_text_seq("NOWAIT"):
wait = True
elif self._match_text_seq("WAIT"):
wait = self._parse_primary()
elif self._match_text_seq("SKIP", "LOCKED"):
wait = False
locks.append(
self.expression(exp.Lock, update=update, expressions=expressions, wait=wait)
)
return locks
def _parse_set_operations(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
while this and self._match_set(self.SET_OPERATIONS):
token_type = self._prev.token_type
if token_type == TokenType.UNION:
operation: t.Type[exp.SetOperation] = exp.Union
elif token_type == TokenType.EXCEPT:
operation = exp.Except
else:
operation = exp.Intersect
comments = self._prev.comments
if self._match(TokenType.DISTINCT):
distinct: t.Optional[bool] = True
elif self._match(TokenType.ALL):
distinct = False
else:
distinct = self.dialect.SET_OP_DISTINCT_BY_DEFAULT[operation]
if distinct is None:
self.raise_error(f"Expected DISTINCT or ALL for {operation.__name__}")
by_name = self._match_text_seq("BY", "NAME")
expression = self._parse_select(nested=True, parse_set_operation=False)
this = self.expression(
operation,
comments=comments,
this=this,
distinct=distinct,
by_name=by_name,
expression=expression,
)
if isinstance(this, exp.SetOperation) and self.MODIFIERS_ATTACHED_TO_SET_OP:
expression = this.expression
if expression:
for arg in self.SET_OP_MODIFIERS:
expr = expression.args.get(arg)
if expr:
this.set(arg, expr.pop())
return this
def _parse_expression(self) -> t.Optional[exp.Expression]:
return self._parse_alias(self._parse_assignment())
def _parse_assignment(self) -> t.Optional[exp.Expression]:
this = self._parse_disjunction()
if not this and self._next and self._next.token_type in self.ASSIGNMENT:
# This allows us to parse <non-identifier token> := <expr>
this = exp.column(
t.cast(str, self._advance_any(ignore_reserved=True) and self._prev.text)
)
while self._match_set(self.ASSIGNMENT):
if isinstance(this, exp.Column) and len(this.parts) == 1:
this = this.this
this = self.expression(
self.ASSIGNMENT[self._prev.token_type],
this=this,
comments=self._prev_comments,
expression=self._parse_assignment(),
)
return this
def _parse_disjunction(self) -> t.Optional[exp.Expression]:
return self._parse_tokens(self._parse_conjunction, self.DISJUNCTION)
def _parse_conjunction(self) -> t.Optional[exp.Expression]:
return self._parse_tokens(self._parse_equality, self.CONJUNCTION)
def _parse_equality(self) -> t.Optional[exp.Expression]:
return self._parse_tokens(self._parse_comparison, self.EQUALITY)
def _parse_comparison(self) -> t.Optional[exp.Expression]:
return self._parse_tokens(self._parse_range, self.COMPARISON)
def _parse_range(self, this: t.Optional[exp.Expression] = None) -> t.Optional[exp.Expression]:
this = this or self._parse_bitwise()
negate = self._match(TokenType.NOT)
if self._match_set(self.RANGE_PARSERS):
expression = self.RANGE_PARSERS[self._prev.token_type](self, this)
if not expression:
return this
this = expression
elif self._match(TokenType.ISNULL):
this = self.expression(exp.Is, this=this, expression=exp.Null())
# Postgres supports ISNULL and NOTNULL for conditions.
# https://blog.andreiavram.ro/postgresql-null-composite-type/
if self._match(TokenType.NOTNULL):
this = self.expression(exp.Is, this=this, expression=exp.Null())
this = self.expression(exp.Not, this=this)
if negate:
this = self._negate_range(this)
if self._match(TokenType.IS):
this = self._parse_is(this)
return this
def _negate_range(self, this: t.Optional[exp.Expression] = None) -> t.Optional[exp.Expression]:
if not this:
return this
return self.expression(exp.Not, this=this)
def _parse_is(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
index = self._index - 1
negate = self._match(TokenType.NOT)
if self._match_text_seq("DISTINCT", "FROM"):
klass = exp.NullSafeEQ if negate else exp.NullSafeNEQ
return self.expression(klass, this=this, expression=self._parse_bitwise())
if self._match(TokenType.JSON):
kind = self._match_texts(self.IS_JSON_PREDICATE_KIND) and self._prev.text.upper()
if self._match_text_seq("WITH"):
_with = True
elif self._match_text_seq("WITHOUT"):
_with = False
else:
_with = None
unique = self._match(TokenType.UNIQUE)
self._match_text_seq("KEYS")
expression: t.Optional[exp.Expression] = self.expression(
exp.JSON, **{"this": kind, "with": _with, "unique": unique}
)
else:
expression = self._parse_primary() or self._parse_null()
if not expression:
self._retreat(index)
return None
this = self.expression(exp.Is, this=this, expression=expression)
return self.expression(exp.Not, this=this) if negate else this
def _parse_in(self, this: t.Optional[exp.Expression], alias: bool = False) -> exp.In:
unnest = self._parse_unnest(with_alias=False)
if unnest:
this = self.expression(exp.In, this=this, unnest=unnest)
elif self._match_set((TokenType.L_PAREN, TokenType.L_BRACKET)):
matched_l_paren = self._prev.token_type == TokenType.L_PAREN
expressions = self._parse_csv(lambda: self._parse_select_or_expression(alias=alias))
if len(expressions) == 1 and isinstance(expressions[0], exp.Query):
this = self.expression(exp.In, this=this, query=expressions[0].subquery(copy=False))
else:
this = self.expression(exp.In, this=this, expressions=expressions)
if matched_l_paren:
self._match_r_paren(this)
elif not self._match(TokenType.R_BRACKET, expression=this):
self.raise_error("Expecting ]")
else:
this = self.expression(exp.In, this=this, field=self._parse_column())
return this
def _parse_between(self, this: t.Optional[exp.Expression]) -> exp.Between:
low = self._parse_bitwise()
self._match(TokenType.AND)
high = self._parse_bitwise()
return self.expression(exp.Between, this=this, low=low, high=high)
def _parse_escape(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
if not self._match(TokenType.ESCAPE):
return this
return self.expression(exp.Escape, this=this, expression=self._parse_string())
def _parse_interval(self, match_interval: bool = True) -> t.Optional[exp.Add | exp.Interval]:
index = self._index
if not self._match(TokenType.INTERVAL) and match_interval:
return None
if self._match(TokenType.STRING, advance=False):
this = self._parse_primary()
else:
this = self._parse_term()
if not this or (
isinstance(this, exp.Column)
and not this.table
and not this.this.quoted
and this.name.upper() == "IS"
):
self._retreat(index)
return None
unit = self._parse_function() or (
not self._match(TokenType.ALIAS, advance=False)
and self._parse_var(any_token=True, upper=True)
)
# Most dialects support, e.g., the form INTERVAL '5' day, thus we try to parse
# each INTERVAL expression into this canonical form so it's easy to transpile
if this and this.is_number:
this = exp.Literal.string(this.to_py())
elif this and this.is_string:
parts = exp.INTERVAL_STRING_RE.findall(this.name)
if len(parts) == 1:
if unit:
# Unconsume the eagerly-parsed unit, since the real unit was part of the string
self._retreat(self._index - 1)
this = exp.Literal.string(parts[0][0])
unit = self.expression(exp.Var, this=parts[0][1].upper())
if self.INTERVAL_SPANS and self._match_text_seq("TO"):
unit = self.expression(
exp.IntervalSpan, this=unit, expression=self._parse_var(any_token=True, upper=True)
)
interval = self.expression(exp.Interval, this=this, unit=unit)
index = self._index
self._match(TokenType.PLUS)
# Convert INTERVAL 'val_1' unit_1 [+] ... [+] 'val_n' unit_n into a sum of intervals
if self._match_set((TokenType.STRING, TokenType.NUMBER), advance=False):
return self.expression(
exp.Add, this=interval, expression=self._parse_interval(match_interval=False)
)
self._retreat(index)
return interval
def _parse_bitwise(self) -> t.Optional[exp.Expression]:
this = self._parse_term()
while True:
if self._match_set(self.BITWISE):
this = self.expression(
self.BITWISE[self._prev.token_type],
this=this,
expression=self._parse_term(),
)
elif self.dialect.DPIPE_IS_STRING_CONCAT and self._match(TokenType.DPIPE):
this = self.expression(
exp.DPipe,
this=this,
expression=self._parse_term(),
safe=not self.dialect.STRICT_STRING_CONCAT,
)
elif self._match(TokenType.DQMARK):
this = self.expression(
exp.Coalesce, this=this, expressions=ensure_list(self._parse_term())
)
elif self._match_pair(TokenType.LT, TokenType.LT):
this = self.expression(
exp.BitwiseLeftShift, this=this, expression=self._parse_term()
)
elif self._match_pair(TokenType.GT, TokenType.GT):
this = self.expression(
exp.BitwiseRightShift, this=this, expression=self._parse_term()
)
else:
break
return this
def _parse_term(self) -> t.Optional[exp.Expression]:
this = self._parse_factor()
while self._match_set(self.TERM):
klass = self.TERM[self._prev.token_type]
comments = self._prev_comments
expression = self._parse_factor()
this = self.expression(klass, this=this, comments=comments, expression=expression)
if isinstance(this, exp.Collate):
expr = this.expression
# Preserve collations such as pg_catalog."default" (Postgres) as columns, otherwise
# fallback to Identifier / Var
if isinstance(expr, exp.Column) and len(expr.parts) == 1:
ident = expr.this
if isinstance(ident, exp.Identifier):
this.set("expression", ident if ident.quoted else exp.var(ident.name))
return this
def _parse_factor(self) -> t.Optional[exp.Expression]:
parse_method = self._parse_exponent if self.EXPONENT else self._parse_unary
this = parse_method()
while self._match_set(self.FACTOR):
klass = self.FACTOR[self._prev.token_type]
comments = self._prev_comments
expression = parse_method()
if not expression and klass is exp.IntDiv and self._prev.text.isalpha():
self._retreat(self._index - 1)
return this
this = self.expression(klass, this=this, comments=comments, expression=expression)
if isinstance(this, exp.Div):
this.args["typed"] = self.dialect.TYPED_DIVISION
this.args["safe"] = self.dialect.SAFE_DIVISION
return this
def _parse_exponent(self) -> t.Optional[exp.Expression]:
return self._parse_tokens(self._parse_unary, self.EXPONENT)
def _parse_unary(self) -> t.Optional[exp.Expression]:
if self._match_set(self.UNARY_PARSERS):
return self.UNARY_PARSERS[self._prev.token_type](self)
return self._parse_at_time_zone(self._parse_type())
def _parse_type(
self, parse_interval: bool = True, fallback_to_identifier: bool = False
) -> t.Optional[exp.Expression]:
interval = parse_interval and self._parse_interval()
if interval:
return interval
index = self._index
data_type = self._parse_types(check_func=True, allow_identifiers=False)
# parse_types() returns a Cast if we parsed BQ's inline constructor <type>(<values>) e.g.
# STRUCT<a INT, b STRING>(1, 'foo'), which is canonicalized to CAST(<values> AS <type>)
if isinstance(data_type, exp.Cast):
# This constructor can contain ops directly after it, for instance struct unnesting:
# STRUCT<a INT, b STRING>(1, 'foo').* --> CAST(STRUCT(1, 'foo') AS STRUCT<a iNT, b STRING).*
return self._parse_column_ops(data_type)
if data_type:
index2 = self._index
this = self._parse_primary()
if isinstance(this, exp.Literal):
parser = self.TYPE_LITERAL_PARSERS.get(data_type.this)
if parser:
return parser(self, this, data_type)
return self.expression(exp.Cast, this=this, to=data_type)
# The expressions arg gets set by the parser when we have something like DECIMAL(38, 0)
# in the input SQL. In that case, we'll produce these tokens: DECIMAL ( 38 , 0 )
#
# If the index difference here is greater than 1, that means the parser itself must have
# consumed additional tokens such as the DECIMAL scale and precision in the above example.
#
# If it's not greater than 1, then it must be 1, because we've consumed at least the type
# keyword, meaning that the expressions arg of the DataType must have gotten set by a
# callable in the TYPE_CONVERTERS mapping. For example, Snowflake converts DECIMAL to
# DECIMAL(38, 0)) in order to facilitate the data type's transpilation.
#
# In these cases, we don't really want to return the converted type, but instead retreat
# and try to parse a Column or Identifier in the section below.
if data_type.expressions and index2 - index > 1:
self._retreat(index2)
return self._parse_column_ops(data_type)
self._retreat(index)
if fallback_to_identifier:
return self._parse_id_var()
this = self._parse_column()
return this and self._parse_column_ops(this)
def _parse_type_size(self) -> t.Optional[exp.DataTypeParam]:
this = self._parse_type()
if not this:
return None
if isinstance(this, exp.Column) and not this.table:
this = exp.var(this.name.upper())
return self.expression(
exp.DataTypeParam, this=this, expression=self._parse_var(any_token=True)
)
def _parse_types(
self, check_func: bool = False, schema: bool = False, allow_identifiers: bool = True
) -> t.Optional[exp.Expression]:
index = self._index
this: t.Optional[exp.Expression] = None
prefix = self._match_text_seq("SYSUDTLIB", ".")
if not self._match_set(self.TYPE_TOKENS):
identifier = allow_identifiers and self._parse_id_var(
any_token=False, tokens=(TokenType.VAR,)
)
if isinstance(identifier, exp.Identifier):
tokens = self.dialect.tokenize(identifier.sql(dialect=self.dialect))
if len(tokens) != 1:
self.raise_error("Unexpected identifier", self._prev)
if tokens[0].token_type in self.TYPE_TOKENS:
self._prev = tokens[0]
elif self.dialect.SUPPORTS_USER_DEFINED_TYPES:
type_name = identifier.name
while self._match(TokenType.DOT):
type_name = f"{type_name}.{self._advance_any() and self._prev.text}"
this = exp.DataType.build(type_name, udt=True)
else:
self._retreat(self._index - 1)
return None
else:
return None
type_token = self._prev.token_type
if type_token == TokenType.PSEUDO_TYPE:
return self.expression(exp.PseudoType, this=self._prev.text.upper())
if type_token == TokenType.OBJECT_IDENTIFIER:
return self.expression(exp.ObjectIdentifier, this=self._prev.text.upper())
# https://materialize.com/docs/sql/types/map/
if type_token == TokenType.MAP and self._match(TokenType.L_BRACKET):
key_type = self._parse_types(
check_func=check_func, schema=schema, allow_identifiers=allow_identifiers
)
if not self._match(TokenType.FARROW):
self._retreat(index)
return None
value_type = self._parse_types(
check_func=check_func, schema=schema, allow_identifiers=allow_identifiers
)
if not self._match(TokenType.R_BRACKET):
self._retreat(index)
return None
return exp.DataType(
this=exp.DataType.Type.MAP,
expressions=[key_type, value_type],
nested=True,
prefix=prefix,
)
nested = type_token in self.NESTED_TYPE_TOKENS
is_struct = type_token in self.STRUCT_TYPE_TOKENS
is_aggregate = type_token in self.AGGREGATE_TYPE_TOKENS
expressions = None
maybe_func = False
if self._match(TokenType.L_PAREN):
if is_struct:
expressions = self._parse_csv(lambda: self._parse_struct_types(type_required=True))
elif nested:
expressions = self._parse_csv(
lambda: self._parse_types(
check_func=check_func, schema=schema, allow_identifiers=allow_identifiers
)
)
if type_token == TokenType.NULLABLE and len(expressions) == 1:
this = expressions[0]
this.set("nullable", True)
self._match_r_paren()
return this
elif type_token in self.ENUM_TYPE_TOKENS:
expressions = self._parse_csv(self._parse_equality)
elif is_aggregate:
func_or_ident = self._parse_function(anonymous=True) or self._parse_id_var(
any_token=False, tokens=(TokenType.VAR,)
)
if not func_or_ident or not self._match(TokenType.COMMA):
return None
expressions = self._parse_csv(
lambda: self._parse_types(
check_func=check_func, schema=schema, allow_identifiers=allow_identifiers
)
)
expressions.insert(0, func_or_ident)
else:
expressions = self._parse_csv(self._parse_type_size)
# https://docs.snowflake.com/en/sql-reference/data-types-vector
if type_token == TokenType.VECTOR and len(expressions) == 2:
expressions[0] = exp.DataType.build(expressions[0].name, dialect=self.dialect)
if not expressions or not self._match(TokenType.R_PAREN):
self._retreat(index)
return None
maybe_func = True
values: t.Optional[t.List[exp.Expression]] = None
if nested and self._match(TokenType.LT):
if is_struct:
expressions = self._parse_csv(lambda: self._parse_struct_types(type_required=True))
else:
expressions = self._parse_csv(
lambda: self._parse_types(
check_func=check_func, schema=schema, allow_identifiers=allow_identifiers
)
)
if not self._match(TokenType.GT):
self.raise_error("Expecting >")
if self._match_set((TokenType.L_BRACKET, TokenType.L_PAREN)):
values = self._parse_csv(self._parse_assignment)
if not values and is_struct:
values = None
self._retreat(self._index - 1)
else:
self._match_set((TokenType.R_BRACKET, TokenType.R_PAREN))
if type_token in self.TIMESTAMPS:
if self._match_text_seq("WITH", "TIME", "ZONE"):
maybe_func = False
tz_type = (
exp.DataType.Type.TIMETZ
if type_token in self.TIMES
else exp.DataType.Type.TIMESTAMPTZ
)
this = exp.DataType(this=tz_type, expressions=expressions)
elif self._match_text_seq("WITH", "LOCAL", "TIME", "ZONE"):
maybe_func = False
this = exp.DataType(this=exp.DataType.Type.TIMESTAMPLTZ, expressions=expressions)
elif self._match_text_seq("WITHOUT", "TIME", "ZONE"):
maybe_func = False
elif type_token == TokenType.INTERVAL:
unit = self._parse_var(upper=True)
if unit:
if self._match_text_seq("TO"):
unit = exp.IntervalSpan(this=unit, expression=self._parse_var(upper=True))
this = self.expression(exp.DataType, this=self.expression(exp.Interval, unit=unit))
else:
this = self.expression(exp.DataType, this=exp.DataType.Type.INTERVAL)
if maybe_func and check_func:
index2 = self._index
peek = self._parse_string()
if not peek:
self._retreat(index)
return None
self._retreat(index2)
if not this:
if self._match_text_seq("UNSIGNED"):
unsigned_type_token = self.SIGNED_TO_UNSIGNED_TYPE_TOKEN.get(type_token)
if not unsigned_type_token:
self.raise_error(f"Cannot convert {type_token.value} to unsigned.")
type_token = unsigned_type_token or type_token
this = exp.DataType(
this=exp.DataType.Type[type_token.value],
expressions=expressions,
nested=nested,
prefix=prefix,
)
# Empty arrays/structs are allowed
if values is not None:
cls = exp.Struct if is_struct else exp.Array
this = exp.cast(cls(expressions=values), this, copy=False)
elif expressions:
this.set("expressions", expressions)
# https://materialize.com/docs/sql/types/list/#type-name
while self._match(TokenType.LIST):
this = exp.DataType(this=exp.DataType.Type.LIST, expressions=[this], nested=True)
index = self._index
# Postgres supports the INT ARRAY[3] syntax as a synonym for INT[3]
matched_array = self._match(TokenType.ARRAY)
while self._curr:
datatype_token = self._prev.token_type
matched_l_bracket = self._match(TokenType.L_BRACKET)
if not matched_l_bracket and not matched_array:
break
matched_array = False
values = self._parse_csv(self._parse_assignment) or None
if (
values
and not schema
and (
not self.dialect.SUPPORTS_FIXED_SIZE_ARRAYS or datatype_token == TokenType.ARRAY
)
):
# Retreating here means that we should not parse the following values as part of the data type, e.g. in DuckDB
# ARRAY[1] should retreat and instead be parsed into exp.Array in contrast to INT[x][y] which denotes a fixed-size array data type
self._retreat(index)
break
this = exp.DataType(
this=exp.DataType.Type.ARRAY, expressions=[this], values=values, nested=True
)
self._match(TokenType.R_BRACKET)
if self.TYPE_CONVERTERS and isinstance(this.this, exp.DataType.Type):
converter = self.TYPE_CONVERTERS.get(this.this)
if converter:
this = converter(t.cast(exp.DataType, this))
return this
def _parse_struct_types(self, type_required: bool = False) -> t.Optional[exp.Expression]:
index = self._index
if (
self._curr
and self._next
and self._curr.token_type in self.TYPE_TOKENS
and self._next.token_type in self.TYPE_TOKENS
):
# Takes care of special cases like `STRUCT<list ARRAY<...>>` where the identifier is also a
# type token. Without this, the list will be parsed as a type and we'll eventually crash
this = self._parse_id_var()
else:
this = (
self._parse_type(parse_interval=False, fallback_to_identifier=True)
or self._parse_id_var()
)
self._match(TokenType.COLON)
if (
type_required
and not isinstance(this, exp.DataType)
and not self._match_set(self.TYPE_TOKENS, advance=False)
):
self._retreat(index)
return self._parse_types()
return self._parse_column_def(this)
def _parse_at_time_zone(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
if not self._match_text_seq("AT", "TIME", "ZONE"):
return this
return self.expression(exp.AtTimeZone, this=this, zone=self._parse_unary())
def _parse_column(self) -> t.Optional[exp.Expression]:
this = self._parse_column_reference()
column = self._parse_column_ops(this) if this else self._parse_bracket(this)
if self.dialect.SUPPORTS_COLUMN_JOIN_MARKS and column:
column.set("join_mark", self._match(TokenType.JOIN_MARKER))
return column
def _parse_column_reference(self) -> t.Optional[exp.Expression]:
this = self._parse_field()
if (
not this
and self._match(TokenType.VALUES, advance=False)
and self.VALUES_FOLLOWED_BY_PAREN
and (not self._next or self._next.token_type != TokenType.L_PAREN)
):
this = self._parse_id_var()
if isinstance(this, exp.Identifier):
# We bubble up comments from the Identifier to the Column
this = self.expression(exp.Column, comments=this.pop_comments(), this=this)
return this
def _parse_colon_as_variant_extract(
self, this: t.Optional[exp.Expression]
) -> t.Optional[exp.Expression]:
casts = []
json_path = []
escape = None
while self._match(TokenType.COLON):
start_index = self._index
# Snowflake allows reserved keywords as json keys but advance_any() excludes TokenType.SELECT from any_tokens=True
path = self._parse_column_ops(
self._parse_field(any_token=True, tokens=(TokenType.SELECT,))
)
# The cast :: operator has a lower precedence than the extraction operator :, so
# we rearrange the AST appropriately to avoid casting the JSON path
while isinstance(path, exp.Cast):
casts.append(path.to)
path = path.this
if casts:
dcolon_offset = next(
i
for i, t in enumerate(self._tokens[start_index:])
if t.token_type == TokenType.DCOLON
)
end_token = self._tokens[start_index + dcolon_offset - 1]
else:
end_token = self._prev
if path:
# Escape single quotes from Snowflake's colon extraction (e.g. col:"a'b") as
# it'll roundtrip to a string literal in GET_PATH
if isinstance(path, exp.Identifier) and path.quoted:
escape = True
json_path.append(self._find_sql(self._tokens[start_index], end_token))
# The VARIANT extract in Snowflake/Databricks is parsed as a JSONExtract; Snowflake uses the json_path in GET_PATH() while
# Databricks transforms it back to the colon/dot notation
if json_path:
json_path_expr = self.dialect.to_json_path(exp.Literal.string(".".join(json_path)))
if json_path_expr:
json_path_expr.set("escape", escape)
this = self.expression(
exp.JSONExtract,
this=this,
expression=json_path_expr,
variant_extract=True,
)
while casts:
this = self.expression(exp.Cast, this=this, to=casts.pop())
return this
def _parse_dcolon(self) -> t.Optional[exp.Expression]:
return self._parse_types()
def _parse_column_ops(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
this = self._parse_bracket(this)
while self._match_set(self.COLUMN_OPERATORS):
op_token = self._prev.token_type
op = self.COLUMN_OPERATORS.get(op_token)
if op_token == TokenType.DCOLON:
field = self._parse_dcolon()
if not field:
self.raise_error("Expected type")
elif op and self._curr:
field = self._parse_column_reference() or self._parse_bracket()
else:
field = self._parse_field(any_token=True, anonymous_func=True)
if isinstance(field, exp.Func) and this:
# bigquery allows function calls like x.y.count(...)
# SAFE.SUBSTR(...)
# https://cloud.google.com/bigquery/docs/reference/standard-sql/functions-reference#function_call_rules
this = exp.replace_tree(
this,
lambda n: (
self.expression(exp.Dot, this=n.args.get("table"), expression=n.this)
if n.table
else n.this
)
if isinstance(n, exp.Column)
else n,
)
if op:
this = op(self, this, field)
elif isinstance(this, exp.Column) and not this.args.get("catalog"):
this = self.expression(
exp.Column,
comments=this.comments,
this=field,
table=this.this,
db=this.args.get("table"),
catalog=this.args.get("db"),
)
else:
this = self.expression(exp.Dot, this=this, expression=field)
this = self._parse_bracket(this)
return self._parse_colon_as_variant_extract(this) if self.COLON_IS_VARIANT_EXTRACT else this
def _parse_primary(self) -> t.Optional[exp.Expression]:
if self._match_set(self.PRIMARY_PARSERS):
token_type = self._prev.token_type
primary = self.PRIMARY_PARSERS[token_type](self, self._prev)
if token_type == TokenType.STRING:
expressions = [primary]
while self._match(TokenType.STRING):
expressions.append(exp.Literal.string(self._prev.text))
if len(expressions) > 1:
return self.expression(exp.Concat, expressions=expressions)
return primary
if self._match_pair(TokenType.DOT, TokenType.NUMBER):
return exp.Literal.number(f"0.{self._prev.text}")
if self._match(TokenType.L_PAREN):
comments = self._prev_comments
query = self._parse_select()
if query:
expressions = [query]
else:
expressions = self._parse_expressions()
this = self._parse_query_modifiers(seq_get(expressions, 0))
if not this and self._match(TokenType.R_PAREN, advance=False):
this = self.expression(exp.Tuple)
elif isinstance(this, exp.UNWRAPPED_QUERIES):
this = self._parse_subquery(this=this, parse_alias=False)
elif isinstance(this, exp.Subquery):
this = self._parse_subquery(
this=self._parse_set_operations(this), parse_alias=False
)
elif len(expressions) > 1 or self._prev.token_type == TokenType.COMMA:
this = self.expression(exp.Tuple, expressions=expressions)
else:
this = self.expression(exp.Paren, this=this)
if this:
this.add_comments(comments)
self._match_r_paren(expression=this)
return this
return None
def _parse_field(
self,
any_token: bool = False,
tokens: t.Optional[t.Collection[TokenType]] = None,
anonymous_func: bool = False,
) -> t.Optional[exp.Expression]:
if anonymous_func:
field = (
self._parse_function(anonymous=anonymous_func, any_token=any_token)
or self._parse_primary()
)
else:
field = self._parse_primary() or self._parse_function(
anonymous=anonymous_func, any_token=any_token
)
return field or self._parse_id_var(any_token=any_token, tokens=tokens)
def _parse_function(
self,
functions: t.Optional[t.Dict[str, t.Callable]] = None,
anonymous: bool = False,
optional_parens: bool = True,
any_token: bool = False,
) -> t.Optional[exp.Expression]:
# This allows us to also parse {fn <function>} syntax (Snowflake, MySQL support this)
# See: https://community.snowflake.com/s/article/SQL-Escape-Sequences
fn_syntax = False
if (
self._match(TokenType.L_BRACE, advance=False)
and self._next
and self._next.text.upper() == "FN"
):
self._advance(2)
fn_syntax = True
func = self._parse_function_call(
functions=functions,
anonymous=anonymous,
optional_parens=optional_parens,
any_token=any_token,
)
if fn_syntax:
self._match(TokenType.R_BRACE)
return func
def _parse_function_call(
self,
functions: t.Optional[t.Dict[str, t.Callable]] = None,
anonymous: bool = False,
optional_parens: bool = True,
any_token: bool = False,
) -> t.Optional[exp.Expression]:
if not self._curr:
return None
comments = self._curr.comments
token_type = self._curr.token_type
this = self._curr.text
upper = this.upper()
parser = self.NO_PAREN_FUNCTION_PARSERS.get(upper)
if optional_parens and parser and token_type not in self.INVALID_FUNC_NAME_TOKENS:
self._advance()
return self._parse_window(parser(self))
if not self._next or self._next.token_type != TokenType.L_PAREN:
if optional_parens and token_type in self.NO_PAREN_FUNCTIONS:
self._advance()
return self.expression(self.NO_PAREN_FUNCTIONS[token_type])
return None
if any_token:
if token_type in self.RESERVED_TOKENS:
return None
elif token_type not in self.FUNC_TOKENS:
return None
self._advance(2)
parser = self.FUNCTION_PARSERS.get(upper)
if parser and not anonymous:
this = parser(self)
else:
subquery_predicate = self.SUBQUERY_PREDICATES.get(token_type)
if subquery_predicate and self._curr.token_type in (TokenType.SELECT, TokenType.WITH):
this = self.expression(
subquery_predicate, comments=comments, this=self._parse_select()
)
self._match_r_paren()
return this
if functions is None:
functions = self.FUNCTIONS
function = functions.get(upper)
alias = upper in self.FUNCTIONS_WITH_ALIASED_ARGS
args = self._parse_csv(lambda: self._parse_lambda(alias=alias))
if alias:
args = self._kv_to_prop_eq(args)
if function and not anonymous:
if "dialect" in function.__code__.co_varnames:
func = function(args, dialect=self.dialect)
else:
func = function(args)
func = self.validate_expression(func, args)
if not self.dialect.NORMALIZE_FUNCTIONS:
func.meta["name"] = this
this = func
else:
if token_type == TokenType.IDENTIFIER:
this = exp.Identifier(this=this, quoted=True)
this = self.expression(exp.Anonymous, this=this, expressions=args)
if isinstance(this, exp.Expression):
this.add_comments(comments)
self._match_r_paren(this)
return self._parse_window(this)
def _to_prop_eq(self, expression: exp.Expression, index: int) -> exp.Expression:
return expression
def _kv_to_prop_eq(self, expressions: t.List[exp.Expression]) -> t.List[exp.Expression]:
transformed = []
for index, e in enumerate(expressions):
if isinstance(e, self.KEY_VALUE_DEFINITIONS):
if isinstance(e, exp.Alias):
e = self.expression(exp.PropertyEQ, this=e.args.get("alias"), expression=e.this)
if not isinstance(e, exp.PropertyEQ):
e = self.expression(
exp.PropertyEQ, this=exp.to_identifier(e.this.name), expression=e.expression
)
if isinstance(e.this, exp.Column):
e.this.replace(e.this.this)
else:
e = self._to_prop_eq(e, index)
transformed.append(e)
return transformed
def _parse_user_defined_function_expression(self) -> t.Optional[exp.Expression]:
return self._parse_statement()
def _parse_function_parameter(self) -> t.Optional[exp.Expression]:
return self._parse_column_def(self._parse_id_var())
def _parse_user_defined_function(
self, kind: t.Optional[TokenType] = None
) -> t.Optional[exp.Expression]:
this = self._parse_id_var()
while self._match(TokenType.DOT):
this = self.expression(exp.Dot, this=this, expression=self._parse_id_var())
if not self._match(TokenType.L_PAREN):
return this
expressions = self._parse_csv(self._parse_function_parameter)
self._match_r_paren()
return self.expression(
exp.UserDefinedFunction, this=this, expressions=expressions, wrapped=True
)
def _parse_introducer(self, token: Token) -> exp.Introducer | exp.Identifier:
literal = self._parse_primary()
if literal:
return self.expression(exp.Introducer, this=token.text, expression=literal)
return self.expression(exp.Identifier, this=token.text)
def _parse_session_parameter(self) -> exp.SessionParameter:
kind = None
this = self._parse_id_var() or self._parse_primary()
if this and self._match(TokenType.DOT):
kind = this.name
this = self._parse_var() or self._parse_primary()
return self.expression(exp.SessionParameter, this=this, kind=kind)
def _parse_lambda_arg(self) -> t.Optional[exp.Expression]:
return self._parse_id_var()
def _parse_lambda(self, alias: bool = False) -> t.Optional[exp.Expression]:
index = self._index
if self._match(TokenType.L_PAREN):
expressions = t.cast(
t.List[t.Optional[exp.Expression]], self._parse_csv(self._parse_lambda_arg)
)
if not self._match(TokenType.R_PAREN):
self._retreat(index)
else:
expressions = [self._parse_lambda_arg()]
if self._match_set(self.LAMBDAS):
return self.LAMBDAS[self._prev.token_type](self, expressions)
self._retreat(index)
this: t.Optional[exp.Expression]
if self._match(TokenType.DISTINCT):
this = self.expression(
exp.Distinct, expressions=self._parse_csv(self._parse_assignment)
)
else:
this = self._parse_select_or_expression(alias=alias)
return self._parse_limit(
self._parse_order(self._parse_having_max(self._parse_respect_or_ignore_nulls(this)))
)
def _parse_schema(self, this: t.Optional[exp.Expression] = None) -> t.Optional[exp.Expression]:
index = self._index
if not self._match(TokenType.L_PAREN):
return this
# Disambiguate between schema and subquery/CTE, e.g. in INSERT INTO table (<expr>),
# expr can be of both types
if self._match_set(self.SELECT_START_TOKENS):
self._retreat(index)
return this
args = self._parse_csv(lambda: self._parse_constraint() or self._parse_field_def())
self._match_r_paren()
return self.expression(exp.Schema, this=this, expressions=args)
def _parse_field_def(self) -> t.Optional[exp.Expression]:
return self._parse_column_def(self._parse_field(any_token=True))
def _parse_column_def(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
# column defs are not really columns, they're identifiers
if isinstance(this, exp.Column):
this = this.this
kind = self._parse_types(schema=True)
if self._match_text_seq("FOR", "ORDINALITY"):
return self.expression(exp.ColumnDef, this=this, ordinality=True)
constraints: t.List[exp.Expression] = []
if (not kind and self._match(TokenType.ALIAS)) or self._match_texts(
("ALIAS", "MATERIALIZED")
):
persisted = self._prev.text.upper() == "MATERIALIZED"
constraint_kind = exp.ComputedColumnConstraint(
this=self._parse_assignment(),
persisted=persisted or self._match_text_seq("PERSISTED"),
not_null=self._match_pair(TokenType.NOT, TokenType.NULL),
)
constraints.append(self.expression(exp.ColumnConstraint, kind=constraint_kind))
elif kind and self._match_pair(TokenType.ALIAS, TokenType.L_PAREN, advance=False):
self._match(TokenType.ALIAS)
constraints.append(
self.expression(
exp.ColumnConstraint,
kind=exp.TransformColumnConstraint(this=self._parse_field()),
)
)
while True:
constraint = self._parse_column_constraint()
if not constraint:
break
constraints.append(constraint)
if not kind and not constraints:
return this
return self.expression(exp.ColumnDef, this=this, kind=kind, constraints=constraints)
def _parse_auto_increment(
self,
) -> exp.GeneratedAsIdentityColumnConstraint | exp.AutoIncrementColumnConstraint:
start = None
increment = None
if self._match(TokenType.L_PAREN, advance=False):
args = self._parse_wrapped_csv(self._parse_bitwise)
start = seq_get(args, 0)
increment = seq_get(args, 1)
elif self._match_text_seq("START"):
start = self._parse_bitwise()
self._match_text_seq("INCREMENT")
increment = self._parse_bitwise()
if start and increment:
return exp.GeneratedAsIdentityColumnConstraint(start=start, increment=increment)
return exp.AutoIncrementColumnConstraint()
def _parse_auto_property(self) -> t.Optional[exp.AutoRefreshProperty]:
if not self._match_text_seq("REFRESH"):
self._retreat(self._index - 1)
return None
return self.expression(exp.AutoRefreshProperty, this=self._parse_var(upper=True))
def _parse_compress(self) -> exp.CompressColumnConstraint:
if self._match(TokenType.L_PAREN, advance=False):
return self.expression(
exp.CompressColumnConstraint, this=self._parse_wrapped_csv(self._parse_bitwise)
)
return self.expression(exp.CompressColumnConstraint, this=self._parse_bitwise())
def _parse_generated_as_identity(
self,
) -> (
exp.GeneratedAsIdentityColumnConstraint
| exp.ComputedColumnConstraint
| exp.GeneratedAsRowColumnConstraint
):
if self._match_text_seq("BY", "DEFAULT"):
on_null = self._match_pair(TokenType.ON, TokenType.NULL)
this = self.expression(
exp.GeneratedAsIdentityColumnConstraint, this=False, on_null=on_null
)
else:
self._match_text_seq("ALWAYS")
this = self.expression(exp.GeneratedAsIdentityColumnConstraint, this=True)
self._match(TokenType.ALIAS)
if self._match_text_seq("ROW"):
start = self._match_text_seq("START")
if not start:
self._match(TokenType.END)
hidden = self._match_text_seq("HIDDEN")
return self.expression(exp.GeneratedAsRowColumnConstraint, start=start, hidden=hidden)
identity = self._match_text_seq("IDENTITY")
if self._match(TokenType.L_PAREN):
if self._match(TokenType.START_WITH):
this.set("start", self._parse_bitwise())
if self._match_text_seq("INCREMENT", "BY"):
this.set("increment", self._parse_bitwise())
if self._match_text_seq("MINVALUE"):
this.set("minvalue", self._parse_bitwise())
if self._match_text_seq("MAXVALUE"):
this.set("maxvalue", self._parse_bitwise())
if self._match_text_seq("CYCLE"):
this.set("cycle", True)
elif self._match_text_seq("NO", "CYCLE"):
this.set("cycle", False)
if not identity:
this.set("expression", self._parse_range())
elif not this.args.get("start") and self._match(TokenType.NUMBER, advance=False):
args = self._parse_csv(self._parse_bitwise)
this.set("start", seq_get(args, 0))
this.set("increment", seq_get(args, 1))
self._match_r_paren()
return this
def _parse_inline(self) -> exp.InlineLengthColumnConstraint:
self._match_text_seq("LENGTH")
return self.expression(exp.InlineLengthColumnConstraint, this=self._parse_bitwise())
def _parse_not_constraint(self) -> t.Optional[exp.Expression]:
if self._match_text_seq("NULL"):
return self.expression(exp.NotNullColumnConstraint)
if self._match_text_seq("CASESPECIFIC"):
return self.expression(exp.CaseSpecificColumnConstraint, not_=True)
if self._match_text_seq("FOR", "REPLICATION"):
return self.expression(exp.NotForReplicationColumnConstraint)
# Unconsume the `NOT` token
self._retreat(self._index - 1)
return None
def _parse_column_constraint(self) -> t.Optional[exp.Expression]:
this = self._match(TokenType.CONSTRAINT) and self._parse_id_var()
procedure_option_follows = (
self._match(TokenType.WITH, advance=False)
and self._next
and self._next.text.upper() in self.PROCEDURE_OPTIONS
)
if not procedure_option_follows and self._match_texts(self.CONSTRAINT_PARSERS):
return self.expression(
exp.ColumnConstraint,
this=this,
kind=self.CONSTRAINT_PARSERS[self._prev.text.upper()](self),
)
return this
def _parse_constraint(self) -> t.Optional[exp.Expression]:
if not self._match(TokenType.CONSTRAINT):
return self._parse_unnamed_constraint(constraints=self.SCHEMA_UNNAMED_CONSTRAINTS)
return self.expression(
exp.Constraint,
this=self._parse_id_var(),
expressions=self._parse_unnamed_constraints(),
)
def _parse_unnamed_constraints(self) -> t.List[exp.Expression]:
constraints = []
while True:
constraint = self._parse_unnamed_constraint() or self._parse_function()
if not constraint:
break
constraints.append(constraint)
return constraints
def _parse_unnamed_constraint(
self, constraints: t.Optional[t.Collection[str]] = None
) -> t.Optional[exp.Expression]:
if self._match(TokenType.IDENTIFIER, advance=False) or not self._match_texts(
constraints or self.CONSTRAINT_PARSERS
):
return None
constraint = self._prev.text.upper()
if constraint not in self.CONSTRAINT_PARSERS:
self.raise_error(f"No parser found for schema constraint {constraint}.")
return self.CONSTRAINT_PARSERS[constraint](self)
def _parse_unique_key(self) -> t.Optional[exp.Expression]:
return self._parse_id_var(any_token=False)
def _parse_unique(self) -> exp.UniqueColumnConstraint:
self._match_text_seq("KEY")
return self.expression(
exp.UniqueColumnConstraint,
nulls=self._match_text_seq("NULLS", "NOT", "DISTINCT"),
this=self._parse_schema(self._parse_unique_key()),
index_type=self._match(TokenType.USING) and self._advance_any() and self._prev.text,
on_conflict=self._parse_on_conflict(),
)
def _parse_key_constraint_options(self) -> t.List[str]:
options = []
while True:
if not self._curr:
break
if self._match(TokenType.ON):
action = None
on = self._advance_any() and self._prev.text
if self._match_text_seq("NO", "ACTION"):
action = "NO ACTION"
elif self._match_text_seq("CASCADE"):
action = "CASCADE"
elif self._match_text_seq("RESTRICT"):
action = "RESTRICT"
elif self._match_pair(TokenType.SET, TokenType.NULL):
action = "SET NULL"
elif self._match_pair(TokenType.SET, TokenType.DEFAULT):
action = "SET DEFAULT"
else:
self.raise_error("Invalid key constraint")
options.append(f"ON {on} {action}")
else:
var = self._parse_var_from_options(
self.KEY_CONSTRAINT_OPTIONS, raise_unmatched=False
)
if not var:
break
options.append(var.name)
return options
def _parse_references(self, match: bool = True) -> t.Optional[exp.Reference]:
if match and not self._match(TokenType.REFERENCES):
return None
expressions = None
this = self._parse_table(schema=True)
options = self._parse_key_constraint_options()
return self.expression(exp.Reference, this=this, expressions=expressions, options=options)
def _parse_foreign_key(self) -> exp.ForeignKey:
expressions = self._parse_wrapped_id_vars()
reference = self._parse_references()
options = {}
while self._match(TokenType.ON):
if not self._match_set((TokenType.DELETE, TokenType.UPDATE)):
self.raise_error("Expected DELETE or UPDATE")
kind = self._prev.text.lower()
if self._match_text_seq("NO", "ACTION"):
action = "NO ACTION"
elif self._match(TokenType.SET):
self._match_set((TokenType.NULL, TokenType.DEFAULT))
action = "SET " + self._prev.text.upper()
else:
self._advance()
action = self._prev.text.upper()
options[kind] = action
return self.expression(
exp.ForeignKey,
expressions=expressions,
reference=reference,
**options, # type: ignore
)
def _parse_primary_key_part(self) -> t.Optional[exp.Expression]:
return self._parse_field()
def _parse_period_for_system_time(self) -> t.Optional[exp.PeriodForSystemTimeConstraint]:
if not self._match(TokenType.TIMESTAMP_SNAPSHOT):
self._retreat(self._index - 1)
return None
id_vars = self._parse_wrapped_id_vars()
return self.expression(
exp.PeriodForSystemTimeConstraint,
this=seq_get(id_vars, 0),
expression=seq_get(id_vars, 1),
)
def _parse_primary_key(
self, wrapped_optional: bool = False, in_props: bool = False
) -> exp.PrimaryKeyColumnConstraint | exp.PrimaryKey:
desc = (
self._match_set((TokenType.ASC, TokenType.DESC))
and self._prev.token_type == TokenType.DESC
)
if not in_props and not self._match(TokenType.L_PAREN, advance=False):
return self.expression(exp.PrimaryKeyColumnConstraint, desc=desc)
expressions = self._parse_wrapped_csv(
self._parse_primary_key_part, optional=wrapped_optional
)
options = self._parse_key_constraint_options()
return self.expression(exp.PrimaryKey, expressions=expressions, options=options)
def _parse_bracket_key_value(self, is_map: bool = False) -> t.Optional[exp.Expression]:
return self._parse_slice(self._parse_alias(self._parse_assignment(), explicit=True))
def _parse_odbc_datetime_literal(self) -> exp.Expression:
"""
Parses a datetime column in ODBC format. We parse the column into the corresponding
types, for example `{d'yyyy-mm-dd'}` will be parsed as a `Date` column, exactly the
same as we did for `DATE('yyyy-mm-dd')`.
Reference:
https://learn.microsoft.com/en-us/sql/odbc/reference/develop-app/date-time-and-timestamp-literals
"""
self._match(TokenType.VAR)
exp_class = self.ODBC_DATETIME_LITERALS[self._prev.text.lower()]
expression = self.expression(exp_class=exp_class, this=self._parse_string())
if not self._match(TokenType.R_BRACE):
self.raise_error("Expected }")
return expression
def _parse_bracket(self, this: t.Optional[exp.Expression] = None) -> t.Optional[exp.Expression]:
if not self._match_set((TokenType.L_BRACKET, TokenType.L_BRACE)):
return this
bracket_kind = self._prev.token_type
if (
bracket_kind == TokenType.L_BRACE
and self._curr
and self._curr.token_type == TokenType.VAR
and self._curr.text.lower() in self.ODBC_DATETIME_LITERALS
):
return self._parse_odbc_datetime_literal()
expressions = self._parse_csv(
lambda: self._parse_bracket_key_value(is_map=bracket_kind == TokenType.L_BRACE)
)
if bracket_kind == TokenType.L_BRACKET and not self._match(TokenType.R_BRACKET):
self.raise_error("Expected ]")
elif bracket_kind == TokenType.L_BRACE and not self._match(TokenType.R_BRACE):
self.raise_error("Expected }")
# https://duckdb.org/docs/sql/data_types/struct.html#creating-structs
if bracket_kind == TokenType.L_BRACE:
this = self.expression(exp.Struct, expressions=self._kv_to_prop_eq(expressions))
elif not this:
this = build_array_constructor(
exp.Array, args=expressions, bracket_kind=bracket_kind, dialect=self.dialect
)
else:
constructor_type = self.ARRAY_CONSTRUCTORS.get(this.name.upper())
if constructor_type:
return build_array_constructor(
constructor_type,
args=expressions,
bracket_kind=bracket_kind,
dialect=self.dialect,
)
expressions = apply_index_offset(this, expressions, -self.dialect.INDEX_OFFSET)
this = self.expression(exp.Bracket, this=this, expressions=expressions)
self._add_comments(this)
return self._parse_bracket(this)
def _parse_slice(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
if self._match(TokenType.COLON):
return self.expression(exp.Slice, this=this, expression=self._parse_assignment())
return this
def _parse_case(self) -> t.Optional[exp.Expression]:
ifs = []
default = None
comments = self._prev_comments
expression = self._parse_assignment()
while self._match(TokenType.WHEN):
this = self._parse_assignment()
self._match(TokenType.THEN)
then = self._parse_assignment()
ifs.append(self.expression(exp.If, this=this, true=then))
if self._match(TokenType.ELSE):
default = self._parse_assignment()
if not self._match(TokenType.END):
if isinstance(default, exp.Interval) and default.this.sql().upper() == "END":
default = exp.column("interval")
else:
self.raise_error("Expected END after CASE", self._prev)
return self.expression(
exp.Case, comments=comments, this=expression, ifs=ifs, default=default
)
def _parse_if(self) -> t.Optional[exp.Expression]:
if self._match(TokenType.L_PAREN):
args = self._parse_csv(self._parse_assignment)
this = self.validate_expression(exp.If.from_arg_list(args), args)
self._match_r_paren()
else:
index = self._index - 1
if self.NO_PAREN_IF_COMMANDS and index == 0:
return self._parse_as_command(self._prev)
condition = self._parse_assignment()
if not condition:
self._retreat(index)
return None
self._match(TokenType.THEN)
true = self._parse_assignment()
false = self._parse_assignment() if self._match(TokenType.ELSE) else None
self._match(TokenType.END)
this = self.expression(exp.If, this=condition, true=true, false=false)
return this
def _parse_next_value_for(self) -> t.Optional[exp.Expression]:
if not self._match_text_seq("VALUE", "FOR"):
self._retreat(self._index - 1)
return None
return self.expression(
exp.NextValueFor,
this=self._parse_column(),
order=self._match(TokenType.OVER) and self._parse_wrapped(self._parse_order),
)
def _parse_extract(self) -> exp.Extract | exp.Anonymous:
this = self._parse_function() or self._parse_var_or_string(upper=True)
if self._match(TokenType.FROM):
return self.expression(exp.Extract, this=this, expression=self._parse_bitwise())
if not self._match(TokenType.COMMA):
self.raise_error("Expected FROM or comma after EXTRACT", self._prev)
return self.expression(exp.Extract, this=this, expression=self._parse_bitwise())
def _parse_gap_fill(self) -> exp.GapFill:
self._match(TokenType.TABLE)
this = self._parse_table()
self._match(TokenType.COMMA)
args = [this, *self._parse_csv(self._parse_lambda)]
gap_fill = exp.GapFill.from_arg_list(args)
return self.validate_expression(gap_fill, args)
def _parse_cast(self, strict: bool, safe: t.Optional[bool] = None) -> exp.Expression:
this = self._parse_assignment()
if not self._match(TokenType.ALIAS):
if self._match(TokenType.COMMA):
return self.expression(exp.CastToStrType, this=this, to=self._parse_string())
self.raise_error("Expected AS after CAST")
fmt = None
to = self._parse_types()
if self._match(TokenType.FORMAT):
fmt_string = self._parse_string()
fmt = self._parse_at_time_zone(fmt_string)
if not to:
to = exp.DataType.build(exp.DataType.Type.UNKNOWN)
if to.this in exp.DataType.TEMPORAL_TYPES:
this = self.expression(
exp.StrToDate if to.this == exp.DataType.Type.DATE else exp.StrToTime,
this=this,
format=exp.Literal.string(
format_time(
fmt_string.this if fmt_string else "",
self.dialect.FORMAT_MAPPING or self.dialect.TIME_MAPPING,
self.dialect.FORMAT_TRIE or self.dialect.TIME_TRIE,
)
),
safe=safe,
)
if isinstance(fmt, exp.AtTimeZone) and isinstance(this, exp.StrToTime):
this.set("zone", fmt.args["zone"])
return this
elif not to:
self.raise_error("Expected TYPE after CAST")
elif isinstance(to, exp.Identifier):
to = exp.DataType.build(to.name, udt=True)
elif to.this == exp.DataType.Type.CHAR:
if self._match(TokenType.CHARACTER_SET):
to = self.expression(exp.CharacterSet, this=self._parse_var_or_string())
return self.expression(
exp.Cast if strict else exp.TryCast,
this=this,
to=to,
format=fmt,
safe=safe,
action=self._parse_var_from_options(self.CAST_ACTIONS, raise_unmatched=False),
)
def _parse_string_agg(self) -> exp.GroupConcat:
if self._match(TokenType.DISTINCT):
args: t.List[t.Optional[exp.Expression]] = [
self.expression(exp.Distinct, expressions=[self._parse_assignment()])
]
if self._match(TokenType.COMMA):
args.extend(self._parse_csv(self._parse_assignment))
else:
args = self._parse_csv(self._parse_assignment) # type: ignore
if self._match_text_seq("ON", "OVERFLOW"):
# trino: LISTAGG(expression [, separator] [ON OVERFLOW overflow_behavior])
if self._match_text_seq("ERROR"):
on_overflow: t.Optional[exp.Expression] = exp.var("ERROR")
else:
self._match_text_seq("TRUNCATE")
on_overflow = self.expression(
exp.OverflowTruncateBehavior,
this=self._parse_string(),
with_count=(
self._match_text_seq("WITH", "COUNT")
or not self._match_text_seq("WITHOUT", "COUNT")
),
)
else:
on_overflow = None
index = self._index
if not self._match(TokenType.R_PAREN) and args:
# postgres: STRING_AGG([DISTINCT] expression, separator [ORDER BY expression1 {ASC | DESC} [, ...]])
# bigquery: STRING_AGG([DISTINCT] expression [, separator] [ORDER BY key [{ASC | DESC}] [, ... ]] [LIMIT n])
args[-1] = self._parse_limit(this=self._parse_order(this=args[-1]))
return self.expression(exp.GroupConcat, this=args[0], separator=seq_get(args, 1))
# Checks if we can parse an order clause: WITHIN GROUP (ORDER BY <order_by_expression_list> [ASC | DESC]).
# This is done "manually", instead of letting _parse_window parse it into an exp.WithinGroup node, so that
# the STRING_AGG call is parsed like in MySQL / SQLite and can thus be transpiled more easily to them.
if not self._match_text_seq("WITHIN", "GROUP"):
self._retreat(index)
return self.validate_expression(exp.GroupConcat.from_arg_list(args), args)
# The corresponding match_r_paren will be called in parse_function (caller)
self._match_l_paren()
return self.expression(
exp.GroupConcat,
this=self._parse_order(this=seq_get(args, 0)),
separator=seq_get(args, 1),
on_overflow=on_overflow,
)
def _parse_convert(
self, strict: bool, safe: t.Optional[bool] = None
) -> t.Optional[exp.Expression]:
this = self._parse_bitwise()
if self._match(TokenType.USING):
to: t.Optional[exp.Expression] = self.expression(
exp.CharacterSet, this=self._parse_var()
)
elif self._match(TokenType.COMMA):
to = self._parse_types()
else:
to = None
return self.expression(exp.Cast if strict else exp.TryCast, this=this, to=to, safe=safe)
def _parse_decode(self) -> t.Optional[exp.Decode | exp.Case]:
"""
There are generally two variants of the DECODE function:
- DECODE(bin, charset)
- DECODE(expression, search, result [, search, result] ... [, default])
The second variant will always be parsed into a CASE expression. Note that NULL
needs special treatment, since we need to explicitly check for it with `IS NULL`,
instead of relying on pattern matching.
"""
args = self._parse_csv(self._parse_assignment)
if len(args) < 3:
return self.expression(exp.Decode, this=seq_get(args, 0), charset=seq_get(args, 1))
expression, *expressions = args
if not expression:
return None
ifs = []
for search, result in zip(expressions[::2], expressions[1::2]):
if not search or not result:
return None
if isinstance(search, exp.Literal):
ifs.append(
exp.If(this=exp.EQ(this=expression.copy(), expression=search), true=result)
)
elif isinstance(search, exp.Null):
ifs.append(
exp.If(this=exp.Is(this=expression.copy(), expression=exp.Null()), true=result)
)
else:
cond = exp.or_(
exp.EQ(this=expression.copy(), expression=search),
exp.and_(
exp.Is(this=expression.copy(), expression=exp.Null()),
exp.Is(this=search.copy(), expression=exp.Null()),
copy=False,
),
copy=False,
)
ifs.append(exp.If(this=cond, true=result))
return exp.Case(ifs=ifs, default=expressions[-1] if len(expressions) % 2 == 1 else None)
def _parse_json_key_value(self) -> t.Optional[exp.JSONKeyValue]:
self._match_text_seq("KEY")
key = self._parse_column()
self._match_set(self.JSON_KEY_VALUE_SEPARATOR_TOKENS)
self._match_text_seq("VALUE")
value = self._parse_bitwise()
if not key and not value:
return None
return self.expression(exp.JSONKeyValue, this=key, expression=value)
def _parse_format_json(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
if not this or not self._match_text_seq("FORMAT", "JSON"):
return this
return self.expression(exp.FormatJson, this=this)
def _parse_on_condition(self) -> t.Optional[exp.OnCondition]:
# MySQL uses "X ON EMPTY Y ON ERROR" (e.g. JSON_VALUE) while Oracle uses the opposite (e.g. JSON_EXISTS)
if self.dialect.ON_CONDITION_EMPTY_BEFORE_ERROR:
empty = self._parse_on_handling("EMPTY", *self.ON_CONDITION_TOKENS)
error = self._parse_on_handling("ERROR", *self.ON_CONDITION_TOKENS)
else:
error = self._parse_on_handling("ERROR", *self.ON_CONDITION_TOKENS)
empty = self._parse_on_handling("EMPTY", *self.ON_CONDITION_TOKENS)
null = self._parse_on_handling("NULL", *self.ON_CONDITION_TOKENS)
if not empty and not error and not null:
return None
return self.expression(
exp.OnCondition,
empty=empty,
error=error,
null=null,
)
def _parse_on_handling(
self, on: str, *values: str
) -> t.Optional[str] | t.Optional[exp.Expression]:
# Parses the "X ON Y" or "DEFAULT <expr> ON Y syntax, e.g. NULL ON NULL (Oracle, T-SQL, MySQL)
for value in values:
if self._match_text_seq(value, "ON", on):
return f"{value} ON {on}"
index = self._index
if self._match(TokenType.DEFAULT):
default_value = self._parse_bitwise()
if self._match_text_seq("ON", on):
return default_value
self._retreat(index)
return None
@t.overload
def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ...
@t.overload
def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ...
def _parse_json_object(self, agg=False):
star = self._parse_star()
expressions = (
[star]
if star
else self._parse_csv(lambda: self._parse_format_json(self._parse_json_key_value()))
)
null_handling = self._parse_on_handling("NULL", "NULL", "ABSENT")
unique_keys = None
if self._match_text_seq("WITH", "UNIQUE"):
unique_keys = True
elif self._match_text_seq("WITHOUT", "UNIQUE"):
unique_keys = False
self._match_text_seq("KEYS")
return_type = self._match_text_seq("RETURNING") and self._parse_format_json(
self._parse_type()
)
encoding = self._match_text_seq("ENCODING") and self._parse_var()
return self.expression(
exp.JSONObjectAgg if agg else exp.JSONObject,
expressions=expressions,
null_handling=null_handling,
unique_keys=unique_keys,
return_type=return_type,
encoding=encoding,
)
# Note: this is currently incomplete; it only implements the "JSON_value_column" part
def _parse_json_column_def(self) -> exp.JSONColumnDef:
if not self._match_text_seq("NESTED"):
this = self._parse_id_var()
kind = self._parse_types(allow_identifiers=False)
nested = None
else:
this = None
kind = None
nested = True
path = self._match_text_seq("PATH") and self._parse_string()
nested_schema = nested and self._parse_json_schema()
return self.expression(
exp.JSONColumnDef,
this=this,
kind=kind,
path=path,
nested_schema=nested_schema,
)
def _parse_json_schema(self) -> exp.JSONSchema:
self._match_text_seq("COLUMNS")
return self.expression(
exp.JSONSchema,
expressions=self._parse_wrapped_csv(self._parse_json_column_def, optional=True),
)
def _parse_json_table(self) -> exp.JSONTable:
this = self._parse_format_json(self._parse_bitwise())
path = self._match(TokenType.COMMA) and self._parse_string()
error_handling = self._parse_on_handling("ERROR", "ERROR", "NULL")
empty_handling = self._parse_on_handling("EMPTY", "ERROR", "NULL")
schema = self._parse_json_schema()
return exp.JSONTable(
this=this,
schema=schema,
path=path,
error_handling=error_handling,
empty_handling=empty_handling,
)
def _parse_match_against(self) -> exp.MatchAgainst:
expressions = self._parse_csv(self._parse_column)
self._match_text_seq(")", "AGAINST", "(")
this = self._parse_string()
if self._match_text_seq("IN", "NATURAL", "LANGUAGE", "MODE"):
modifier = "IN NATURAL LANGUAGE MODE"
if self._match_text_seq("WITH", "QUERY", "EXPANSION"):
modifier = f"{modifier} WITH QUERY EXPANSION"
elif self._match_text_seq("IN", "BOOLEAN", "MODE"):
modifier = "IN BOOLEAN MODE"
elif self._match_text_seq("WITH", "QUERY", "EXPANSION"):
modifier = "WITH QUERY EXPANSION"
else:
modifier = None
return self.expression(
exp.MatchAgainst, this=this, expressions=expressions, modifier=modifier
)
# https://learn.microsoft.com/en-us/sql/t-sql/functions/openjson-transact-sql?view=sql-server-ver16
def _parse_open_json(self) -> exp.OpenJSON:
this = self._parse_bitwise()
path = self._match(TokenType.COMMA) and self._parse_string()
def _parse_open_json_column_def() -> exp.OpenJSONColumnDef:
this = self._parse_field(any_token=True)
kind = self._parse_types()
path = self._parse_string()
as_json = self._match_pair(TokenType.ALIAS, TokenType.JSON)
return self.expression(
exp.OpenJSONColumnDef, this=this, kind=kind, path=path, as_json=as_json
)
expressions = None
if self._match_pair(TokenType.R_PAREN, TokenType.WITH):
self._match_l_paren()
expressions = self._parse_csv(_parse_open_json_column_def)
return self.expression(exp.OpenJSON, this=this, path=path, expressions=expressions)
def _parse_position(self, haystack_first: bool = False) -> exp.StrPosition:
args = self._parse_csv(self._parse_bitwise)
if self._match(TokenType.IN):
return self.expression(
exp.StrPosition, this=self._parse_bitwise(), substr=seq_get(args, 0)
)
if haystack_first:
haystack = seq_get(args, 0)
needle = seq_get(args, 1)
else:
needle = seq_get(args, 0)
haystack = seq_get(args, 1)
return self.expression(
exp.StrPosition, this=haystack, substr=needle, position=seq_get(args, 2)
)
def _parse_predict(self) -> exp.Predict:
self._match_text_seq("MODEL")
this = self._parse_table()
self._match(TokenType.COMMA)
self._match_text_seq("TABLE")
return self.expression(
exp.Predict,
this=this,
expression=self._parse_table(),
params_struct=self._match(TokenType.COMMA) and self._parse_bitwise(),
)
def _parse_join_hint(self, func_name: str) -> exp.JoinHint:
args = self._parse_csv(self._parse_table)
return exp.JoinHint(this=func_name.upper(), expressions=args)
def _parse_substring(self) -> exp.Substring:
# Postgres supports the form: substring(string [from int] [for int])
# https://www.postgresql.org/docs/9.1/functions-string.html @ Table 9-6
args = t.cast(t.List[t.Optional[exp.Expression]], self._parse_csv(self._parse_bitwise))
if self._match(TokenType.FROM):
args.append(self._parse_bitwise())
if self._match(TokenType.FOR):
if len(args) == 1:
args.append(exp.Literal.number(1))
args.append(self._parse_bitwise())
return self.validate_expression(exp.Substring.from_arg_list(args), args)
def _parse_trim(self) -> exp.Trim:
# https://www.w3resource.com/sql/character-functions/trim.php
# https://docs.oracle.com/javadb/10.8.3.0/ref/rreftrimfunc.html
position = None
collation = None
expression = None
if self._match_texts(self.TRIM_TYPES):
position = self._prev.text.upper()
this = self._parse_bitwise()
if self._match_set((TokenType.FROM, TokenType.COMMA)):
invert_order = self._prev.token_type == TokenType.FROM or self.TRIM_PATTERN_FIRST
expression = self._parse_bitwise()
if invert_order:
this, expression = expression, this
if self._match(TokenType.COLLATE):
collation = self._parse_bitwise()
return self.expression(
exp.Trim, this=this, position=position, expression=expression, collation=collation
)
def _parse_window_clause(self) -> t.Optional[t.List[exp.Expression]]:
return self._match(TokenType.WINDOW) and self._parse_csv(self._parse_named_window)
def _parse_named_window(self) -> t.Optional[exp.Expression]:
return self._parse_window(self._parse_id_var(), alias=True)
def _parse_respect_or_ignore_nulls(
self, this: t.Optional[exp.Expression]
) -> t.Optional[exp.Expression]:
if self._match_text_seq("IGNORE", "NULLS"):
return self.expression(exp.IgnoreNulls, this=this)
if self._match_text_seq("RESPECT", "NULLS"):
return self.expression(exp.RespectNulls, this=this)
return this
def _parse_having_max(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]:
if self._match(TokenType.HAVING):
self._match_texts(("MAX", "MIN"))
max = self._prev.text.upper() != "MIN"
return self.expression(
exp.HavingMax, this=this, expression=self._parse_column(), max=max
)
return this
def _parse_window(
self, this: t.Optional[exp.Expression], alias: bool = False
) -> t.Optional[exp.Expression]:
func = this
comments = func.comments if isinstance(func, exp.Expression) else None
if self._match_pair(TokenType.FILTER, TokenType.L_PAREN):
self._match(TokenType.WHERE)
this = self.expression(
exp.Filter, this=this, expression=self._parse_where(skip_where_token=True)
)
self._match_r_paren()
# T-SQL allows the OVER (...) syntax after WITHIN GROUP.
# https://learn.microsoft.com/en-us/sql/t-sql/functions/percentile-disc-transact-sql?view=sql-server-ver16
if self._match_text_seq("WITHIN", "GROUP"):
order = self._parse_wrapped(self._parse_order)
this = self.expression(exp.WithinGroup, this=this, expression=order)
# SQL spec defines an optional [ { IGNORE | RESPECT } NULLS ] OVER
# Some dialects choose to implement and some do not.
# https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html
# There is some code above in _parse_lambda that handles
# SELECT FIRST_VALUE(TABLE.COLUMN IGNORE|RESPECT NULLS) OVER ...
# The below changes handle
# SELECT FIRST_VALUE(TABLE.COLUMN) IGNORE|RESPECT NULLS OVER ...
# Oracle allows both formats
# (https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/img_text/first_value.html)
# and Snowflake chose to do the same for familiarity
# https://docs.snowflake.com/en/sql-reference/functions/first_value.html#usage-notes
if isinstance(this, exp.AggFunc):
ignore_respect = this.find(exp.IgnoreNulls, exp.RespectNulls)
if ignore_respect and ignore_respect is not this:
ignore_respect.replace(ignore_respect.this)
this = self.expression(ignore_respect.__class__, this=this)
this = self._parse_respect_or_ignore_nulls(this)
# bigquery select from window x AS (partition by ...)
if alias:
over = None
self._match(TokenType.ALIAS)
elif not self._match_set(self.WINDOW_BEFORE_PAREN_TOKENS):
return this
else:
over = self._prev.text.upper()
if comments and isinstance(func, exp.Expression):
func.pop_comments()
if not self._match(TokenType.L_PAREN):
return self.expression(
exp.Window,
comments=comments,
this=this,
alias=self._parse_id_var(False),
over=over,
)
window_alias = self._parse_id_var(any_token=False, tokens=self.WINDOW_ALIAS_TOKENS)
first = self._match(TokenType.FIRST)
if self._match_text_seq("LAST"):
first = False
partition, order = self._parse_partition_and_order()
kind = self._match_set((TokenType.ROWS, TokenType.RANGE)) and self._prev.text
if kind:
self._match(TokenType.BETWEEN)
start = self._parse_window_spec()
self._match(TokenType.AND)
end = self._parse_window_spec()
spec = self.expression(
exp.WindowSpec,
kind=kind,
start=start["value"],
start_side=start["side"],
end=end["value"],
end_side=end["side"],
)
else:
spec = None
self._match_r_paren()
window = self.expression(
exp.Window,
comments=comments,
this=this,
partition_by=partition,
order=order,
spec=spec,
alias=window_alias,
over=over,
first=first,
)
# This covers Oracle's FIRST/LAST syntax: aggregate KEEP (...) OVER (...)
if self._match_set(self.WINDOW_BEFORE_PAREN_TOKENS, advance=False):
return self._parse_window(window, alias=alias)
return window
def _parse_partition_and_order(
self,
) -> t.Tuple[t.List[exp.Expression], t.Optional[exp.Expression]]:
return self._parse_partition_by(), self._parse_order()
def _parse_window_spec(self) -> t.Dict[str, t.Optional[str | exp.Expression]]:
self._match(TokenType.BETWEEN)
return {
"value": (
(self._match_text_seq("UNBOUNDED") and "UNBOUNDED")
or (self._match_text_seq("CURRENT", "ROW") and "CURRENT ROW")
or self._parse_bitwise()
),
"side": self._match_texts(self.WINDOW_SIDES) and self._prev.text,
}
def _parse_alias(
self, this: t.Optional[exp.Expression], explicit: bool = False
) -> t.Optional[exp.Expression]:
any_token = self._match(TokenType.ALIAS)
comments = self._prev_comments or []
if explicit and not any_token:
return this
if self._match(TokenType.L_PAREN):
aliases = self.expression(
exp.Aliases,
comments=comments,
this=this,
expressions=self._parse_csv(lambda: self._parse_id_var(any_token)),
)
self._match_r_paren(aliases)
return aliases
alias = self._parse_id_var(any_token, tokens=self.ALIAS_TOKENS) or (
self.STRING_ALIASES and self._parse_string_as_identifier()
)
if alias:
comments.extend(alias.pop_comments())
this = self.expression(exp.Alias, comments=comments, this=this, alias=alias)
column = this.this
# Moves the comment next to the alias in `expr /* comment */ AS alias`
if not this.comments and column and column.comments:
this.comments = column.pop_comments()
return this
def _parse_id_var(
self,
any_token: bool = True,
tokens: t.Optional[t.Collection[TokenType]] = None,
) -> t.Optional[exp.Expression]:
expression = self._parse_identifier()
if not expression and (
(any_token and self._advance_any()) or self._match_set(tokens or self.ID_VAR_TOKENS)
):
quoted = self._prev.token_type == TokenType.STRING
expression = self.expression(exp.Identifier, this=self._prev.text, quoted=quoted)
return expression
def _parse_string(self) -> t.Optional[exp.Expression]:
if self._match_set(self.STRING_PARSERS):
return self.STRING_PARSERS[self._prev.token_type](self, self._prev)
return self._parse_placeholder()
def _parse_string_as_identifier(self) -> t.Optional[exp.Identifier]:
return exp.to_identifier(self._match(TokenType.STRING) and self._prev.text, quoted=True)
def _parse_number(self) -> t.Optional[exp.Expression]:
if self._match_set(self.NUMERIC_PARSERS):
return self.NUMERIC_PARSERS[self._prev.token_type](self, self._prev)
return self._parse_placeholder()
def _parse_identifier(self) -> t.Optional[exp.Expression]:
if self._match(TokenType.IDENTIFIER):
return self.expression(exp.Identifier, this=self._prev.text, quoted=True)
return self._parse_placeholder()
def _parse_var(
self,
any_token: bool = False,
tokens: t.Optional[t.Collection[TokenType]] = None,
upper: bool = False,
) -> t.Optional[exp.Expression]:
if (
(any_token and self._advance_any())
or self._match(TokenType.VAR)
or (self._match_set(tokens) if tokens else False)
):
return self.expression(
exp.Var, this=self._prev.text.upper() if upper else self._prev.text
)
return self._parse_placeholder()
def _advance_any(self, ignore_reserved: bool = False) -> t.Optional[Token]:
if self._curr and (ignore_reserved or self._curr.token_type not in self.RESERVED_TOKENS):
self._advance()
return self._prev
return None
def _parse_var_or_string(self, upper: bool = False) -> t.Optional[exp.Expression]:
return self._parse_string() or self._parse_var(any_token=True, upper=upper)
def _parse_primary_or_var(self) -> t.Optional[exp.Expression]:
return self._parse_primary() or self._parse_var(any_token=True)
def _parse_null(self) -> t.Optional[exp.Expression]:
if self._match_set(self.NULL_TOKENS):
return self.PRIMARY_PARSERS[TokenType.NULL](self, self._prev)
return self._parse_placeholder()
def _parse_boolean(self) -> t.Optional[exp.Expression]:
if self._match(TokenType.TRUE):
return self.PRIMARY_PARSERS[TokenType.TRUE](self, self._prev)
if self._match(TokenType.FALSE):
return self.PRIMARY_PARSERS[TokenType.FALSE](self, self._prev)
return self._parse_placeholder()
def _parse_star(self) -> t.Optional[exp.Expression]:
if self._match(TokenType.STAR):
return self.PRIMARY_PARSERS[TokenType.STAR](self, self._prev)
return self._parse_placeholder()
def _parse_parameter(self) -> exp.Parameter:
this = self._parse_identifier() or self._parse_primary_or_var()
return self.expression(exp.Parameter, this=this)
def _parse_placeholder(self) -> t.Optional[exp.Expression]:
if self._match_set(self.PLACEHOLDER_PARSERS):
placeholder = self.PLACEHOLDER_PARSERS[self._prev.token_type](self)
if placeholder:
return placeholder
self._advance(-1)
return None
def _parse_star_op(self, *keywords: str) -> t.Optional[t.List[exp.Expression]]:
if not self._match_texts(keywords):
return None
if self._match(TokenType.L_PAREN, advance=False):
return self._parse_wrapped_csv(self._parse_expression)
expression = self._parse_expression()
return [expression] if expression else None
def _parse_csv(
self, parse_method: t.Callable, sep: TokenType = TokenType.COMMA
) -> t.List[exp.Expression]:
parse_result = parse_method()
items = [parse_result] if parse_result is not None else []
while self._match(sep):
self._add_comments(parse_result)
parse_result = parse_method()
if parse_result is not None:
items.append(parse_result)
return items
def _parse_tokens(
self, parse_method: t.Callable, expressions: t.Dict
) -> t.Optional[exp.Expression]:
this = parse_method()
while self._match_set(expressions):
this = self.expression(
expressions[self._prev.token_type],
this=this,
comments=self._prev_comments,
expression=parse_method(),
)
return this
def _parse_wrapped_id_vars(self, optional: bool = False) -> t.List[exp.Expression]:
return self._parse_wrapped_csv(self._parse_id_var, optional=optional)
def _parse_wrapped_csv(
self, parse_method: t.Callable, sep: TokenType = TokenType.COMMA, optional: bool = False
) -> t.List[exp.Expression]:
return self._parse_wrapped(
lambda: self._parse_csv(parse_method, sep=sep), optional=optional
)
def _parse_wrapped(self, parse_method: t.Callable, optional: bool = False) -> t.Any:
wrapped = self._match(TokenType.L_PAREN)
if not wrapped and not optional:
self.raise_error("Expecting (")
parse_result = parse_method()
if wrapped:
self._match_r_paren()
return parse_result
def _parse_expressions(self) -> t.List[exp.Expression]:
return self._parse_csv(self._parse_expression)
def _parse_select_or_expression(self, alias: bool = False) -> t.Optional[exp.Expression]:
return self._parse_select() or self._parse_set_operations(
self._parse_expression() if alias else self._parse_assignment()
)
def _parse_ddl_select(self) -> t.Optional[exp.Expression]:
return self._parse_query_modifiers(
self._parse_set_operations(self._parse_select(nested=True, parse_subquery_alias=False))
)
def _parse_transaction(self) -> exp.Transaction | exp.Command:
this = None
if self._match_texts(self.TRANSACTION_KIND):
this = self._prev.text
self._match_texts(("TRANSACTION", "WORK"))
modes = []
while True:
mode = []
while self._match(TokenType.VAR):
mode.append(self._prev.text)
if mode:
modes.append(" ".join(mode))
if not self._match(TokenType.COMMA):
break
return self.expression(exp.Transaction, this=this, modes=modes)
def _parse_commit_or_rollback(self) -> exp.Commit | exp.Rollback:
chain = None
savepoint = None
is_rollback = self._prev.token_type == TokenType.ROLLBACK
self._match_texts(("TRANSACTION", "WORK"))
if self._match_text_seq("TO"):
self._match_text_seq("SAVEPOINT")
savepoint = self._parse_id_var()
if self._match(TokenType.AND):
chain = not self._match_text_seq("NO")
self._match_text_seq("CHAIN")
if is_rollback:
return self.expression(exp.Rollback, savepoint=savepoint)
return self.expression(exp.Commit, chain=chain)
def _parse_refresh(self) -> exp.Refresh:
self._match(TokenType.TABLE)
return self.expression(exp.Refresh, this=self._parse_string() or self._parse_table())
def _parse_add_column(self) -> t.Optional[exp.Expression]:
if not self._match_text_seq("ADD"):
return None
self._match(TokenType.COLUMN)
exists_column = self._parse_exists(not_=True)
expression = self._parse_field_def()
if expression:
expression.set("exists", exists_column)
# https://docs.databricks.com/delta/update-schema.html#explicitly-update-schema-to-add-columns
if self._match_texts(("FIRST", "AFTER")):
position = self._prev.text
column_position = self.expression(
exp.ColumnPosition, this=self._parse_column(), position=position
)
expression.set("position", column_position)
return expression
def _parse_drop_column(self) -> t.Optional[exp.Drop | exp.Command]:
drop = self._match(TokenType.DROP) and self._parse_drop()
if drop and not isinstance(drop, exp.Command):
drop.set("kind", drop.args.get("kind", "COLUMN"))
return drop
# https://docs.aws.amazon.com/athena/latest/ug/alter-table-drop-partition.html
def _parse_drop_partition(self, exists: t.Optional[bool] = None) -> exp.DropPartition:
return self.expression(
exp.DropPartition, expressions=self._parse_csv(self._parse_partition), exists=exists
)
def _parse_alter_table_add(self) -> t.List[exp.Expression]:
index = self._index - 1
if self._match_set(self.ADD_CONSTRAINT_TOKENS, advance=False):
return self._parse_csv(
lambda: self.expression(
exp.AddConstraint, expressions=self._parse_csv(self._parse_constraint)
)
)
self._retreat(index)
if not self.ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN and self._match_text_seq("ADD"):
return self._parse_wrapped_csv(self._parse_field_def, optional=True)
if self._match_text_seq("ADD", "COLUMNS"):
schema = self._parse_schema()
if schema:
return [schema]
return []
return self._parse_wrapped_csv(self._parse_add_column, optional=True)
def _parse_alter_table_alter(self) -> t.Optional[exp.Expression]:
if self._match_texts(self.ALTER_ALTER_PARSERS):
return self.ALTER_ALTER_PARSERS[self._prev.text.upper()](self)
# Many dialects support the ALTER [COLUMN] syntax, so if there is no
# keyword after ALTER we default to parsing this statement
self._match(TokenType.COLUMN)
column = self._parse_field(any_token=True)
if self._match_pair(TokenType.DROP, TokenType.DEFAULT):
return self.expression(exp.AlterColumn, this=column, drop=True)
if self._match_pair(TokenType.SET, TokenType.DEFAULT):
return self.expression(exp.AlterColumn, this=column, default=self._parse_assignment())
if self._match(TokenType.COMMENT):
return self.expression(exp.AlterColumn, this=column, comment=self._parse_string())
if self._match_text_seq("DROP", "NOT", "NULL"):
return self.expression(
exp.AlterColumn,
this=column,
drop=True,
allow_null=True,
)
if self._match_text_seq("SET", "NOT", "NULL"):
return self.expression(
exp.AlterColumn,
this=column,
allow_null=False,
)
self._match_text_seq("SET", "DATA")
self._match_text_seq("TYPE")
return self.expression(
exp.AlterColumn,
this=column,
dtype=self._parse_types(),
collate=self._match(TokenType.COLLATE) and self._parse_term(),
using=self._match(TokenType.USING) and self._parse_assignment(),
)
def _parse_alter_diststyle(self) -> exp.AlterDistStyle:
if self._match_texts(("ALL", "EVEN", "AUTO")):
return self.expression(exp.AlterDistStyle, this=exp.var(self._prev.text.upper()))
self._match_text_seq("KEY", "DISTKEY")
return self.expression(exp.AlterDistStyle, this=self._parse_column())
def _parse_alter_sortkey(self, compound: t.Optional[bool] = None) -> exp.AlterSortKey:
if compound:
self._match_text_seq("SORTKEY")
if self._match(TokenType.L_PAREN, advance=False):
return self.expression(
exp.AlterSortKey, expressions=self._parse_wrapped_id_vars(), compound=compound
)
self._match_texts(("AUTO", "NONE"))
return self.expression(
exp.AlterSortKey, this=exp.var(self._prev.text.upper()), compound=compound
)
def _parse_alter_table_drop(self) -> t.List[exp.Expression]:
index = self._index - 1
partition_exists = self._parse_exists()
if self._match(TokenType.PARTITION, advance=False):
return self._parse_csv(lambda: self._parse_drop_partition(exists=partition_exists))
self._retreat(index)
return self._parse_csv(self._parse_drop_column)
def _parse_alter_table_rename(self) -> t.Optional[exp.AlterRename | exp.RenameColumn]:
if self._match(TokenType.COLUMN):
exists = self._parse_exists()
old_column = self._parse_column()
to = self._match_text_seq("TO")
new_column = self._parse_column()
if old_column is None or to is None or new_column is None:
return None
return self.expression(exp.RenameColumn, this=old_column, to=new_column, exists=exists)
self._match_text_seq("TO")
return self.expression(exp.AlterRename, this=self._parse_table(schema=True))
def _parse_alter_table_set(self) -> exp.AlterSet:
alter_set = self.expression(exp.AlterSet)
if self._match(TokenType.L_PAREN, advance=False) or self._match_text_seq(
"TABLE", "PROPERTIES"
):
alter_set.set("expressions", self._parse_wrapped_csv(self._parse_assignment))
elif self._match_text_seq("FILESTREAM_ON", advance=False):
alter_set.set("expressions", [self._parse_assignment()])
elif self._match_texts(("LOGGED", "UNLOGGED")):
alter_set.set("option", exp.var(self._prev.text.upper()))
elif self._match_text_seq("WITHOUT") and self._match_texts(("CLUSTER", "OIDS")):
alter_set.set("option", exp.var(f"WITHOUT {self._prev.text.upper()}"))
elif self._match_text_seq("LOCATION"):
alter_set.set("location", self._parse_field())
elif self._match_text_seq("ACCESS", "METHOD"):
alter_set.set("access_method", self._parse_field())
elif self._match_text_seq("TABLESPACE"):
alter_set.set("tablespace", self._parse_field())
elif self._match_text_seq("FILE", "FORMAT") or self._match_text_seq("FILEFORMAT"):
alter_set.set("file_format", [self._parse_field()])
elif self._match_text_seq("STAGE_FILE_FORMAT"):
alter_set.set("file_format", self._parse_wrapped_options())
elif self._match_text_seq("STAGE_COPY_OPTIONS"):
alter_set.set("copy_options", self._parse_wrapped_options())
elif self._match_text_seq("TAG") or self._match_text_seq("TAGS"):
alter_set.set("tag", self._parse_csv(self._parse_assignment))
else:
if self._match_text_seq("SERDE"):
alter_set.set("serde", self._parse_field())
alter_set.set("expressions", [self._parse_properties()])
return alter_set
def _parse_alter(self) -> exp.Alter | exp.Command:
start = self._prev
alter_token = self._match_set(self.ALTERABLES) and self._prev
if not alter_token:
return self._parse_as_command(start)
exists = self._parse_exists()
only = self._match_text_seq("ONLY")
this = self._parse_table(schema=True)
cluster = self._parse_on_property() if self._match(TokenType.ON) else None
if self._next:
self._advance()
parser = self.ALTER_PARSERS.get(self._prev.text.upper()) if self._prev else None
if parser:
actions = ensure_list(parser(self))
not_valid = self._match_text_seq("NOT", "VALID")
options = self._parse_csv(self._parse_property)
if not self._curr and actions:
return self.expression(
exp.Alter,
this=this,
kind=alter_token.text.upper(),
exists=exists,
actions=actions,
only=only,
options=options,
cluster=cluster,
not_valid=not_valid,
)
return self._parse_as_command(start)
def _parse_merge(self) -> exp.Merge:
self._match(TokenType.INTO)
target = self._parse_table()
if target and self._match(TokenType.ALIAS, advance=False):
target.set("alias", self._parse_table_alias())
self._match(TokenType.USING)
using = self._parse_table()
self._match(TokenType.ON)
on = self._parse_assignment()
return self.expression(
exp.Merge,
this=target,
using=using,
on=on,
expressions=self._parse_when_matched(),
returning=self._parse_returning(),
)
def _parse_when_matched(self) -> t.List[exp.When]:
whens = []
while self._match(TokenType.WHEN):
matched = not self._match(TokenType.NOT)
self._match_text_seq("MATCHED")
source = (
False
if self._match_text_seq("BY", "TARGET")
else self._match_text_seq("BY", "SOURCE")
)
condition = self._parse_assignment() if self._match(TokenType.AND) else None
self._match(TokenType.THEN)
if self._match(TokenType.INSERT):
this = self._parse_star()
if this:
then: t.Optional[exp.Expression] = self.expression(exp.Insert, this=this)
else:
then = self.expression(
exp.Insert,
this=exp.var("ROW") if self._match_text_seq("ROW") else self._parse_value(),
expression=self._match_text_seq("VALUES") and self._parse_value(),
)
elif self._match(TokenType.UPDATE):
expressions = self._parse_star()
if expressions:
then = self.expression(exp.Update, expressions=expressions)
else:
then = self.expression(
exp.Update,
expressions=self._match(TokenType.SET)
and self._parse_csv(self._parse_equality),
)
elif self._match(TokenType.DELETE):
then = self.expression(exp.Var, this=self._prev.text)
else:
then = self._parse_var_from_options(self.CONFLICT_ACTIONS)
whens.append(
self.expression(
exp.When,
matched=matched,
source=source,
condition=condition,
then=then,
)
)
return whens
def _parse_show(self) -> t.Optional[exp.Expression]:
parser = self._find_parser(self.SHOW_PARSERS, self.SHOW_TRIE)
if parser:
return parser(self)
return self._parse_as_command(self._prev)
def _parse_set_item_assignment(
self, kind: t.Optional[str] = None
) -> t.Optional[exp.Expression]:
index = self._index
if kind in ("GLOBAL", "SESSION") and self._match_text_seq("TRANSACTION"):
return self._parse_set_transaction(global_=kind == "GLOBAL")
left = self._parse_primary() or self._parse_column()
assignment_delimiter = self._match_texts(("=", "TO"))
if not left or (self.SET_REQUIRES_ASSIGNMENT_DELIMITER and not assignment_delimiter):
self._retreat(index)
return None
right = self._parse_statement() or self._parse_id_var()
if isinstance(right, (exp.Column, exp.Identifier)):
right = exp.var(right.name)
this = self.expression(exp.EQ, this=left, expression=right)
return self.expression(exp.SetItem, this=this, kind=kind)
def _parse_set_transaction(self, global_: bool = False) -> exp.Expression:
self._match_text_seq("TRANSACTION")
characteristics = self._parse_csv(
lambda: self._parse_var_from_options(self.TRANSACTION_CHARACTERISTICS)
)
return self.expression(
exp.SetItem,
expressions=characteristics,
kind="TRANSACTION",
**{"global": global_}, # type: ignore
)
def _parse_set_item(self) -> t.Optional[exp.Expression]:
parser = self._find_parser(self.SET_PARSERS, self.SET_TRIE)
return parser(self) if parser else self._parse_set_item_assignment(kind=None)
def _parse_set(self, unset: bool = False, tag: bool = False) -> exp.Set | exp.Command:
index = self._index
set_ = self.expression(
exp.Set, expressions=self._parse_csv(self._parse_set_item), unset=unset, tag=tag
)
if self._curr:
self._retreat(index)
return self._parse_as_command(self._prev)
return set_
def _parse_var_from_options(
self, options: OPTIONS_TYPE, raise_unmatched: bool = True
) -> t.Optional[exp.Var]:
start = self._curr
if not start:
return None
option = start.text.upper()
continuations = options.get(option)
index = self._index
self._advance()
for keywords in continuations or []:
if isinstance(keywords, str):
keywords = (keywords,)
if self._match_text_seq(*keywords):
option = f"{option} {' '.join(keywords)}"
break
else:
if continuations or continuations is None:
if raise_unmatched:
self.raise_error(f"Unknown option {option}")
self._retreat(index)
return None
return exp.var(option)
def _parse_as_command(self, start: Token) -> exp.Command:
while self._curr:
self._advance()
text = self._find_sql(start, self._prev)
size = len(start.text)
self._warn_unsupported()
return exp.Command(this=text[:size], expression=text[size:])
def _parse_dict_property(self, this: str) -> exp.DictProperty:
settings = []
self._match_l_paren()
kind = self._parse_id_var()
if self._match(TokenType.L_PAREN):
while True:
key = self._parse_id_var()
value = self._parse_primary()
if not key and value is None:
break
settings.append(self.expression(exp.DictSubProperty, this=key, value=value))
self._match(TokenType.R_PAREN)
self._match_r_paren()
return self.expression(
exp.DictProperty,
this=this,
kind=kind.this if kind else None,
settings=settings,
)
def _parse_dict_range(self, this: str) -> exp.DictRange:
self._match_l_paren()
has_min = self._match_text_seq("MIN")
if has_min:
min = self._parse_var() or self._parse_primary()
self._match_text_seq("MAX")
max = self._parse_var() or self._parse_primary()
else:
max = self._parse_var() or self._parse_primary()
min = exp.Literal.number(0)
self._match_r_paren()
return self.expression(exp.DictRange, this=this, min=min, max=max)
def _parse_comprehension(
self, this: t.Optional[exp.Expression]
) -> t.Optional[exp.Comprehension]:
index = self._index
expression = self._parse_column()
if not self._match(TokenType.IN):
self._retreat(index - 1)
return None
iterator = self._parse_column()
condition = self._parse_assignment() if self._match_text_seq("IF") else None
return self.expression(
exp.Comprehension,
this=this,
expression=expression,
iterator=iterator,
condition=condition,
)
def _parse_heredoc(self) -> t.Optional[exp.Heredoc]:
if self._match(TokenType.HEREDOC_STRING):
return self.expression(exp.Heredoc, this=self._prev.text)
if not self._match_text_seq("$"):
return None
tags = ["$"]
tag_text = None
if self._is_connected():
self._advance()
tags.append(self._prev.text.upper())
else:
self.raise_error("No closing $ found")
if tags[-1] != "$":
if self._is_connected() and self._match_text_seq("$"):
tag_text = tags[-1]
tags.append("$")
else:
self.raise_error("No closing $ found")
heredoc_start = self._curr
while self._curr:
if self._match_text_seq(*tags, advance=False):
this = self._find_sql(heredoc_start, self._prev)
self._advance(len(tags))
return self.expression(exp.Heredoc, this=this, tag=tag_text)
self._advance()
self.raise_error(f"No closing {''.join(tags)} found")
return None
def _find_parser(
self, parsers: t.Dict[str, t.Callable], trie: t.Dict
) -> t.Optional[t.Callable]:
if not self._curr:
return None
index = self._index
this = []
while True:
# The current token might be multiple words
curr = self._curr.text.upper()
key = curr.split(" ")
this.append(curr)
self._advance()
result, trie = in_trie(trie, key)
if result == TrieResult.FAILED:
break
if result == TrieResult.EXISTS:
subparser = parsers[" ".join(this)]
return subparser
self._retreat(index)
return None
def _match(self, token_type, advance=True, expression=None):
if not self._curr:
return None
if self._curr.token_type == token_type:
if advance:
self._advance()
self._add_comments(expression)
return True
return None
def _match_set(self, types, advance=True):
if not self._curr:
return None
if self._curr.token_type in types:
if advance:
self._advance()
return True
return None
def _match_pair(self, token_type_a, token_type_b, advance=True):
if not self._curr or not self._next:
return None
if self._curr.token_type == token_type_a and self._next.token_type == token_type_b:
if advance:
self._advance(2)
return True
return None
def _match_l_paren(self, expression: t.Optional[exp.Expression] = None) -> None:
if not self._match(TokenType.L_PAREN, expression=expression):
self.raise_error("Expecting (")
def _match_r_paren(self, expression: t.Optional[exp.Expression] = None) -> None:
if not self._match(TokenType.R_PAREN, expression=expression):
self.raise_error("Expecting )")
def _match_texts(self, texts, advance=True):
if (
self._curr
and self._curr.token_type != TokenType.STRING
and self._curr.text.upper() in texts
):
if advance:
self._advance()
return True
return None
def _match_text_seq(self, *texts, advance=True):
index = self._index
for text in texts:
if (
self._curr
and self._curr.token_type != TokenType.STRING
and self._curr.text.upper() == text
):
self._advance()
else:
self._retreat(index)
return None
if not advance:
self._retreat(index)
return True
def _replace_lambda(
self, node: t.Optional[exp.Expression], expressions: t.List[exp.Expression]
) -> t.Optional[exp.Expression]:
if not node:
return node
lambda_types = {e.name: e.args.get("to") or False for e in expressions}
for column in node.find_all(exp.Column):
typ = lambda_types.get(column.parts[0].name)
if typ is not None:
dot_or_id = column.to_dot() if column.table else column.this
if typ:
dot_or_id = self.expression(
exp.Cast,
this=dot_or_id,
to=typ,
)
parent = column.parent
while isinstance(parent, exp.Dot):
if not isinstance(parent.parent, exp.Dot):
parent.replace(dot_or_id)
break
parent = parent.parent
else:
if column is node:
node = dot_or_id
else:
column.replace(dot_or_id)
return node
def _parse_truncate_table(self) -> t.Optional[exp.TruncateTable] | exp.Expression:
start = self._prev
# Not to be confused with TRUNCATE(number, decimals) function call
if self._match(TokenType.L_PAREN):
self._retreat(self._index - 2)
return self._parse_function()
# Clickhouse supports TRUNCATE DATABASE as well
is_database = self._match(TokenType.DATABASE)
self._match(TokenType.TABLE)
exists = self._parse_exists(not_=False)
expressions = self._parse_csv(
lambda: self._parse_table(schema=True, is_db_reference=is_database)
)
cluster = self._parse_on_property() if self._match(TokenType.ON) else None
if self._match_text_seq("RESTART", "IDENTITY"):
identity = "RESTART"
elif self._match_text_seq("CONTINUE", "IDENTITY"):
identity = "CONTINUE"
else:
identity = None
if self._match_text_seq("CASCADE") or self._match_text_seq("RESTRICT"):
option = self._prev.text
else:
option = None
partition = self._parse_partition()
# Fallback case
if self._curr:
return self._parse_as_command(start)
return self.expression(
exp.TruncateTable,
expressions=expressions,
is_database=is_database,
exists=exists,
cluster=cluster,
identity=identity,
option=option,
partition=partition,
)
def _parse_with_operator(self) -> t.Optional[exp.Expression]:
this = self._parse_ordered(self._parse_opclass)
if not self._match(TokenType.WITH):
return this
op = self._parse_var(any_token=True)
return self.expression(exp.WithOperator, this=this, op=op)
def _parse_wrapped_options(self) -> t.List[t.Optional[exp.Expression]]:
self._match(TokenType.EQ)
self._match(TokenType.L_PAREN)
opts: t.List[t.Optional[exp.Expression]] = []
while self._curr and not self._match(TokenType.R_PAREN):
if self._match_text_seq("FORMAT_NAME", "="):
# The FORMAT_NAME can be set to an identifier for Snowflake and T-SQL,
# so we parse it separately to use _parse_field()
prop = self.expression(
exp.Property, this=exp.var("FORMAT_NAME"), value=self._parse_field()
)
opts.append(prop)
else:
opts.append(self._parse_property())
self._match(TokenType.COMMA)
return opts
def _parse_copy_parameters(self) -> t.List[exp.CopyParameter]:
sep = TokenType.COMMA if self.dialect.COPY_PARAMS_ARE_CSV else None
options = []
while self._curr and not self._match(TokenType.R_PAREN, advance=False):
option = self._parse_var(any_token=True)
prev = self._prev.text.upper()
# Different dialects might separate options and values by white space, "=" and "AS"
self._match(TokenType.EQ)
self._match(TokenType.ALIAS)
param = self.expression(exp.CopyParameter, this=option)
if prev in self.COPY_INTO_VARLEN_OPTIONS and self._match(
TokenType.L_PAREN, advance=False
):
# Snowflake FILE_FORMAT case, Databricks COPY & FORMAT options
param.set("expressions", self._parse_wrapped_options())
elif prev == "FILE_FORMAT":
# T-SQL's external file format case
param.set("expression", self._parse_field())
else:
param.set("expression", self._parse_unquoted_field())
options.append(param)
self._match(sep)
return options
def _parse_credentials(self) -> t.Optional[exp.Credentials]:
expr = self.expression(exp.Credentials)
if self._match_text_seq("STORAGE_INTEGRATION", "="):
expr.set("storage", self._parse_field())
if self._match_text_seq("CREDENTIALS"):
# Snowflake case: CREDENTIALS = (...), Redshift case: CREDENTIALS <string>
creds = (
self._parse_wrapped_options() if self._match(TokenType.EQ) else self._parse_field()
)
expr.set("credentials", creds)
if self._match_text_seq("ENCRYPTION"):
expr.set("encryption", self._parse_wrapped_options())
if self._match_text_seq("IAM_ROLE"):
expr.set("iam_role", self._parse_field())
if self._match_text_seq("REGION"):
expr.set("region", self._parse_field())
return expr
def _parse_file_location(self) -> t.Optional[exp.Expression]:
return self._parse_field()
def _parse_copy(self) -> exp.Copy | exp.Command:
start = self._prev
self._match(TokenType.INTO)
this = (
self._parse_select(nested=True, parse_subquery_alias=False)
if self._match(TokenType.L_PAREN, advance=False)
else self._parse_table(schema=True)
)
kind = self._match(TokenType.FROM) or not self._match_text_seq("TO")
files = self._parse_csv(self._parse_file_location)
credentials = self._parse_credentials()
self._match_text_seq("WITH")
params = self._parse_wrapped(self._parse_copy_parameters, optional=True)
# Fallback case
if self._curr:
return self._parse_as_command(start)
return self.expression(
exp.Copy,
this=this,
kind=kind,
credentials=credentials,
files=files,
params=params,
)
def _parse_normalize(self) -> exp.Normalize:
return self.expression(
exp.Normalize,
this=self._parse_bitwise(),
form=self._match(TokenType.COMMA) and self._parse_var(),
)
def _parse_star_ops(self) -> t.Optional[exp.Expression]:
if self._match_text_seq("COLUMNS", "(", advance=False):
this = self._parse_function()
if isinstance(this, exp.Columns):
this.set("unpack", True)
return this
return self.expression(
exp.Star,
**{ # type: ignore
"except": self._parse_star_op("EXCEPT", "EXCLUDE"),
"replace": self._parse_star_op("REPLACE"),
"rename": self._parse_star_op("RENAME"),
},
)
def _parse_grant_privilege(self) -> t.Optional[exp.GrantPrivilege]:
privilege_parts = []
# Keep consuming consecutive keywords until comma (end of this privilege) or ON
# (end of privilege list) or L_PAREN (start of column list) are met
while self._curr and not self._match_set(self.PRIVILEGE_FOLLOW_TOKENS, advance=False):
privilege_parts.append(self._curr.text.upper())
self._advance()
this = exp.var(" ".join(privilege_parts))
expressions = (
self._parse_wrapped_csv(self._parse_column)
if self._match(TokenType.L_PAREN, advance=False)
else None
)
return self.expression(exp.GrantPrivilege, this=this, expressions=expressions)
def _parse_grant_principal(self) -> t.Optional[exp.GrantPrincipal]:
kind = self._match_texts(("ROLE", "GROUP")) and self._prev.text.upper()
principal = self._parse_id_var()
if not principal:
return None
return self.expression(exp.GrantPrincipal, this=principal, kind=kind)
def _parse_grant(self) -> exp.Grant | exp.Command:
start = self._prev
privileges = self._parse_csv(self._parse_grant_privilege)
self._match(TokenType.ON)
kind = self._match_set(self.CREATABLES) and self._prev.text.upper()
# Attempt to parse the securable e.g. MySQL allows names
# such as "foo.*", "*.*" which are not easily parseable yet
securable = self._try_parse(self._parse_table_parts)
if not securable or not self._match_text_seq("TO"):
return self._parse_as_command(start)
principals = self._parse_csv(self._parse_grant_principal)
grant_option = self._match_text_seq("WITH", "GRANT", "OPTION")
if self._curr:
return self._parse_as_command(start)
return self.expression(
exp.Grant,
privileges=privileges,
kind=kind,
securable=securable,
principals=principals,
grant_option=grant_option,
)
def _parse_overlay(self) -> exp.Overlay:
return self.expression(
exp.Overlay,
**{ # type: ignore
"this": self._parse_bitwise(),
"expression": self._match_text_seq("PLACING") and self._parse_bitwise(),
"from": self._match_text_seq("FROM") and self._parse_bitwise(),
"for": self._match_text_seq("FOR") and self._parse_bitwise(),
},
)