Merging upstream version 10.4.2.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 15:01:55 +01:00 · 2025-02-13 15:01:55 +01:00 · 0c79f8b507
commit 0c79f8b507
parent de4e42d4d3
88 changed files with 1637 additions and 436 deletions
--- a/sqlglot/parser.py
+++ b/sqlglot/parser.py
@ -5,7 +5,7 @@ import typing as t

 from sqlglot import exp
 from sqlglot.errors import ErrorLevel, ParseError, concat_messages, merge_errors
-from sqlglot.helper import apply_index_offset, ensure_collection, seq_get
+from sqlglot.helper import apply_index_offset, ensure_collection, ensure_list, seq_get
 from sqlglot.tokens import Token, Tokenizer, TokenType
 from sqlglot.trie import in_trie, new_trie

@ -117,6 +117,7 @@ class Parser(metaclass=_Parser):
        TokenType.GEOMETRY,
        TokenType.HLLSKETCH,
        TokenType.HSTORE,
+        TokenType.PSEUDO_TYPE,
        TokenType.SUPER,
        TokenType.SERIAL,
        TokenType.SMALLSERIAL,
@ -153,6 +154,7 @@ class Parser(metaclass=_Parser):
        TokenType.CACHE,
        TokenType.CASCADE,
        TokenType.COLLATE,
+        TokenType.COLUMN,
        TokenType.COMMAND,
        TokenType.COMMIT,
        TokenType.COMPOUND,
@ -169,6 +171,7 @@ class Parser(metaclass=_Parser):
        TokenType.ESCAPE,
        TokenType.FALSE,
        TokenType.FIRST,
+        TokenType.FILTER,
        TokenType.FOLLOWING,
        TokenType.FORMAT,
        TokenType.FUNCTION,
@ -188,6 +191,7 @@ class Parser(metaclass=_Parser):
        TokenType.MERGE,
        TokenType.NATURAL,
        TokenType.NEXT,
+        TokenType.OFFSET,
        TokenType.ONLY,
        TokenType.OPTIONS,
        TokenType.ORDINALITY,
@ -222,12 +226,18 @@ class Parser(metaclass=_Parser):
        TokenType.PROPERTIES,
        TokenType.PROCEDURE,
        TokenType.VOLATILE,
+        TokenType.WINDOW,
        *SUBQUERY_PREDICATES,
        *TYPE_TOKENS,
        *NO_PAREN_FUNCTIONS,
    }

-    TABLE_ALIAS_TOKENS = ID_VAR_TOKENS - {TokenType.NATURAL, TokenType.APPLY}
+    TABLE_ALIAS_TOKENS = ID_VAR_TOKENS - {
+        TokenType.APPLY,
+        TokenType.NATURAL,
+        TokenType.OFFSET,
+        TokenType.WINDOW,
+    }

    UPDATE_ALIAS_TOKENS = TABLE_ALIAS_TOKENS - {TokenType.SET}

@ -257,6 +267,7 @@ class Parser(metaclass=_Parser):
        TokenType.TABLE,
        TokenType.TIMESTAMP,
        TokenType.TIMESTAMPTZ,
+        TokenType.WINDOW,
        *TYPE_TOKENS,
        *SUBQUERY_PREDICATES,
    }
@ -351,22 +362,27 @@ class Parser(metaclass=_Parser):
        TokenType.ARROW: lambda self, this, path: self.expression(
            exp.JSONExtract,
            this=this,
-            path=path,
+            expression=path,
        ),
        TokenType.DARROW: lambda self, this, path: self.expression(
            exp.JSONExtractScalar,
            this=this,
-            path=path,
+            expression=path,
        ),
        TokenType.HASH_ARROW: lambda self, this, path: self.expression(
            exp.JSONBExtract,
            this=this,
-            path=path,
+            expression=path,
        ),
        TokenType.DHASH_ARROW: lambda self, this, path: self.expression(
            exp.JSONBExtractScalar,
            this=this,
-            path=path,
+            expression=path,
+        ),
+        TokenType.PLACEHOLDER: lambda self, this, key: self.expression(
+            exp.JSONBContains,
+            this=this,
+            expression=key,
        ),
    }

@ -392,25 +408,27 @@ class Parser(metaclass=_Parser):
        exp.Ordered: lambda self: self._parse_ordered(),
        exp.Having: lambda self: self._parse_having(),
        exp.With: lambda self: self._parse_with(),
+        exp.Window: lambda self: self._parse_named_window(),
        "JOIN_TYPE": lambda self: self._parse_join_side_and_kind(),
    }

    STATEMENT_PARSERS = {
+        TokenType.ALTER: lambda self: self._parse_alter(),
+        TokenType.BEGIN: lambda self: self._parse_transaction(),
+        TokenType.CACHE: lambda self: self._parse_cache(),
+        TokenType.COMMIT: lambda self: self._parse_commit_or_rollback(),
        TokenType.CREATE: lambda self: self._parse_create(),
+        TokenType.DELETE: lambda self: self._parse_delete(),
        TokenType.DESCRIBE: lambda self: self._parse_describe(),
        TokenType.DROP: lambda self: self._parse_drop(),
+        TokenType.END: lambda self: self._parse_commit_or_rollback(),
        TokenType.INSERT: lambda self: self._parse_insert(),
        TokenType.LOAD_DATA: lambda self: self._parse_load_data(),
-        TokenType.UPDATE: lambda self: self._parse_update(),
-        TokenType.DELETE: lambda self: self._parse_delete(),
-        TokenType.CACHE: lambda self: self._parse_cache(),
-        TokenType.UNCACHE: lambda self: self._parse_uncache(),
-        TokenType.USE: lambda self: self.expression(exp.Use, this=self._parse_id_var()),
-        TokenType.BEGIN: lambda self: self._parse_transaction(),
-        TokenType.COMMIT: lambda self: self._parse_commit_or_rollback(),
-        TokenType.END: lambda self: self._parse_commit_or_rollback(),
-        TokenType.ROLLBACK: lambda self: self._parse_commit_or_rollback(),
        TokenType.MERGE: lambda self: self._parse_merge(),
+        TokenType.ROLLBACK: lambda self: self._parse_commit_or_rollback(),
+        TokenType.UNCACHE: lambda self: self._parse_uncache(),
+        TokenType.UPDATE: lambda self: self._parse_update(),
+        TokenType.USE: lambda self: self.expression(exp.Use, this=self._parse_id_var()),
    }

    UNARY_PARSERS = {
@ -441,6 +459,7 @@ class Parser(metaclass=_Parser):
        TokenType.HEX_STRING: lambda self, token: self.expression(exp.HexString, this=token.text),
        TokenType.BYTE_STRING: lambda self, token: self.expression(exp.ByteString, this=token.text),
        TokenType.INTRODUCER: lambda self, token: self._parse_introducer(token),
+        TokenType.NATIONAL: lambda self, token: self._parse_national(token),
        TokenType.SESSION_PARAMETER: lambda self, _: self._parse_session_parameter(),
    }

@ -454,6 +473,9 @@ class Parser(metaclass=_Parser):
        TokenType.ILIKE: lambda self, this: self._parse_escape(
            self.expression(exp.ILike, this=this, expression=self._parse_bitwise())
        ),
+        TokenType.IRLIKE: lambda self, this: self.expression(
+            exp.RegexpILike, this=this, expression=self._parse_bitwise()
+        ),
        TokenType.RLIKE: lambda self, this: self.expression(
            exp.RegexpLike, this=this, expression=self._parse_bitwise()
        ),
@ -535,8 +557,7 @@ class Parser(metaclass=_Parser):
        "group": lambda self: self._parse_group(),
        "having": lambda self: self._parse_having(),
        "qualify": lambda self: self._parse_qualify(),
-        "window": lambda self: self._match(TokenType.WINDOW)
-        and self._parse_window(self._parse_id_var(), alias=True),
+        "windows": lambda self: self._parse_window_clause(),
        "distribute": lambda self: self._parse_sort(TokenType.DISTRIBUTE_BY, exp.Distribute),
        "sort": lambda self: self._parse_sort(TokenType.SORT_BY, exp.Sort),
        "cluster": lambda self: self._parse_sort(TokenType.CLUSTER_BY, exp.Cluster),
@ -551,18 +572,18 @@ class Parser(metaclass=_Parser):
    MODIFIABLES = (exp.Subquery, exp.Subqueryable, exp.Table)

    CREATABLES = {
-        TokenType.TABLE,
-        TokenType.VIEW,
+        TokenType.COLUMN,
        TokenType.FUNCTION,
        TokenType.INDEX,
        TokenType.PROCEDURE,
        TokenType.SCHEMA,
+        TokenType.TABLE,
+        TokenType.VIEW,
    }

    TRANSACTION_KIND = {"DEFERRED", "IMMEDIATE", "EXCLUSIVE"}

    STRICT_CAST = True
-    LATERAL_FUNCTION_AS_VIEW = False

    __slots__ = (
        "error_level",
@ -782,13 +803,16 @@ class Parser(metaclass=_Parser):
        self._parse_query_modifiers(expression)
        return expression

-    def _parse_drop(self):
+    def _parse_drop(self, default_kind=None):
        temporary = self._match(TokenType.TEMPORARY)
        materialized = self._match(TokenType.MATERIALIZED)
        kind = self._match_set(self.CREATABLES) and self._prev.text
        if not kind:
-            self.raise_error(f"Expected {self.CREATABLES}")
-            return
+            if default_kind:
+                kind = default_kind
+            else:
+                self.raise_error(f"Expected {self.CREATABLES}")
+                return

        return self.expression(
            exp.Drop,
@ -876,7 +900,7 @@ class Parser(metaclass=_Parser):
        ) or self._match_pair(TokenType.STRING, TokenType.EQ, advance=False)

        if assignment:
-            key = self._parse_var() or self._parse_string()
+            key = self._parse_var_or_string()
            self._match(TokenType.EQ)
            return self.expression(exp.Property, this=key, value=self._parse_column())

@ -1152,18 +1176,32 @@ class Parser(metaclass=_Parser):
        elif (table or nested) and self._match(TokenType.L_PAREN):
            this = self._parse_table() if table else self._parse_select(nested=True)
            self._parse_query_modifiers(this)
+            this = self._parse_set_operations(this)
            self._match_r_paren()
-            this = self._parse_subquery(this)
+            # early return so that subquery unions aren't parsed again
+            # SELECT * FROM (SELECT 1) UNION ALL SELECT 1
+            # Union ALL should be a property of the top select node, not the subquery
+            return self._parse_subquery(this)
        elif self._match(TokenType.VALUES):
+            if self._curr.token_type == TokenType.L_PAREN:
+                # We don't consume the left paren because it's consumed in _parse_value
+                expressions = self._parse_csv(self._parse_value)
+            else:
+                # In presto we can have VALUES 1, 2 which results in 1 column & 2 rows.
+                # Source: https://prestodb.io/docs/current/sql/values.html
+                expressions = self._parse_csv(
+                    lambda: self.expression(exp.Tuple, expressions=[self._parse_conjunction()])
+                )
+
            this = self.expression(
                exp.Values,
-                expressions=self._parse_csv(self._parse_value),
+                expressions=expressions,
                alias=self._parse_table_alias(),
            )
        else:
            this = None

-        return self._parse_set_operations(this) if this else None
+        return self._parse_set_operations(this)

    def _parse_with(self, skip_with_token=False):
        if not skip_with_token and not self._match(TokenType.WITH):
@ -1201,11 +1239,12 @@ class Parser(metaclass=_Parser):
        alias = self._parse_id_var(
            any_token=any_token, tokens=alias_tokens or self.TABLE_ALIAS_TOKENS
        )
-        columns = None

        if self._match(TokenType.L_PAREN):
-            columns = self._parse_csv(lambda: self._parse_id_var(any_token))
+            columns = self._parse_csv(lambda: self._parse_column_def(self._parse_id_var()))
            self._match_r_paren()
+        else:
+            columns = None

        if not alias and not columns:
            return None
@ -1295,26 +1334,19 @@ class Parser(metaclass=_Parser):
                    expression=self._parse_function() or self._parse_id_var(any_token=False),
                )

-        columns = None
-        table_alias = None
-        if view or self.LATERAL_FUNCTION_AS_VIEW:
-            table_alias = self._parse_id_var(any_token=False)
-            if self._match(TokenType.ALIAS):
-                columns = self._parse_csv(self._parse_id_var)
+        if view:
+            table = self._parse_id_var(any_token=False)
+            columns = self._parse_csv(self._parse_id_var) if self._match(TokenType.ALIAS) else []
+            table_alias = self.expression(exp.TableAlias, this=table, columns=columns)
        else:
-            self._match(TokenType.ALIAS)
-            table_alias = self._parse_id_var(any_token=False)
-
-            if self._match(TokenType.L_PAREN):
-                columns = self._parse_csv(self._parse_id_var)
-                self._match_r_paren()
+            table_alias = self._parse_table_alias()

        expression = self.expression(
            exp.Lateral,
            this=this,
            view=view,
            outer=outer,
-            alias=self.expression(exp.TableAlias, this=table_alias, columns=columns),
+            alias=table_alias,
        )

        if outer_apply or cross_apply:
@ -1693,6 +1725,9 @@ class Parser(metaclass=_Parser):
        if negate:
            this = self.expression(exp.Not, this=this)

+        if self._match(TokenType.IS):
+            this = self._parse_is(this)
+
        return this

    def _parse_is(self, this):
@ -1796,6 +1831,10 @@ class Parser(metaclass=_Parser):
            return None

        type_token = self._prev.token_type
+
+        if type_token == TokenType.PSEUDO_TYPE:
+            return self.expression(exp.PseudoType, this=self._prev.text)
+
        nested = type_token in self.NESTED_TYPE_TOKENS
        is_struct = type_token == TokenType.STRUCT
        expressions = None
@ -1851,6 +1890,8 @@ class Parser(metaclass=_Parser):

            if value is None:
                value = exp.DataType(this=exp.DataType.Type.TIMESTAMP, expressions=expressions)
+        elif type_token == TokenType.INTERVAL:
+            value = self.expression(exp.Interval, unit=self._parse_var())

        if maybe_func and check_func:
            index2 = self._index
@ -1924,7 +1965,16 @@ class Parser(metaclass=_Parser):

    def _parse_primary(self):
        if self._match_set(self.PRIMARY_PARSERS):
-            return self.PRIMARY_PARSERS[self._prev.token_type](self, self._prev)
+            token_type = self._prev.token_type
+            primary = self.PRIMARY_PARSERS[token_type](self, self._prev)
+
+            if token_type == TokenType.STRING:
+                expressions = [primary]
+                while self._match(TokenType.STRING):
+                    expressions.append(exp.Literal.string(self._prev.text))
+                if len(expressions) > 1:
+                    return self.expression(exp.Concat, expressions=expressions)
+            return primary

        if self._match_pair(TokenType.DOT, TokenType.NUMBER):
            return exp.Literal.number(f"0.{self._prev.text}")
@ -2027,6 +2077,9 @@ class Parser(metaclass=_Parser):

        return self.expression(exp.Identifier, this=token.text)

+    def _parse_national(self, token):
+        return self.expression(exp.National, this=exp.Literal.string(token.text))
+
    def _parse_session_parameter(self):
        kind = None
        this = self._parse_id_var() or self._parse_primary()
@ -2051,7 +2104,9 @@ class Parser(metaclass=_Parser):

        if self._match(TokenType.L_PAREN):
            expressions = self._parse_csv(self._parse_id_var)
-            self._match(TokenType.R_PAREN)
+
+            if not self._match(TokenType.R_PAREN):
+                self._retreat(index)
        else:
            expressions = [self._parse_id_var()]

@ -2065,14 +2120,14 @@ class Parser(metaclass=_Parser):
                exp.Distinct, expressions=self._parse_csv(self._parse_conjunction)
            )
        else:
-            this = self._parse_conjunction()
+            this = self._parse_select_or_expression()

        if self._match(TokenType.IGNORE_NULLS):
            this = self.expression(exp.IgnoreNulls, this=this)
        else:
            self._match(TokenType.RESPECT_NULLS)

-        return self._parse_alias(self._parse_limit(self._parse_order(this)))
+        return self._parse_limit(self._parse_order(this))

    def _parse_schema(self, this=None):
        index = self._index
@ -2081,7 +2136,8 @@ class Parser(metaclass=_Parser):
            return this

        args = self._parse_csv(
-            lambda: self._parse_constraint() or self._parse_column_def(self._parse_field(True))
+            lambda: self._parse_constraint()
+            or self._parse_column_def(self._parse_field(any_token=True))
        )
        self._match_r_paren()
        return self.expression(exp.Schema, this=this, expressions=args)
@ -2120,7 +2176,7 @@ class Parser(metaclass=_Parser):
        elif self._match(TokenType.ENCODE):
            kind = self.expression(exp.EncodeColumnConstraint, this=self._parse_var())
        elif self._match(TokenType.DEFAULT):
-            kind = self.expression(exp.DefaultColumnConstraint, this=self._parse_conjunction())
+            kind = self.expression(exp.DefaultColumnConstraint, this=self._parse_bitwise())
        elif self._match_pair(TokenType.NOT, TokenType.NULL):
            kind = exp.NotNullColumnConstraint()
        elif self._match(TokenType.NULL):
@ -2211,7 +2267,10 @@ class Parser(metaclass=_Parser):
        if not self._match(TokenType.L_BRACKET):
            return this

-        expressions = self._parse_csv(self._parse_conjunction)
+        if self._match(TokenType.COLON):
+            expressions = [self.expression(exp.Slice, expression=self._parse_conjunction())]
+        else:
+            expressions = self._parse_csv(lambda: self._parse_slice(self._parse_conjunction()))

        if not this or this.name.upper() == "ARRAY":
            this = self.expression(exp.Array, expressions=expressions)
@ -2225,6 +2284,11 @@ class Parser(metaclass=_Parser):
        this.comments = self._prev_comments
        return self._parse_bracket(this)

+    def _parse_slice(self, this):
+        if self._match(TokenType.COLON):
+            return self.expression(exp.Slice, this=this, expression=self._parse_conjunction())
+        return this
+
    def _parse_case(self):
        ifs = []
        default = None
@ -2386,6 +2450,12 @@ class Parser(metaclass=_Parser):
            collation=collation,
        )

+    def _parse_window_clause(self):
+        return self._match(TokenType.WINDOW) and self._parse_csv(self._parse_named_window)
+
+    def _parse_named_window(self):
+        return self._parse_window(self._parse_id_var(), alias=True)
+
    def _parse_window(self, this, alias=False):
        if self._match(TokenType.FILTER):
            where = self._parse_wrapped(self._parse_where)
@ -2501,11 +2571,9 @@ class Parser(metaclass=_Parser):
        if identifier:
            return identifier

-        if any_token and self._curr and self._curr.token_type not in self.RESERVED_KEYWORDS:
-            self._advance()
-        elif not self._match_set(tokens or self.ID_VAR_TOKENS):
-            return None
-        return exp.Identifier(this=self._prev.text, quoted=False)
+        if (any_token and self._advance_any()) or self._match_set(tokens or self.ID_VAR_TOKENS):
+            return exp.Identifier(this=self._prev.text, quoted=False)
+        return None

    def _parse_string(self):
        if self._match(TokenType.STRING):
@ -2522,11 +2590,17 @@ class Parser(metaclass=_Parser):
            return self.expression(exp.Identifier, this=self._prev.text, quoted=True)
        return self._parse_placeholder()

-    def _parse_var(self):
-        if self._match(TokenType.VAR):
+    def _parse_var(self, any_token=False):
+        if (any_token and self._advance_any()) or self._match(TokenType.VAR):
            return self.expression(exp.Var, this=self._prev.text)
        return self._parse_placeholder()

+    def _advance_any(self):
+        if self._curr and self._curr.token_type not in self.RESERVED_KEYWORDS:
+            self._advance()
+            return self._prev
+        return None
+
    def _parse_var_or_string(self):
        return self._parse_var() or self._parse_string()

@ -2551,8 +2625,9 @@ class Parser(metaclass=_Parser):
        if self._match(TokenType.PLACEHOLDER):
            return self.expression(exp.Placeholder)
        elif self._match(TokenType.COLON):
-            self._advance()
-            return self.expression(exp.Placeholder, this=self._prev.text)
+            if self._match_set((TokenType.NUMBER, TokenType.VAR)):
+                return self.expression(exp.Placeholder, this=self._prev.text)
+            self._advance(-1)
        return None

    def _parse_except(self):
@ -2647,6 +2722,54 @@ class Parser(metaclass=_Parser):
            return self.expression(exp.Rollback, savepoint=savepoint)
        return self.expression(exp.Commit, chain=chain)

+    def _parse_add_column(self):
+        if not self._match_text_seq("ADD"):
+            return None
+
+        self._match(TokenType.COLUMN)
+        exists_column = self._parse_exists(not_=True)
+        expression = self._parse_column_def(self._parse_field(any_token=True))
+        expression.set("exists", exists_column)
+        return expression
+
+    def _parse_drop_column(self):
+        return self._match(TokenType.DROP) and self._parse_drop(default_kind="COLUMN")
+
+    def _parse_alter(self):
+        if not self._match(TokenType.TABLE):
+            return None
+
+        exists = self._parse_exists()
+        this = self._parse_table(schema=True)
+
+        actions = None
+        if self._match_text_seq("ADD", advance=False):
+            actions = self._parse_csv(self._parse_add_column)
+        elif self._match_text_seq("DROP", advance=False):
+            actions = self._parse_csv(self._parse_drop_column)
+        elif self._match_text_seq("ALTER"):
+            self._match(TokenType.COLUMN)
+            column = self._parse_field(any_token=True)
+
+            if self._match_pair(TokenType.DROP, TokenType.DEFAULT):
+                actions = self.expression(exp.AlterColumn, this=column, drop=True)
+            elif self._match_pair(TokenType.SET, TokenType.DEFAULT):
+                actions = self.expression(
+                    exp.AlterColumn, this=column, default=self._parse_conjunction()
+                )
+            else:
+                self._match_text_seq("SET", "DATA")
+                actions = self.expression(
+                    exp.AlterColumn,
+                    this=column,
+                    dtype=self._match_text_seq("TYPE") and self._parse_types(),
+                    collate=self._match(TokenType.COLLATE) and self._parse_term(),
+                    using=self._match(TokenType.USING) and self._parse_conjunction(),
+                )
+
+        actions = ensure_list(actions)
+        return self.expression(exp.AlterTable, this=this, exists=exists, actions=actions)
+
    def _parse_show(self):
        parser = self._find_parser(self.SHOW_PARSERS, self._show_trie)
        if parser:
@ -2782,7 +2905,7 @@ class Parser(metaclass=_Parser):
            return True
        return False

-    def _match_text_seq(self, *texts):
+    def _match_text_seq(self, *texts, advance=True):
        index = self._index
        for text in texts:
            if self._curr and self._curr.text.upper() == text:
@ -2790,6 +2913,10 @@ class Parser(metaclass=_Parser):
            else:
                self._retreat(index)
                return False
+
+        if not advance:
+            self._retreat(index)
+
        return True

    def _replace_columns_with_dots(self, this):