Merging upstream version 7.1.3.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 14:46:58 +01:00 · 2025-02-13 14:46:58 +01:00 · e6b3d2fe54
commit e6b3d2fe54
parent 964bd62de9
42 changed files with 1430 additions and 253 deletions
--- a/sqlglot/parser.py
+++ b/sqlglot/parser.py
@ -135,11 +135,13 @@ class Parser:
        TokenType.BOTH,
        TokenType.BUCKET,
        TokenType.CACHE,
+        TokenType.CALL,
        TokenType.COLLATE,
        TokenType.COMMIT,
        TokenType.CONSTRAINT,
        TokenType.DEFAULT,
        TokenType.DELETE,
+        TokenType.DESCRIBE,
        TokenType.DETERMINISTIC,
        TokenType.EXECUTE,
        TokenType.ENGINE,
@ -160,6 +162,7 @@ class Parser:
        TokenType.LAZY,
        TokenType.LANGUAGE,
        TokenType.LEADING,
+        TokenType.LOCAL,
        TokenType.LOCATION,
        TokenType.MATERIALIZED,
        TokenType.NATURAL,
@ -176,6 +179,7 @@ class Parser:
        TokenType.REFERENCES,
        TokenType.RETURNS,
        TokenType.ROWS,
+        TokenType.SCHEMA,
        TokenType.SCHEMA_COMMENT,
        TokenType.SEED,
        TokenType.SEMI,
@ -294,6 +298,11 @@ class Parser:

    COLUMN_OPERATORS = {
        TokenType.DOT: None,
+        TokenType.DCOLON: lambda self, this, to: self.expression(
+            exp.Cast,
+            this=this,
+            to=to,
+        ),
        TokenType.ARROW: lambda self, this, path: self.expression(
            exp.JSONExtract,
            this=this,
@ -342,8 +351,10 @@ class Parser:

    STATEMENT_PARSERS = {
        TokenType.CREATE: lambda self: self._parse_create(),
+        TokenType.DESCRIBE: lambda self: self._parse_describe(),
        TokenType.DROP: lambda self: self._parse_drop(),
        TokenType.INSERT: lambda self: self._parse_insert(),
+        TokenType.LOAD_DATA: lambda self: self._parse_load_data(),
        TokenType.UPDATE: lambda self: self._parse_update(),
        TokenType.DELETE: lambda self: self._parse_delete(),
        TokenType.CACHE: lambda self: self._parse_cache(),
@ -449,7 +460,14 @@ class Parser:

    MODIFIABLES = (exp.Subquery, exp.Subqueryable, exp.Table)

-    CREATABLES = {TokenType.TABLE, TokenType.VIEW, TokenType.FUNCTION, TokenType.INDEX, TokenType.PROCEDURE}
+    CREATABLES = {
+        TokenType.TABLE,
+        TokenType.VIEW,
+        TokenType.FUNCTION,
+        TokenType.INDEX,
+        TokenType.PROCEDURE,
+        TokenType.SCHEMA,
+    }

    STRICT_CAST = True

@ -650,7 +668,7 @@ class Parser:
        materialized = self._match(TokenType.MATERIALIZED)
        kind = self._match_set(self.CREATABLES) and self._prev.text
        if not kind:
-            self.raise_error("Expected TABLE, VIEW, INDEX, FUNCTION, or PROCEDURE")
+            self.raise_error(f"Expected {self.CREATABLES}")
            return

        return self.expression(
@ -677,7 +695,7 @@ class Parser:
        create_token = self._match_set(self.CREATABLES) and self._prev

        if not create_token:
-            self.raise_error("Expected TABLE, VIEW, INDEX, FUNCTION, or PROCEDURE")
+            self.raise_error(f"Expected {self.CREATABLES}")
            return

        exists = self._parse_exists(not_=True)
@ -692,7 +710,7 @@ class Parser:
                expression = self._parse_select_or_expression()
        elif create_token.token_type == TokenType.INDEX:
            this = self._parse_index()
-        elif create_token.token_type in (TokenType.TABLE, TokenType.VIEW):
+        elif create_token.token_type in (TokenType.TABLE, TokenType.VIEW, TokenType.SCHEMA):
            this = self._parse_table(schema=True)
            properties = self._parse_properties()
            if self._match(TokenType.ALIAS):
@ -836,19 +854,74 @@ class Parser:
            return self.expression(exp.Properties, expressions=properties)
        return None

+    def _parse_describe(self):
+        self._match(TokenType.TABLE)
+
+        return self.expression(exp.Describe, this=self._parse_id_var())
+
    def _parse_insert(self):
        overwrite = self._match(TokenType.OVERWRITE)
-        self._match(TokenType.INTO)
-        self._match(TokenType.TABLE)
+        local = self._match(TokenType.LOCAL)
+        if self._match_text("DIRECTORY"):
+            this = self.expression(
+                exp.Directory,
+                this=self._parse_var_or_string(),
+                local=local,
+                row_format=self._parse_row_format(),
+            )
+        else:
+            self._match(TokenType.INTO)
+            self._match(TokenType.TABLE)
+            this = self._parse_table(schema=True)
        return self.expression(
            exp.Insert,
-            this=self._parse_table(schema=True),
+            this=this,
            exists=self._parse_exists(),
            partition=self._parse_partition(),
            expression=self._parse_select(nested=True),
            overwrite=overwrite,
        )

+    def _parse_row_format(self):
+        if not self._match_pair(TokenType.ROW, TokenType.FORMAT):
+            return None
+
+        self._match_text("DELIMITED")
+
+        kwargs = {}
+
+        if self._match_text("FIELDS", "TERMINATED", "BY"):
+            kwargs["fields"] = self._parse_string()
+            if self._match_text("ESCAPED", "BY"):
+                kwargs["escaped"] = self._parse_string()
+        if self._match_text("COLLECTION", "ITEMS", "TERMINATED", "BY"):
+            kwargs["collection_items"] = self._parse_string()
+        if self._match_text("MAP", "KEYS", "TERMINATED", "BY"):
+            kwargs["map_keys"] = self._parse_string()
+        if self._match_text("LINES", "TERMINATED", "BY"):
+            kwargs["lines"] = self._parse_string()
+        if self._match_text("NULL", "DEFINED", "AS"):
+            kwargs["null"] = self._parse_string()
+        return self.expression(exp.RowFormat, **kwargs)
+
+    def _parse_load_data(self):
+        local = self._match(TokenType.LOCAL)
+        self._match_text("INPATH")
+        inpath = self._parse_string()
+        overwrite = self._match(TokenType.OVERWRITE)
+        self._match_pair(TokenType.INTO, TokenType.TABLE)
+
+        return self.expression(
+            exp.LoadData,
+            this=self._parse_table(schema=True),
+            local=local,
+            overwrite=overwrite,
+            inpath=inpath,
+            partition=self._parse_partition(),
+            input_format=self._match_text("INPUTFORMAT") and self._parse_string(),
+            serde=self._match_text("SERDE") and self._parse_string(),
+        )
+
    def _parse_delete(self):
        self._match(TokenType.FROM)

@ -1484,6 +1557,14 @@ class Parser:

        if self._match_set(self.RANGE_PARSERS):
            this = self.RANGE_PARSERS[self._prev.token_type](self, this)
+        elif self._match(TokenType.ISNULL):
+            this = self.expression(exp.Is, this=this, expression=exp.Null())
+
+        # Postgres supports ISNULL and NOTNULL for conditions.
+        # https://blog.andreiavram.ro/postgresql-null-composite-type/
+        if self._match(TokenType.NOTNULL):
+            this = self.expression(exp.Is, this=this, expression=exp.Null())
+            this = self.expression(exp.Not, this=this)

        if negate:
            this = self.expression(exp.Not, this=this)
@ -1582,12 +1663,6 @@ class Parser:
                return self._parse_column()
            return type_token

-        while self._match(TokenType.DCOLON):
-            type_token = self._parse_types()
-            if not type_token:
-                self.raise_error("Expected type")
-            this = self.expression(exp.Cast, this=this, to=type_token)
-
        return this

    def _parse_types(self):
@ -1601,6 +1676,11 @@ class Parser:
        is_struct = type_token == TokenType.STRUCT
        expressions = None

+        if not nested and self._match_pair(TokenType.L_BRACKET, TokenType.R_BRACKET):
+            return exp.DataType(
+                this=exp.DataType.Type.ARRAY, expressions=[exp.DataType.build(type_token.value)], nested=True
+            )
+
        if self._match(TokenType.L_BRACKET):
            self._retreat(index)
            return None
@ -1611,7 +1691,7 @@ class Parser:
            elif nested:
                expressions = self._parse_csv(self._parse_types)
            else:
-                expressions = self._parse_csv(self._parse_type)
+                expressions = self._parse_csv(self._parse_conjunction)

            if not expressions:
                self._retreat(index)
@ -1677,8 +1757,17 @@ class Parser:
        this = self._parse_bracket(this)

        while self._match_set(self.COLUMN_OPERATORS):
-            op = self.COLUMN_OPERATORS.get(self._prev.token_type)
-            field = self._parse_star() or self._parse_function() or self._parse_id_var()
+            op_token = self._prev.token_type
+            op = self.COLUMN_OPERATORS.get(op_token)
+
+            if op_token == TokenType.DCOLON:
+                field = self._parse_types()
+                if not field:
+                    self.raise_error("Expected type")
+            elif op:
+                field = exp.Literal.string(self._advance() or self._prev.text)
+            else:
+                field = self._parse_star() or self._parse_function() or self._parse_id_var()

            if isinstance(field, exp.Func):
                # bigquery allows function calls like x.y.count(...)
@ -1687,7 +1776,7 @@ class Parser:
                this = self._replace_columns_with_dots(this)

            if op:
-                this = op(self, this, exp.Literal.string(field.name))
+                this = op(self, this, field)
            elif isinstance(this, exp.Column) and not this.table:
                this = self.expression(exp.Column, this=field, table=this.this)
            else:
@ -1808,11 +1897,10 @@ class Parser:
        if not self._match(TokenType.ARROW):
            self._retreat(index)

-            distinct = self._match(TokenType.DISTINCT)
-            this = self._parse_conjunction()
-
-            if distinct:
-                this = self.expression(exp.Distinct, this=this)
+            if self._match(TokenType.DISTINCT):
+                this = self.expression(exp.Distinct, expressions=self._parse_csv(self._parse_conjunction))
+            else:
+                this = self._parse_conjunction()

            if self._match(TokenType.IGNORE_NULLS):
                this = self.expression(exp.IgnoreNulls, this=this)
@ -2112,6 +2200,8 @@ class Parser:
            this = self.expression(exp.Filter, this=this, expression=self._parse_where())
            self._match_r_paren()

+        # T-SQL allows the OVER (...) syntax after WITHIN GROUP.
+        # https://learn.microsoft.com/en-us/sql/t-sql/functions/percentile-disc-transact-sql?view=sql-server-ver16
        if self._match(TokenType.WITHIN_GROUP):
            self._match_l_paren()
            this = self.expression(
@ -2120,7 +2210,6 @@ class Parser:
                expression=self._parse_order(),
            )
            self._match_r_paren()
-            return this

        # SQL spec defines an optional [ { IGNORE | RESPECT } NULLS ] OVER
        # Some dialects choose to implement and some do not.
@ -2366,6 +2455,16 @@ class Parser:
        if not self._match(TokenType.R_PAREN):
            self.raise_error("Expecting )")

+    def _match_text(self, *texts):
+        index = self._index
+        for text in texts:
+            if self._curr and self._curr.text.upper() == text:
+                self._advance()
+            else:
+                self._retreat(index)
+                return False
+        return True
+
    def _replace_columns_with_dots(self, this):
        if isinstance(this, exp.Dot):
            exp.replace_children(this, self._replace_columns_with_dots)