Merging upstream version 25.8.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 21:52:10 +01:00 · 2025-02-13 21:52:10 +01:00 · 50df8dea29
commit 50df8dea29
parent 1d73cb497c
61 changed files with 50550 additions and 50354 deletions
--- a/sqlglot/dialects/bigquery.py
+++ b/sqlglot/dialects/bigquery.py
@ -322,6 +322,7 @@ class BigQuery(Dialect):
            "ANY TYPE": TokenType.VARIANT,
            "BEGIN": TokenType.COMMAND,
            "BEGIN TRANSACTION": TokenType.BEGIN,
+            "BYTEINT": TokenType.INT,
            "BYTES": TokenType.BINARY,
            "CURRENT_DATETIME": TokenType.CURRENT_DATETIME,
            "DATETIME": TokenType.TIMESTAMP,
--- a/sqlglot/dialects/clickhouse.py
+++ b/sqlglot/dialects/clickhouse.py
@ -81,6 +81,14 @@ def _build_count_if(args: t.List) -> exp.CountIf | exp.CombinedAggFunc:
    return exp.CombinedAggFunc(this="countIf", expressions=args, parts=("count", "If"))


+def _build_str_to_date(args: t.List) -> exp.Cast | exp.Anonymous:
+    if len(args) == 3:
+        return exp.Anonymous(this="STR_TO_DATE", expressions=args)
+
+    strtodate = exp.StrToDate.from_arg_list(args)
+    return exp.cast(strtodate, exp.DataType.build(exp.DataType.Type.DATETIME))
+
+
 def _datetime_delta_sql(name: str) -> t.Callable[[Generator, DATEΤΙΜΕ_DELTA], str]:
    def _delta_sql(self: Generator, expression: DATEΤΙΜΕ_DELTA) -> str:
        if not expression.unit:
@ -181,6 +189,7 @@ class ClickHouse(Dialect):
            "MAP": parser.build_var_map,
            "MATCH": exp.RegexpLike.from_arg_list,
            "RANDCANONICAL": exp.Rand.from_arg_list,
+            "STR_TO_DATE": _build_str_to_date,
            "TUPLE": exp.Struct.from_arg_list,
            "TIMESTAMP_SUB": build_date_delta(exp.TimestampSub, default_unit=None),
            "TIMESTAMPSUB": build_date_delta(exp.TimestampSub, default_unit=None),
@ -836,6 +845,24 @@ class ClickHouse(Dialect):
            "NAMED COLLECTION",
        }

+        def strtodate_sql(self, expression: exp.StrToDate) -> str:
+            strtodate_sql = self.function_fallback_sql(expression)
+
+            if not isinstance(expression.parent, exp.Cast):
+                # StrToDate returns DATEs in other dialects (eg. postgres), so
+                # this branch aims to improve the transpilation to clickhouse
+                return f"CAST({strtodate_sql} AS DATE)"
+
+            return strtodate_sql
+
+        def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str:
+            this = expression.this
+
+            if isinstance(this, exp.StrToDate) and expression.to == exp.DataType.build("datetime"):
+                return self.sql(this)
+
+            return super().cast_sql(expression, safe_prefix=safe_prefix)
+
        def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str:
            this = self.json_path_part(expression.this)
            return str(int(this) + 1) if is_int(this) else this
--- a/sqlglot/dialects/duckdb.py
+++ b/sqlglot/dialects/duckdb.py
@ -158,7 +158,7 @@ def _struct_sql(self: DuckDB.Generator, expression: exp.Struct) -> str:

 def _datatype_sql(self: DuckDB.Generator, expression: exp.DataType) -> str:
    if expression.is_type("array"):
-        return f"{self.expressions(expression, flat=True)}[]"
+        return f"{self.expressions(expression, flat=True)}[{self.expressions(expression, key='values', flat=True)}]"

    # Type TIMESTAMP / TIME WITH TIME ZONE does not support any modifiers
    if expression.is_type("timestamptz", "timetz"):
@ -186,9 +186,14 @@ def _unix_to_time_sql(self: DuckDB.Generator, expression: exp.UnixToTime) -> str
    return self.func("TO_TIMESTAMP", exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)))


+WRAPPED_JSON_EXTRACT_EXPRESSIONS = (exp.Binary, exp.Bracket, exp.In)
+
+
 def _arrow_json_extract_sql(self: DuckDB.Generator, expression: JSON_EXTRACT_TYPE) -> str:
    arrow_sql = arrow_json_extract_sql(self, expression)
-    if not expression.same_parent and isinstance(expression.parent, (exp.Binary, exp.Bracket)):
+    if not expression.same_parent and isinstance(
+        expression.parent, WRAPPED_JSON_EXTRACT_EXPRESSIONS
+    ):
        arrow_sql = self.wrap(arrow_sql)
    return arrow_sql

@ -238,14 +243,15 @@ class DuckDB(Dialect):
            "POSITIONAL": TokenType.POSITIONAL,
            "SIGNED": TokenType.INT,
            "STRING": TokenType.TEXT,
-            "UBIGINT": TokenType.UBIGINT,
-            "UINTEGER": TokenType.UINT,
-            "USMALLINT": TokenType.USMALLINT,
-            "UTINYINT": TokenType.UTINYINT,
+            "SUMMARIZE": TokenType.SUMMARIZE,
            "TIMESTAMP_S": TokenType.TIMESTAMP_S,
            "TIMESTAMP_MS": TokenType.TIMESTAMP_MS,
            "TIMESTAMP_NS": TokenType.TIMESTAMP_NS,
            "TIMESTAMP_US": TokenType.TIMESTAMP,
+            "UBIGINT": TokenType.UBIGINT,
+            "UINTEGER": TokenType.UINT,
+            "USMALLINT": TokenType.USMALLINT,
+            "UTINYINT": TokenType.UTINYINT,
            "VARCHAR": TokenType.TEXT,
        }
        KEYWORDS.pop("/*+")
@ -744,10 +750,9 @@ class DuckDB(Dialect):
        def generateseries_sql(self, expression: exp.GenerateSeries) -> str:
            # GENERATE_SERIES(a, b) -> [a, b], RANGE(a, b) -> [a, b)
            if expression.args.get("is_end_exclusive"):
-                expression.set("is_end_exclusive", None)
                return rename_func("RANGE")(self, expression)

-            return super().generateseries_sql(expression)
+            return self.function_fallback_sql(expression)

        def bracket_sql(self, expression: exp.Bracket) -> str:
            this = expression.this
--- a/sqlglot/dialects/postgres.py
+++ b/sqlglot/dialects/postgres.py
@ -168,16 +168,13 @@ def _serial_to_generated(expression: exp.Expression) -> exp.Expression:

 def _build_generate_series(args: t.List) -> exp.GenerateSeries:
    # The goal is to convert step values like '1 day' or INTERVAL '1 day' into INTERVAL '1' day
+    # Note: postgres allows calls with just two arguments -- the "step" argument defaults to 1
    step = seq_get(args, 2)
-
-    if step is None:
-        # Postgres allows calls with just two arguments -- the "step" argument defaults to 1
-        return exp.GenerateSeries.from_arg_list(args)
-
-    if step.is_string:
-        args[2] = exp.to_interval(step.this)
-    elif isinstance(step, exp.Interval) and not step.args.get("unit"):
-        args[2] = exp.to_interval(step.this.this)
+    if step is not None:
+        if step.is_string:
+            args[2] = exp.to_interval(step.this)
+        elif isinstance(step, exp.Interval) and not step.args.get("unit"):
+            args[2] = exp.to_interval(step.this.this)

    return exp.GenerateSeries.from_arg_list(args)

--- a/sqlglot/dialects/presto.py
+++ b/sqlglot/dialects/presto.py
@ -393,9 +393,6 @@ class Presto(Dialect):
        TRANSFORMS = {
            **generator.Generator.TRANSFORMS,
            exp.AnyValue: rename_func("ARBITRARY"),
-            exp.ApproxDistinct: lambda self, e: self.func(
-                "APPROX_DISTINCT", e.this, e.args.get("accuracy")
-            ),
            exp.ApproxQuantile: rename_func("APPROX_PERCENTILE"),
            exp.ArgMax: rename_func("MAX_BY"),
            exp.ArgMin: rename_func("MIN_BY"),
--- a/sqlglot/dialects/sqlite.py
+++ b/sqlglot/dialects/sqlite.py
@ -223,7 +223,7 @@ class SQLite(Dialect):
                    exp.select(exp.alias_("value", column_alias)).from_(expression).subquery()
                )
            else:
-                sql = super().generateseries_sql(expression)
+                sql = self.function_fallback_sql(expression)

            return sql

--- a/sqlglot/dialects/tsql.py
+++ b/sqlglot/dialects/tsql.py
@ -322,6 +322,15 @@ def _build_with_arg_as_text(
    return _parse


+def _build_json_query(args: t.List, dialect: Dialect) -> exp.JSONExtract:
+    if len(args) == 1:
+        # The default value for path is '$'. As a result, if you don't provide a
+        # value for path, JSON_QUERY returns the input expression.
+        args.append(exp.Literal.string("$"))
+
+    return parser.build_extract_json_with_path(exp.JSONExtract)(args, dialect)
+
+
 def _json_extract_sql(
    self: TSQL.Generator, expression: exp.JSONExtract | exp.JSONExtractScalar
 ) -> str:
@ -510,7 +519,7 @@ class TSQL(Dialect):
            "GETDATE": exp.CurrentTimestamp.from_arg_list,
            "HASHBYTES": _build_hashbytes,
            "ISNULL": exp.Coalesce.from_arg_list,
-            "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract),
+            "JSON_QUERY": _build_json_query,
            "JSON_VALUE": parser.build_extract_json_with_path(exp.JSONExtractScalar),
            "LEN": _build_with_arg_as_text(exp.Length),
            "LEFT": _build_with_arg_as_text(exp.Left),
@ -790,6 +799,7 @@ class TSQL(Dialect):
        PARSE_JSON_NAME = None

        EXPRESSIONS_WITHOUT_NESTED_CTES = {
+            exp.Create,
            exp.Delete,
            exp.Insert,
            exp.Intersect,
@ -989,31 +999,32 @@ class TSQL(Dialect):
            kind = expression.kind
            exists = expression.args.pop("exists", None)

-            if kind == "VIEW":
-                expression.this.set("catalog", None)
-
-            sql = super().create_sql(expression)
-
            like_property = expression.find(exp.LikeProperty)
            if like_property:
                ctas_expression = like_property.this
            else:
                ctas_expression = expression.expression

+            if kind == "VIEW":
+                expression.this.set("catalog", None)
+                with_ = expression.args.get("with")
+                if ctas_expression and with_:
+                    # We've already preprocessed the Create expression to bubble up any nested CTEs,
+                    # but CREATE VIEW actually requires the WITH clause to come after it so we need
+                    # to amend the AST by moving the CTEs to the CREATE VIEW statement's query.
+                    ctas_expression.set("with", with_.pop())
+
+            sql = super().create_sql(expression)
+
            table = expression.find(exp.Table)

            # Convert CTAS statement to SELECT .. INTO ..
            if kind == "TABLE" and ctas_expression:
-                ctas_with = ctas_expression.args.get("with")
-                if ctas_with:
-                    ctas_with = ctas_with.pop()
-
                if isinstance(ctas_expression, exp.UNWRAPPED_QUERIES):
                    ctas_expression = ctas_expression.subquery()

                select_into = exp.select("*").from_(exp.alias_(ctas_expression, "temp", table=True))
                select_into.set("into", exp.Into(this=table))
-                select_into.set("with", ctas_with)

                if like_property:
                    select_into.limit(0, copy=False)
--- a/sqlglot/expressions.py
+++ b/sqlglot/expressions.py
@ -1439,6 +1439,11 @@ class Describe(Expression):
    arg_types = {"this": True, "style": False, "kind": False, "expressions": False}


+# https://duckdb.org/docs/guides/meta/summarize.html
+class Summarize(Expression):
+    arg_types = {"this": True, "table": False}
+
+
 class Kill(Expression):
    arg_types = {"this": True, "kind": False}

--- a/sqlglot/generator.py
+++ b/sqlglot/generator.py
@ -105,12 +105,6 @@ class Generator(metaclass=_Generator):
        exp.InlineLengthColumnConstraint: lambda self, e: f"INLINE LENGTH {self.sql(e, 'this')}",
        exp.InputModelProperty: lambda self, e: f"INPUT{self.sql(e, 'this')}",
        exp.IntervalSpan: lambda self, e: f"{self.sql(e, 'this')} TO {self.sql(e, 'expression')}",
-        exp.JSONExtract: lambda self, e: self.func(
-            "JSON_EXTRACT", e.this, e.expression, *e.expressions
-        ),
-        exp.JSONExtractScalar: lambda self, e: self.func(
-            "JSON_EXTRACT_SCALAR", e.this, e.expression, *e.expressions
-        ),
        exp.LanguageProperty: lambda self, e: self.naked_property(e),
        exp.LocationProperty: lambda self, e: self.naked_property(e),
        exp.LogProperty: lambda _, e: f"{'NO ' if e.args.get('no') else ''}LOG",
@ -146,7 +140,6 @@ class Generator(metaclass=_Generator):
        exp.TemporaryProperty: lambda *_: "TEMPORARY",
        exp.TagColumnConstraint: lambda self, e: f"TAG ({self.expressions(e, flat=True)})",
        exp.TitleColumnConstraint: lambda self, e: f"TITLE {self.sql(e, 'this')}",
-        exp.Timestamp: lambda self, e: self.func("TIMESTAMP", e.this, e.args.get("zone")),
        exp.ToMap: lambda self, e: f"MAP {self.sql(e, 'this')}",
        exp.ToTableProperty: lambda self, e: f"TO {self.sql(e.this)}",
        exp.TransformModelProperty: lambda self, e: self.func("TRANSFORM", *e.expressions),
@ -1846,7 +1839,7 @@ class Generator(metaclass=_Generator):
        return f"{this} {kind} {expr}"

    def tuple_sql(self, expression: exp.Tuple) -> str:
-        return f"({self.expressions(expression, flat=True)})"
+        return f"({self.expressions(expression, dynamic=True, new_line=True, skip_first=True, skip_last=True)})"

    def update_sql(self, expression: exp.Update) -> str:
        this = self.sql(expression, "this")
@ -2994,9 +2987,6 @@ class Generator(metaclass=_Generator):
        zone = self.sql(expression, "this")
        return f"CURRENT_DATE({zone})" if zone else "CURRENT_DATE"

-    def currenttimestamp_sql(self, expression: exp.CurrentTimestamp) -> str:
-        return self.func("CURRENT_TIMESTAMP", expression.this)
-
    def collate_sql(self, expression: exp.Collate) -> str:
        if self.COLLATE_IS_FUNC:
            return self.function_fallback_sql(expression)
@ -3354,7 +3344,9 @@ class Generator(metaclass=_Generator):
        return f"{self.normalize_func(name)}{prefix}{self.format_args(*args)}{suffix}"

    def format_args(self, *args: t.Optional[str | exp.Expression]) -> str:
-        arg_sqls = tuple(self.sql(arg) for arg in args if arg is not None)
+        arg_sqls = tuple(
+            self.sql(arg) for arg in args if arg is not None and not isinstance(arg, bool)
+        )
        if self.pretty and self.too_wide(arg_sqls):
            return self.indent("\n" + ",\n".join(arg_sqls) + "\n", skip_first=True, skip_last=True)
        return ", ".join(arg_sqls)
@ -3397,12 +3389,8 @@ class Generator(metaclass=_Generator):
            return sep.join(sql for sql in (self.sql(e) for e in expressions) if sql)

        num_sqls = len(expressions)
-
-        # These are calculated once in case we have the leading_comma / pretty option set, correspondingly
-        if self.pretty and not self.leading_comma:
-            stripped_sep = sep.strip()
-
        result_sqls = []
+
        for i, e in enumerate(expressions):
            sql = self.sql(e, comment=False)
            if not sql:
@ -3415,7 +3403,7 @@ class Generator(metaclass=_Generator):
                    result_sqls.append(f"{sep if i > 0 else ''}{prefix}{sql}{comments}")
                else:
                    result_sqls.append(
-                        f"{prefix}{sql}{stripped_sep if i + 1 < num_sqls else ''}{comments}"
+                        f"{prefix}{sql}{(sep.rstrip() if comments else sep) if i + 1 < num_sqls else ''}{comments}"
                    )
            else:
                result_sqls.append(f"{prefix}{sql}{comments}{sep if i + 1 < num_sqls else ''}")
@ -3424,7 +3412,7 @@ class Generator(metaclass=_Generator):
            if new_line:
                result_sqls.insert(0, "")
                result_sqls.append("")
-            result_sql = "\n".join(result_sqls)
+            result_sql = "\n".join(s.rstrip() for s in result_sqls)
        else:
            result_sql = "".join(result_sqls)
        return (
@ -3761,10 +3749,6 @@ class Generator(metaclass=_Generator):

        return self.function_fallback_sql(expression)

-    def generateseries_sql(self, expression: exp.GenerateSeries) -> str:
-        expression.set("is_end_exclusive", None)
-        return self.function_fallback_sql(expression)
-
    def struct_sql(self, expression: exp.Struct) -> str:
        expression.set(
            "expressions",
@ -4027,9 +4011,6 @@ class Generator(metaclass=_Generator):

        return self.func(self.PARSE_JSON_NAME, expression.this, expression.expression)

-    def length_sql(self, expression: exp.Length) -> str:
-        return self.func("LENGTH", expression.this)
-
    def rand_sql(self, expression: exp.Rand) -> str:
        lower = self.sql(expression, "lower")
        upper = self.sql(expression, "upper")
@ -4038,17 +4019,6 @@ class Generator(metaclass=_Generator):
            return f"({upper} - {lower}) * {self.func('RAND', expression.this)} + {lower}"
        return self.func("RAND", expression.this)

-    def strtodate_sql(self, expression: exp.StrToDate) -> str:
-        return self.func("STR_TO_DATE", expression.this, expression.args.get("format"))
-
-    def strtotime_sql(self, expression: exp.StrToTime) -> str:
-        return self.func(
-            "STR_TO_TIME",
-            expression.this,
-            expression.args.get("format"),
-            expression.args.get("zone"),
-        )
-
    def changes_sql(self, expression: exp.Changes) -> str:
        information = self.sql(expression, "information")
        information = f"INFORMATION => {information}"
@ -4067,3 +4037,7 @@ class Generator(metaclass=_Generator):
            fill_pattern = "' '"

        return self.func(f"{prefix}PAD", expression.this, expression.expression, fill_pattern)
+
+    def summarize_sql(self, expression: exp.Summarize) -> str:
+        table = " TABLE" if expression.args.get("table") else ""
+        return f"SUMMARIZE{table} {self.sql(expression.this)}"
--- a/sqlglot/helper.py
+++ b/sqlglot/helper.py
@ -179,8 +179,9 @@ def apply_index_offset(

    if not expression.type:
        annotate_types(expression)
+
    if t.cast(exp.DataType, expression.type).this in exp.DataType.INTEGER_TYPES:
-        logger.warning("Applying array index offset (%s)", offset)
+        logger.info("Applying array index offset (%s)", offset)
        expression = simplify(expression + offset)
        return [expression]

--- a/sqlglot/parser.py
+++ b/sqlglot/parser.py
@ -393,6 +393,7 @@ class Parser(metaclass=_Parser):
        TokenType.COMMIT,
        TokenType.CONSTRAINT,
        TokenType.COPY,
+        TokenType.CUBE,
        TokenType.DEFAULT,
        TokenType.DELETE,
        TokenType.DESC,
@ -673,7 +674,7 @@ class Parser(metaclass=_Parser):
        exp.Cluster: lambda self: self._parse_sort(exp.Cluster, TokenType.CLUSTER_BY),
        exp.Column: lambda self: self._parse_column(),
        exp.Condition: lambda self: self._parse_assignment(),
-        exp.DataType: lambda self: self._parse_types(allow_identifiers=False),
+        exp.DataType: lambda self: self._parse_types(allow_identifiers=False, schema=True),
        exp.Expression: lambda self: self._parse_expression(),
        exp.From: lambda self: self._parse_from(joins=True),
        exp.Group: lambda self: self._parse_group(),
@ -2825,12 +2826,14 @@ class Parser(metaclass=_Parser):
            this = self._parse_derived_table_values()
        elif from_:
            this = exp.select("*").from_(from_.this, copy=False)
+        elif self._match(TokenType.SUMMARIZE):
+            table = self._match(TokenType.TABLE)
+            this = self._parse_select() or self._parse_string() or self._parse_table()
+            return self.expression(exp.Summarize, this=this, table=table)
        else:
            this = None

-        if parse_set_operation:
-            return self._parse_set_operations(this)
-        return this
+        return self._parse_set_operations(this) if parse_set_operation else this

    def _parse_with(self, skip_with_token: bool = False) -> t.Optional[exp.With]:
        if not skip_with_token and not self._match(TokenType.WITH):
@ -3825,7 +3828,7 @@ class Parser(metaclass=_Parser):
        while True:
            expressions = self._parse_csv(
                lambda: None
-                if self._match(TokenType.ROLLUP, advance=False)
+                if self._match_set((TokenType.CUBE, TokenType.ROLLUP), advance=False)
                else self._parse_assignment()
            )
            if expressions:
@ -4613,7 +4616,11 @@ class Parser(metaclass=_Parser):

            matched_array = False
            values = self._parse_csv(self._parse_assignment) or None
-            if values and not schema:
+            if (
+                values
+                and not schema
+                and this.is_type(exp.DataType.Type.ARRAY, exp.DataType.Type.MAP)
+            ):
                self._retreat(index)
                break

--- a/sqlglot/tokens.py
+++ b/sqlglot/tokens.py
@ -364,6 +364,7 @@ class TokenType(AutoName):
    STORAGE_INTEGRATION = auto()
    STRAIGHT_JOIN = auto()
    STRUCT = auto()
+    SUMMARIZE = auto()
    TABLE_SAMPLE = auto()
    TAG = auto()
    TEMPORARY = auto()