sqlglot/sqlglot/dialects/spark.py

from __future__ import annotations

import typing as t

from sqlglot import exp
from sqlglot.dialects.dialect import rename_func, unit_to_var
from sqlglot.dialects.hive import _build_with_ignore_nulls
from sqlglot.dialects.spark2 import Spark2, temporary_storage_provider, _build_as_cast
from sqlglot.helper import ensure_list, seq_get
from sqlglot.transforms import (
    ctas_with_tmp_tables_to_create_tmp_view,
    remove_unique_constraints,
    preprocess,
    move_partitioned_by_to_schema_columns,
)


def _build_datediff(args: t.List) -> exp.Expression:
    """
    Although Spark docs don't mention the "unit" argument, Spark3 added support for
    it at some point. Databricks also supports this variant (see below).

    For example, in spark-sql (v3.3.1):
    - SELECT DATEDIFF('2020-01-01', '2020-01-05') results in -4
    - SELECT DATEDIFF(day, '2020-01-01', '2020-01-05') results in 4

    See also:
    - https://docs.databricks.com/sql/language-manual/functions/datediff3.html
    - https://docs.databricks.com/sql/language-manual/functions/datediff.html
    """
    unit = None
    this = seq_get(args, 0)
    expression = seq_get(args, 1)

    if len(args) == 3:
        unit = this
        this = args[2]

    return exp.DateDiff(
        this=exp.TsOrDsToDate(this=this), expression=exp.TsOrDsToDate(this=expression), unit=unit
    )


def _build_dateadd(args: t.List) -> exp.Expression:
    expression = seq_get(args, 1)

    if len(args) == 2:
        # DATE_ADD(startDate, numDays INTEGER)
        # https://docs.databricks.com/en/sql/language-manual/functions/date_add.html
        return exp.TsOrDsAdd(
            this=seq_get(args, 0), expression=expression, unit=exp.Literal.string("DAY")
        )

    # DATE_ADD / DATEADD / TIMESTAMPADD(unit, value integer, expr)
    # https://docs.databricks.com/en/sql/language-manual/functions/date_add3.html
    return exp.TimestampAdd(this=seq_get(args, 2), expression=expression, unit=seq_get(args, 0))


def _normalize_partition(e: exp.Expression) -> exp.Expression:
    """Normalize the expressions in PARTITION BY (<expression>, <expression>, ...)"""
    if isinstance(e, str):
        return exp.to_identifier(e)
    if isinstance(e, exp.Literal):
        return exp.to_identifier(e.name)
    return e


def _dateadd_sql(self: Spark.Generator, expression: exp.TsOrDsAdd | exp.TimestampAdd) -> str:
    if not expression.unit or (
        isinstance(expression, exp.TsOrDsAdd) and expression.text("unit").upper() == "DAY"
    ):
        # Coming from Hive/Spark2 DATE_ADD or roundtripping the 2-arg version of Spark3/DB
        return self.func("DATE_ADD", expression.this, expression.expression)

    this = self.func(
        "DATE_ADD",
        unit_to_var(expression),
        expression.expression,
        expression.this,
    )

    if isinstance(expression, exp.TsOrDsAdd):
        # The 3 arg version of DATE_ADD produces a timestamp in Spark3/DB but possibly not
        # in other dialects
        return_type = expression.return_type
        if not return_type.is_type(exp.DataType.Type.TIMESTAMP, exp.DataType.Type.DATETIME):
            this = f"CAST({this} AS {return_type})"

    return this


class Spark(Spark2):
    class Tokenizer(Spark2.Tokenizer):
        RAW_STRINGS = [
            (prefix + q, q)
            for q in t.cast(t.List[str], Spark2.Tokenizer.QUOTES)
            for prefix in ("r", "R")
        ]

    class Parser(Spark2.Parser):
        FUNCTIONS = {
            **Spark2.Parser.FUNCTIONS,
            "ANY_VALUE": _build_with_ignore_nulls(exp.AnyValue),
            "DATE_ADD": _build_dateadd,
            "DATEADD": _build_dateadd,
            "TIMESTAMPADD": _build_dateadd,
            "DATEDIFF": _build_datediff,
            "TIMESTAMP_LTZ": _build_as_cast("TIMESTAMP_LTZ"),
            "TIMESTAMP_NTZ": _build_as_cast("TIMESTAMP_NTZ"),
            "TRY_ELEMENT_AT": lambda args: exp.Bracket(
                this=seq_get(args, 0), expressions=ensure_list(seq_get(args, 1)), safe=True
            ),
        }

        def _parse_generated_as_identity(
            self,
        ) -> (
            exp.GeneratedAsIdentityColumnConstraint
            | exp.ComputedColumnConstraint
            | exp.GeneratedAsRowColumnConstraint
        ):
            this = super()._parse_generated_as_identity()
            if this.expression:
                return self.expression(exp.ComputedColumnConstraint, this=this.expression)
            return this

    class Generator(Spark2.Generator):
        SUPPORTS_TO_NUMBER = True

        TYPE_MAPPING = {
            **Spark2.Generator.TYPE_MAPPING,
            exp.DataType.Type.MONEY: "DECIMAL(15, 4)",
            exp.DataType.Type.SMALLMONEY: "DECIMAL(6, 4)",
            exp.DataType.Type.UNIQUEIDENTIFIER: "STRING",
            exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP_LTZ",
            exp.DataType.Type.TIMESTAMPNTZ: "TIMESTAMP_NTZ",
        }

        TRANSFORMS = {
            **Spark2.Generator.TRANSFORMS,
            exp.ArrayConstructCompact: lambda self, e: self.func(
                "ARRAY_COMPACT", self.func("ARRAY", *e.expressions)
            ),
            exp.Create: preprocess(
                [
                    remove_unique_constraints,
                    lambda e: ctas_with_tmp_tables_to_create_tmp_view(
                        e, temporary_storage_provider
                    ),
                    move_partitioned_by_to_schema_columns,
                ]
            ),
            exp.PartitionedByProperty: lambda self,
            e: f"PARTITIONED BY {self.wrap(self.expressions(sqls=[_normalize_partition(e) for e in e.this.expressions], skip_first=True))}",
            exp.StartsWith: rename_func("STARTSWITH"),
            exp.TsOrDsAdd: _dateadd_sql,
            exp.TimestampAdd: _dateadd_sql,
            exp.TryCast: lambda self, e: (
                self.trycast_sql(e) if e.args.get("safe") else self.cast_sql(e)
            ),
        }
        TRANSFORMS.pop(exp.AnyValue)
        TRANSFORMS.pop(exp.DateDiff)
        TRANSFORMS.pop(exp.Group)

        def bracket_sql(self, expression: exp.Bracket) -> str:
            if expression.args.get("safe"):
                key = seq_get(self.bracket_offset_expressions(expression), 0)
                return self.func("TRY_ELEMENT_AT", expression.this, key)

            return super().bracket_sql(expression)

        def computedcolumnconstraint_sql(self, expression: exp.ComputedColumnConstraint) -> str:
            return f"GENERATED ALWAYS AS ({self.sql(expression, 'this')})"

        def anyvalue_sql(self, expression: exp.AnyValue) -> str:
            return self.function_fallback_sql(expression)

        def datediff_sql(self, expression: exp.DateDiff) -> str:
            end = self.sql(expression, "this")
            start = self.sql(expression, "expression")

            if expression.unit:
                return self.func("DATEDIFF", unit_to_var(expression), start, end)

            return self.func("DATEDIFF", end, start)
Adding upstream version 10.0.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 14:52:26 +01:00			`from __future__ import annotations`

Adding upstream version 11.7.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:51:35 +01:00			`import typing as t`

Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`from sqlglot import exp`
Adding upstream version 23.7.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:30:02 +01:00			`from sqlglot.dialects.dialect import rename_func, unit_to_var`
Adding upstream version 21.1.2. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:28:14 +01:00			`from sqlglot.dialects.hive import _build_with_ignore_nulls`
Adding upstream version 23.13.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:33:03 +01:00			`from sqlglot.dialects.spark2 import Spark2, temporary_storage_provider, _build_as_cast`
Adding upstream version 23.10.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:31:00 +01:00			`from sqlglot.helper import ensure_list, seq_get`
Adding upstream version 21.1.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:25:55 +01:00			`from sqlglot.transforms import (`
			`ctas_with_tmp_tables_to_create_tmp_view,`
			`remove_unique_constraints,`
			`preprocess,`
			`move_partitioned_by_to_schema_columns,`
			`)`
Adding upstream version 6.0.4. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 06:15:54 +01:00

Adding upstream version 21.1.2. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:28:14 +01:00			`def _build_datediff(args: t.List) -> exp.Expression:`
Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`"""`
			`Although Spark docs don't mention the "unit" argument, Spark3 added support for`
Adding upstream version 15.0.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:56:32 +01:00			`it at some point. Databricks also supports this variant (see below).`
Adding upstream version 6.0.4. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 06:15:54 +01:00
Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`For example, in spark-sql (v3.3.1):`
			`- SELECT DATEDIFF('2020-01-01', '2020-01-05') results in -4`
			`- SELECT DATEDIFF(day, '2020-01-01', '2020-01-05') results in 4`
Adding upstream version 6.0.4. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 06:15:54 +01:00
Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`See also:`
			`- https://docs.databricks.com/sql/language-manual/functions/datediff3.html`
			`- https://docs.databricks.com/sql/language-manual/functions/datediff.html`
			`"""`
			`unit = None`
			`this = seq_get(args, 0)`
			`expression = seq_get(args, 1)`
Adding upstream version 6.0.4. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 06:15:54 +01:00
Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`if len(args) == 3:`
			`unit = this`
			`this = args[2]`
Adding upstream version 6.0.4. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 06:15:54 +01:00
Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`return exp.DateDiff(`
			`this=exp.TsOrDsToDate(this=this), expression=exp.TsOrDsToDate(this=expression), unit=unit`
			`)`
Adding upstream version 6.0.4. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 06:15:54 +01:00

Adding upstream version 25.1.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:38:56 +01:00			`def _build_dateadd(args: t.List) -> exp.Expression:`
			`expression = seq_get(args, 1)`

			`if len(args) == 2:`
			`# DATE_ADD(startDate, numDays INTEGER)`
			`# https://docs.databricks.com/en/sql/language-manual/functions/date_add.html`
			`return exp.TsOrDsAdd(`
			`this=seq_get(args, 0), expression=expression, unit=exp.Literal.string("DAY")`
			`)`

			`# DATE_ADD / DATEADD / TIMESTAMPADD(unit, value integer, expr)`
			`# https://docs.databricks.com/en/sql/language-manual/functions/date_add3.html`
			`return exp.TimestampAdd(this=seq_get(args, 2), expression=expression, unit=seq_get(args, 0))`


Adding upstream version 21.1.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:25:55 +01:00			`def _normalize_partition(e: exp.Expression) -> exp.Expression:`
			`"""Normalize the expressions in PARTITION BY (<expression>, <expression>, ...)"""`
			`if isinstance(e, str):`
			`return exp.to_identifier(e)`
			`if isinstance(e, exp.Literal):`
			`return exp.to_identifier(e.name)`
			`return e`


Adding upstream version 25.1.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:38:56 +01:00			`def _dateadd_sql(self: Spark.Generator, expression: exp.TsOrDsAdd \| exp.TimestampAdd) -> str:`
			`if not expression.unit or (`
			`isinstance(expression, exp.TsOrDsAdd) and expression.text("unit").upper() == "DAY"`
			`):`
			`# Coming from Hive/Spark2 DATE_ADD or roundtripping the 2-arg version of Spark3/DB`
			`return self.func("DATE_ADD", expression.this, expression.expression)`

			`this = self.func(`
			`"DATE_ADD",`
			`unit_to_var(expression),`
			`expression.expression,`
			`expression.this,`
			`)`

			`if isinstance(expression, exp.TsOrDsAdd):`
			`# The 3 arg version of DATE_ADD produces a timestamp in Spark3/DB but possibly not`
			`# in other dialects`
			`return_type = expression.return_type`
			`if not return_type.is_type(exp.DataType.Type.TIMESTAMP, exp.DataType.Type.DATETIME):`
			`this = f"CAST({this} AS {return_type})"`

			`return this`


Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`class Spark(Spark2):`
Adding upstream version 18.3.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 20:59:23 +01:00			`class Tokenizer(Spark2.Tokenizer):`
			`RAW_STRINGS = [`
			`(prefix + q, q)`
			`for q in t.cast(t.List[str], Spark2.Tokenizer.QUOTES)`
			`for prefix in ("r", "R")`
			`]`

Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`class Parser(Spark2.Parser):`
Adding upstream version 6.0.4. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 06:15:54 +01:00			`FUNCTIONS = {`
Adding upstream version 15.0.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:56:32 +01:00			`**Spark2.Parser.FUNCTIONS,`
Adding upstream version 21.1.2. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:28:14 +01:00			`"ANY_VALUE": _build_with_ignore_nulls(exp.AnyValue),`
Adding upstream version 25.1.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:38:56 +01:00			`"DATE_ADD": _build_dateadd,`
			`"DATEADD": _build_dateadd,`
			`"TIMESTAMPADD": _build_dateadd,`
Adding upstream version 21.1.2. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:28:14 +01:00			`"DATEDIFF": _build_datediff,`
Adding upstream version 23.13.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:33:03 +01:00			`"TIMESTAMP_LTZ": _build_as_cast("TIMESTAMP_LTZ"),`
			`"TIMESTAMP_NTZ": _build_as_cast("TIMESTAMP_NTZ"),`
Adding upstream version 23.10.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:31:00 +01:00			`"TRY_ELEMENT_AT": lambda args: exp.Bracket(`
			`this=seq_get(args, 0), expressions=ensure_list(seq_get(args, 1)), safe=True`
			`),`
Adding upstream version 6.0.4. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 06:15:54 +01:00			`}`

Adding upstream version 18.11.2. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:04:14 +01:00			`def _parse_generated_as_identity(`
			`self,`
Adding upstream version 20.1.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:16:46 +01:00			`) -> (`
			`exp.GeneratedAsIdentityColumnConstraint`
			`\| exp.ComputedColumnConstraint`
			`\| exp.GeneratedAsRowColumnConstraint`
			`):`
Adding upstream version 18.11.2. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:04:14 +01:00			`this = super()._parse_generated_as_identity()`
			`if this.expression:`
			`return self.expression(exp.ComputedColumnConstraint, this=this.expression)`
			`return this`

Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`class Generator(Spark2.Generator):`
Adding upstream version 23.7.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:30:02 +01:00			`SUPPORTS_TO_NUMBER = True`

Adding upstream version 17.3.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 20:43:42 +01:00			`TYPE_MAPPING = {`
			`**Spark2.Generator.TYPE_MAPPING,`
			`exp.DataType.Type.MONEY: "DECIMAL(15, 4)",`
			`exp.DataType.Type.SMALLMONEY: "DECIMAL(6, 4)",`
			`exp.DataType.Type.UNIQUEIDENTIFIER: "STRING",`
Adding upstream version 23.13.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:33:03 +01:00			`exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP_LTZ",`
			`exp.DataType.Type.TIMESTAMPNTZ: "TIMESTAMP_NTZ",`
Adding upstream version 17.3.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 20:43:42 +01:00			`}`
Adding upstream version 17.9.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 20:47:46 +01:00
			`TRANSFORMS = {`
			`**Spark2.Generator.TRANSFORMS,`
Adding upstream version 23.16.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:34:56 +01:00			`exp.ArrayConstructCompact: lambda self, e: self.func(`
			`"ARRAY_COMPACT", self.func("ARRAY", *e.expressions)`
			`),`
Adding upstream version 21.1.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:25:55 +01:00			`exp.Create: preprocess(`
			`[`
			`remove_unique_constraints,`
			`lambda e: ctas_with_tmp_tables_to_create_tmp_view(`
			`e, temporary_storage_provider`
			`),`
			`move_partitioned_by_to_schema_columns,`
			`]`
			`),`
			`exp.PartitionedByProperty: lambda self,`
			`e: f"PARTITIONED BY {self.wrap(self.expressions(sqls=[_normalize_partition(e) for e in e.this.expressions], skip_first=True))}",`
Adding upstream version 17.9.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 20:47:46 +01:00			`exp.StartsWith: rename_func("STARTSWITH"),`
Adding upstream version 25.1.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:38:56 +01:00			`exp.TsOrDsAdd: _dateadd_sql,`
			`exp.TimestampAdd: _dateadd_sql,`
Adding upstream version 20.11.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:19:36 +01:00			`exp.TryCast: lambda self, e: (`
			`self.trycast_sql(e) if e.args.get("safe") else self.cast_sql(e)`
			`),`
Adding upstream version 17.9.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 20:47:46 +01:00			`}`
Adding upstream version 18.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 20:56:33 +01:00			`TRANSFORMS.pop(exp.AnyValue)`
Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`TRANSFORMS.pop(exp.DateDiff)`
Adding upstream version 16.7.3. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 20:20:19 +01:00			`TRANSFORMS.pop(exp.Group)`
Adding upstream version 6.3.1. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 14:44:19 +01:00
Adding upstream version 23.10.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:31:00 +01:00			`def bracket_sql(self, expression: exp.Bracket) -> str:`
			`if expression.args.get("safe"):`
			`key = seq_get(self.bracket_offset_expressions(expression), 0)`
			`return self.func("TRY_ELEMENT_AT", expression.this, key)`

			`return super().bracket_sql(expression)`

Adding upstream version 18.11.2. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:04:14 +01:00			`def computedcolumnconstraint_sql(self, expression: exp.ComputedColumnConstraint) -> str:`
			`return f"GENERATED ALWAYS AS ({self.sql(expression, 'this')})"`

Adding upstream version 18.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 20:56:33 +01:00			`def anyvalue_sql(self, expression: exp.AnyValue) -> str:`
			`return self.function_fallback_sql(expression)`

Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`def datediff_sql(self, expression: exp.DateDiff) -> str:`
			`end = self.sql(expression, "this")`
			`start = self.sql(expression, "expression")`
Adding upstream version 10.6.3. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:09:11 +01:00
Adding upstream version 23.7.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 21:30:02 +01:00			`if expression.unit:`
			`return self.func("DATEDIFF", unit_to_var(expression), start, end)`
Adding upstream version 10.6.3. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:09:11 +01:00
Adding upstream version 12.2.0. Signed-off-by: Daniel Baumann <daniel@debian.org> 2025-02-13 15:52:54 +01:00			`return self.func("DATEDIFF", end, start)`