Merging upstream version 16.2.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 16:00:51 +01:00 · 2025-02-13 16:00:51 +01:00 · 718a80b164
commit 718a80b164
parent c12f551e31
106 changed files with 41940 additions and 40162 deletions
--- a/sqlglot/dialects/dialect.py
+++ b/sqlglot/dialects/dialect.py
@ -25,6 +25,8 @@ class Dialects(str, Enum):

    BIGQUERY = "bigquery"
    CLICKHOUSE = "clickhouse"
+    DATABRICKS = "databricks"
+    DRILL = "drill"
    DUCKDB = "duckdb"
    HIVE = "hive"
    MYSQL = "mysql"
@ -38,11 +40,9 @@ class Dialects(str, Enum):
    SQLITE = "sqlite"
    STARROCKS = "starrocks"
    TABLEAU = "tableau"
+    TERADATA = "teradata"
    TRINO = "trino"
    TSQL = "tsql"
-    DATABRICKS = "databricks"
-    DRILL = "drill"
-    TERADATA = "teradata"


 class _Dialect(type):
@ -76,16 +76,19 @@ class _Dialect(type):
        enum = Dialects.__members__.get(clsname.upper())
        cls.classes[enum.value if enum is not None else clsname.lower()] = klass

-        klass.time_trie = new_trie(klass.time_mapping)
-        klass.inverse_time_mapping = {v: k for k, v in klass.time_mapping.items()}
-        klass.inverse_time_trie = new_trie(klass.inverse_time_mapping)
+        klass.TIME_TRIE = new_trie(klass.TIME_MAPPING)
+        klass.FORMAT_TRIE = (
+            new_trie(klass.FORMAT_MAPPING) if klass.FORMAT_MAPPING else klass.TIME_TRIE
+        )
+        klass.INVERSE_TIME_MAPPING = {v: k for k, v in klass.TIME_MAPPING.items()}
+        klass.INVERSE_TIME_TRIE = new_trie(klass.INVERSE_TIME_MAPPING)

        klass.tokenizer_class = getattr(klass, "Tokenizer", Tokenizer)
        klass.parser_class = getattr(klass, "Parser", Parser)
        klass.generator_class = getattr(klass, "Generator", Generator)

-        klass.quote_start, klass.quote_end = list(klass.tokenizer_class._QUOTES.items())[0]
-        klass.identifier_start, klass.identifier_end = list(
+        klass.QUOTE_START, klass.QUOTE_END = list(klass.tokenizer_class._QUOTES.items())[0]
+        klass.IDENTIFIER_START, klass.IDENTIFIER_END = list(
            klass.tokenizer_class._IDENTIFIERS.items()
        )[0]

@ -99,43 +102,80 @@ class _Dialect(type):
                (None, None),
            )

-        klass.bit_start, klass.bit_end = get_start_end(TokenType.BIT_STRING)
-        klass.hex_start, klass.hex_end = get_start_end(TokenType.HEX_STRING)
-        klass.byte_start, klass.byte_end = get_start_end(TokenType.BYTE_STRING)
-        klass.raw_start, klass.raw_end = get_start_end(TokenType.RAW_STRING)
+        klass.BIT_START, klass.BIT_END = get_start_end(TokenType.BIT_STRING)
+        klass.HEX_START, klass.HEX_END = get_start_end(TokenType.HEX_STRING)
+        klass.BYTE_START, klass.BYTE_END = get_start_end(TokenType.BYTE_STRING)
+        klass.RAW_START, klass.RAW_END = get_start_end(TokenType.RAW_STRING)

-        klass.tokenizer_class.identifiers_can_start_with_digit = (
-            klass.identifiers_can_start_with_digit
-        )
+        dialect_properties = {
+            **{
+                k: v
+                for k, v in vars(klass).items()
+                if not callable(v) and not isinstance(v, classmethod) and not k.startswith("__")
+            },
+            "STRING_ESCAPE": klass.tokenizer_class.STRING_ESCAPES[0],
+            "IDENTIFIER_ESCAPE": klass.tokenizer_class.IDENTIFIER_ESCAPES[0],
+        }
+
+        # Pass required dialect properties to the tokenizer, parser and generator classes
+        for subclass in (klass.tokenizer_class, klass.parser_class, klass.generator_class):
+            for name, value in dialect_properties.items():
+                if hasattr(subclass, name):
+                    setattr(subclass, name, value)
+
+        if not klass.STRICT_STRING_CONCAT:
+            klass.parser_class.BITWISE[TokenType.DPIPE] = exp.SafeDPipe

        return klass


 class Dialect(metaclass=_Dialect):
-    index_offset = 0
-    unnest_column_only = False
-    alias_post_tablesample = False
-    identifiers_can_start_with_digit = False
-    normalize_functions: t.Optional[str] = "upper"
-    null_ordering = "nulls_are_small"
+    # Determines the base index offset for arrays
+    INDEX_OFFSET = 0

-    date_format = "'%Y-%m-%d'"
-    dateint_format = "'%Y%m%d'"
-    time_format = "'%Y-%m-%d %H:%M:%S'"
-    time_mapping: t.Dict[str, str] = {}
+    # If true unnest table aliases are considered only as column aliases
+    UNNEST_COLUMN_ONLY = False

-    # autofilled
-    quote_start = None
-    quote_end = None
-    identifier_start = None
-    identifier_end = None
+    # Determines whether or not the table alias comes after tablesample
+    ALIAS_POST_TABLESAMPLE = False

-    time_trie = None
-    inverse_time_mapping = None
-    inverse_time_trie = None
-    tokenizer_class = None
-    parser_class = None
-    generator_class = None
+    # Determines whether or not an unquoted identifier can start with a digit
+    IDENTIFIERS_CAN_START_WITH_DIGIT = False
+
+    # Determines whether or not CONCAT's arguments must be strings
+    STRICT_STRING_CONCAT = False
+
+    # Determines how function names are going to be normalized
+    NORMALIZE_FUNCTIONS: bool | str = "upper"
+
+    # Indicates the default null ordering method to use if not explicitly set
+    # Options are: "nulls_are_small", "nulls_are_large", "nulls_are_last"
+    NULL_ORDERING = "nulls_are_small"
+
+    DATE_FORMAT = "'%Y-%m-%d'"
+    DATEINT_FORMAT = "'%Y%m%d'"
+    TIME_FORMAT = "'%Y-%m-%d %H:%M:%S'"
+
+    # Custom time mappings in which the key represents dialect time format
+    # and the value represents a python time format
+    TIME_MAPPING: t.Dict[str, str] = {}
+
+    # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_model_rules_date_time
+    # https://docs.teradata.com/r/Teradata-Database-SQL-Functions-Operators-Expressions-and-Predicates/March-2017/Data-Type-Conversions/Character-to-DATE-Conversion/Forcing-a-FORMAT-on-CAST-for-Converting-Character-to-DATE
+    # special syntax cast(x as date format 'yyyy') defaults to time_mapping
+    FORMAT_MAPPING: t.Dict[str, str] = {}
+
+    # Autofilled
+    tokenizer_class = Tokenizer
+    parser_class = Parser
+    generator_class = Generator
+
+    # A trie of the time_mapping keys
+    TIME_TRIE: t.Dict = {}
+    FORMAT_TRIE: t.Dict = {}
+
+    INVERSE_TIME_MAPPING: t.Dict[str, str] = {}
+    INVERSE_TIME_TRIE: t.Dict = {}

    def __eq__(self, other: t.Any) -> bool:
        return type(self) == other
@ -164,20 +204,13 @@ class Dialect(metaclass=_Dialect):
    ) -> t.Optional[exp.Expression]:
        if isinstance(expression, str):
            return exp.Literal.string(
-                format_time(
-                    expression[1:-1],  # the time formats are quoted
-                    cls.time_mapping,
-                    cls.time_trie,
-                )
+                # the time formats are quoted
+                format_time(expression[1:-1], cls.TIME_MAPPING, cls.TIME_TRIE)
            )
+
        if expression and expression.is_string:
-            return exp.Literal.string(
-                format_time(
-                    expression.this,
-                    cls.time_mapping,
-                    cls.time_trie,
-                )
-            )
+            return exp.Literal.string(format_time(expression.this, cls.TIME_MAPPING, cls.TIME_TRIE))
+
        return expression

    def parse(self, sql: str, **opts) -> t.List[t.Optional[exp.Expression]]:
@ -200,48 +233,14 @@ class Dialect(metaclass=_Dialect):
    @property
    def tokenizer(self) -> Tokenizer:
        if not hasattr(self, "_tokenizer"):
-            self._tokenizer = self.tokenizer_class()  # type: ignore
+            self._tokenizer = self.tokenizer_class()
        return self._tokenizer

    def parser(self, **opts) -> Parser:
-        return self.parser_class(  # type: ignore
-            **{
-                "index_offset": self.index_offset,
-                "unnest_column_only": self.unnest_column_only,
-                "alias_post_tablesample": self.alias_post_tablesample,
-                "null_ordering": self.null_ordering,
-                **opts,
-            },
-        )
+        return self.parser_class(**opts)

    def generator(self, **opts) -> Generator:
-        return self.generator_class(  # type: ignore
-            **{
-                "quote_start": self.quote_start,
-                "quote_end": self.quote_end,
-                "bit_start": self.bit_start,
-                "bit_end": self.bit_end,
-                "hex_start": self.hex_start,
-                "hex_end": self.hex_end,
-                "byte_start": self.byte_start,
-                "byte_end": self.byte_end,
-                "raw_start": self.raw_start,
-                "raw_end": self.raw_end,
-                "identifier_start": self.identifier_start,
-                "identifier_end": self.identifier_end,
-                "string_escape": self.tokenizer_class.STRING_ESCAPES[0],
-                "identifier_escape": self.tokenizer_class.IDENTIFIER_ESCAPES[0],
-                "index_offset": self.index_offset,
-                "time_mapping": self.inverse_time_mapping,
-                "time_trie": self.inverse_time_trie,
-                "unnest_column_only": self.unnest_column_only,
-                "alias_post_tablesample": self.alias_post_tablesample,
-                "identifiers_can_start_with_digit": self.identifiers_can_start_with_digit,
-                "normalize_functions": self.normalize_functions,
-                "null_ordering": self.null_ordering,
-                **opts,
-            }
-        )
+        return self.generator_class(**opts)


 DialectType = t.Union[str, Dialect, t.Type[Dialect], None]
@ -279,10 +278,7 @@ def inline_array_sql(self: Generator, expression: exp.Array) -> str:

 def no_ilike_sql(self: Generator, expression: exp.ILike) -> str:
    return self.like_sql(
-        exp.Like(
-            this=exp.Lower(this=expression.this),
-            expression=expression.args["expression"],
-        )
+        exp.Like(this=exp.Lower(this=expression.this), expression=expression.expression)
    )


@ -359,6 +355,7 @@ def var_map_sql(
    for key, value in zip(keys.expressions, values.expressions):
        args.append(self.sql(key))
        args.append(self.sql(value))
+
    return self.func(map_func_name, *args)


@ -381,7 +378,7 @@ def format_time_lambda(
            this=seq_get(args, 0),
            format=Dialect[dialect].format_time(
                seq_get(args, 1)
-                or (Dialect[dialect].time_format if default is True else default or None)
+                or (Dialect[dialect].TIME_FORMAT if default is True else default or None)
            ),
        )

@ -437,9 +434,7 @@ def parse_date_delta_with_interval(
            expression = exp.Literal.number(expression.this)

        return expression_class(
-            this=args[0],
-            expression=expression,
-            unit=exp.Literal.string(interval.text("unit")),
+            this=args[0], expression=expression, unit=exp.Literal.string(interval.text("unit"))
        )

    return func
@ -462,9 +457,7 @@ def timestamptrunc_sql(self: Generator, expression: exp.TimestampTrunc) -> str:

 def locate_to_strposition(args: t.List) -> exp.Expression:
    return exp.StrPosition(
-        this=seq_get(args, 1),
-        substr=seq_get(args, 0),
-        position=seq_get(args, 2),
+        this=seq_get(args, 1), substr=seq_get(args, 0), position=seq_get(args, 2)
    )


@ -546,13 +539,21 @@ def ts_or_ds_to_date_sql(dialect: str) -> t.Callable:
    def _ts_or_ds_to_date_sql(self: Generator, expression: exp.TsOrDsToDate) -> str:
        _dialect = Dialect.get_or_raise(dialect)
        time_format = self.format_time(expression)
-        if time_format and time_format not in (_dialect.time_format, _dialect.date_format):
+        if time_format and time_format not in (_dialect.TIME_FORMAT, _dialect.DATE_FORMAT):
            return f"CAST({str_to_time_sql(self, expression)} AS DATE)"
        return f"CAST({self.sql(expression, 'this')} AS DATE)"

    return _ts_or_ds_to_date_sql


+def concat_to_dpipe_sql(self: Generator, expression: exp.Concat | exp.SafeConcat) -> str:
+    this, *rest_args = expression.expressions
+    for arg in rest_args:
+        this = exp.DPipe(this=this, expression=arg)
+
+    return self.sql(this)
+
+
 # Spark, DuckDB use (almost) the same naming scheme for the output columns of the PIVOT operator
 def pivot_column_names(aggregations: t.List[exp.Expression], dialect: DialectType) -> t.List[str]:
    names = []