Adding upstream version 15.2.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 15:58:03 +01:00 · 2025-02-13 15:58:03 +01:00 · 577b79f5a7
commit 577b79f5a7
parent bb75596aa9
81 changed files with 40321 additions and 37940 deletions
--- a/sqlglot/dialects/hive.py
+++ b/sqlglot/dialects/hive.py
@ -9,6 +9,7 @@ from sqlglot.dialects.dialect import (
    create_with_partitions_sql,
    format_time_lambda,
    if_sql,
+    left_to_substring_sql,
    locate_to_strposition,
    max_or_greatest,
    min_or_least,
@ -17,6 +18,7 @@ from sqlglot.dialects.dialect import (
    no_safe_divide_sql,
    no_trycast_sql,
    rename_func,
+    right_to_substring_sql,
    strposition_to_locate_sql,
    struct_extract_sql,
    timestrtotime_sql,
@ -89,7 +91,7 @@ def _json_format_sql(self: generator.Generator, expression: exp.JSONFormat) -> s

        annotate_types(this)

-    if this.type.is_type(exp.DataType.Type.JSON):
+    if this.type.is_type("json"):
        return self.sql(this)
    return self.func("TO_JSON", this, expression.args.get("options"))

@ -149,6 +151,7 @@ def _to_date_sql(self: generator.Generator, expression: exp.TsOrDsToDate) -> str

 class Hive(Dialect):
    alias_post_tablesample = True
+    identifiers_can_start_with_digit = True

    time_mapping = {
        "y": "%Y",
@ -190,7 +193,6 @@ class Hive(Dialect):
        IDENTIFIERS = ["`"]
        STRING_ESCAPES = ["\\"]
        ENCODE = "utf-8"
-        IDENTIFIER_CAN_START_WITH_DIGIT = True

        KEYWORDS = {
            **tokens.Tokenizer.KEYWORDS,
@ -276,6 +278,39 @@ class Hive(Dialect):
            "cluster": lambda self: self._parse_sort(exp.Cluster, "CLUSTER", "BY"),
        }

+        def _parse_types(
+            self, check_func: bool = False, schema: bool = False
+        ) -> t.Optional[exp.Expression]:
+            """
+            Spark (and most likely Hive) treats casts to CHAR(length) and VARCHAR(length) as casts to
+            STRING in all contexts except for schema definitions. For example, this is in Spark v3.4.0:
+
+                spark-sql (default)> select cast(1234 as varchar(2));
+                23/06/06 15:51:18 WARN CharVarcharUtils: The Spark cast operator does not support
+                char/varchar type and simply treats them as string type. Please use string type
+                directly to avoid confusion. Otherwise, you can set spark.sql.legacy.charVarcharAsString
+                to true, so that Spark treat them as string type as same as Spark 3.0 and earlier
+
+                1234
+                Time taken: 4.265 seconds, Fetched 1 row(s)
+
+            This shows that Spark doesn't truncate the value into '12', which is inconsistent with
+            what other dialects (e.g. postgres) do, so we need to drop the length to transpile correctly.
+
+            Reference: https://spark.apache.org/docs/latest/sql-ref-datatypes.html
+            """
+            this = super()._parse_types(check_func=check_func, schema=schema)
+
+            if this and not schema:
+                return this.transform(
+                    lambda node: node.replace(exp.DataType.build("text"))
+                    if isinstance(node, exp.DataType) and node.is_type("char", "varchar")
+                    else node,
+                    copy=False,
+                )
+
+            return this
+
    class Generator(generator.Generator):
        LIMIT_FETCH = "LIMIT"
        TABLESAMPLE_WITH_METHOD = False
@ -323,6 +358,7 @@ class Hive(Dialect):
            exp.JSONExtract: rename_func("GET_JSON_OBJECT"),
            exp.JSONExtractScalar: rename_func("GET_JSON_OBJECT"),
            exp.JSONFormat: _json_format_sql,
+            exp.Left: left_to_substring_sql,
            exp.Map: var_map_sql,
            exp.Max: max_or_greatest,
            exp.Min: min_or_least,
@ -332,6 +368,7 @@ class Hive(Dialect):
            exp.ApproxQuantile: rename_func("PERCENTILE_APPROX"),
            exp.RegexpLike: lambda self, e: self.binary(e, "RLIKE"),
            exp.RegexpSplit: rename_func("SPLIT"),
+            exp.Right: right_to_substring_sql,
            exp.SafeDivide: no_safe_divide_sql,
            exp.SchemaCommentProperty: lambda self, e: self.naked_property(e),
            exp.SetAgg: rename_func("COLLECT_SET"),