Merging upstream version 25.5.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 21:41:14 +01:00 · 2025-02-13 21:41:14 +01:00 · 029b9c2c73
commit 029b9c2c73
parent 298e7a8147
136 changed files with 80990 additions and 72541 deletions
--- a/sqlglot/dialects/presto.py
+++ b/sqlglot/dialects/presto.py
@ -173,6 +173,35 @@ def _unix_to_time_sql(self: Presto.Generator, expression: exp.UnixToTime) -> str
    return f"FROM_UNIXTIME(CAST({timestamp} AS DOUBLE) / POW(10, {scale}))"


+def _jsonextract_sql(self: Presto.Generator, expression: exp.JSONExtract) -> str:
+    is_json_extract = self.dialect.settings.get("variant_extract_is_json_extract", True)
+
+    # Generate JSON_EXTRACT unless the user has configured that a Snowflake / Databricks
+    # VARIANT extract (e.g. col:x.y) should map to dot notation (i.e ROW access) in Presto/Trino
+    if not expression.args.get("variant_extract") or is_json_extract:
+        return self.func(
+            "JSON_EXTRACT", expression.this, expression.expression, *expression.expressions
+        )
+
+    this = self.sql(expression, "this")
+
+    # Convert the JSONPath extraction `JSON_EXTRACT(col, '$.x.y) to a ROW access col.x.y
+    segments = []
+    for path_key in expression.expression.expressions[1:]:
+        if not isinstance(path_key, exp.JSONPathKey):
+            # Cannot transpile subscripts, wildcards etc to dot notation
+            self.unsupported(f"Cannot transpile JSONPath segment '{path_key}' to ROW access")
+            continue
+        key = path_key.this
+        if not exp.SAFE_IDENTIFIER_RE.match(key):
+            key = f'"{key}"'
+        segments.append(f".{key}")
+
+    expr = "".join(segments)
+
+    return f"{this}{expr}"
+
+
 def _to_int(expression: exp.Expression) -> exp.Expression:
    if not expression.type:
        from sqlglot.optimizer.annotate_types import annotate_types
@ -227,7 +256,7 @@ class Presto(Dialect):
            "TDIGEST": TokenType.TDIGEST,
            "HYPERLOGLOG": TokenType.HLLSKETCH,
        }
-
+        KEYWORDS.pop("/*+")
        KEYWORDS.pop("QUALIFY")

    class Parser(parser.Parser):
@ -305,6 +334,7 @@ class Presto(Dialect):
        MULTI_ARG_DISTINCT = False
        SUPPORTS_TO_NUMBER = False
        HEX_FUNC = "TO_HEX"
+        PARSE_JSON_NAME = "JSON_PARSE"

        PROPERTIES_LOCATION = {
            **generator.Generator.PROPERTIES_LOCATION,
@ -389,7 +419,7 @@ class Presto(Dialect):
            exp.If: if_sql(),
            exp.ILike: no_ilike_sql,
            exp.Initcap: _initcap_sql,
-            exp.ParseJSON: rename_func("JSON_PARSE"),
+            exp.JSONExtract: _jsonextract_sql,
            exp.Last: _first_last_sql,
            exp.LastValue: _first_last_sql,
            exp.LastDay: lambda self, e: self.func("LAST_DAY_OF_MONTH", e.this),
@ -448,9 +478,6 @@ class Presto(Dialect):
                [transforms.remove_within_group_for_percentiles]
            ),
            exp.Xor: bool_xor_sql,
-            exp.MD5: lambda self, e: self.func(
-                "LOWER", self.func("TO_HEX", self.func("MD5", self.sql(e, "this")))
-            ),
            exp.MD5Digest: rename_func("MD5"),
            exp.SHA: rename_func("SHA1"),
            exp.SHA2: sha256_sql,
@ -517,6 +544,19 @@ class Presto(Dialect):
            "with",
        }

+        def md5_sql(self, expression: exp.MD5) -> str:
+            this = expression.this
+
+            if not this.type:
+                from sqlglot.optimizer.annotate_types import annotate_types
+
+                this = annotate_types(this)
+
+            if this.is_type(*exp.DataType.TEXT_TYPES):
+                this = exp.Encode(this=this, charset=exp.Literal.string("utf-8"))
+
+            return self.func("LOWER", self.func("TO_HEX", self.func("MD5", self.sql(this))))
+
        def strtounix_sql(self, expression: exp.StrToUnix) -> str:
            # Since `TO_UNIXTIME` requires a `TIMESTAMP`, we need to parse the argument into one.
            # To do this, we first try to `DATE_PARSE` it, but since this can fail when there's a