1
0
Fork 0

Adding upstream version 15.2.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-13 15:58:03 +01:00
parent bb75596aa9
commit 577b79f5a7
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
81 changed files with 40321 additions and 37940 deletions

View file

@ -9,6 +9,7 @@ from sqlglot.dialects.dialect import (
create_with_partitions_sql,
format_time_lambda,
if_sql,
left_to_substring_sql,
locate_to_strposition,
max_or_greatest,
min_or_least,
@ -17,6 +18,7 @@ from sqlglot.dialects.dialect import (
no_safe_divide_sql,
no_trycast_sql,
rename_func,
right_to_substring_sql,
strposition_to_locate_sql,
struct_extract_sql,
timestrtotime_sql,
@ -89,7 +91,7 @@ def _json_format_sql(self: generator.Generator, expression: exp.JSONFormat) -> s
annotate_types(this)
if this.type.is_type(exp.DataType.Type.JSON):
if this.type.is_type("json"):
return self.sql(this)
return self.func("TO_JSON", this, expression.args.get("options"))
@ -149,6 +151,7 @@ def _to_date_sql(self: generator.Generator, expression: exp.TsOrDsToDate) -> str
class Hive(Dialect):
alias_post_tablesample = True
identifiers_can_start_with_digit = True
time_mapping = {
"y": "%Y",
@ -190,7 +193,6 @@ class Hive(Dialect):
IDENTIFIERS = ["`"]
STRING_ESCAPES = ["\\"]
ENCODE = "utf-8"
IDENTIFIER_CAN_START_WITH_DIGIT = True
KEYWORDS = {
**tokens.Tokenizer.KEYWORDS,
@ -276,6 +278,39 @@ class Hive(Dialect):
"cluster": lambda self: self._parse_sort(exp.Cluster, "CLUSTER", "BY"),
}
def _parse_types(
self, check_func: bool = False, schema: bool = False
) -> t.Optional[exp.Expression]:
"""
Spark (and most likely Hive) treats casts to CHAR(length) and VARCHAR(length) as casts to
STRING in all contexts except for schema definitions. For example, this is in Spark v3.4.0:
spark-sql (default)> select cast(1234 as varchar(2));
23/06/06 15:51:18 WARN CharVarcharUtils: The Spark cast operator does not support
char/varchar type and simply treats them as string type. Please use string type
directly to avoid confusion. Otherwise, you can set spark.sql.legacy.charVarcharAsString
to true, so that Spark treat them as string type as same as Spark 3.0 and earlier
1234
Time taken: 4.265 seconds, Fetched 1 row(s)
This shows that Spark doesn't truncate the value into '12', which is inconsistent with
what other dialects (e.g. postgres) do, so we need to drop the length to transpile correctly.
Reference: https://spark.apache.org/docs/latest/sql-ref-datatypes.html
"""
this = super()._parse_types(check_func=check_func, schema=schema)
if this and not schema:
return this.transform(
lambda node: node.replace(exp.DataType.build("text"))
if isinstance(node, exp.DataType) and node.is_type("char", "varchar")
else node,
copy=False,
)
return this
class Generator(generator.Generator):
LIMIT_FETCH = "LIMIT"
TABLESAMPLE_WITH_METHOD = False
@ -323,6 +358,7 @@ class Hive(Dialect):
exp.JSONExtract: rename_func("GET_JSON_OBJECT"),
exp.JSONExtractScalar: rename_func("GET_JSON_OBJECT"),
exp.JSONFormat: _json_format_sql,
exp.Left: left_to_substring_sql,
exp.Map: var_map_sql,
exp.Max: max_or_greatest,
exp.Min: min_or_least,
@ -332,6 +368,7 @@ class Hive(Dialect):
exp.ApproxQuantile: rename_func("PERCENTILE_APPROX"),
exp.RegexpLike: lambda self, e: self.binary(e, "RLIKE"),
exp.RegexpSplit: rename_func("SPLIT"),
exp.Right: right_to_substring_sql,
exp.SafeDivide: no_safe_divide_sql,
exp.SchemaCommentProperty: lambda self, e: self.naked_property(e),
exp.SetAgg: rename_func("COLLECT_SET"),