2025-02-13 14:53:05 +01:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2025-02-13 15:46:19 +01:00
|
|
|
import typing as t
|
|
|
|
|
2025-02-13 14:53:05 +01:00
|
|
|
from sqlglot import exp, generator, parser, tokens, transforms
|
2025-02-13 06:15:54 +01:00
|
|
|
from sqlglot.dialects.dialect import (
|
|
|
|
Dialect,
|
|
|
|
approx_count_distinct_sql,
|
2025-02-13 14:40:43 +01:00
|
|
|
create_with_partitions_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
format_time_lambda,
|
|
|
|
if_sql,
|
2025-02-13 15:58:40 +01:00
|
|
|
left_to_substring_sql,
|
2025-02-13 14:54:32 +01:00
|
|
|
locate_to_strposition,
|
2025-02-13 15:48:10 +01:00
|
|
|
max_or_greatest,
|
2025-02-13 15:43:32 +01:00
|
|
|
min_or_least,
|
2025-02-13 06:15:54 +01:00
|
|
|
no_ilike_sql,
|
|
|
|
no_recursive_cte_sql,
|
|
|
|
no_safe_divide_sql,
|
|
|
|
no_trycast_sql,
|
2025-02-13 20:45:52 +01:00
|
|
|
regexp_extract_sql,
|
2025-02-13 20:46:55 +01:00
|
|
|
regexp_replace_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
rename_func,
|
2025-02-13 15:58:40 +01:00
|
|
|
right_to_substring_sql,
|
2025-02-13 15:05:06 +01:00
|
|
|
strposition_to_locate_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
struct_extract_sql,
|
2025-02-13 15:01:55 +01:00
|
|
|
timestrtotime_sql,
|
2025-02-13 14:45:11 +01:00
|
|
|
var_map_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
)
|
2025-02-13 14:53:05 +01:00
|
|
|
from sqlglot.helper import seq_get
|
|
|
|
from sqlglot.parser import parse_var_map
|
2025-02-13 14:54:32 +01:00
|
|
|
from sqlglot.tokens import TokenType
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 14:50:31 +01:00
|
|
|
# (FuncType, Multiplier)
|
|
|
|
DATE_DELTA_INTERVAL = {
|
|
|
|
"YEAR": ("ADD_MONTHS", 12),
|
|
|
|
"MONTH": ("ADD_MONTHS", 1),
|
|
|
|
"QUARTER": ("ADD_MONTHS", 3),
|
|
|
|
"WEEK": ("DATE_ADD", 7),
|
|
|
|
"DAY": ("DATE_ADD", 1),
|
|
|
|
}
|
|
|
|
|
2025-02-13 15:48:10 +01:00
|
|
|
TIME_DIFF_FACTOR = {
|
|
|
|
"MILLISECOND": " * 1000",
|
|
|
|
"SECOND": "",
|
|
|
|
"MINUTE": " / 60",
|
|
|
|
"HOUR": " / 3600",
|
|
|
|
}
|
|
|
|
|
2025-02-13 14:50:31 +01:00
|
|
|
DIFF_MONTH_SWITCH = ("YEAR", "QUARTER", "MONTH")
|
|
|
|
|
|
|
|
|
2025-02-13 15:52:09 +01:00
|
|
|
def _add_date_sql(self: generator.Generator, expression: exp.DateAdd | exp.DateSub) -> str:
|
2025-02-13 14:50:31 +01:00
|
|
|
unit = expression.text("unit").upper()
|
|
|
|
func, multiplier = DATE_DELTA_INTERVAL.get(unit, ("DATE_ADD", 1))
|
2025-02-13 15:52:09 +01:00
|
|
|
|
|
|
|
if isinstance(expression, exp.DateSub):
|
|
|
|
multiplier *= -1
|
|
|
|
|
|
|
|
if expression.expression.is_number:
|
|
|
|
modified_increment = exp.Literal.number(int(expression.text("expression")) * multiplier)
|
|
|
|
else:
|
2025-02-13 20:51:40 +01:00
|
|
|
modified_increment = expression.expression.copy()
|
2025-02-13 15:52:09 +01:00
|
|
|
if multiplier != 1:
|
|
|
|
modified_increment = exp.Mul( # type: ignore
|
|
|
|
this=modified_increment, expression=exp.Literal.number(multiplier)
|
|
|
|
)
|
|
|
|
|
|
|
|
return self.func(func, expression.this, modified_increment)
|
2025-02-13 14:50:31 +01:00
|
|
|
|
|
|
|
|
2025-02-13 15:46:19 +01:00
|
|
|
def _date_diff_sql(self: generator.Generator, expression: exp.DateDiff) -> str:
|
2025-02-13 14:50:31 +01:00
|
|
|
unit = expression.text("unit").upper()
|
2025-02-13 15:48:10 +01:00
|
|
|
|
|
|
|
factor = TIME_DIFF_FACTOR.get(unit)
|
|
|
|
if factor is not None:
|
|
|
|
left = self.sql(expression, "this")
|
|
|
|
right = self.sql(expression, "expression")
|
|
|
|
sec_diff = f"UNIX_TIMESTAMP({left}) - UNIX_TIMESTAMP({right})"
|
|
|
|
return f"({sec_diff}){factor}" if factor else sec_diff
|
|
|
|
|
2025-02-13 14:50:31 +01:00
|
|
|
sql_func = "MONTHS_BETWEEN" if unit in DIFF_MONTH_SWITCH else "DATEDIFF"
|
|
|
|
_, multiplier = DATE_DELTA_INTERVAL.get(unit, ("", 1))
|
|
|
|
multiplier_sql = f" / {multiplier}" if multiplier > 1 else ""
|
|
|
|
diff_sql = f"{sql_func}({self.format_args(expression.this, expression.expression)})"
|
2025-02-13 16:00:51 +01:00
|
|
|
|
2025-02-13 14:50:31 +01:00
|
|
|
return f"{diff_sql}{multiplier_sql}"
|
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 15:53:39 +01:00
|
|
|
def _json_format_sql(self: generator.Generator, expression: exp.JSONFormat) -> str:
|
|
|
|
this = expression.this
|
2025-02-13 20:04:59 +01:00
|
|
|
if isinstance(this, exp.Cast) and this.is_type("json") and this.this.is_string:
|
|
|
|
# Since FROM_JSON requires a nested type, we always wrap the json string with
|
|
|
|
# an array to ensure that "naked" strings like "'a'" will be handled correctly
|
|
|
|
wrapped_json = exp.Literal.string(f"[{this.this.name}]")
|
2025-02-13 15:53:39 +01:00
|
|
|
|
2025-02-13 20:04:59 +01:00
|
|
|
from_json = self.func("FROM_JSON", wrapped_json, self.func("SCHEMA_OF_JSON", wrapped_json))
|
|
|
|
to_json = self.func("TO_JSON", from_json)
|
|
|
|
|
|
|
|
# This strips the [, ] delimiters of the dummy array printed by TO_JSON
|
|
|
|
return self.func("REGEXP_EXTRACT", to_json, "'^.(.*).$'", "1")
|
2025-02-13 15:53:39 +01:00
|
|
|
|
|
|
|
return self.func("TO_JSON", this, expression.args.get("options"))
|
|
|
|
|
|
|
|
|
|
|
|
def _array_sort_sql(self: generator.Generator, expression: exp.ArraySort) -> str:
|
2025-02-13 06:15:54 +01:00
|
|
|
if expression.expression:
|
|
|
|
self.unsupported("Hive SORT_ARRAY does not support a comparator")
|
|
|
|
return f"SORT_ARRAY({self.sql(expression, 'this')})"
|
|
|
|
|
|
|
|
|
2025-02-13 15:46:19 +01:00
|
|
|
def _property_sql(self: generator.Generator, expression: exp.Property) -> str:
|
2025-02-13 14:56:25 +01:00
|
|
|
return f"'{expression.name}'={self.sql(expression, 'value')}"
|
2025-02-13 06:15:54 +01:00
|
|
|
|
|
|
|
|
2025-02-13 15:53:39 +01:00
|
|
|
def _str_to_unix_sql(self: generator.Generator, expression: exp.StrToUnix) -> str:
|
2025-02-13 15:26:26 +01:00
|
|
|
return self.func("UNIX_TIMESTAMP", expression.this, _time_format(self, expression))
|
2025-02-13 06:15:54 +01:00
|
|
|
|
|
|
|
|
2025-02-13 15:53:39 +01:00
|
|
|
def _str_to_date_sql(self: generator.Generator, expression: exp.StrToDate) -> str:
|
2025-02-13 06:15:54 +01:00
|
|
|
this = self.sql(expression, "this")
|
|
|
|
time_format = self.format_time(expression)
|
2025-02-13 16:00:51 +01:00
|
|
|
if time_format not in (Hive.TIME_FORMAT, Hive.DATE_FORMAT):
|
2025-02-13 06:15:54 +01:00
|
|
|
this = f"FROM_UNIXTIME(UNIX_TIMESTAMP({this}, {time_format}))"
|
|
|
|
return f"CAST({this} AS DATE)"
|
|
|
|
|
|
|
|
|
2025-02-13 15:53:39 +01:00
|
|
|
def _str_to_time_sql(self: generator.Generator, expression: exp.StrToTime) -> str:
|
2025-02-13 06:15:54 +01:00
|
|
|
this = self.sql(expression, "this")
|
|
|
|
time_format = self.format_time(expression)
|
2025-02-13 16:00:51 +01:00
|
|
|
if time_format not in (Hive.TIME_FORMAT, Hive.DATE_FORMAT):
|
2025-02-13 06:15:54 +01:00
|
|
|
this = f"FROM_UNIXTIME(UNIX_TIMESTAMP({this}, {time_format}))"
|
|
|
|
return f"CAST({this} AS TIMESTAMP)"
|
|
|
|
|
|
|
|
|
2025-02-13 15:46:19 +01:00
|
|
|
def _time_format(
|
|
|
|
self: generator.Generator, expression: exp.UnixToStr | exp.StrToUnix
|
|
|
|
) -> t.Optional[str]:
|
2025-02-13 06:15:54 +01:00
|
|
|
time_format = self.format_time(expression)
|
2025-02-13 16:00:51 +01:00
|
|
|
if time_format == Hive.TIME_FORMAT:
|
2025-02-13 06:15:54 +01:00
|
|
|
return None
|
|
|
|
return time_format
|
|
|
|
|
|
|
|
|
2025-02-13 15:46:19 +01:00
|
|
|
def _time_to_str(self: generator.Generator, expression: exp.TimeToStr) -> str:
|
2025-02-13 06:15:54 +01:00
|
|
|
this = self.sql(expression, "this")
|
|
|
|
time_format = self.format_time(expression)
|
|
|
|
return f"DATE_FORMAT({this}, {time_format})"
|
|
|
|
|
|
|
|
|
2025-02-13 15:46:19 +01:00
|
|
|
def _to_date_sql(self: generator.Generator, expression: exp.TsOrDsToDate) -> str:
|
2025-02-13 06:15:54 +01:00
|
|
|
this = self.sql(expression, "this")
|
|
|
|
time_format = self.format_time(expression)
|
2025-02-13 16:00:51 +01:00
|
|
|
if time_format and time_format not in (Hive.TIME_FORMAT, Hive.DATE_FORMAT):
|
2025-02-13 06:15:54 +01:00
|
|
|
return f"TO_DATE({this}, {time_format})"
|
|
|
|
return f"TO_DATE({this})"
|
|
|
|
|
|
|
|
|
|
|
|
class Hive(Dialect):
|
2025-02-13 16:00:51 +01:00
|
|
|
ALIAS_POST_TABLESAMPLE = True
|
|
|
|
IDENTIFIERS_CAN_START_WITH_DIGIT = True
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 20:04:59 +01:00
|
|
|
# https://spark.apache.org/docs/latest/sql-ref-identifier.html#description
|
|
|
|
RESOLVES_IDENTIFIERS_AS_UPPERCASE = None
|
|
|
|
|
2025-02-13 16:00:51 +01:00
|
|
|
TIME_MAPPING = {
|
2025-02-13 06:15:54 +01:00
|
|
|
"y": "%Y",
|
|
|
|
"Y": "%Y",
|
|
|
|
"YYYY": "%Y",
|
|
|
|
"yyyy": "%Y",
|
|
|
|
"YY": "%y",
|
|
|
|
"yy": "%y",
|
|
|
|
"MMMM": "%B",
|
|
|
|
"MMM": "%b",
|
|
|
|
"MM": "%m",
|
|
|
|
"M": "%-m",
|
|
|
|
"dd": "%d",
|
|
|
|
"d": "%-d",
|
|
|
|
"HH": "%H",
|
|
|
|
"H": "%-H",
|
|
|
|
"hh": "%I",
|
|
|
|
"h": "%-I",
|
|
|
|
"mm": "%M",
|
|
|
|
"m": "%-M",
|
|
|
|
"ss": "%S",
|
|
|
|
"s": "%-S",
|
2025-02-13 14:50:31 +01:00
|
|
|
"SSSSSS": "%f",
|
2025-02-13 14:48:46 +01:00
|
|
|
"a": "%p",
|
|
|
|
"DD": "%j",
|
|
|
|
"D": "%-j",
|
2025-02-13 14:50:31 +01:00
|
|
|
"E": "%a",
|
|
|
|
"EE": "%a",
|
|
|
|
"EEE": "%a",
|
|
|
|
"EEEE": "%A",
|
2025-02-13 06:15:54 +01:00
|
|
|
}
|
|
|
|
|
2025-02-13 16:00:51 +01:00
|
|
|
DATE_FORMAT = "'yyyy-MM-dd'"
|
|
|
|
DATEINT_FORMAT = "'yyyyMMdd'"
|
|
|
|
TIME_FORMAT = "'yyyy-MM-dd HH:mm:ss'"
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 14:53:05 +01:00
|
|
|
class Tokenizer(tokens.Tokenizer):
|
2025-02-13 06:15:54 +01:00
|
|
|
QUOTES = ["'", '"']
|
|
|
|
IDENTIFIERS = ["`"]
|
2025-02-13 15:23:26 +01:00
|
|
|
STRING_ESCAPES = ["\\"]
|
2025-02-13 06:15:54 +01:00
|
|
|
ENCODE = "utf-8"
|
|
|
|
|
2025-02-13 14:54:32 +01:00
|
|
|
KEYWORDS = {
|
|
|
|
**tokens.Tokenizer.KEYWORDS,
|
|
|
|
"ADD ARCHIVE": TokenType.COMMAND,
|
|
|
|
"ADD ARCHIVES": TokenType.COMMAND,
|
|
|
|
"ADD FILE": TokenType.COMMAND,
|
|
|
|
"ADD FILES": TokenType.COMMAND,
|
|
|
|
"ADD JAR": TokenType.COMMAND,
|
|
|
|
"ADD JARS": TokenType.COMMAND,
|
2025-02-13 15:03:38 +01:00
|
|
|
"MSCK REPAIR": TokenType.COMMAND,
|
2025-02-13 20:46:55 +01:00
|
|
|
"REFRESH": TokenType.COMMAND,
|
2025-02-13 14:58:37 +01:00
|
|
|
"WITH SERDEPROPERTIES": TokenType.SERDE_PROPERTIES,
|
2025-02-13 14:54:32 +01:00
|
|
|
}
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 15:03:38 +01:00
|
|
|
NUMERIC_LITERALS = {
|
|
|
|
"L": "BIGINT",
|
|
|
|
"S": "SMALLINT",
|
|
|
|
"Y": "TINYINT",
|
|
|
|
"D": "DOUBLE",
|
|
|
|
"F": "FLOAT",
|
|
|
|
"BD": "DECIMAL",
|
|
|
|
}
|
|
|
|
|
2025-02-13 14:53:05 +01:00
|
|
|
class Parser(parser.Parser):
|
2025-02-13 15:52:09 +01:00
|
|
|
LOG_DEFAULTS_TO_LN = True
|
2025-02-13 06:15:54 +01:00
|
|
|
STRICT_CAST = False
|
|
|
|
|
|
|
|
FUNCTIONS = {
|
2025-02-13 15:57:23 +01:00
|
|
|
**parser.Parser.FUNCTIONS,
|
2025-02-13 15:53:39 +01:00
|
|
|
"BASE64": exp.ToBase64.from_arg_list,
|
2025-02-13 06:15:54 +01:00
|
|
|
"COLLECT_LIST": exp.ArrayAgg.from_arg_list,
|
2025-02-13 20:45:52 +01:00
|
|
|
"COLLECT_SET": exp.SetAgg.from_arg_list,
|
2025-02-13 06:15:54 +01:00
|
|
|
"DATE_ADD": lambda args: exp.TsOrDsAdd(
|
2025-02-13 16:00:51 +01:00
|
|
|
this=seq_get(args, 0), expression=seq_get(args, 1), unit=exp.Literal.string("DAY")
|
2025-02-13 06:15:54 +01:00
|
|
|
),
|
2025-02-13 20:45:52 +01:00
|
|
|
"DATE_FORMAT": lambda args: format_time_lambda(exp.TimeToStr, "hive")(
|
|
|
|
[
|
|
|
|
exp.TimeStrToTime(this=seq_get(args, 0)),
|
|
|
|
seq_get(args, 1),
|
|
|
|
]
|
2025-02-13 06:15:54 +01:00
|
|
|
),
|
|
|
|
"DATE_SUB": lambda args: exp.TsOrDsAdd(
|
2025-02-13 14:53:05 +01:00
|
|
|
this=seq_get(args, 0),
|
2025-02-13 16:00:51 +01:00
|
|
|
expression=exp.Mul(this=seq_get(args, 1), expression=exp.Literal.number(-1)),
|
2025-02-13 06:15:54 +01:00
|
|
|
unit=exp.Literal.string("DAY"),
|
|
|
|
),
|
2025-02-13 20:45:52 +01:00
|
|
|
"DATEDIFF": lambda args: exp.DateDiff(
|
|
|
|
this=exp.TsOrDsToDate(this=seq_get(args, 0)),
|
|
|
|
expression=exp.TsOrDsToDate(this=seq_get(args, 1)),
|
2025-02-13 15:01:55 +01:00
|
|
|
),
|
2025-02-13 14:53:05 +01:00
|
|
|
"DAY": lambda args: exp.Day(this=exp.TsOrDsToDate(this=seq_get(args, 0))),
|
2025-02-13 06:15:54 +01:00
|
|
|
"FROM_UNIXTIME": format_time_lambda(exp.UnixToStr, "hive", True),
|
|
|
|
"GET_JSON_OBJECT": exp.JSONExtractScalar.from_arg_list,
|
2025-02-13 14:54:32 +01:00
|
|
|
"LOCATE": locate_to_strposition,
|
2025-02-13 14:45:11 +01:00
|
|
|
"MAP": parse_var_map,
|
2025-02-13 06:15:54 +01:00
|
|
|
"MONTH": lambda args: exp.Month(this=exp.TsOrDsToDate.from_arg_list(args)),
|
|
|
|
"PERCENTILE": exp.Quantile.from_arg_list,
|
2025-02-13 08:04:41 +01:00
|
|
|
"PERCENTILE_APPROX": exp.ApproxQuantile.from_arg_list,
|
2025-02-13 20:45:52 +01:00
|
|
|
"REGEXP_EXTRACT": lambda args: exp.RegexpExtract(
|
|
|
|
this=seq_get(args, 0), expression=seq_get(args, 1), group=seq_get(args, 2)
|
|
|
|
),
|
2025-02-13 06:15:54 +01:00
|
|
|
"SIZE": exp.ArraySize.from_arg_list,
|
|
|
|
"SPLIT": exp.RegexpSplit.from_arg_list,
|
|
|
|
"TO_DATE": format_time_lambda(exp.TsOrDsToDate, "hive"),
|
2025-02-13 15:50:57 +01:00
|
|
|
"TO_JSON": exp.JSONFormat.from_arg_list,
|
2025-02-13 15:53:39 +01:00
|
|
|
"UNBASE64": exp.FromBase64.from_arg_list,
|
2025-02-13 06:15:54 +01:00
|
|
|
"UNIX_TIMESTAMP": format_time_lambda(exp.StrToUnix, "hive", True),
|
|
|
|
"YEAR": lambda args: exp.Year(this=exp.TsOrDsToDate.from_arg_list(args)),
|
|
|
|
}
|
|
|
|
|
2025-02-13 20:51:40 +01:00
|
|
|
NO_PAREN_FUNCTION_PARSERS = {
|
|
|
|
**parser.Parser.NO_PAREN_FUNCTION_PARSERS,
|
2025-02-13 20:46:55 +01:00
|
|
|
"TRANSFORM": lambda self: self._parse_transform(),
|
|
|
|
}
|
|
|
|
|
2025-02-13 14:58:37 +01:00
|
|
|
PROPERTY_PARSERS = {
|
2025-02-13 15:57:23 +01:00
|
|
|
**parser.Parser.PROPERTY_PARSERS,
|
2025-02-13 15:08:15 +01:00
|
|
|
"WITH SERDEPROPERTIES": lambda self: exp.SerdeProperties(
|
2025-02-13 14:58:37 +01:00
|
|
|
expressions=self._parse_wrapped_csv(self._parse_property)
|
|
|
|
),
|
|
|
|
}
|
|
|
|
|
2025-02-13 20:51:40 +01:00
|
|
|
def _parse_transform(self) -> t.Optional[exp.Transform | exp.QueryTransform]:
|
|
|
|
if not self._match(TokenType.L_PAREN, advance=False):
|
|
|
|
self._retreat(self._index - 1)
|
|
|
|
return None
|
2025-02-13 20:46:55 +01:00
|
|
|
|
2025-02-13 20:51:40 +01:00
|
|
|
args = self._parse_wrapped_csv(self._parse_lambda)
|
2025-02-13 20:46:55 +01:00
|
|
|
row_format_before = self._parse_row_format(match_row=True)
|
|
|
|
|
|
|
|
record_writer = None
|
|
|
|
if self._match_text_seq("RECORDWRITER"):
|
|
|
|
record_writer = self._parse_string()
|
|
|
|
|
|
|
|
if not self._match(TokenType.USING):
|
|
|
|
return exp.Transform.from_arg_list(args)
|
|
|
|
|
|
|
|
command_script = self._parse_string()
|
|
|
|
|
|
|
|
self._match(TokenType.ALIAS)
|
|
|
|
schema = self._parse_schema()
|
|
|
|
|
|
|
|
row_format_after = self._parse_row_format(match_row=True)
|
|
|
|
record_reader = None
|
|
|
|
if self._match_text_seq("RECORDREADER"):
|
|
|
|
record_reader = self._parse_string()
|
|
|
|
|
|
|
|
return self.expression(
|
|
|
|
exp.QueryTransform,
|
|
|
|
expressions=args,
|
|
|
|
command_script=command_script,
|
|
|
|
schema=schema,
|
|
|
|
row_format_before=row_format_before,
|
|
|
|
record_writer=record_writer,
|
|
|
|
row_format_after=row_format_after,
|
|
|
|
record_reader=record_reader,
|
|
|
|
)
|
|
|
|
|
2025-02-13 15:58:40 +01:00
|
|
|
def _parse_types(
|
|
|
|
self, check_func: bool = False, schema: bool = False
|
|
|
|
) -> t.Optional[exp.Expression]:
|
|
|
|
"""
|
|
|
|
Spark (and most likely Hive) treats casts to CHAR(length) and VARCHAR(length) as casts to
|
|
|
|
STRING in all contexts except for schema definitions. For example, this is in Spark v3.4.0:
|
|
|
|
|
|
|
|
spark-sql (default)> select cast(1234 as varchar(2));
|
|
|
|
23/06/06 15:51:18 WARN CharVarcharUtils: The Spark cast operator does not support
|
|
|
|
char/varchar type and simply treats them as string type. Please use string type
|
|
|
|
directly to avoid confusion. Otherwise, you can set spark.sql.legacy.charVarcharAsString
|
|
|
|
to true, so that Spark treat them as string type as same as Spark 3.0 and earlier
|
|
|
|
|
|
|
|
1234
|
|
|
|
Time taken: 4.265 seconds, Fetched 1 row(s)
|
|
|
|
|
|
|
|
This shows that Spark doesn't truncate the value into '12', which is inconsistent with
|
|
|
|
what other dialects (e.g. postgres) do, so we need to drop the length to transpile correctly.
|
|
|
|
|
|
|
|
Reference: https://spark.apache.org/docs/latest/sql-ref-datatypes.html
|
|
|
|
"""
|
|
|
|
this = super()._parse_types(check_func=check_func, schema=schema)
|
|
|
|
|
|
|
|
if this and not schema:
|
|
|
|
return this.transform(
|
|
|
|
lambda node: node.replace(exp.DataType.build("text"))
|
|
|
|
if isinstance(node, exp.DataType) and node.is_type("char", "varchar")
|
|
|
|
else node,
|
|
|
|
copy=False,
|
|
|
|
)
|
|
|
|
|
|
|
|
return this
|
|
|
|
|
2025-02-13 14:53:05 +01:00
|
|
|
class Generator(generator.Generator):
|
2025-02-13 15:52:09 +01:00
|
|
|
LIMIT_FETCH = "LIMIT"
|
|
|
|
TABLESAMPLE_WITH_METHOD = False
|
|
|
|
TABLESAMPLE_SIZE_IS_PERCENT = True
|
|
|
|
JOIN_HINTS = False
|
|
|
|
TABLE_HINTS = False
|
2025-02-13 20:43:05 +01:00
|
|
|
QUERY_HINTS = False
|
2025-02-13 15:57:23 +01:00
|
|
|
INDEX_ON = "ON TABLE"
|
2025-02-13 20:48:36 +01:00
|
|
|
EXTRACT_ALLOWS_QUOTES = False
|
2025-02-13 15:52:09 +01:00
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
TYPE_MAPPING = {
|
2025-02-13 15:57:23 +01:00
|
|
|
**generator.Generator.TYPE_MAPPING,
|
2025-02-13 20:48:36 +01:00
|
|
|
exp.DataType.Type.BIT: "BOOLEAN",
|
2025-02-13 15:00:13 +01:00
|
|
|
exp.DataType.Type.DATETIME: "TIMESTAMP",
|
2025-02-13 20:48:36 +01:00
|
|
|
exp.DataType.Type.TEXT: "STRING",
|
|
|
|
exp.DataType.Type.TIME: "TIMESTAMP",
|
2025-02-13 15:46:19 +01:00
|
|
|
exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP",
|
2025-02-13 20:48:36 +01:00
|
|
|
exp.DataType.Type.VARBINARY: "BINARY",
|
2025-02-13 06:15:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
TRANSFORMS = {
|
2025-02-13 15:57:23 +01:00
|
|
|
**generator.Generator.TRANSFORMS,
|
2025-02-13 15:53:39 +01:00
|
|
|
exp.Group: transforms.preprocess([transforms.unalias_group]),
|
2025-02-13 15:52:09 +01:00
|
|
|
exp.Select: transforms.preprocess(
|
2025-02-13 15:53:39 +01:00
|
|
|
[
|
|
|
|
transforms.eliminate_qualify,
|
|
|
|
transforms.eliminate_distinct_on,
|
|
|
|
transforms.unnest_to_explode,
|
|
|
|
]
|
2025-02-13 15:52:09 +01:00
|
|
|
),
|
2025-02-13 14:56:25 +01:00
|
|
|
exp.Property: _property_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.ApproxDistinct: approx_count_distinct_sql,
|
2025-02-13 14:48:46 +01:00
|
|
|
exp.ArrayConcat: rename_func("CONCAT"),
|
2025-02-13 15:53:39 +01:00
|
|
|
exp.ArrayJoin: lambda self, e: self.func("CONCAT_WS", e.expression, e.this),
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.ArraySize: rename_func("SIZE"),
|
2025-02-13 15:53:39 +01:00
|
|
|
exp.ArraySort: _array_sort_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.With: no_recursive_cte_sql,
|
2025-02-13 14:50:31 +01:00
|
|
|
exp.DateAdd: _add_date_sql,
|
|
|
|
exp.DateDiff: _date_diff_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.DateStrToDate: rename_func("TO_DATE"),
|
2025-02-13 15:52:09 +01:00
|
|
|
exp.DateSub: _add_date_sql,
|
2025-02-13 16:00:51 +01:00
|
|
|
exp.DateToDi: lambda self, e: f"CAST(DATE_FORMAT({self.sql(e, 'this')}, {Hive.DATEINT_FORMAT}) AS INT)",
|
|
|
|
exp.DiToDate: lambda self, e: f"TO_DATE(CAST({self.sql(e, 'this')} AS STRING), {Hive.DATEINT_FORMAT})",
|
2025-02-13 15:52:09 +01:00
|
|
|
exp.FileFormatProperty: lambda self, e: f"STORED AS {self.sql(e, 'this') if isinstance(e.this, exp.InputOutputFormat) else e.name.upper()}",
|
2025-02-13 15:53:39 +01:00
|
|
|
exp.FromBase64: rename_func("UNBASE64"),
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.If: if_sql,
|
|
|
|
exp.ILike: no_ilike_sql,
|
2025-02-13 20:48:36 +01:00
|
|
|
exp.IsNan: rename_func("ISNAN"),
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.JSONExtract: rename_func("GET_JSON_OBJECT"),
|
|
|
|
exp.JSONExtractScalar: rename_func("GET_JSON_OBJECT"),
|
2025-02-13 15:53:39 +01:00
|
|
|
exp.JSONFormat: _json_format_sql,
|
2025-02-13 15:58:40 +01:00
|
|
|
exp.Left: left_to_substring_sql,
|
2025-02-13 14:45:11 +01:00
|
|
|
exp.Map: var_map_sql,
|
2025-02-13 15:48:10 +01:00
|
|
|
exp.Max: max_or_greatest,
|
2025-02-13 20:44:18 +01:00
|
|
|
exp.MD5Digest: lambda self, e: self.func("UNHEX", self.func("MD5", e.this)),
|
2025-02-13 15:43:32 +01:00
|
|
|
exp.Min: min_or_least,
|
2025-02-13 20:46:55 +01:00
|
|
|
exp.MonthsBetween: lambda self, e: self.func("MONTHS_BETWEEN", e.this, e.expression),
|
2025-02-13 14:45:11 +01:00
|
|
|
exp.VarMap: var_map_sql,
|
2025-02-13 14:40:43 +01:00
|
|
|
exp.Create: create_with_partitions_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.Quantile: rename_func("PERCENTILE"),
|
2025-02-13 08:04:41 +01:00
|
|
|
exp.ApproxQuantile: rename_func("PERCENTILE_APPROX"),
|
2025-02-13 20:45:52 +01:00
|
|
|
exp.RegexpExtract: regexp_extract_sql,
|
2025-02-13 20:46:55 +01:00
|
|
|
exp.RegexpReplace: regexp_replace_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.RegexpLike: lambda self, e: self.binary(e, "RLIKE"),
|
|
|
|
exp.RegexpSplit: rename_func("SPLIT"),
|
2025-02-13 15:58:40 +01:00
|
|
|
exp.Right: right_to_substring_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.SafeDivide: no_safe_divide_sql,
|
2025-02-13 14:40:43 +01:00
|
|
|
exp.SchemaCommentProperty: lambda self, e: self.naked_property(e),
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.SetAgg: rename_func("COLLECT_SET"),
|
|
|
|
exp.Split: lambda self, e: f"SPLIT({self.sql(e, 'this')}, CONCAT('\\\\Q', {self.sql(e, 'expression')}))",
|
2025-02-13 15:05:06 +01:00
|
|
|
exp.StrPosition: strposition_to_locate_sql,
|
2025-02-13 15:53:39 +01:00
|
|
|
exp.StrToDate: _str_to_date_sql,
|
|
|
|
exp.StrToTime: _str_to_time_sql,
|
|
|
|
exp.StrToUnix: _str_to_unix_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.StructExtract: struct_extract_sql,
|
|
|
|
exp.TimeStrToDate: rename_func("TO_DATE"),
|
2025-02-13 15:01:55 +01:00
|
|
|
exp.TimeStrToTime: timestrtotime_sql,
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.TimeStrToUnix: rename_func("UNIX_TIMESTAMP"),
|
|
|
|
exp.TimeToStr: _time_to_str,
|
|
|
|
exp.TimeToUnix: rename_func("UNIX_TIMESTAMP"),
|
2025-02-13 15:53:39 +01:00
|
|
|
exp.ToBase64: rename_func("BASE64"),
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.TsOrDiToDi: lambda self, e: f"CAST(SUBSTR(REPLACE(CAST({self.sql(e, 'this')} AS STRING), '-', ''), 1, 8) AS INT)",
|
|
|
|
exp.TsOrDsAdd: lambda self, e: f"DATE_ADD({self.sql(e, 'this')}, {self.sql(e, 'expression')})",
|
|
|
|
exp.TsOrDsToDate: _to_date_sql,
|
|
|
|
exp.TryCast: no_trycast_sql,
|
2025-02-13 15:26:26 +01:00
|
|
|
exp.UnixToStr: lambda self, e: self.func(
|
|
|
|
"FROM_UNIXTIME", e.this, _time_format(self, e)
|
|
|
|
),
|
2025-02-13 06:15:54 +01:00
|
|
|
exp.UnixToTime: rename_func("FROM_UNIXTIME"),
|
|
|
|
exp.UnixToTimeStr: rename_func("FROM_UNIXTIME"),
|
2025-02-13 14:56:25 +01:00
|
|
|
exp.PartitionedByProperty: lambda self, e: f"PARTITIONED BY {self.sql(e, 'this')}",
|
2025-02-13 14:58:37 +01:00
|
|
|
exp.SerdeProperties: lambda self, e: self.properties(e, prefix="WITH SERDEPROPERTIES"),
|
2025-02-13 14:51:47 +01:00
|
|
|
exp.NumberToStr: rename_func("FORMAT_NUMBER"),
|
2025-02-13 15:03:38 +01:00
|
|
|
exp.LastDateOfMonth: rename_func("LAST_DAY"),
|
2025-02-13 15:57:23 +01:00
|
|
|
exp.National: lambda self, e: self.national_sql(e, prefix=""),
|
2025-02-13 14:40:43 +01:00
|
|
|
}
|
|
|
|
|
2025-02-13 15:09:58 +01:00
|
|
|
PROPERTIES_LOCATION = {
|
2025-02-13 15:57:23 +01:00
|
|
|
**generator.Generator.PROPERTIES_LOCATION,
|
2025-02-13 15:26:26 +01:00
|
|
|
exp.FileFormatProperty: exp.Properties.Location.POST_SCHEMA,
|
|
|
|
exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA,
|
2025-02-13 15:52:09 +01:00
|
|
|
exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED,
|
2025-02-13 06:15:54 +01:00
|
|
|
}
|
|
|
|
|
2025-02-13 20:46:55 +01:00
|
|
|
def rowformatserdeproperty_sql(self, expression: exp.RowFormatSerdeProperty) -> str:
|
|
|
|
serde_props = self.sql(expression, "serde_properties")
|
|
|
|
serde_props = f" {serde_props}" if serde_props else ""
|
|
|
|
return f"ROW FORMAT SERDE {self.sql(expression, 'this')}{serde_props}"
|
|
|
|
|
2025-02-13 15:46:19 +01:00
|
|
|
def arrayagg_sql(self, expression: exp.ArrayAgg) -> str:
|
|
|
|
return self.func(
|
|
|
|
"COLLECT_LIST",
|
|
|
|
expression.this.this if isinstance(expression.this, exp.Order) else expression.this,
|
|
|
|
)
|
|
|
|
|
|
|
|
def with_properties(self, properties: exp.Properties) -> str:
|
2025-02-13 16:00:51 +01:00
|
|
|
return self.properties(properties, prefix=self.seg("TBLPROPERTIES"))
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 15:46:19 +01:00
|
|
|
def datatype_sql(self, expression: exp.DataType) -> str:
|
2025-02-13 06:15:54 +01:00
|
|
|
if (
|
2025-02-13 08:04:41 +01:00
|
|
|
expression.this in (exp.DataType.Type.VARCHAR, exp.DataType.Type.NVARCHAR)
|
2025-02-13 06:15:54 +01:00
|
|
|
and not expression.expressions
|
|
|
|
):
|
|
|
|
expression = exp.DataType.build("text")
|
2025-02-13 15:03:38 +01:00
|
|
|
elif expression.this in exp.DataType.TEMPORAL_TYPES:
|
|
|
|
expression = exp.DataType.build(expression.this)
|
2025-02-13 20:45:52 +01:00
|
|
|
elif expression.is_type("float"):
|
|
|
|
size_expression = expression.find(exp.DataTypeSize)
|
|
|
|
if size_expression:
|
|
|
|
size = int(size_expression.name)
|
|
|
|
expression = (
|
|
|
|
exp.DataType.build("float") if size <= 32 else exp.DataType.build("double")
|
|
|
|
)
|
2025-02-13 15:52:09 +01:00
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
return super().datatype_sql(expression)
|