1
0
Fork 0

Merging upstream version 10.5.10.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-13 15:07:05 +01:00
parent 8588db6332
commit 4d496b7a6a
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
43 changed files with 1384 additions and 356 deletions

View file

@ -1,3 +1,64 @@
"""
## Dialects
One of the core abstractions in SQLGlot is the concept of a "dialect". The `Dialect` class essentially implements a
"SQLGlot dialect", which aims to be as generic and ANSI-compliant as possible. It relies on the base `Tokenizer`,
`Parser` and `Generator` classes to achieve this goal, so these need to be very lenient when it comes to consuming
SQL code.
However, there are cases where the syntax of different SQL dialects varies wildly, even for common tasks. One such
example is the date/time functions, which can be hard to deal with. For this reason, it's sometimes necessary to
override the base dialect in order to specialize its behavior. This can be easily done in SQLGlot: supporting new
dialects is as simple as subclassing from `Dialect` and overriding its various components (e.g. the `Parser` class),
in order to implement the target behavior.
### Implementing a custom Dialect
Consider the following example:
```python
from sqlglot import exp
from sqlglot.dialects.dialect import Dialect
from sqlglot.generator import Generator
from sqlglot.tokens import Tokenizer, TokenType
class Custom(Dialect):
class Tokenizer(Tokenizer):
QUOTES = ["'", '"']
IDENTIFIERS = ["`"]
KEYWORDS = {
**Tokenizer.KEYWORDS,
"INT64": TokenType.BIGINT,
"FLOAT64": TokenType.DOUBLE,
}
class Generator(Generator):
TRANSFORMS = {exp.Array: lambda self, e: f"[{self.expressions(e)}]"}
TYPE_MAPPING = {
exp.DataType.Type.TINYINT: "INT64",
exp.DataType.Type.SMALLINT: "INT64",
exp.DataType.Type.INT: "INT64",
exp.DataType.Type.BIGINT: "INT64",
exp.DataType.Type.DECIMAL: "NUMERIC",
exp.DataType.Type.FLOAT: "FLOAT64",
exp.DataType.Type.DOUBLE: "FLOAT64",
exp.DataType.Type.BOOLEAN: "BOOL",
exp.DataType.Type.TEXT: "STRING",
}
```
This is a typical example of adding a new dialect implementation in SQLGlot: we specify its identifier and string
delimiters, as well as what tokens it uses for its types and how they're associated with SQLGlot types. Since
the `Expression` classes are common for each dialect supported in SQLGlot, we may also need to override the generation
logic for some expressions; this is usually done by adding new entries to the `TRANSFORMS` mapping.
----
"""
from sqlglot.dialects.bigquery import BigQuery
from sqlglot.dialects.clickhouse import ClickHouse
from sqlglot.dialects.databricks import Databricks

View file

@ -124,7 +124,6 @@ class BigQuery(Dialect):
"FLOAT64": TokenType.DOUBLE,
"INT64": TokenType.BIGINT,
"NOT DETERMINISTIC": TokenType.VOLATILE,
"QUALIFY": TokenType.QUALIFY,
"UNKNOWN": TokenType.NULL,
}
KEYWORDS.pop("DIV")

View file

@ -73,13 +73,8 @@ class ClickHouse(Dialect):
return this
def _parse_position(self) -> exp.Expression:
this = super()._parse_position()
# clickhouse position args are swapped
substr = this.this
this.args["this"] = this.args.get("substr")
this.args["substr"] = substr
return this
def _parse_position(self, haystack_first: bool = False) -> exp.Expression:
return super()._parse_position(haystack_first=True)
# https://clickhouse.com/docs/en/sql-reference/statements/select/with/
def _parse_cte(self) -> exp.Expression:

View file

@ -124,6 +124,8 @@ class MySQL(Dialect):
**tokens.Tokenizer.KEYWORDS,
"MEDIUMTEXT": TokenType.MEDIUMTEXT,
"LONGTEXT": TokenType.LONGTEXT,
"MEDIUMBLOB": TokenType.MEDIUMBLOB,
"LONGBLOB": TokenType.LONGBLOB,
"START": TokenType.BEGIN,
"SEPARATOR": TokenType.SEPARATOR,
"_ARMSCII8": TokenType.INTRODUCER,
@ -459,6 +461,8 @@ class MySQL(Dialect):
TYPE_MAPPING = generator.Generator.TYPE_MAPPING.copy()
TYPE_MAPPING.pop(exp.DataType.Type.MEDIUMTEXT)
TYPE_MAPPING.pop(exp.DataType.Type.LONGTEXT)
TYPE_MAPPING.pop(exp.DataType.Type.MEDIUMBLOB)
TYPE_MAPPING.pop(exp.DataType.Type.LONGBLOB)
WITH_PROPERTIES: t.Set[t.Type[exp.Property]] = set()

View file

@ -194,7 +194,8 @@ class Snowflake(Dialect):
KEYWORDS = {
**tokens.Tokenizer.KEYWORDS,
"QUALIFY": TokenType.QUALIFY,
"EXCLUDE": TokenType.EXCEPT,
"RENAME": TokenType.REPLACE,
"TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
"TIMESTAMP_NTZ": TokenType.TIMESTAMP,
"TIMESTAMP_TZ": TokenType.TIMESTAMPTZ,
@ -232,6 +233,11 @@ class Snowflake(Dialect):
exp.DataType.Type.TIMESTAMP: "TIMESTAMPNTZ",
}
STAR_MAPPING = {
"except": "EXCLUDE",
"replace": "RENAME",
}
ROOT_PROPERTIES = {
exp.PartitionedByProperty,
exp.ReturnsProperty,

View file

@ -1,6 +1,7 @@
from __future__ import annotations
import re
import typing as t
from sqlglot import exp, generator, parser, tokens
from sqlglot.dialects.dialect import Dialect, parse_date_delta, rename_func
@ -251,6 +252,7 @@ class TSQL(Dialect):
"NTEXT": TokenType.TEXT,
"NVARCHAR(MAX)": TokenType.TEXT,
"PRINT": TokenType.COMMAND,
"PROC": TokenType.PROCEDURE,
"REAL": TokenType.FLOAT,
"ROWVERSION": TokenType.ROWVERSION,
"SMALLDATETIME": TokenType.DATETIME,
@ -263,6 +265,11 @@ class TSQL(Dialect):
"XML": TokenType.XML,
}
# TSQL allows @, # to appear as a variable/identifier prefix
SINGLE_TOKENS = tokens.Tokenizer.SINGLE_TOKENS.copy()
SINGLE_TOKENS.pop("@")
SINGLE_TOKENS.pop("#")
class Parser(parser.Parser):
FUNCTIONS = {
**parser.Parser.FUNCTIONS, # type: ignore
@ -293,26 +300,82 @@ class TSQL(Dialect):
DataType.Type.NCHAR,
}
# https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-temporary#create-a-temporary-table
TABLE_PREFIX_TOKENS = {TokenType.HASH, TokenType.PARAMETER}
RETURNS_TABLE_TOKENS = parser.Parser.ID_VAR_TOKENS - { # type: ignore
TokenType.TABLE,
*parser.Parser.TYPE_TOKENS, # type: ignore
}
def _parse_convert(self, strict):
STATEMENT_PARSERS = {
**parser.Parser.STATEMENT_PARSERS, # type: ignore
TokenType.END: lambda self: self._parse_command(),
}
def _parse_system_time(self) -> t.Optional[exp.Expression]:
if not self._match_text_seq("FOR", "SYSTEM_TIME"):
return None
if self._match_text_seq("AS", "OF"):
system_time = self.expression(
exp.SystemTime, this=self._parse_bitwise(), kind="AS OF"
)
elif self._match_set((TokenType.FROM, TokenType.BETWEEN)):
kind = self._prev.text
this = self._parse_bitwise()
self._match_texts(("TO", "AND"))
expression = self._parse_bitwise()
system_time = self.expression(
exp.SystemTime, this=this, expression=expression, kind=kind
)
elif self._match_text_seq("CONTAINED", "IN"):
args = self._parse_wrapped_csv(self._parse_bitwise)
system_time = self.expression(
exp.SystemTime,
this=seq_get(args, 0),
expression=seq_get(args, 1),
kind="CONTAINED IN",
)
elif self._match(TokenType.ALL):
system_time = self.expression(exp.SystemTime, kind="ALL")
else:
system_time = None
self.raise_error("Unable to parse FOR SYSTEM_TIME clause")
return system_time
def _parse_table_parts(self, schema: bool = False) -> exp.Expression:
table = super()._parse_table_parts(schema=schema)
table.set("system_time", self._parse_system_time())
return table
def _parse_returns(self) -> exp.Expression:
table = self._parse_id_var(any_token=False, tokens=self.RETURNS_TABLE_TOKENS)
returns = super()._parse_returns()
returns.set("table", table)
return returns
def _parse_convert(self, strict: bool) -> t.Optional[exp.Expression]:
to = self._parse_types()
self._match(TokenType.COMMA)
this = self._parse_conjunction()
if not to or not this:
return None
# Retrieve length of datatype and override to default if not specified
if seq_get(to.expressions, 0) is None and to.this in self.VAR_LENGTH_DATATYPES:
to = exp.DataType.build(to.this, expressions=[exp.Literal.number(30)], nested=False)
# Check whether a conversion with format is applicable
if self._match(TokenType.COMMA):
format_val = self._parse_number().name
if format_val not in TSQL.convert_format_mapping:
format_val = self._parse_number()
format_val_name = format_val.name if format_val else ""
if format_val_name not in TSQL.convert_format_mapping:
raise ValueError(
f"CONVERT function at T-SQL does not support format style {format_val}"
f"CONVERT function at T-SQL does not support format style {format_val_name}"
)
format_norm = exp.Literal.string(TSQL.convert_format_mapping[format_val])
format_norm = exp.Literal.string(TSQL.convert_format_mapping[format_val_name])
# Check whether the convert entails a string to date format
if to.this == DataType.Type.DATE:
@ -333,6 +396,21 @@ class TSQL(Dialect):
# Entails a simple cast without any format requirement
return self.expression(exp.Cast if strict else exp.TryCast, this=this, to=to)
def _parse_user_defined_function(
self, kind: t.Optional[TokenType] = None
) -> t.Optional[exp.Expression]:
this = super()._parse_user_defined_function(kind=kind)
if (
kind == TokenType.FUNCTION
or isinstance(this, exp.UserDefinedFunction)
or self._match(TokenType.ALIAS, advance=False)
):
return this
expressions = self._parse_csv(self._parse_udf_kwarg)
return self.expression(exp.UserDefinedFunction, this=this, expressions=expressions)
class Generator(generator.Generator):
TYPE_MAPPING = {
**generator.Generator.TYPE_MAPPING, # type: ignore
@ -354,3 +432,27 @@ class TSQL(Dialect):
exp.TimeToStr: _format_sql,
exp.GroupConcat: _string_agg_sql,
}
TRANSFORMS.pop(exp.ReturnsProperty)
def systemtime_sql(self, expression: exp.SystemTime) -> str:
kind = expression.args["kind"]
if kind == "ALL":
return "FOR SYSTEM_TIME ALL"
start = self.sql(expression, "this")
if kind == "AS OF":
return f"FOR SYSTEM_TIME AS OF {start}"
end = self.sql(expression, "expression")
if kind == "FROM":
return f"FOR SYSTEM_TIME FROM {start} TO {end}"
if kind == "BETWEEN":
return f"FOR SYSTEM_TIME BETWEEN {start} AND {end}"
return f"FOR SYSTEM_TIME CONTAINED IN ({start}, {end})"
def returnsproperty_sql(self, expression: exp.ReturnsProperty) -> str:
table = expression.args.get("table")
table = f"{table} " if table else ""
return f"RETURNS {table}{self.sql(expression, 'this')}"