Merging upstream version 10.5.10.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
8588db6332
commit
4d496b7a6a
43 changed files with 1384 additions and 356 deletions
|
@ -1,3 +1,64 @@
|
|||
"""
|
||||
## Dialects
|
||||
|
||||
One of the core abstractions in SQLGlot is the concept of a "dialect". The `Dialect` class essentially implements a
|
||||
"SQLGlot dialect", which aims to be as generic and ANSI-compliant as possible. It relies on the base `Tokenizer`,
|
||||
`Parser` and `Generator` classes to achieve this goal, so these need to be very lenient when it comes to consuming
|
||||
SQL code.
|
||||
|
||||
However, there are cases where the syntax of different SQL dialects varies wildly, even for common tasks. One such
|
||||
example is the date/time functions, which can be hard to deal with. For this reason, it's sometimes necessary to
|
||||
override the base dialect in order to specialize its behavior. This can be easily done in SQLGlot: supporting new
|
||||
dialects is as simple as subclassing from `Dialect` and overriding its various components (e.g. the `Parser` class),
|
||||
in order to implement the target behavior.
|
||||
|
||||
|
||||
### Implementing a custom Dialect
|
||||
|
||||
Consider the following example:
|
||||
|
||||
```python
|
||||
from sqlglot import exp
|
||||
from sqlglot.dialects.dialect import Dialect
|
||||
from sqlglot.generator import Generator
|
||||
from sqlglot.tokens import Tokenizer, TokenType
|
||||
|
||||
|
||||
class Custom(Dialect):
|
||||
class Tokenizer(Tokenizer):
|
||||
QUOTES = ["'", '"']
|
||||
IDENTIFIERS = ["`"]
|
||||
|
||||
KEYWORDS = {
|
||||
**Tokenizer.KEYWORDS,
|
||||
"INT64": TokenType.BIGINT,
|
||||
"FLOAT64": TokenType.DOUBLE,
|
||||
}
|
||||
|
||||
class Generator(Generator):
|
||||
TRANSFORMS = {exp.Array: lambda self, e: f"[{self.expressions(e)}]"}
|
||||
|
||||
TYPE_MAPPING = {
|
||||
exp.DataType.Type.TINYINT: "INT64",
|
||||
exp.DataType.Type.SMALLINT: "INT64",
|
||||
exp.DataType.Type.INT: "INT64",
|
||||
exp.DataType.Type.BIGINT: "INT64",
|
||||
exp.DataType.Type.DECIMAL: "NUMERIC",
|
||||
exp.DataType.Type.FLOAT: "FLOAT64",
|
||||
exp.DataType.Type.DOUBLE: "FLOAT64",
|
||||
exp.DataType.Type.BOOLEAN: "BOOL",
|
||||
exp.DataType.Type.TEXT: "STRING",
|
||||
}
|
||||
```
|
||||
|
||||
This is a typical example of adding a new dialect implementation in SQLGlot: we specify its identifier and string
|
||||
delimiters, as well as what tokens it uses for its types and how they're associated with SQLGlot types. Since
|
||||
the `Expression` classes are common for each dialect supported in SQLGlot, we may also need to override the generation
|
||||
logic for some expressions; this is usually done by adding new entries to the `TRANSFORMS` mapping.
|
||||
|
||||
----
|
||||
"""
|
||||
|
||||
from sqlglot.dialects.bigquery import BigQuery
|
||||
from sqlglot.dialects.clickhouse import ClickHouse
|
||||
from sqlglot.dialects.databricks import Databricks
|
||||
|
|
|
@ -124,7 +124,6 @@ class BigQuery(Dialect):
|
|||
"FLOAT64": TokenType.DOUBLE,
|
||||
"INT64": TokenType.BIGINT,
|
||||
"NOT DETERMINISTIC": TokenType.VOLATILE,
|
||||
"QUALIFY": TokenType.QUALIFY,
|
||||
"UNKNOWN": TokenType.NULL,
|
||||
}
|
||||
KEYWORDS.pop("DIV")
|
||||
|
|
|
@ -73,13 +73,8 @@ class ClickHouse(Dialect):
|
|||
|
||||
return this
|
||||
|
||||
def _parse_position(self) -> exp.Expression:
|
||||
this = super()._parse_position()
|
||||
# clickhouse position args are swapped
|
||||
substr = this.this
|
||||
this.args["this"] = this.args.get("substr")
|
||||
this.args["substr"] = substr
|
||||
return this
|
||||
def _parse_position(self, haystack_first: bool = False) -> exp.Expression:
|
||||
return super()._parse_position(haystack_first=True)
|
||||
|
||||
# https://clickhouse.com/docs/en/sql-reference/statements/select/with/
|
||||
def _parse_cte(self) -> exp.Expression:
|
||||
|
|
|
@ -124,6 +124,8 @@ class MySQL(Dialect):
|
|||
**tokens.Tokenizer.KEYWORDS,
|
||||
"MEDIUMTEXT": TokenType.MEDIUMTEXT,
|
||||
"LONGTEXT": TokenType.LONGTEXT,
|
||||
"MEDIUMBLOB": TokenType.MEDIUMBLOB,
|
||||
"LONGBLOB": TokenType.LONGBLOB,
|
||||
"START": TokenType.BEGIN,
|
||||
"SEPARATOR": TokenType.SEPARATOR,
|
||||
"_ARMSCII8": TokenType.INTRODUCER,
|
||||
|
@ -459,6 +461,8 @@ class MySQL(Dialect):
|
|||
TYPE_MAPPING = generator.Generator.TYPE_MAPPING.copy()
|
||||
TYPE_MAPPING.pop(exp.DataType.Type.MEDIUMTEXT)
|
||||
TYPE_MAPPING.pop(exp.DataType.Type.LONGTEXT)
|
||||
TYPE_MAPPING.pop(exp.DataType.Type.MEDIUMBLOB)
|
||||
TYPE_MAPPING.pop(exp.DataType.Type.LONGBLOB)
|
||||
|
||||
WITH_PROPERTIES: t.Set[t.Type[exp.Property]] = set()
|
||||
|
||||
|
|
|
@ -194,7 +194,8 @@ class Snowflake(Dialect):
|
|||
|
||||
KEYWORDS = {
|
||||
**tokens.Tokenizer.KEYWORDS,
|
||||
"QUALIFY": TokenType.QUALIFY,
|
||||
"EXCLUDE": TokenType.EXCEPT,
|
||||
"RENAME": TokenType.REPLACE,
|
||||
"TIMESTAMP_LTZ": TokenType.TIMESTAMPLTZ,
|
||||
"TIMESTAMP_NTZ": TokenType.TIMESTAMP,
|
||||
"TIMESTAMP_TZ": TokenType.TIMESTAMPTZ,
|
||||
|
@ -232,6 +233,11 @@ class Snowflake(Dialect):
|
|||
exp.DataType.Type.TIMESTAMP: "TIMESTAMPNTZ",
|
||||
}
|
||||
|
||||
STAR_MAPPING = {
|
||||
"except": "EXCLUDE",
|
||||
"replace": "RENAME",
|
||||
}
|
||||
|
||||
ROOT_PROPERTIES = {
|
||||
exp.PartitionedByProperty,
|
||||
exp.ReturnsProperty,
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import typing as t
|
||||
|
||||
from sqlglot import exp, generator, parser, tokens
|
||||
from sqlglot.dialects.dialect import Dialect, parse_date_delta, rename_func
|
||||
|
@ -251,6 +252,7 @@ class TSQL(Dialect):
|
|||
"NTEXT": TokenType.TEXT,
|
||||
"NVARCHAR(MAX)": TokenType.TEXT,
|
||||
"PRINT": TokenType.COMMAND,
|
||||
"PROC": TokenType.PROCEDURE,
|
||||
"REAL": TokenType.FLOAT,
|
||||
"ROWVERSION": TokenType.ROWVERSION,
|
||||
"SMALLDATETIME": TokenType.DATETIME,
|
||||
|
@ -263,6 +265,11 @@ class TSQL(Dialect):
|
|||
"XML": TokenType.XML,
|
||||
}
|
||||
|
||||
# TSQL allows @, # to appear as a variable/identifier prefix
|
||||
SINGLE_TOKENS = tokens.Tokenizer.SINGLE_TOKENS.copy()
|
||||
SINGLE_TOKENS.pop("@")
|
||||
SINGLE_TOKENS.pop("#")
|
||||
|
||||
class Parser(parser.Parser):
|
||||
FUNCTIONS = {
|
||||
**parser.Parser.FUNCTIONS, # type: ignore
|
||||
|
@ -293,26 +300,82 @@ class TSQL(Dialect):
|
|||
DataType.Type.NCHAR,
|
||||
}
|
||||
|
||||
# https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-temporary#create-a-temporary-table
|
||||
TABLE_PREFIX_TOKENS = {TokenType.HASH, TokenType.PARAMETER}
|
||||
RETURNS_TABLE_TOKENS = parser.Parser.ID_VAR_TOKENS - { # type: ignore
|
||||
TokenType.TABLE,
|
||||
*parser.Parser.TYPE_TOKENS, # type: ignore
|
||||
}
|
||||
|
||||
def _parse_convert(self, strict):
|
||||
STATEMENT_PARSERS = {
|
||||
**parser.Parser.STATEMENT_PARSERS, # type: ignore
|
||||
TokenType.END: lambda self: self._parse_command(),
|
||||
}
|
||||
|
||||
def _parse_system_time(self) -> t.Optional[exp.Expression]:
|
||||
if not self._match_text_seq("FOR", "SYSTEM_TIME"):
|
||||
return None
|
||||
|
||||
if self._match_text_seq("AS", "OF"):
|
||||
system_time = self.expression(
|
||||
exp.SystemTime, this=self._parse_bitwise(), kind="AS OF"
|
||||
)
|
||||
elif self._match_set((TokenType.FROM, TokenType.BETWEEN)):
|
||||
kind = self._prev.text
|
||||
this = self._parse_bitwise()
|
||||
self._match_texts(("TO", "AND"))
|
||||
expression = self._parse_bitwise()
|
||||
system_time = self.expression(
|
||||
exp.SystemTime, this=this, expression=expression, kind=kind
|
||||
)
|
||||
elif self._match_text_seq("CONTAINED", "IN"):
|
||||
args = self._parse_wrapped_csv(self._parse_bitwise)
|
||||
system_time = self.expression(
|
||||
exp.SystemTime,
|
||||
this=seq_get(args, 0),
|
||||
expression=seq_get(args, 1),
|
||||
kind="CONTAINED IN",
|
||||
)
|
||||
elif self._match(TokenType.ALL):
|
||||
system_time = self.expression(exp.SystemTime, kind="ALL")
|
||||
else:
|
||||
system_time = None
|
||||
self.raise_error("Unable to parse FOR SYSTEM_TIME clause")
|
||||
|
||||
return system_time
|
||||
|
||||
def _parse_table_parts(self, schema: bool = False) -> exp.Expression:
|
||||
table = super()._parse_table_parts(schema=schema)
|
||||
table.set("system_time", self._parse_system_time())
|
||||
return table
|
||||
|
||||
def _parse_returns(self) -> exp.Expression:
|
||||
table = self._parse_id_var(any_token=False, tokens=self.RETURNS_TABLE_TOKENS)
|
||||
returns = super()._parse_returns()
|
||||
returns.set("table", table)
|
||||
return returns
|
||||
|
||||
def _parse_convert(self, strict: bool) -> t.Optional[exp.Expression]:
|
||||
to = self._parse_types()
|
||||
self._match(TokenType.COMMA)
|
||||
this = self._parse_conjunction()
|
||||
|
||||
if not to or not this:
|
||||
return None
|
||||
|
||||
# Retrieve length of datatype and override to default if not specified
|
||||
if seq_get(to.expressions, 0) is None and to.this in self.VAR_LENGTH_DATATYPES:
|
||||
to = exp.DataType.build(to.this, expressions=[exp.Literal.number(30)], nested=False)
|
||||
|
||||
# Check whether a conversion with format is applicable
|
||||
if self._match(TokenType.COMMA):
|
||||
format_val = self._parse_number().name
|
||||
if format_val not in TSQL.convert_format_mapping:
|
||||
format_val = self._parse_number()
|
||||
format_val_name = format_val.name if format_val else ""
|
||||
|
||||
if format_val_name not in TSQL.convert_format_mapping:
|
||||
raise ValueError(
|
||||
f"CONVERT function at T-SQL does not support format style {format_val}"
|
||||
f"CONVERT function at T-SQL does not support format style {format_val_name}"
|
||||
)
|
||||
format_norm = exp.Literal.string(TSQL.convert_format_mapping[format_val])
|
||||
|
||||
format_norm = exp.Literal.string(TSQL.convert_format_mapping[format_val_name])
|
||||
|
||||
# Check whether the convert entails a string to date format
|
||||
if to.this == DataType.Type.DATE:
|
||||
|
@ -333,6 +396,21 @@ class TSQL(Dialect):
|
|||
# Entails a simple cast without any format requirement
|
||||
return self.expression(exp.Cast if strict else exp.TryCast, this=this, to=to)
|
||||
|
||||
def _parse_user_defined_function(
|
||||
self, kind: t.Optional[TokenType] = None
|
||||
) -> t.Optional[exp.Expression]:
|
||||
this = super()._parse_user_defined_function(kind=kind)
|
||||
|
||||
if (
|
||||
kind == TokenType.FUNCTION
|
||||
or isinstance(this, exp.UserDefinedFunction)
|
||||
or self._match(TokenType.ALIAS, advance=False)
|
||||
):
|
||||
return this
|
||||
|
||||
expressions = self._parse_csv(self._parse_udf_kwarg)
|
||||
return self.expression(exp.UserDefinedFunction, this=this, expressions=expressions)
|
||||
|
||||
class Generator(generator.Generator):
|
||||
TYPE_MAPPING = {
|
||||
**generator.Generator.TYPE_MAPPING, # type: ignore
|
||||
|
@ -354,3 +432,27 @@ class TSQL(Dialect):
|
|||
exp.TimeToStr: _format_sql,
|
||||
exp.GroupConcat: _string_agg_sql,
|
||||
}
|
||||
|
||||
TRANSFORMS.pop(exp.ReturnsProperty)
|
||||
|
||||
def systemtime_sql(self, expression: exp.SystemTime) -> str:
|
||||
kind = expression.args["kind"]
|
||||
if kind == "ALL":
|
||||
return "FOR SYSTEM_TIME ALL"
|
||||
|
||||
start = self.sql(expression, "this")
|
||||
if kind == "AS OF":
|
||||
return f"FOR SYSTEM_TIME AS OF {start}"
|
||||
|
||||
end = self.sql(expression, "expression")
|
||||
if kind == "FROM":
|
||||
return f"FOR SYSTEM_TIME FROM {start} TO {end}"
|
||||
if kind == "BETWEEN":
|
||||
return f"FOR SYSTEM_TIME BETWEEN {start} AND {end}"
|
||||
|
||||
return f"FOR SYSTEM_TIME CONTAINED IN ({start}, {end})"
|
||||
|
||||
def returnsproperty_sql(self, expression: exp.ReturnsProperty) -> str:
|
||||
table = expression.args.get("table")
|
||||
table = f"{table} " if table else ""
|
||||
return f"RETURNS {table}{self.sql(expression, 'this')}"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue