Merging upstream version 20.3.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 21:17:51 +01:00 · 2025-02-13 21:17:51 +01:00 · 4d9376ba93
commit 4d9376ba93
parent 2945bcc4f7
132 changed files with 55125 additions and 51576 deletions
--- a/sqlglot/dialects/dialect.py
+++ b/sqlglot/dialects/dialect.py
@ -21,11 +21,14 @@ DATE_ADD_OR_SUB = t.Union[exp.DateAdd, exp.TsOrDsAdd, exp.DateSub]


 class Dialects(str, Enum):
+    """Dialects supported by SQLGLot."""
+
    DIALECT = ""

    BIGQUERY = "bigquery"
    CLICKHOUSE = "clickhouse"
    DATABRICKS = "databricks"
+    DORIS = "doris"
    DRILL = "drill"
    DUCKDB = "duckdb"
    HIVE = "hive"
@ -43,16 +46,22 @@ class Dialects(str, Enum):
    TERADATA = "teradata"
    TRINO = "trino"
    TSQL = "tsql"
-    Doris = "doris"


 class NormalizationStrategy(str, AutoName):
    """Specifies the strategy according to which identifiers should be normalized."""

-    LOWERCASE = auto()  # Unquoted identifiers are lowercased
-    UPPERCASE = auto()  # Unquoted identifiers are uppercased
-    CASE_SENSITIVE = auto()  # Always case-sensitive, regardless of quotes
-    CASE_INSENSITIVE = auto()  # Always case-insensitive, regardless of quotes
+    LOWERCASE = auto()
+    """Unquoted identifiers are lowercased."""
+
+    UPPERCASE = auto()
+    """Unquoted identifiers are uppercased."""
+
+    CASE_SENSITIVE = auto()
+    """Always case-sensitive, regardless of quotes."""
+
+    CASE_INSENSITIVE = auto()
+    """Always case-insensitive, regardless of quotes."""


 class _Dialect(type):
@ -117,6 +126,7 @@ class _Dialect(type):
        klass.BIT_START, klass.BIT_END = get_start_end(TokenType.BIT_STRING)
        klass.HEX_START, klass.HEX_END = get_start_end(TokenType.HEX_STRING)
        klass.BYTE_START, klass.BYTE_END = get_start_end(TokenType.BYTE_STRING)
+        klass.UNICODE_START, klass.UNICODE_END = get_start_end(TokenType.UNICODE_STRING)

        if enum not in ("", "bigquery"):
            klass.generator_class.SELECT_KINDS = ()
@ -131,74 +141,84 @@ class _Dialect(type):


 class Dialect(metaclass=_Dialect):
-    # Determines the base index offset for arrays
    INDEX_OFFSET = 0
+    """Determines the base index offset for arrays."""
+
+    WEEK_OFFSET = 0
+    """Determines the day of week of DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday."""

-    # If true unnest table aliases are considered only as column aliases
    UNNEST_COLUMN_ONLY = False
+    """Determines whether or not `UNNEST` table aliases are treated as column aliases."""

-    # Determines whether or not the table alias comes after tablesample
    ALIAS_POST_TABLESAMPLE = False
+    """Determines whether or not the table alias comes after tablesample."""

-    # Specifies the strategy according to which identifiers should be normalized.
    NORMALIZATION_STRATEGY = NormalizationStrategy.LOWERCASE
+    """Specifies the strategy according to which identifiers should be normalized."""

-    # Determines whether or not an unquoted identifier can start with a digit
    IDENTIFIERS_CAN_START_WITH_DIGIT = False
+    """Determines whether or not an unquoted identifier can start with a digit."""

-    # Determines whether or not the DPIPE token ('||') is a string concatenation operator
    DPIPE_IS_STRING_CONCAT = True
+    """Determines whether or not the DPIPE token (`||`) is a string concatenation operator."""

-    # Determines whether or not CONCAT's arguments must be strings
    STRICT_STRING_CONCAT = False
+    """Determines whether or not `CONCAT`'s arguments must be strings."""

-    # Determines whether or not user-defined data types are supported
    SUPPORTS_USER_DEFINED_TYPES = True
+    """Determines whether or not user-defined data types are supported."""

-    # Determines whether or not SEMI/ANTI JOINs are supported
    SUPPORTS_SEMI_ANTI_JOIN = True
+    """Determines whether or not `SEMI` or `ANTI` joins are supported."""

-    # Determines how function names are going to be normalized
    NORMALIZE_FUNCTIONS: bool | str = "upper"
+    """Determines how function names are going to be normalized."""

-    # Determines whether the base comes first in the LOG function
    LOG_BASE_FIRST = True
+    """Determines whether the base comes first in the `LOG` function."""

-    # Indicates the default null ordering method to use if not explicitly set
-    # Options are: "nulls_are_small", "nulls_are_large", "nulls_are_last"
    NULL_ORDERING = "nulls_are_small"
+    """
+    Indicates the default `NULL` ordering method to use if not explicitly set.
+    Possible values: `"nulls_are_small"`, `"nulls_are_large"`, `"nulls_are_last"`
+    """

-    # Whether the behavior of a / b depends on the types of a and b.
-    # False means a / b is always float division.
-    # True means a / b is integer division if both a and b are integers.
    TYPED_DIVISION = False
+    """
+    Whether the behavior of `a / b` depends on the types of `a` and `b`.
+    False means `a / b` is always float division.
+    True means `a / b` is integer division if both `a` and `b` are integers.
+    """

-    # False means 1 / 0 throws an error.
-    # True means 1 / 0 returns null.
    SAFE_DIVISION = False
+    """Determines whether division by zero throws an error (`False`) or returns NULL (`True`)."""

-    # A NULL arg in CONCAT yields NULL by default, but in some dialects it yields an empty string
    CONCAT_COALESCE = False
+    """A `NULL` arg in `CONCAT` yields `NULL` by default, but in some dialects it yields an empty string."""

    DATE_FORMAT = "'%Y-%m-%d'"
    DATEINT_FORMAT = "'%Y%m%d'"
    TIME_FORMAT = "'%Y-%m-%d %H:%M:%S'"

-    # Custom time mappings in which the key represents dialect time format
-    # and the value represents a python time format
    TIME_MAPPING: t.Dict[str, str] = {}
+    """Associates this dialect's time formats with their equivalent Python `strftime` format."""

    # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_model_rules_date_time
    # https://docs.teradata.com/r/Teradata-Database-SQL-Functions-Operators-Expressions-and-Predicates/March-2017/Data-Type-Conversions/Character-to-DATE-Conversion/Forcing-a-FORMAT-on-CAST-for-Converting-Character-to-DATE
-    # special syntax cast(x as date format 'yyyy') defaults to time_mapping
    FORMAT_MAPPING: t.Dict[str, str] = {}
+    """
+    Helper which is used for parsing the special syntax `CAST(x AS DATE FORMAT 'yyyy')`.
+    If empty, the corresponding trie will be constructed off of `TIME_MAPPING`.
+    """

-    # Mapping of an unescaped escape sequence to the corresponding character
    ESCAPE_SEQUENCES: t.Dict[str, str] = {}
+    """Mapping of an unescaped escape sequence to the corresponding character."""

-    # Columns that are auto-generated by the engine corresponding to this dialect
-    # Such columns may be excluded from SELECT * queries, for example
    PSEUDOCOLUMNS: t.Set[str] = set()
+    """
+    Columns that are auto-generated by the engine corresponding to this dialect.
+    For example, such columns may be excluded from `SELECT *` queries.
+    """

    # --- Autofilled ---

@ -221,13 +241,15 @@ class Dialect(metaclass=_Dialect):
    IDENTIFIER_START = '"'
    IDENTIFIER_END = '"'

-    # Delimiters for bit, hex and byte literals
+    # Delimiters for bit, hex, byte and unicode literals
    BIT_START: t.Optional[str] = None
    BIT_END: t.Optional[str] = None
    HEX_START: t.Optional[str] = None
    HEX_END: t.Optional[str] = None
    BYTE_START: t.Optional[str] = None
    BYTE_END: t.Optional[str] = None
+    UNICODE_START: t.Optional[str] = None
+    UNICODE_END: t.Optional[str] = None

    @classmethod
    def get_or_raise(cls, dialect: DialectType) -> Dialect:
@ -275,6 +297,7 @@ class Dialect(metaclass=_Dialect):
    def format_time(
        cls, expression: t.Optional[str | exp.Expression]
    ) -> t.Optional[exp.Expression]:
+        """Converts a time format in this dialect to its equivalent Python `strftime` format."""
        if isinstance(expression, str):
            return exp.Literal.string(
                # the time formats are quoted
@ -306,9 +329,9 @@ class Dialect(metaclass=_Dialect):
        """
        Transforms an identifier in a way that resembles how it'd be resolved by this dialect.

-        For example, an identifier like FoO would be resolved as foo in Postgres, because it
+        For example, an identifier like `FoO` would be resolved as `foo` in Postgres, because it
        lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
-        it would resolve it as FOO. If it was quoted, it'd need to be treated as case-sensitive,
+        it would resolve it as `FOO`. If it was quoted, it'd need to be treated as case-sensitive,
        and so any normalization would be prohibited in order to avoid "breaking" the identifier.

        There are also dialects like Spark, which are case-insensitive even when quotes are
@ -356,8 +379,8 @@ class Dialect(metaclass=_Dialect):
        Args:
            text: The text to check.
            identify:
-                "always" or `True`: Always returns true.
-                "safe": True if the identifier is case-insensitive.
+                `"always"` or `True`: Always returns `True`.
+                `"safe"`: Only returns `True` if the identifier is case-insensitive.

        Returns:
            Whether or not the given text can be identified.
@ -371,6 +394,14 @@ class Dialect(metaclass=_Dialect):
        return False

    def quote_identifier(self, expression: E, identify: bool = True) -> E:
+        """
+        Adds quotes to a given identifier.
+
+        Args:
+            expression: The expression of interest. If it's not an `Identifier`, this method is a no-op.
+            identify: If set to `False`, the quotes will only be added if the identifier is deemed
+                "unsafe", with respect to its characters and this dialect's normalization strategy.
+        """
        if isinstance(expression, exp.Identifier):
            name = expression.this
            expression.set(