Merging upstream version 11.7.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 15:52:09 +01:00 · 2025-02-13 15:52:09 +01:00 · 8d96084fad
commit 8d96084fad
parent 0c053462ae
144 changed files with 44104 additions and 39367 deletions
--- a/sqlglot/tokens.py
+++ b/sqlglot/tokens.py
@ -87,6 +87,7 @@ class TokenType(AutoName):
    FLOAT = auto()
    DOUBLE = auto()
    DECIMAL = auto()
+    BIGDECIMAL = auto()
    CHAR = auto()
    NCHAR = auto()
    VARCHAR = auto()
@ -214,6 +215,7 @@ class TokenType(AutoName):
    ISNULL = auto()
    JOIN = auto()
    JOIN_MARKER = auto()
+    KEEP = auto()
    LANGUAGE = auto()
    LATERAL = auto()
    LAZY = auto()
@ -231,6 +233,7 @@ class TokenType(AutoName):
    MOD = auto()
    NATURAL = auto()
    NEXT = auto()
+    NEXT_VALUE_FOR = auto()
    NO_ACTION = auto()
    NOTNULL = auto()
    NULL = auto()
@ -315,7 +318,7 @@ class TokenType(AutoName):


 class Token:
-    __slots__ = ("token_type", "text", "line", "col", "comments")
+    __slots__ = ("token_type", "text", "line", "col", "end", "comments")

    @classmethod
    def number(cls, number: int) -> Token:
@ -343,22 +346,29 @@ class Token:
        text: str,
        line: int = 1,
        col: int = 1,
+        end: int = 0,
        comments: t.List[str] = [],
    ) -> None:
        self.token_type = token_type
        self.text = text
        self.line = line
-        self.col = col - len(text)
-        self.col = self.col if self.col > 1 else 1
+        size = len(text)
+        self.col = col
+        self.end = end if end else size
        self.comments = comments

+    @property
+    def start(self) -> int:
+        """Returns the start of the token."""
+        return self.end - len(self.text)
+
    def __repr__(self) -> str:
        attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
        return f"<Token {attributes}>"


 class _Tokenizer(type):
-    def __new__(cls, clsname, bases, attrs):  # type: ignore
+    def __new__(cls, clsname, bases, attrs):
        klass = super().__new__(cls, clsname, bases, attrs)

        klass._QUOTES = {
@ -433,25 +443,25 @@ class Tokenizer(metaclass=_Tokenizer):
        "#": TokenType.HASH,
    }

-    QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
-
    BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
-
-    HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
-
    BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
-
+    HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
    IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
-
+    IDENTIFIER_ESCAPES = ['"']
+    QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
    STRING_ESCAPES = ["'"]
+    VAR_SINGLE_TOKENS: t.Set[str] = set()

+    _COMMENTS: t.Dict[str, str] = {}
+    _BIT_STRINGS: t.Dict[str, str] = {}
+    _BYTE_STRINGS: t.Dict[str, str] = {}
+    _HEX_STRINGS: t.Dict[str, str] = {}
+    _IDENTIFIERS: t.Dict[str, str] = {}
+    _IDENTIFIER_ESCAPES: t.Set[str] = set()
+    _QUOTES: t.Dict[str, str] = {}
    _STRING_ESCAPES: t.Set[str] = set()

-    IDENTIFIER_ESCAPES = ['"']
-
-    _IDENTIFIER_ESCAPES: t.Set[str] = set()
-
-    KEYWORDS = {
+    KEYWORDS: t.Dict[t.Optional[str], TokenType] = {
        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
        "{{+": TokenType.BLOCK_START,
@ -553,6 +563,7 @@ class Tokenizer(metaclass=_Tokenizer):
        "IS": TokenType.IS,
        "ISNULL": TokenType.ISNULL,
        "JOIN": TokenType.JOIN,
+        "KEEP": TokenType.KEEP,
        "LATERAL": TokenType.LATERAL,
        "LAZY": TokenType.LAZY,
        "LEADING": TokenType.LEADING,
@ -565,6 +576,7 @@ class Tokenizer(metaclass=_Tokenizer):
        "MERGE": TokenType.MERGE,
        "NATURAL": TokenType.NATURAL,
        "NEXT": TokenType.NEXT,
+        "NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR,
        "NO ACTION": TokenType.NO_ACTION,
        "NOT": TokenType.NOT,
        "NOTNULL": TokenType.NOTNULL,
@ -632,6 +644,7 @@ class Tokenizer(metaclass=_Tokenizer):
        "UPDATE": TokenType.UPDATE,
        "USE": TokenType.USE,
        "USING": TokenType.USING,
+        "UUID": TokenType.UUID,
        "VALUES": TokenType.VALUES,
        "VIEW": TokenType.VIEW,
        "VOLATILE": TokenType.VOLATILE,
@ -661,6 +674,8 @@ class Tokenizer(metaclass=_Tokenizer):
        "INT8": TokenType.BIGINT,
        "DEC": TokenType.DECIMAL,
        "DECIMAL": TokenType.DECIMAL,
+        "BIGDECIMAL": TokenType.BIGDECIMAL,
+        "BIGNUMERIC": TokenType.BIGDECIMAL,
        "MAP": TokenType.MAP,
        "NULLABLE": TokenType.NULLABLE,
        "NUMBER": TokenType.DECIMAL,
@ -742,7 +757,7 @@ class Tokenizer(metaclass=_Tokenizer):
    ENCODE: t.Optional[str] = None

    COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")]
-    KEYWORD_TRIE = None  # autofilled
+    KEYWORD_TRIE: t.Dict = {}  # autofilled

    IDENTIFIER_CAN_START_WITH_DIGIT = False

@ -776,19 +791,28 @@ class Tokenizer(metaclass=_Tokenizer):
        self._col = 1
        self._comments: t.List[str] = []

-        self._char = None
-        self._end = None
-        self._peek = None
+        self._char = ""
+        self._end = False
+        self._peek = ""
        self._prev_token_line = -1
        self._prev_token_comments: t.List[str] = []
-        self._prev_token_type = None
+        self._prev_token_type: t.Optional[TokenType] = None

    def tokenize(self, sql: str) -> t.List[Token]:
        """Returns a list of tokens corresponding to the SQL string `sql`."""
        self.reset()
        self.sql = sql
        self.size = len(sql)
-        self._scan()
+        try:
+            self._scan()
+        except Exception as e:
+            start = self._current - 50
+            end = self._current + 50
+            start = start if start > 0 else 0
+            end = end if end < self.size else self.size - 1
+            context = self.sql[start:end]
+            raise ValueError(f"Error tokenizing '{context}'") from e
+
        return self.tokens

    def _scan(self, until: t.Optional[t.Callable] = None) -> None:
@ -810,9 +834,12 @@ class Tokenizer(metaclass=_Tokenizer):
            if until and until():
                break

+        if self.tokens:
+            self.tokens[-1].comments.extend(self._comments)
+
    def _chars(self, size: int) -> str:
        if size == 1:
-            return self._char  # type: ignore
+            return self._char
        start = self._current - 1
        end = start + size
        if end <= self.size:
@ -821,17 +848,15 @@ class Tokenizer(metaclass=_Tokenizer):

    def _advance(self, i: int = 1) -> None:
        if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
-            self._set_new_line()
+            self._col = 1
+            self._line += 1
+        else:
+            self._col += i

-        self._col += i
        self._current += i
-        self._end = self._current >= self.size  # type: ignore
-        self._char = self.sql[self._current - 1]  # type: ignore
-        self._peek = self.sql[self._current] if self._current < self.size else ""  # type: ignore
-
-    def _set_new_line(self) -> None:
-        self._col = 1
-        self._line += 1
+        self._end = self._current >= self.size
+        self._char = self.sql[self._current - 1]
+        self._peek = "" if self._end else self.sql[self._current]

    @property
    def _text(self) -> str:
@ -840,13 +865,14 @@ class Tokenizer(metaclass=_Tokenizer):
    def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
        self._prev_token_line = self._line
        self._prev_token_comments = self._comments
-        self._prev_token_type = token_type  # type: ignore
+        self._prev_token_type = token_type
        self.tokens.append(
            Token(
                token_type,
                self._text if text is None else text,
                self._line,
                self._col,
+                self._current,
                self._comments,
            )
        )
@ -881,7 +907,7 @@ class Tokenizer(metaclass=_Tokenizer):
            if skip:
                result = 1
            else:
-                result, trie = in_trie(trie, char.upper())  # type: ignore
+                result, trie = in_trie(trie, char.upper())

            if result == 0:
                break
@ -910,7 +936,7 @@ class Tokenizer(metaclass=_Tokenizer):

        if not word:
            if self._char in self.SINGLE_TOKENS:
-                self._add(self.SINGLE_TOKENS[self._char], text=self._char)  # type: ignore
+                self._add(self.SINGLE_TOKENS[self._char], text=self._char)
                return
            self._scan_var()
            return
@ -927,29 +953,31 @@ class Tokenizer(metaclass=_Tokenizer):
        self._add(self.KEYWORDS[word], text=word)

    def _scan_comment(self, comment_start: str) -> bool:
-        if comment_start not in self._COMMENTS:  # type: ignore
+        if comment_start not in self._COMMENTS:
            return False

        comment_start_line = self._line
        comment_start_size = len(comment_start)
-        comment_end = self._COMMENTS[comment_start]  # type: ignore
+        comment_end = self._COMMENTS[comment_start]

        if comment_end:
-            comment_end_size = len(comment_end)
+            # Skip the comment's start delimiter
+            self._advance(comment_start_size)

+            comment_end_size = len(comment_end)
            while not self._end and self._chars(comment_end_size) != comment_end:
                self._advance()

-            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])  # type: ignore
+            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
            self._advance(comment_end_size - 1)
        else:
            while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK:
                self._advance()
-            self._comments.append(self._text[comment_start_size:])  # type: ignore
+            self._comments.append(self._text[comment_start_size:])

        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
        # Multiple consecutive comments are preserved by appending them to the current comments list.
-        if comment_start_line == self._prev_token_line or self._end:
+        if comment_start_line == self._prev_token_line:
            self.tokens[-1].comments.extend(self._comments)
            self._comments = []
            self._prev_token_line = self._line
@ -958,7 +986,7 @@ class Tokenizer(metaclass=_Tokenizer):

    def _scan_number(self) -> None:
        if self._char == "0":
-            peek = self._peek.upper()  # type: ignore
+            peek = self._peek.upper()
            if peek == "B":
                return self._scan_bits()
            elif peek == "X":
@ -968,7 +996,7 @@ class Tokenizer(metaclass=_Tokenizer):
        scientific = 0

        while True:
-            if self._peek.isdigit():  # type: ignore
+            if self._peek.isdigit():
                self._advance()
            elif self._peek == "." and not decimal:
                decimal = True
@ -976,24 +1004,23 @@ class Tokenizer(metaclass=_Tokenizer):
            elif self._peek in ("-", "+") and scientific == 1:
                scientific += 1
                self._advance()
-            elif self._peek.upper() == "E" and not scientific:  # type: ignore
+            elif self._peek.upper() == "E" and not scientific:
                scientific += 1
                self._advance()
-            elif self._peek.isidentifier():  # type: ignore
+            elif self._peek.isidentifier():
                number_text = self._text
-                literal = []
+                literal = ""

-                while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:  # type: ignore
-                    literal.append(self._peek.upper())  # type: ignore
+                while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:
+                    literal += self._peek.upper()
                    self._advance()

-                literal = "".join(literal)  # type: ignore
-                token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal))  # type: ignore
+                token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal))

                if token_type:
                    self._add(TokenType.NUMBER, number_text)
                    self._add(TokenType.DCOLON, "::")
-                    return self._add(token_type, literal)  # type: ignore
+                    return self._add(token_type, literal)
                elif self.IDENTIFIER_CAN_START_WITH_DIGIT:
                    return self._add(TokenType.VAR)

@ -1020,7 +1047,7 @@ class Tokenizer(metaclass=_Tokenizer):

    def _extract_value(self) -> str:
        while True:
-            char = self._peek.strip()  # type: ignore
+            char = self._peek.strip()
            if char and char not in self.SINGLE_TOKENS:
                self._advance()
            else:
@ -1029,35 +1056,35 @@ class Tokenizer(metaclass=_Tokenizer):
        return self._text

    def _scan_string(self, quote: str) -> bool:
-        quote_end = self._QUOTES.get(quote)  # type: ignore
+        quote_end = self._QUOTES.get(quote)
        if quote_end is None:
            return False

        self._advance(len(quote))
        text = self._extract_string(quote_end)
-        text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text  # type: ignore
+        text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
        self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text)
        return True

    # X'1234, b'0110', E'\\\\\' etc.
    def _scan_formatted_string(self, string_start: str) -> bool:
-        if string_start in self._HEX_STRINGS:  # type: ignore
-            delimiters = self._HEX_STRINGS  # type: ignore
+        if string_start in self._HEX_STRINGS:
+            delimiters = self._HEX_STRINGS
            token_type = TokenType.HEX_STRING
            base = 16
-        elif string_start in self._BIT_STRINGS:  # type: ignore
-            delimiters = self._BIT_STRINGS  # type: ignore
+        elif string_start in self._BIT_STRINGS:
+            delimiters = self._BIT_STRINGS
            token_type = TokenType.BIT_STRING
            base = 2
-        elif string_start in self._BYTE_STRINGS:  # type: ignore
-            delimiters = self._BYTE_STRINGS  # type: ignore
+        elif string_start in self._BYTE_STRINGS:
+            delimiters = self._BYTE_STRINGS
            token_type = TokenType.BYTE_STRING
            base = None
        else:
            return False

        self._advance(len(string_start))
-        string_end = delimiters.get(string_start)
+        string_end = delimiters[string_start]
        text = self._extract_string(string_end)

        if base is None:
@ -1083,20 +1110,20 @@ class Tokenizer(metaclass=_Tokenizer):
            self._advance()
            if self._char == identifier_end:
                if identifier_end_is_escape and self._peek == identifier_end:
-                    text += identifier_end  # type: ignore
+                    text += identifier_end
                    self._advance()
                    continue

                break

-            text += self._char  # type: ignore
+            text += self._char

        self._add(TokenType.IDENTIFIER, text)

    def _scan_var(self) -> None:
        while True:
-            char = self._peek.strip()  # type: ignore
-            if char and char not in self.SINGLE_TOKENS:
+            char = self._peek.strip()
+            if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
                self._advance()
            else:
                break
@ -1115,9 +1142,9 @@ class Tokenizer(metaclass=_Tokenizer):
                self._peek == delimiter or self._peek in self._STRING_ESCAPES
            ):
                if self._peek == delimiter:
-                    text += self._peek  # type: ignore
+                    text += self._peek
                else:
-                    text += self._char + self._peek  # type: ignore
+                    text += self._char + self._peek

                if self._current + 1 < self.size:
                    self._advance(2)
@ -1131,7 +1158,7 @@ class Tokenizer(metaclass=_Tokenizer):

                if self._end:
                    raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
-                text += self._char  # type: ignore
+                text += self._char
                self._advance()

        return text