Merging upstream version 12.2.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-02-13 15:53:39 +01:00 · 2025-02-13 15:53:39 +01:00 · 62b2b24d3b
commit 62b2b24d3b
parent fffa0d5761
100 changed files with 35022 additions and 30936 deletions
--- a/sqlglot/tokens.py
+++ b/sqlglot/tokens.py
@ -84,6 +84,10 @@ class TokenType(AutoName):
    UINT = auto()
    BIGINT = auto()
    UBIGINT = auto()
+    INT128 = auto()
+    UINT128 = auto()
+    INT256 = auto()
+    UINT256 = auto()
    FLOAT = auto()
    DOUBLE = auto()
    DECIMAL = auto()
@ -774,8 +778,6 @@ class Tokenizer(metaclass=_Tokenizer):
        "_end",
        "_peek",
        "_prev_token_line",
-        "_prev_token_comments",
-        "_prev_token_type",
    )

    def __init__(self) -> None:
@ -795,8 +797,6 @@ class Tokenizer(metaclass=_Tokenizer):
        self._end = False
        self._peek = ""
        self._prev_token_line = -1
-        self._prev_token_comments: t.List[str] = []
-        self._prev_token_type: t.Optional[TokenType] = None

    def tokenize(self, sql: str) -> t.List[Token]:
        """Returns a list of tokens corresponding to the SQL string `sql`."""
@ -846,7 +846,7 @@ class Tokenizer(metaclass=_Tokenizer):
            return self.sql[start:end]
        return ""

-    def _advance(self, i: int = 1) -> None:
+    def _advance(self, i: int = 1, alnum: bool = False) -> None:
        if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
            self._col = 1
            self._line += 1
@ -858,14 +858,30 @@ class Tokenizer(metaclass=_Tokenizer):
        self._char = self.sql[self._current - 1]
        self._peek = "" if self._end else self.sql[self._current]

+        if alnum and self._char.isalnum():
+            _col = self._col
+            _current = self._current
+            _end = self._end
+            _peek = self._peek
+
+            while _peek.isalnum():
+                _col += 1
+                _current += 1
+                _end = _current >= self.size
+                _peek = "" if _end else self.sql[_current]
+
+            self._col = _col
+            self._current = _current
+            self._end = _end
+            self._peek = _peek
+            self._char = self.sql[_current - 1]
+
    @property
    def _text(self) -> str:
        return self.sql[self._start : self._current]

    def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
        self._prev_token_line = self._line
-        self._prev_token_comments = self._comments
-        self._prev_token_type = token_type
        self.tokens.append(
            Token(
                token_type,
@ -966,13 +982,13 @@ class Tokenizer(metaclass=_Tokenizer):

            comment_end_size = len(comment_end)
            while not self._end and self._chars(comment_end_size) != comment_end:
-                self._advance()
+                self._advance(alnum=True)

            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
            self._advance(comment_end_size - 1)
        else:
            while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK:
-                self._advance()
+                self._advance(alnum=True)
            self._comments.append(self._text[comment_start_size:])

        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
@ -988,9 +1004,9 @@ class Tokenizer(metaclass=_Tokenizer):
        if self._char == "0":
            peek = self._peek.upper()
            if peek == "B":
-                return self._scan_bits()
+                return self._scan_bits() if self._BIT_STRINGS else self._add(TokenType.NUMBER)
            elif peek == "X":
-                return self._scan_hex()
+                return self._scan_hex() if self._HEX_STRINGS else self._add(TokenType.NUMBER)

        decimal = False
        scientific = 0
@ -1033,7 +1049,9 @@ class Tokenizer(metaclass=_Tokenizer):
        self._advance()
        value = self._extract_value()
        try:
-            self._add(TokenType.BIT_STRING, f"{int(value, 2)}")
+            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
+            int(value, 2)
+            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
        except ValueError:
            self._add(TokenType.IDENTIFIER)

@ -1041,7 +1059,9 @@ class Tokenizer(metaclass=_Tokenizer):
        self._advance()
        value = self._extract_value()
        try:
-            self._add(TokenType.HEX_STRING, f"{int(value, 16)}")
+            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
+            int(value, 16)
+            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
        except ValueError:
            self._add(TokenType.IDENTIFIER)

@ -1049,7 +1069,7 @@ class Tokenizer(metaclass=_Tokenizer):
        while True:
            char = self._peek.strip()
            if char and char not in self.SINGLE_TOKENS:
-                self._advance()
+                self._advance(alnum=True)
            else:
                break

@ -1066,7 +1086,7 @@ class Tokenizer(metaclass=_Tokenizer):
        self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text)
        return True

-    # X'1234, b'0110', E'\\\\\' etc.
+    # X'1234', b'0110', E'\\\\\' etc.
    def _scan_formatted_string(self, string_start: str) -> bool:
        if string_start in self._HEX_STRINGS:
            delimiters = self._HEX_STRINGS
@ -1087,60 +1107,43 @@ class Tokenizer(metaclass=_Tokenizer):
        string_end = delimiters[string_start]
        text = self._extract_string(string_end)

-        if base is None:
-            self._add(token_type, text)
-        else:
+        if base:
            try:
-                self._add(token_type, f"{int(text, base)}")
+                int(text, base)
            except:
                raise RuntimeError(
                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
                )

+        self._add(token_type, text)
        return True

    def _scan_identifier(self, identifier_end: str) -> None:
-        text = ""
-        identifier_end_is_escape = identifier_end in self._IDENTIFIER_ESCAPES
-
-        while True:
-            if self._end:
-                raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}")
-
-            self._advance()
-            if self._char == identifier_end:
-                if identifier_end_is_escape and self._peek == identifier_end:
-                    text += identifier_end
-                    self._advance()
-                    continue
-
-                break
-
-            text += self._char
-
+        self._advance()
+        text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES)
        self._add(TokenType.IDENTIFIER, text)

    def _scan_var(self) -> None:
        while True:
            char = self._peek.strip()
            if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
-                self._advance()
+                self._advance(alnum=True)
            else:
                break
+
        self._add(
            TokenType.VAR
-            if self._prev_token_type == TokenType.PARAMETER
+            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
            else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
        )

-    def _extract_string(self, delimiter: str) -> str:
+    def _extract_string(self, delimiter: str, escapes=None) -> str:
        text = ""
        delim_size = len(delimiter)
+        escapes = self._STRING_ESCAPES if escapes is None else escapes

        while True:
-            if self._char in self._STRING_ESCAPES and (
-                self._peek == delimiter or self._peek in self._STRING_ESCAPES
-            ):
+            if self._char in escapes and (self._peek == delimiter or self._peek in escapes):
                if self._peek == delimiter:
                    text += self._peek
                else:
@ -1158,7 +1161,9 @@ class Tokenizer(metaclass=_Tokenizer):

                if self._end:
                    raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
-                text += self._char
-                self._advance()
+
+                current = self._current - 1
+                self._advance(alnum=True)
+                text += self.sql[current : self._current - 1]

        return text