1
0
Fork 0

Merging upstream version 11.7.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-13 15:52:09 +01:00
parent 0c053462ae
commit 8d96084fad
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
144 changed files with 44104 additions and 39367 deletions

View file

@ -87,6 +87,7 @@ class TokenType(AutoName):
FLOAT = auto()
DOUBLE = auto()
DECIMAL = auto()
BIGDECIMAL = auto()
CHAR = auto()
NCHAR = auto()
VARCHAR = auto()
@ -214,6 +215,7 @@ class TokenType(AutoName):
ISNULL = auto()
JOIN = auto()
JOIN_MARKER = auto()
KEEP = auto()
LANGUAGE = auto()
LATERAL = auto()
LAZY = auto()
@ -231,6 +233,7 @@ class TokenType(AutoName):
MOD = auto()
NATURAL = auto()
NEXT = auto()
NEXT_VALUE_FOR = auto()
NO_ACTION = auto()
NOTNULL = auto()
NULL = auto()
@ -315,7 +318,7 @@ class TokenType(AutoName):
class Token:
__slots__ = ("token_type", "text", "line", "col", "comments")
__slots__ = ("token_type", "text", "line", "col", "end", "comments")
@classmethod
def number(cls, number: int) -> Token:
@ -343,22 +346,29 @@ class Token:
text: str,
line: int = 1,
col: int = 1,
end: int = 0,
comments: t.List[str] = [],
) -> None:
self.token_type = token_type
self.text = text
self.line = line
self.col = col - len(text)
self.col = self.col if self.col > 1 else 1
size = len(text)
self.col = col
self.end = end if end else size
self.comments = comments
@property
def start(self) -> int:
"""Returns the start of the token."""
return self.end - len(self.text)
def __repr__(self) -> str:
attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
return f"<Token {attributes}>"
class _Tokenizer(type):
def __new__(cls, clsname, bases, attrs): # type: ignore
def __new__(cls, clsname, bases, attrs):
klass = super().__new__(cls, clsname, bases, attrs)
klass._QUOTES = {
@ -433,25 +443,25 @@ class Tokenizer(metaclass=_Tokenizer):
"#": TokenType.HASH,
}
QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
IDENTIFIER_ESCAPES = ['"']
QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
STRING_ESCAPES = ["'"]
VAR_SINGLE_TOKENS: t.Set[str] = set()
_COMMENTS: t.Dict[str, str] = {}
_BIT_STRINGS: t.Dict[str, str] = {}
_BYTE_STRINGS: t.Dict[str, str] = {}
_HEX_STRINGS: t.Dict[str, str] = {}
_IDENTIFIERS: t.Dict[str, str] = {}
_IDENTIFIER_ESCAPES: t.Set[str] = set()
_QUOTES: t.Dict[str, str] = {}
_STRING_ESCAPES: t.Set[str] = set()
IDENTIFIER_ESCAPES = ['"']
_IDENTIFIER_ESCAPES: t.Set[str] = set()
KEYWORDS = {
KEYWORDS: t.Dict[t.Optional[str], TokenType] = {
**{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
**{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
"{{+": TokenType.BLOCK_START,
@ -553,6 +563,7 @@ class Tokenizer(metaclass=_Tokenizer):
"IS": TokenType.IS,
"ISNULL": TokenType.ISNULL,
"JOIN": TokenType.JOIN,
"KEEP": TokenType.KEEP,
"LATERAL": TokenType.LATERAL,
"LAZY": TokenType.LAZY,
"LEADING": TokenType.LEADING,
@ -565,6 +576,7 @@ class Tokenizer(metaclass=_Tokenizer):
"MERGE": TokenType.MERGE,
"NATURAL": TokenType.NATURAL,
"NEXT": TokenType.NEXT,
"NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR,
"NO ACTION": TokenType.NO_ACTION,
"NOT": TokenType.NOT,
"NOTNULL": TokenType.NOTNULL,
@ -632,6 +644,7 @@ class Tokenizer(metaclass=_Tokenizer):
"UPDATE": TokenType.UPDATE,
"USE": TokenType.USE,
"USING": TokenType.USING,
"UUID": TokenType.UUID,
"VALUES": TokenType.VALUES,
"VIEW": TokenType.VIEW,
"VOLATILE": TokenType.VOLATILE,
@ -661,6 +674,8 @@ class Tokenizer(metaclass=_Tokenizer):
"INT8": TokenType.BIGINT,
"DEC": TokenType.DECIMAL,
"DECIMAL": TokenType.DECIMAL,
"BIGDECIMAL": TokenType.BIGDECIMAL,
"BIGNUMERIC": TokenType.BIGDECIMAL,
"MAP": TokenType.MAP,
"NULLABLE": TokenType.NULLABLE,
"NUMBER": TokenType.DECIMAL,
@ -742,7 +757,7 @@ class Tokenizer(metaclass=_Tokenizer):
ENCODE: t.Optional[str] = None
COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")]
KEYWORD_TRIE = None # autofilled
KEYWORD_TRIE: t.Dict = {} # autofilled
IDENTIFIER_CAN_START_WITH_DIGIT = False
@ -776,19 +791,28 @@ class Tokenizer(metaclass=_Tokenizer):
self._col = 1
self._comments: t.List[str] = []
self._char = None
self._end = None
self._peek = None
self._char = ""
self._end = False
self._peek = ""
self._prev_token_line = -1
self._prev_token_comments: t.List[str] = []
self._prev_token_type = None
self._prev_token_type: t.Optional[TokenType] = None
def tokenize(self, sql: str) -> t.List[Token]:
"""Returns a list of tokens corresponding to the SQL string `sql`."""
self.reset()
self.sql = sql
self.size = len(sql)
self._scan()
try:
self._scan()
except Exception as e:
start = self._current - 50
end = self._current + 50
start = start if start > 0 else 0
end = end if end < self.size else self.size - 1
context = self.sql[start:end]
raise ValueError(f"Error tokenizing '{context}'") from e
return self.tokens
def _scan(self, until: t.Optional[t.Callable] = None) -> None:
@ -810,9 +834,12 @@ class Tokenizer(metaclass=_Tokenizer):
if until and until():
break
if self.tokens:
self.tokens[-1].comments.extend(self._comments)
def _chars(self, size: int) -> str:
if size == 1:
return self._char # type: ignore
return self._char
start = self._current - 1
end = start + size
if end <= self.size:
@ -821,17 +848,15 @@ class Tokenizer(metaclass=_Tokenizer):
def _advance(self, i: int = 1) -> None:
if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
self._set_new_line()
self._col = 1
self._line += 1
else:
self._col += i
self._col += i
self._current += i
self._end = self._current >= self.size # type: ignore
self._char = self.sql[self._current - 1] # type: ignore
self._peek = self.sql[self._current] if self._current < self.size else "" # type: ignore
def _set_new_line(self) -> None:
self._col = 1
self._line += 1
self._end = self._current >= self.size
self._char = self.sql[self._current - 1]
self._peek = "" if self._end else self.sql[self._current]
@property
def _text(self) -> str:
@ -840,13 +865,14 @@ class Tokenizer(metaclass=_Tokenizer):
def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
self._prev_token_line = self._line
self._prev_token_comments = self._comments
self._prev_token_type = token_type # type: ignore
self._prev_token_type = token_type
self.tokens.append(
Token(
token_type,
self._text if text is None else text,
self._line,
self._col,
self._current,
self._comments,
)
)
@ -881,7 +907,7 @@ class Tokenizer(metaclass=_Tokenizer):
if skip:
result = 1
else:
result, trie = in_trie(trie, char.upper()) # type: ignore
result, trie = in_trie(trie, char.upper())
if result == 0:
break
@ -910,7 +936,7 @@ class Tokenizer(metaclass=_Tokenizer):
if not word:
if self._char in self.SINGLE_TOKENS:
self._add(self.SINGLE_TOKENS[self._char], text=self._char) # type: ignore
self._add(self.SINGLE_TOKENS[self._char], text=self._char)
return
self._scan_var()
return
@ -927,29 +953,31 @@ class Tokenizer(metaclass=_Tokenizer):
self._add(self.KEYWORDS[word], text=word)
def _scan_comment(self, comment_start: str) -> bool:
if comment_start not in self._COMMENTS: # type: ignore
if comment_start not in self._COMMENTS:
return False
comment_start_line = self._line
comment_start_size = len(comment_start)
comment_end = self._COMMENTS[comment_start] # type: ignore
comment_end = self._COMMENTS[comment_start]
if comment_end:
comment_end_size = len(comment_end)
# Skip the comment's start delimiter
self._advance(comment_start_size)
comment_end_size = len(comment_end)
while not self._end and self._chars(comment_end_size) != comment_end:
self._advance()
self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) # type: ignore
self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
self._advance(comment_end_size - 1)
else:
while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK:
self._advance()
self._comments.append(self._text[comment_start_size:]) # type: ignore
self._comments.append(self._text[comment_start_size:])
# Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
# Multiple consecutive comments are preserved by appending them to the current comments list.
if comment_start_line == self._prev_token_line or self._end:
if comment_start_line == self._prev_token_line:
self.tokens[-1].comments.extend(self._comments)
self._comments = []
self._prev_token_line = self._line
@ -958,7 +986,7 @@ class Tokenizer(metaclass=_Tokenizer):
def _scan_number(self) -> None:
if self._char == "0":
peek = self._peek.upper() # type: ignore
peek = self._peek.upper()
if peek == "B":
return self._scan_bits()
elif peek == "X":
@ -968,7 +996,7 @@ class Tokenizer(metaclass=_Tokenizer):
scientific = 0
while True:
if self._peek.isdigit(): # type: ignore
if self._peek.isdigit():
self._advance()
elif self._peek == "." and not decimal:
decimal = True
@ -976,24 +1004,23 @@ class Tokenizer(metaclass=_Tokenizer):
elif self._peek in ("-", "+") and scientific == 1:
scientific += 1
self._advance()
elif self._peek.upper() == "E" and not scientific: # type: ignore
elif self._peek.upper() == "E" and not scientific:
scientific += 1
self._advance()
elif self._peek.isidentifier(): # type: ignore
elif self._peek.isidentifier():
number_text = self._text
literal = []
literal = ""
while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: # type: ignore
literal.append(self._peek.upper()) # type: ignore
while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:
literal += self._peek.upper()
self._advance()
literal = "".join(literal) # type: ignore
token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal)) # type: ignore
token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal))
if token_type:
self._add(TokenType.NUMBER, number_text)
self._add(TokenType.DCOLON, "::")
return self._add(token_type, literal) # type: ignore
return self._add(token_type, literal)
elif self.IDENTIFIER_CAN_START_WITH_DIGIT:
return self._add(TokenType.VAR)
@ -1020,7 +1047,7 @@ class Tokenizer(metaclass=_Tokenizer):
def _extract_value(self) -> str:
while True:
char = self._peek.strip() # type: ignore
char = self._peek.strip()
if char and char not in self.SINGLE_TOKENS:
self._advance()
else:
@ -1029,35 +1056,35 @@ class Tokenizer(metaclass=_Tokenizer):
return self._text
def _scan_string(self, quote: str) -> bool:
quote_end = self._QUOTES.get(quote) # type: ignore
quote_end = self._QUOTES.get(quote)
if quote_end is None:
return False
self._advance(len(quote))
text = self._extract_string(quote_end)
text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text # type: ignore
text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text)
return True
# X'1234, b'0110', E'\\\\\' etc.
def _scan_formatted_string(self, string_start: str) -> bool:
if string_start in self._HEX_STRINGS: # type: ignore
delimiters = self._HEX_STRINGS # type: ignore
if string_start in self._HEX_STRINGS:
delimiters = self._HEX_STRINGS
token_type = TokenType.HEX_STRING
base = 16
elif string_start in self._BIT_STRINGS: # type: ignore
delimiters = self._BIT_STRINGS # type: ignore
elif string_start in self._BIT_STRINGS:
delimiters = self._BIT_STRINGS
token_type = TokenType.BIT_STRING
base = 2
elif string_start in self._BYTE_STRINGS: # type: ignore
delimiters = self._BYTE_STRINGS # type: ignore
elif string_start in self._BYTE_STRINGS:
delimiters = self._BYTE_STRINGS
token_type = TokenType.BYTE_STRING
base = None
else:
return False
self._advance(len(string_start))
string_end = delimiters.get(string_start)
string_end = delimiters[string_start]
text = self._extract_string(string_end)
if base is None:
@ -1083,20 +1110,20 @@ class Tokenizer(metaclass=_Tokenizer):
self._advance()
if self._char == identifier_end:
if identifier_end_is_escape and self._peek == identifier_end:
text += identifier_end # type: ignore
text += identifier_end
self._advance()
continue
break
text += self._char # type: ignore
text += self._char
self._add(TokenType.IDENTIFIER, text)
def _scan_var(self) -> None:
while True:
char = self._peek.strip() # type: ignore
if char and char not in self.SINGLE_TOKENS:
char = self._peek.strip()
if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
self._advance()
else:
break
@ -1115,9 +1142,9 @@ class Tokenizer(metaclass=_Tokenizer):
self._peek == delimiter or self._peek in self._STRING_ESCAPES
):
if self._peek == delimiter:
text += self._peek # type: ignore
text += self._peek
else:
text += self._char + self._peek # type: ignore
text += self._char + self._peek
if self._current + 1 < self.size:
self._advance(2)
@ -1131,7 +1158,7 @@ class Tokenizer(metaclass=_Tokenizer):
if self._end:
raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
text += self._char # type: ignore
text += self._char
self._advance()
return text