Merging upstream version 12.2.0.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
fffa0d5761
commit
62b2b24d3b
100 changed files with 35022 additions and 30936 deletions
|
@ -84,6 +84,10 @@ class TokenType(AutoName):
|
|||
UINT = auto()
|
||||
BIGINT = auto()
|
||||
UBIGINT = auto()
|
||||
INT128 = auto()
|
||||
UINT128 = auto()
|
||||
INT256 = auto()
|
||||
UINT256 = auto()
|
||||
FLOAT = auto()
|
||||
DOUBLE = auto()
|
||||
DECIMAL = auto()
|
||||
|
@ -774,8 +778,6 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"_end",
|
||||
"_peek",
|
||||
"_prev_token_line",
|
||||
"_prev_token_comments",
|
||||
"_prev_token_type",
|
||||
)
|
||||
|
||||
def __init__(self) -> None:
|
||||
|
@ -795,8 +797,6 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
self._end = False
|
||||
self._peek = ""
|
||||
self._prev_token_line = -1
|
||||
self._prev_token_comments: t.List[str] = []
|
||||
self._prev_token_type: t.Optional[TokenType] = None
|
||||
|
||||
def tokenize(self, sql: str) -> t.List[Token]:
|
||||
"""Returns a list of tokens corresponding to the SQL string `sql`."""
|
||||
|
@ -846,7 +846,7 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
return self.sql[start:end]
|
||||
return ""
|
||||
|
||||
def _advance(self, i: int = 1) -> None:
|
||||
def _advance(self, i: int = 1, alnum: bool = False) -> None:
|
||||
if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
|
||||
self._col = 1
|
||||
self._line += 1
|
||||
|
@ -858,14 +858,30 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
self._char = self.sql[self._current - 1]
|
||||
self._peek = "" if self._end else self.sql[self._current]
|
||||
|
||||
if alnum and self._char.isalnum():
|
||||
_col = self._col
|
||||
_current = self._current
|
||||
_end = self._end
|
||||
_peek = self._peek
|
||||
|
||||
while _peek.isalnum():
|
||||
_col += 1
|
||||
_current += 1
|
||||
_end = _current >= self.size
|
||||
_peek = "" if _end else self.sql[_current]
|
||||
|
||||
self._col = _col
|
||||
self._current = _current
|
||||
self._end = _end
|
||||
self._peek = _peek
|
||||
self._char = self.sql[_current - 1]
|
||||
|
||||
@property
|
||||
def _text(self) -> str:
|
||||
return self.sql[self._start : self._current]
|
||||
|
||||
def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
|
||||
self._prev_token_line = self._line
|
||||
self._prev_token_comments = self._comments
|
||||
self._prev_token_type = token_type
|
||||
self.tokens.append(
|
||||
Token(
|
||||
token_type,
|
||||
|
@ -966,13 +982,13 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
|
||||
comment_end_size = len(comment_end)
|
||||
while not self._end and self._chars(comment_end_size) != comment_end:
|
||||
self._advance()
|
||||
self._advance(alnum=True)
|
||||
|
||||
self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
|
||||
self._advance(comment_end_size - 1)
|
||||
else:
|
||||
while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK:
|
||||
self._advance()
|
||||
self._advance(alnum=True)
|
||||
self._comments.append(self._text[comment_start_size:])
|
||||
|
||||
# Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
|
||||
|
@ -988,9 +1004,9 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
if self._char == "0":
|
||||
peek = self._peek.upper()
|
||||
if peek == "B":
|
||||
return self._scan_bits()
|
||||
return self._scan_bits() if self._BIT_STRINGS else self._add(TokenType.NUMBER)
|
||||
elif peek == "X":
|
||||
return self._scan_hex()
|
||||
return self._scan_hex() if self._HEX_STRINGS else self._add(TokenType.NUMBER)
|
||||
|
||||
decimal = False
|
||||
scientific = 0
|
||||
|
@ -1033,7 +1049,9 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
self._advance()
|
||||
value = self._extract_value()
|
||||
try:
|
||||
self._add(TokenType.BIT_STRING, f"{int(value, 2)}")
|
||||
# If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
|
||||
int(value, 2)
|
||||
self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b
|
||||
except ValueError:
|
||||
self._add(TokenType.IDENTIFIER)
|
||||
|
||||
|
@ -1041,7 +1059,9 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
self._advance()
|
||||
value = self._extract_value()
|
||||
try:
|
||||
self._add(TokenType.HEX_STRING, f"{int(value, 16)}")
|
||||
# If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
|
||||
int(value, 16)
|
||||
self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x
|
||||
except ValueError:
|
||||
self._add(TokenType.IDENTIFIER)
|
||||
|
||||
|
@ -1049,7 +1069,7 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
while True:
|
||||
char = self._peek.strip()
|
||||
if char and char not in self.SINGLE_TOKENS:
|
||||
self._advance()
|
||||
self._advance(alnum=True)
|
||||
else:
|
||||
break
|
||||
|
||||
|
@ -1066,7 +1086,7 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text)
|
||||
return True
|
||||
|
||||
# X'1234, b'0110', E'\\\\\' etc.
|
||||
# X'1234', b'0110', E'\\\\\' etc.
|
||||
def _scan_formatted_string(self, string_start: str) -> bool:
|
||||
if string_start in self._HEX_STRINGS:
|
||||
delimiters = self._HEX_STRINGS
|
||||
|
@ -1087,60 +1107,43 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
string_end = delimiters[string_start]
|
||||
text = self._extract_string(string_end)
|
||||
|
||||
if base is None:
|
||||
self._add(token_type, text)
|
||||
else:
|
||||
if base:
|
||||
try:
|
||||
self._add(token_type, f"{int(text, base)}")
|
||||
int(text, base)
|
||||
except:
|
||||
raise RuntimeError(
|
||||
f"Numeric string contains invalid characters from {self._line}:{self._start}"
|
||||
)
|
||||
|
||||
self._add(token_type, text)
|
||||
return True
|
||||
|
||||
def _scan_identifier(self, identifier_end: str) -> None:
|
||||
text = ""
|
||||
identifier_end_is_escape = identifier_end in self._IDENTIFIER_ESCAPES
|
||||
|
||||
while True:
|
||||
if self._end:
|
||||
raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}")
|
||||
|
||||
self._advance()
|
||||
if self._char == identifier_end:
|
||||
if identifier_end_is_escape and self._peek == identifier_end:
|
||||
text += identifier_end
|
||||
self._advance()
|
||||
continue
|
||||
|
||||
break
|
||||
|
||||
text += self._char
|
||||
|
||||
self._advance()
|
||||
text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES)
|
||||
self._add(TokenType.IDENTIFIER, text)
|
||||
|
||||
def _scan_var(self) -> None:
|
||||
while True:
|
||||
char = self._peek.strip()
|
||||
if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
|
||||
self._advance()
|
||||
self._advance(alnum=True)
|
||||
else:
|
||||
break
|
||||
|
||||
self._add(
|
||||
TokenType.VAR
|
||||
if self._prev_token_type == TokenType.PARAMETER
|
||||
if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
|
||||
else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
|
||||
)
|
||||
|
||||
def _extract_string(self, delimiter: str) -> str:
|
||||
def _extract_string(self, delimiter: str, escapes=None) -> str:
|
||||
text = ""
|
||||
delim_size = len(delimiter)
|
||||
escapes = self._STRING_ESCAPES if escapes is None else escapes
|
||||
|
||||
while True:
|
||||
if self._char in self._STRING_ESCAPES and (
|
||||
self._peek == delimiter or self._peek in self._STRING_ESCAPES
|
||||
):
|
||||
if self._char in escapes and (self._peek == delimiter or self._peek in escapes):
|
||||
if self._peek == delimiter:
|
||||
text += self._peek
|
||||
else:
|
||||
|
@ -1158,7 +1161,9 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
|
||||
if self._end:
|
||||
raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
|
||||
text += self._char
|
||||
self._advance()
|
||||
|
||||
current = self._current - 1
|
||||
self._advance(alnum=True)
|
||||
text += self.sql[current : self._current - 1]
|
||||
|
||||
return text
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue