1
0
Fork 0

Merging upstream version 6.1.1.

Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
Daniel Baumann 2025-02-13 08:04:41 +01:00
parent 3c6d649c90
commit 08ecea3adf
Signed by: daniel
GPG key ID: FBB4F0E80A80222F
61 changed files with 1844 additions and 1555 deletions

View file

@ -38,6 +38,7 @@ class TokenType(AutoName):
DARROW = auto()
HASH_ARROW = auto()
DHASH_ARROW = auto()
LR_ARROW = auto()
ANNOTATION = auto()
DOLLAR = auto()
@ -53,6 +54,7 @@ class TokenType(AutoName):
TABLE = auto()
VAR = auto()
BIT_STRING = auto()
HEX_STRING = auto()
# types
BOOLEAN = auto()
@ -78,10 +80,17 @@ class TokenType(AutoName):
UUID = auto()
GEOGRAPHY = auto()
NULLABLE = auto()
GEOMETRY = auto()
HLLSKETCH = auto()
SUPER = auto()
SERIAL = auto()
SMALLSERIAL = auto()
BIGSERIAL = auto()
# keywords
ADD_FILE = auto()
ALIAS = auto()
ALWAYS = auto()
ALL = auto()
ALTER = auto()
ANALYZE = auto()
@ -92,11 +101,12 @@ class TokenType(AutoName):
AUTO_INCREMENT = auto()
BEGIN = auto()
BETWEEN = auto()
BOTH = auto()
BUCKET = auto()
BY_DEFAULT = auto()
CACHE = auto()
CALL = auto()
CASE = auto()
CAST = auto()
CHARACTER_SET = auto()
CHECK = auto()
CLUSTER_BY = auto()
@ -104,7 +114,6 @@ class TokenType(AutoName):
COMMENT = auto()
COMMIT = auto()
CONSTRAINT = auto()
CONVERT = auto()
CREATE = auto()
CROSS = auto()
CUBE = auto()
@ -127,22 +136,24 @@ class TokenType(AutoName):
EXCEPT = auto()
EXISTS = auto()
EXPLAIN = auto()
EXTRACT = auto()
FALSE = auto()
FETCH = auto()
FILTER = auto()
FINAL = auto()
FIRST = auto()
FOLLOWING = auto()
FOR = auto()
FOREIGN_KEY = auto()
FORMAT = auto()
FULL = auto()
FUNCTION = auto()
FROM = auto()
GENERATED = auto()
GROUP_BY = auto()
GROUPING_SETS = auto()
HAVING = auto()
HINT = auto()
IDENTITY = auto()
IF = auto()
IGNORE_NULLS = auto()
ILIKE = auto()
@ -159,12 +170,14 @@ class TokenType(AutoName):
JOIN = auto()
LATERAL = auto()
LAZY = auto()
LEADING = auto()
LEFT = auto()
LIKE = auto()
LIMIT = auto()
LOCATION = auto()
MAP = auto()
MOD = auto()
NATURAL = auto()
NEXT = auto()
NO_ACTION = auto()
NULL = auto()
@ -204,8 +217,10 @@ class TokenType(AutoName):
ROWS = auto()
SCHEMA_COMMENT = auto()
SELECT = auto()
SEPARATOR = auto()
SET = auto()
SHOW = auto()
SIMILAR_TO = auto()
SOME = auto()
SORT_BY = auto()
STORED = auto()
@ -213,12 +228,11 @@ class TokenType(AutoName):
TABLE_FORMAT = auto()
TABLE_SAMPLE = auto()
TEMPORARY = auto()
TIME = auto()
TOP = auto()
THEN = auto()
TRUE = auto()
TRAILING = auto()
TRUNCATE = auto()
TRY_CAST = auto()
UNBOUNDED = auto()
UNCACHE = auto()
UNION = auto()
@ -272,35 +286,32 @@ class _Tokenizer(type):
def __new__(cls, clsname, bases, attrs):
klass = super().__new__(cls, clsname, bases, attrs)
klass.QUOTES = dict(
(quote, quote) if isinstance(quote, str) else (quote[0], quote[1])
for quote in klass.QUOTES
)
klass.IDENTIFIERS = dict(
(identifier, identifier)
if isinstance(identifier, str)
else (identifier[0], identifier[1])
for identifier in klass.IDENTIFIERS
)
klass.COMMENTS = dict(
(comment, None) if isinstance(comment, str) else (comment[0], comment[1])
for comment in klass.COMMENTS
klass._QUOTES = cls._delimeter_list_to_dict(klass.QUOTES)
klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS)
klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS)
klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS)
klass._COMMENTS = dict(
(comment, None) if isinstance(comment, str) else (comment[0], comment[1]) for comment in klass.COMMENTS
)
klass.KEYWORD_TRIE = new_trie(
key.upper()
for key, value in {
**klass.KEYWORDS,
**{comment: TokenType.COMMENT for comment in klass.COMMENTS},
**{quote: TokenType.QUOTE for quote in klass.QUOTES},
**{comment: TokenType.COMMENT for comment in klass._COMMENTS},
**{quote: TokenType.QUOTE for quote in klass._QUOTES},
**{bit_string: TokenType.BIT_STRING for bit_string in klass._BIT_STRINGS},
**{hex_string: TokenType.HEX_STRING for hex_string in klass._HEX_STRINGS},
}.items()
if " " in key or any(single in key for single in klass.SINGLE_TOKENS)
)
return klass
@staticmethod
def _delimeter_list_to_dict(list):
return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list)
class Tokenizer(metaclass=_Tokenizer):
SINGLE_TOKENS = {
@ -339,6 +350,10 @@ class Tokenizer(metaclass=_Tokenizer):
QUOTES = ["'"]
BIT_STRINGS = []
HEX_STRINGS = []
IDENTIFIERS = ['"']
ESCAPE = "'"
@ -357,6 +372,7 @@ class Tokenizer(metaclass=_Tokenizer):
"->>": TokenType.DARROW,
"#>": TokenType.HASH_ARROW,
"#>>": TokenType.DHASH_ARROW,
"<->": TokenType.LR_ARROW,
"ADD ARCHIVE": TokenType.ADD_FILE,
"ADD ARCHIVES": TokenType.ADD_FILE,
"ADD FILE": TokenType.ADD_FILE,
@ -374,12 +390,12 @@ class Tokenizer(metaclass=_Tokenizer):
"AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
"BEGIN": TokenType.BEGIN,
"BETWEEN": TokenType.BETWEEN,
"BOTH": TokenType.BOTH,
"BUCKET": TokenType.BUCKET,
"CALL": TokenType.CALL,
"CACHE": TokenType.CACHE,
"UNCACHE": TokenType.UNCACHE,
"CASE": TokenType.CASE,
"CAST": TokenType.CAST,
"CHARACTER SET": TokenType.CHARACTER_SET,
"CHECK": TokenType.CHECK,
"CLUSTER BY": TokenType.CLUSTER_BY,
@ -387,7 +403,6 @@ class Tokenizer(metaclass=_Tokenizer):
"COMMENT": TokenType.SCHEMA_COMMENT,
"COMMIT": TokenType.COMMIT,
"CONSTRAINT": TokenType.CONSTRAINT,
"CONVERT": TokenType.CONVERT,
"CREATE": TokenType.CREATE,
"CROSS": TokenType.CROSS,
"CUBE": TokenType.CUBE,
@ -408,7 +423,6 @@ class Tokenizer(metaclass=_Tokenizer):
"EXCEPT": TokenType.EXCEPT,
"EXISTS": TokenType.EXISTS,
"EXPLAIN": TokenType.EXPLAIN,
"EXTRACT": TokenType.EXTRACT,
"FALSE": TokenType.FALSE,
"FETCH": TokenType.FETCH,
"FILTER": TokenType.FILTER,
@ -437,10 +451,12 @@ class Tokenizer(metaclass=_Tokenizer):
"JOIN": TokenType.JOIN,
"LATERAL": TokenType.LATERAL,
"LAZY": TokenType.LAZY,
"LEADING": TokenType.LEADING,
"LEFT": TokenType.LEFT,
"LIKE": TokenType.LIKE,
"LIMIT": TokenType.LIMIT,
"LOCATION": TokenType.LOCATION,
"NATURAL": TokenType.NATURAL,
"NEXT": TokenType.NEXT,
"NO ACTION": TokenType.NO_ACTION,
"NOT": TokenType.NOT,
@ -490,8 +506,8 @@ class Tokenizer(metaclass=_Tokenizer):
"TEMPORARY": TokenType.TEMPORARY,
"THEN": TokenType.THEN,
"TRUE": TokenType.TRUE,
"TRAILING": TokenType.TRAILING,
"TRUNCATE": TokenType.TRUNCATE,
"TRY_CAST": TokenType.TRY_CAST,
"UNBOUNDED": TokenType.UNBOUNDED,
"UNION": TokenType.UNION,
"UNNEST": TokenType.UNNEST,
@ -626,14 +642,12 @@ class Tokenizer(metaclass=_Tokenizer):
break
white_space = self.WHITE_SPACE.get(self._char)
identifier_end = self.IDENTIFIERS.get(self._char)
identifier_end = self._IDENTIFIERS.get(self._char)
if white_space:
if white_space == TokenType.BREAK:
self._col = 1
self._line += 1
elif self._char == "0" and self._peek == "x":
self._scan_hex()
elif self._char.isdigit():
self._scan_number()
elif identifier_end:
@ -666,9 +680,7 @@ class Tokenizer(metaclass=_Tokenizer):
text = self._text if text is None else text
self.tokens.append(Token(token_type, text, self._line, self._col))
if token_type in self.COMMANDS and (
len(self.tokens) == 1 or self.tokens[-2].token_type == TokenType.SEMICOLON
):
if token_type in self.COMMANDS and (len(self.tokens) == 1 or self.tokens[-2].token_type == TokenType.SEMICOLON):
self._start = self._current
while not self._end and self._peek != ";":
self._advance()
@ -725,6 +737,8 @@ class Tokenizer(metaclass=_Tokenizer):
if self._scan_string(word):
return
if self._scan_numeric_string(word):
return
if self._scan_comment(word):
return
@ -732,10 +746,10 @@ class Tokenizer(metaclass=_Tokenizer):
self._add(self.KEYWORDS[word.upper()])
def _scan_comment(self, comment_start):
if comment_start not in self.COMMENTS:
if comment_start not in self._COMMENTS:
return False
comment_end = self.COMMENTS[comment_start]
comment_end = self._COMMENTS[comment_start]
if comment_end:
comment_end_size = len(comment_end)
@ -749,15 +763,18 @@ class Tokenizer(metaclass=_Tokenizer):
return True
def _scan_annotation(self):
while (
not self._end
and self.WHITE_SPACE.get(self._peek) != TokenType.BREAK
and self._peek != ","
):
while not self._end and self.WHITE_SPACE.get(self._peek) != TokenType.BREAK and self._peek != ",":
self._advance()
self._add(TokenType.ANNOTATION, self._text[1:])
def _scan_number(self):
if self._char == "0":
peek = self._peek.upper()
if peek == "B":
return self._scan_bits()
elif peek == "X":
return self._scan_hex()
decimal = False
scientific = 0
@ -788,57 +805,71 @@ class Tokenizer(metaclass=_Tokenizer):
else:
return self._add(TokenType.NUMBER)
def _scan_bits(self):
self._advance()
value = self._extract_value()
try:
self._add(TokenType.BIT_STRING, f"{int(value, 2)}")
except ValueError:
self._add(TokenType.IDENTIFIER)
def _scan_hex(self):
self._advance()
value = self._extract_value()
try:
self._add(TokenType.HEX_STRING, f"{int(value, 16)}")
except ValueError:
self._add(TokenType.IDENTIFIER)
def _extract_value(self):
while True:
char = self._peek.strip()
if char and char not in self.SINGLE_TOKENS:
self._advance()
else:
break
try:
self._add(TokenType.BIT_STRING, f"{int(self._text, 16):b}")
except ValueError:
self._add(TokenType.IDENTIFIER)
return self._text
def _scan_string(self, quote):
quote_end = self.QUOTES.get(quote)
quote_end = self._QUOTES.get(quote)
if quote_end is None:
return False
text = ""
self._advance(len(quote))
quote_end_size = len(quote_end)
while True:
if self._char == self.ESCAPE and self._peek == quote_end:
text += quote
self._advance(2)
else:
if self._chars(quote_end_size) == quote_end:
if quote_end_size > 1:
self._advance(quote_end_size - 1)
break
if self._end:
raise RuntimeError(
f"Missing {quote} from {self._line}:{self._start}"
)
text += self._char
self._advance()
text = self._extract_string(quote_end)
text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
text = text.replace("\\\\", "\\") if self.ESCAPE == "\\" else text
self._add(TokenType.STRING, text)
return True
def _scan_numeric_string(self, string_start):
if string_start in self._HEX_STRINGS:
delimiters = self._HEX_STRINGS
token_type = TokenType.HEX_STRING
base = 16
elif string_start in self._BIT_STRINGS:
delimiters = self._BIT_STRINGS
token_type = TokenType.BIT_STRING
base = 2
else:
return False
self._advance(len(string_start))
string_end = delimiters.get(string_start)
text = self._extract_string(string_end)
try:
self._add(token_type, f"{int(text, base)}")
except ValueError:
raise RuntimeError(f"Numeric string contains invalid characters from {self._line}:{self._start}")
return True
def _scan_identifier(self, identifier_end):
while self._peek != identifier_end:
if self._end:
raise RuntimeError(
f"Missing {identifier_end} from {self._line}:{self._start}"
)
raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}")
self._advance()
self._advance()
self._add(TokenType.IDENTIFIER, self._text[1:-1])
@ -851,3 +882,24 @@ class Tokenizer(metaclass=_Tokenizer):
else:
break
self._add(self.KEYWORDS.get(self._text.upper(), TokenType.VAR))
def _extract_string(self, delimiter):
text = ""
delim_size = len(delimiter)
while True:
if self._char == self.ESCAPE and self._peek == delimiter:
text += delimiter
self._advance(2)
else:
if self._chars(delim_size) == delimiter:
if delim_size > 1:
self._advance(delim_size - 1)
break
if self._end:
raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
text += self._char
self._advance()
return text