Merging upstream version 6.1.1.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
3c6d649c90
commit
08ecea3adf
61 changed files with 1844 additions and 1555 deletions
|
@ -38,6 +38,7 @@ class TokenType(AutoName):
|
|||
DARROW = auto()
|
||||
HASH_ARROW = auto()
|
||||
DHASH_ARROW = auto()
|
||||
LR_ARROW = auto()
|
||||
ANNOTATION = auto()
|
||||
DOLLAR = auto()
|
||||
|
||||
|
@ -53,6 +54,7 @@ class TokenType(AutoName):
|
|||
TABLE = auto()
|
||||
VAR = auto()
|
||||
BIT_STRING = auto()
|
||||
HEX_STRING = auto()
|
||||
|
||||
# types
|
||||
BOOLEAN = auto()
|
||||
|
@ -78,10 +80,17 @@ class TokenType(AutoName):
|
|||
UUID = auto()
|
||||
GEOGRAPHY = auto()
|
||||
NULLABLE = auto()
|
||||
GEOMETRY = auto()
|
||||
HLLSKETCH = auto()
|
||||
SUPER = auto()
|
||||
SERIAL = auto()
|
||||
SMALLSERIAL = auto()
|
||||
BIGSERIAL = auto()
|
||||
|
||||
# keywords
|
||||
ADD_FILE = auto()
|
||||
ALIAS = auto()
|
||||
ALWAYS = auto()
|
||||
ALL = auto()
|
||||
ALTER = auto()
|
||||
ANALYZE = auto()
|
||||
|
@ -92,11 +101,12 @@ class TokenType(AutoName):
|
|||
AUTO_INCREMENT = auto()
|
||||
BEGIN = auto()
|
||||
BETWEEN = auto()
|
||||
BOTH = auto()
|
||||
BUCKET = auto()
|
||||
BY_DEFAULT = auto()
|
||||
CACHE = auto()
|
||||
CALL = auto()
|
||||
CASE = auto()
|
||||
CAST = auto()
|
||||
CHARACTER_SET = auto()
|
||||
CHECK = auto()
|
||||
CLUSTER_BY = auto()
|
||||
|
@ -104,7 +114,6 @@ class TokenType(AutoName):
|
|||
COMMENT = auto()
|
||||
COMMIT = auto()
|
||||
CONSTRAINT = auto()
|
||||
CONVERT = auto()
|
||||
CREATE = auto()
|
||||
CROSS = auto()
|
||||
CUBE = auto()
|
||||
|
@ -127,22 +136,24 @@ class TokenType(AutoName):
|
|||
EXCEPT = auto()
|
||||
EXISTS = auto()
|
||||
EXPLAIN = auto()
|
||||
EXTRACT = auto()
|
||||
FALSE = auto()
|
||||
FETCH = auto()
|
||||
FILTER = auto()
|
||||
FINAL = auto()
|
||||
FIRST = auto()
|
||||
FOLLOWING = auto()
|
||||
FOR = auto()
|
||||
FOREIGN_KEY = auto()
|
||||
FORMAT = auto()
|
||||
FULL = auto()
|
||||
FUNCTION = auto()
|
||||
FROM = auto()
|
||||
GENERATED = auto()
|
||||
GROUP_BY = auto()
|
||||
GROUPING_SETS = auto()
|
||||
HAVING = auto()
|
||||
HINT = auto()
|
||||
IDENTITY = auto()
|
||||
IF = auto()
|
||||
IGNORE_NULLS = auto()
|
||||
ILIKE = auto()
|
||||
|
@ -159,12 +170,14 @@ class TokenType(AutoName):
|
|||
JOIN = auto()
|
||||
LATERAL = auto()
|
||||
LAZY = auto()
|
||||
LEADING = auto()
|
||||
LEFT = auto()
|
||||
LIKE = auto()
|
||||
LIMIT = auto()
|
||||
LOCATION = auto()
|
||||
MAP = auto()
|
||||
MOD = auto()
|
||||
NATURAL = auto()
|
||||
NEXT = auto()
|
||||
NO_ACTION = auto()
|
||||
NULL = auto()
|
||||
|
@ -204,8 +217,10 @@ class TokenType(AutoName):
|
|||
ROWS = auto()
|
||||
SCHEMA_COMMENT = auto()
|
||||
SELECT = auto()
|
||||
SEPARATOR = auto()
|
||||
SET = auto()
|
||||
SHOW = auto()
|
||||
SIMILAR_TO = auto()
|
||||
SOME = auto()
|
||||
SORT_BY = auto()
|
||||
STORED = auto()
|
||||
|
@ -213,12 +228,11 @@ class TokenType(AutoName):
|
|||
TABLE_FORMAT = auto()
|
||||
TABLE_SAMPLE = auto()
|
||||
TEMPORARY = auto()
|
||||
TIME = auto()
|
||||
TOP = auto()
|
||||
THEN = auto()
|
||||
TRUE = auto()
|
||||
TRAILING = auto()
|
||||
TRUNCATE = auto()
|
||||
TRY_CAST = auto()
|
||||
UNBOUNDED = auto()
|
||||
UNCACHE = auto()
|
||||
UNION = auto()
|
||||
|
@ -272,35 +286,32 @@ class _Tokenizer(type):
|
|||
def __new__(cls, clsname, bases, attrs):
|
||||
klass = super().__new__(cls, clsname, bases, attrs)
|
||||
|
||||
klass.QUOTES = dict(
|
||||
(quote, quote) if isinstance(quote, str) else (quote[0], quote[1])
|
||||
for quote in klass.QUOTES
|
||||
)
|
||||
|
||||
klass.IDENTIFIERS = dict(
|
||||
(identifier, identifier)
|
||||
if isinstance(identifier, str)
|
||||
else (identifier[0], identifier[1])
|
||||
for identifier in klass.IDENTIFIERS
|
||||
)
|
||||
|
||||
klass.COMMENTS = dict(
|
||||
(comment, None) if isinstance(comment, str) else (comment[0], comment[1])
|
||||
for comment in klass.COMMENTS
|
||||
klass._QUOTES = cls._delimeter_list_to_dict(klass.QUOTES)
|
||||
klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS)
|
||||
klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS)
|
||||
klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS)
|
||||
klass._COMMENTS = dict(
|
||||
(comment, None) if isinstance(comment, str) else (comment[0], comment[1]) for comment in klass.COMMENTS
|
||||
)
|
||||
|
||||
klass.KEYWORD_TRIE = new_trie(
|
||||
key.upper()
|
||||
for key, value in {
|
||||
**klass.KEYWORDS,
|
||||
**{comment: TokenType.COMMENT for comment in klass.COMMENTS},
|
||||
**{quote: TokenType.QUOTE for quote in klass.QUOTES},
|
||||
**{comment: TokenType.COMMENT for comment in klass._COMMENTS},
|
||||
**{quote: TokenType.QUOTE for quote in klass._QUOTES},
|
||||
**{bit_string: TokenType.BIT_STRING for bit_string in klass._BIT_STRINGS},
|
||||
**{hex_string: TokenType.HEX_STRING for hex_string in klass._HEX_STRINGS},
|
||||
}.items()
|
||||
if " " in key or any(single in key for single in klass.SINGLE_TOKENS)
|
||||
)
|
||||
|
||||
return klass
|
||||
|
||||
@staticmethod
|
||||
def _delimeter_list_to_dict(list):
|
||||
return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list)
|
||||
|
||||
|
||||
class Tokenizer(metaclass=_Tokenizer):
|
||||
SINGLE_TOKENS = {
|
||||
|
@ -339,6 +350,10 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
|
||||
QUOTES = ["'"]
|
||||
|
||||
BIT_STRINGS = []
|
||||
|
||||
HEX_STRINGS = []
|
||||
|
||||
IDENTIFIERS = ['"']
|
||||
|
||||
ESCAPE = "'"
|
||||
|
@ -357,6 +372,7 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"->>": TokenType.DARROW,
|
||||
"#>": TokenType.HASH_ARROW,
|
||||
"#>>": TokenType.DHASH_ARROW,
|
||||
"<->": TokenType.LR_ARROW,
|
||||
"ADD ARCHIVE": TokenType.ADD_FILE,
|
||||
"ADD ARCHIVES": TokenType.ADD_FILE,
|
||||
"ADD FILE": TokenType.ADD_FILE,
|
||||
|
@ -374,12 +390,12 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
|
||||
"BEGIN": TokenType.BEGIN,
|
||||
"BETWEEN": TokenType.BETWEEN,
|
||||
"BOTH": TokenType.BOTH,
|
||||
"BUCKET": TokenType.BUCKET,
|
||||
"CALL": TokenType.CALL,
|
||||
"CACHE": TokenType.CACHE,
|
||||
"UNCACHE": TokenType.UNCACHE,
|
||||
"CASE": TokenType.CASE,
|
||||
"CAST": TokenType.CAST,
|
||||
"CHARACTER SET": TokenType.CHARACTER_SET,
|
||||
"CHECK": TokenType.CHECK,
|
||||
"CLUSTER BY": TokenType.CLUSTER_BY,
|
||||
|
@ -387,7 +403,6 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"COMMENT": TokenType.SCHEMA_COMMENT,
|
||||
"COMMIT": TokenType.COMMIT,
|
||||
"CONSTRAINT": TokenType.CONSTRAINT,
|
||||
"CONVERT": TokenType.CONVERT,
|
||||
"CREATE": TokenType.CREATE,
|
||||
"CROSS": TokenType.CROSS,
|
||||
"CUBE": TokenType.CUBE,
|
||||
|
@ -408,7 +423,6 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"EXCEPT": TokenType.EXCEPT,
|
||||
"EXISTS": TokenType.EXISTS,
|
||||
"EXPLAIN": TokenType.EXPLAIN,
|
||||
"EXTRACT": TokenType.EXTRACT,
|
||||
"FALSE": TokenType.FALSE,
|
||||
"FETCH": TokenType.FETCH,
|
||||
"FILTER": TokenType.FILTER,
|
||||
|
@ -437,10 +451,12 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"JOIN": TokenType.JOIN,
|
||||
"LATERAL": TokenType.LATERAL,
|
||||
"LAZY": TokenType.LAZY,
|
||||
"LEADING": TokenType.LEADING,
|
||||
"LEFT": TokenType.LEFT,
|
||||
"LIKE": TokenType.LIKE,
|
||||
"LIMIT": TokenType.LIMIT,
|
||||
"LOCATION": TokenType.LOCATION,
|
||||
"NATURAL": TokenType.NATURAL,
|
||||
"NEXT": TokenType.NEXT,
|
||||
"NO ACTION": TokenType.NO_ACTION,
|
||||
"NOT": TokenType.NOT,
|
||||
|
@ -490,8 +506,8 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"TEMPORARY": TokenType.TEMPORARY,
|
||||
"THEN": TokenType.THEN,
|
||||
"TRUE": TokenType.TRUE,
|
||||
"TRAILING": TokenType.TRAILING,
|
||||
"TRUNCATE": TokenType.TRUNCATE,
|
||||
"TRY_CAST": TokenType.TRY_CAST,
|
||||
"UNBOUNDED": TokenType.UNBOUNDED,
|
||||
"UNION": TokenType.UNION,
|
||||
"UNNEST": TokenType.UNNEST,
|
||||
|
@ -626,14 +642,12 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
break
|
||||
|
||||
white_space = self.WHITE_SPACE.get(self._char)
|
||||
identifier_end = self.IDENTIFIERS.get(self._char)
|
||||
identifier_end = self._IDENTIFIERS.get(self._char)
|
||||
|
||||
if white_space:
|
||||
if white_space == TokenType.BREAK:
|
||||
self._col = 1
|
||||
self._line += 1
|
||||
elif self._char == "0" and self._peek == "x":
|
||||
self._scan_hex()
|
||||
elif self._char.isdigit():
|
||||
self._scan_number()
|
||||
elif identifier_end:
|
||||
|
@ -666,9 +680,7 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
text = self._text if text is None else text
|
||||
self.tokens.append(Token(token_type, text, self._line, self._col))
|
||||
|
||||
if token_type in self.COMMANDS and (
|
||||
len(self.tokens) == 1 or self.tokens[-2].token_type == TokenType.SEMICOLON
|
||||
):
|
||||
if token_type in self.COMMANDS and (len(self.tokens) == 1 or self.tokens[-2].token_type == TokenType.SEMICOLON):
|
||||
self._start = self._current
|
||||
while not self._end and self._peek != ";":
|
||||
self._advance()
|
||||
|
@ -725,6 +737,8 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
|
||||
if self._scan_string(word):
|
||||
return
|
||||
if self._scan_numeric_string(word):
|
||||
return
|
||||
if self._scan_comment(word):
|
||||
return
|
||||
|
||||
|
@ -732,10 +746,10 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
self._add(self.KEYWORDS[word.upper()])
|
||||
|
||||
def _scan_comment(self, comment_start):
|
||||
if comment_start not in self.COMMENTS:
|
||||
if comment_start not in self._COMMENTS:
|
||||
return False
|
||||
|
||||
comment_end = self.COMMENTS[comment_start]
|
||||
comment_end = self._COMMENTS[comment_start]
|
||||
|
||||
if comment_end:
|
||||
comment_end_size = len(comment_end)
|
||||
|
@ -749,15 +763,18 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
return True
|
||||
|
||||
def _scan_annotation(self):
|
||||
while (
|
||||
not self._end
|
||||
and self.WHITE_SPACE.get(self._peek) != TokenType.BREAK
|
||||
and self._peek != ","
|
||||
):
|
||||
while not self._end and self.WHITE_SPACE.get(self._peek) != TokenType.BREAK and self._peek != ",":
|
||||
self._advance()
|
||||
self._add(TokenType.ANNOTATION, self._text[1:])
|
||||
|
||||
def _scan_number(self):
|
||||
if self._char == "0":
|
||||
peek = self._peek.upper()
|
||||
if peek == "B":
|
||||
return self._scan_bits()
|
||||
elif peek == "X":
|
||||
return self._scan_hex()
|
||||
|
||||
decimal = False
|
||||
scientific = 0
|
||||
|
||||
|
@ -788,57 +805,71 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
else:
|
||||
return self._add(TokenType.NUMBER)
|
||||
|
||||
def _scan_bits(self):
|
||||
self._advance()
|
||||
value = self._extract_value()
|
||||
try:
|
||||
self._add(TokenType.BIT_STRING, f"{int(value, 2)}")
|
||||
except ValueError:
|
||||
self._add(TokenType.IDENTIFIER)
|
||||
|
||||
def _scan_hex(self):
|
||||
self._advance()
|
||||
value = self._extract_value()
|
||||
try:
|
||||
self._add(TokenType.HEX_STRING, f"{int(value, 16)}")
|
||||
except ValueError:
|
||||
self._add(TokenType.IDENTIFIER)
|
||||
|
||||
def _extract_value(self):
|
||||
while True:
|
||||
char = self._peek.strip()
|
||||
if char and char not in self.SINGLE_TOKENS:
|
||||
self._advance()
|
||||
else:
|
||||
break
|
||||
try:
|
||||
self._add(TokenType.BIT_STRING, f"{int(self._text, 16):b}")
|
||||
except ValueError:
|
||||
self._add(TokenType.IDENTIFIER)
|
||||
|
||||
return self._text
|
||||
|
||||
def _scan_string(self, quote):
|
||||
quote_end = self.QUOTES.get(quote)
|
||||
quote_end = self._QUOTES.get(quote)
|
||||
if quote_end is None:
|
||||
return False
|
||||
|
||||
text = ""
|
||||
self._advance(len(quote))
|
||||
quote_end_size = len(quote_end)
|
||||
|
||||
while True:
|
||||
if self._char == self.ESCAPE and self._peek == quote_end:
|
||||
text += quote
|
||||
self._advance(2)
|
||||
else:
|
||||
if self._chars(quote_end_size) == quote_end:
|
||||
if quote_end_size > 1:
|
||||
self._advance(quote_end_size - 1)
|
||||
break
|
||||
|
||||
if self._end:
|
||||
raise RuntimeError(
|
||||
f"Missing {quote} from {self._line}:{self._start}"
|
||||
)
|
||||
text += self._char
|
||||
self._advance()
|
||||
text = self._extract_string(quote_end)
|
||||
|
||||
text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
|
||||
text = text.replace("\\\\", "\\") if self.ESCAPE == "\\" else text
|
||||
self._add(TokenType.STRING, text)
|
||||
return True
|
||||
|
||||
def _scan_numeric_string(self, string_start):
|
||||
if string_start in self._HEX_STRINGS:
|
||||
delimiters = self._HEX_STRINGS
|
||||
token_type = TokenType.HEX_STRING
|
||||
base = 16
|
||||
elif string_start in self._BIT_STRINGS:
|
||||
delimiters = self._BIT_STRINGS
|
||||
token_type = TokenType.BIT_STRING
|
||||
base = 2
|
||||
else:
|
||||
return False
|
||||
|
||||
self._advance(len(string_start))
|
||||
string_end = delimiters.get(string_start)
|
||||
text = self._extract_string(string_end)
|
||||
|
||||
try:
|
||||
self._add(token_type, f"{int(text, base)}")
|
||||
except ValueError:
|
||||
raise RuntimeError(f"Numeric string contains invalid characters from {self._line}:{self._start}")
|
||||
return True
|
||||
|
||||
def _scan_identifier(self, identifier_end):
|
||||
while self._peek != identifier_end:
|
||||
if self._end:
|
||||
raise RuntimeError(
|
||||
f"Missing {identifier_end} from {self._line}:{self._start}"
|
||||
)
|
||||
raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}")
|
||||
self._advance()
|
||||
self._advance()
|
||||
self._add(TokenType.IDENTIFIER, self._text[1:-1])
|
||||
|
@ -851,3 +882,24 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
else:
|
||||
break
|
||||
self._add(self.KEYWORDS.get(self._text.upper(), TokenType.VAR))
|
||||
|
||||
def _extract_string(self, delimiter):
|
||||
text = ""
|
||||
delim_size = len(delimiter)
|
||||
|
||||
while True:
|
||||
if self._char == self.ESCAPE and self._peek == delimiter:
|
||||
text += delimiter
|
||||
self._advance(2)
|
||||
else:
|
||||
if self._chars(delim_size) == delimiter:
|
||||
if delim_size > 1:
|
||||
self._advance(delim_size - 1)
|
||||
break
|
||||
|
||||
if self._end:
|
||||
raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
|
||||
text += self._char
|
||||
self._advance()
|
||||
|
||||
return text
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue