Merging upstream version 15.0.0.
Signed-off-by: Daniel Baumann <daniel@debian.org>
This commit is contained in:
parent
8deb804d23
commit
fc63828ee4
167 changed files with 58268 additions and 51337 deletions
|
@ -51,7 +51,6 @@ class TokenType(AutoName):
|
|||
DOLLAR = auto()
|
||||
PARAMETER = auto()
|
||||
SESSION_PARAMETER = auto()
|
||||
NATIONAL = auto()
|
||||
DAMP = auto()
|
||||
|
||||
BLOCK_START = auto()
|
||||
|
@ -72,6 +71,8 @@ class TokenType(AutoName):
|
|||
BIT_STRING = auto()
|
||||
HEX_STRING = auto()
|
||||
BYTE_STRING = auto()
|
||||
NATIONAL_STRING = auto()
|
||||
RAW_STRING = auto()
|
||||
|
||||
# types
|
||||
BIT = auto()
|
||||
|
@ -110,6 +111,7 @@ class TokenType(AutoName):
|
|||
TIMESTAMPTZ = auto()
|
||||
TIMESTAMPLTZ = auto()
|
||||
DATETIME = auto()
|
||||
DATETIME64 = auto()
|
||||
DATE = auto()
|
||||
UUID = auto()
|
||||
GEOGRAPHY = auto()
|
||||
|
@ -142,30 +144,22 @@ class TokenType(AutoName):
|
|||
ARRAY = auto()
|
||||
ASC = auto()
|
||||
ASOF = auto()
|
||||
AT_TIME_ZONE = auto()
|
||||
AUTO_INCREMENT = auto()
|
||||
BEGIN = auto()
|
||||
BETWEEN = auto()
|
||||
BOTH = auto()
|
||||
BUCKET = auto()
|
||||
BY_DEFAULT = auto()
|
||||
CACHE = auto()
|
||||
CASCADE = auto()
|
||||
CASE = auto()
|
||||
CHARACTER_SET = auto()
|
||||
CLUSTER_BY = auto()
|
||||
COLLATE = auto()
|
||||
COMMAND = auto()
|
||||
COMMENT = auto()
|
||||
COMMIT = auto()
|
||||
COMPOUND = auto()
|
||||
CONSTRAINT = auto()
|
||||
CREATE = auto()
|
||||
CROSS = auto()
|
||||
CUBE = auto()
|
||||
CURRENT_DATE = auto()
|
||||
CURRENT_DATETIME = auto()
|
||||
CURRENT_ROW = auto()
|
||||
CURRENT_TIME = auto()
|
||||
CURRENT_TIMESTAMP = auto()
|
||||
CURRENT_USER = auto()
|
||||
|
@ -174,8 +168,6 @@ class TokenType(AutoName):
|
|||
DESC = auto()
|
||||
DESCRIBE = auto()
|
||||
DISTINCT = auto()
|
||||
DISTINCT_FROM = auto()
|
||||
DISTRIBUTE_BY = auto()
|
||||
DIV = auto()
|
||||
DROP = auto()
|
||||
ELSE = auto()
|
||||
|
@ -189,7 +181,6 @@ class TokenType(AutoName):
|
|||
FILTER = auto()
|
||||
FINAL = auto()
|
||||
FIRST = auto()
|
||||
FOLLOWING = auto()
|
||||
FOR = auto()
|
||||
FOREIGN_KEY = auto()
|
||||
FORMAT = auto()
|
||||
|
@ -203,7 +194,6 @@ class TokenType(AutoName):
|
|||
HAVING = auto()
|
||||
HINT = auto()
|
||||
IF = auto()
|
||||
IGNORE_NULLS = auto()
|
||||
ILIKE = auto()
|
||||
ILIKE_ANY = auto()
|
||||
IN = auto()
|
||||
|
@ -222,36 +212,27 @@ class TokenType(AutoName):
|
|||
KEEP = auto()
|
||||
LANGUAGE = auto()
|
||||
LATERAL = auto()
|
||||
LAZY = auto()
|
||||
LEADING = auto()
|
||||
LEFT = auto()
|
||||
LIKE = auto()
|
||||
LIKE_ANY = auto()
|
||||
LIMIT = auto()
|
||||
LOAD_DATA = auto()
|
||||
LOCAL = auto()
|
||||
LOAD = auto()
|
||||
LOCK = auto()
|
||||
MAP = auto()
|
||||
MATCH_RECOGNIZE = auto()
|
||||
MATERIALIZED = auto()
|
||||
MERGE = auto()
|
||||
MOD = auto()
|
||||
NATURAL = auto()
|
||||
NEXT = auto()
|
||||
NEXT_VALUE_FOR = auto()
|
||||
NO_ACTION = auto()
|
||||
NOTNULL = auto()
|
||||
NULL = auto()
|
||||
NULLS_FIRST = auto()
|
||||
NULLS_LAST = auto()
|
||||
OFFSET = auto()
|
||||
ON = auto()
|
||||
ONLY = auto()
|
||||
OPTIONS = auto()
|
||||
ORDER_BY = auto()
|
||||
ORDERED = auto()
|
||||
ORDINALITY = auto()
|
||||
OUTER = auto()
|
||||
OUT_OF = auto()
|
||||
OVER = auto()
|
||||
OVERLAPS = auto()
|
||||
OVERWRITE = auto()
|
||||
|
@ -261,7 +242,6 @@ class TokenType(AutoName):
|
|||
PIVOT = auto()
|
||||
PLACEHOLDER = auto()
|
||||
PRAGMA = auto()
|
||||
PRECEDING = auto()
|
||||
PRIMARY_KEY = auto()
|
||||
PROCEDURE = auto()
|
||||
PROPERTIES = auto()
|
||||
|
@ -271,7 +251,6 @@ class TokenType(AutoName):
|
|||
RANGE = auto()
|
||||
RECURSIVE = auto()
|
||||
REPLACE = auto()
|
||||
RESPECT_NULLS = auto()
|
||||
RETURNING = auto()
|
||||
REFERENCES = auto()
|
||||
RIGHT = auto()
|
||||
|
@ -280,28 +259,23 @@ class TokenType(AutoName):
|
|||
ROLLUP = auto()
|
||||
ROW = auto()
|
||||
ROWS = auto()
|
||||
SEED = auto()
|
||||
SELECT = auto()
|
||||
SEMI = auto()
|
||||
SEPARATOR = auto()
|
||||
SERDE_PROPERTIES = auto()
|
||||
SET = auto()
|
||||
SETTINGS = auto()
|
||||
SHOW = auto()
|
||||
SIMILAR_TO = auto()
|
||||
SOME = auto()
|
||||
SORTKEY = auto()
|
||||
SORT_BY = auto()
|
||||
STRUCT = auto()
|
||||
TABLE_SAMPLE = auto()
|
||||
TEMPORARY = auto()
|
||||
TOP = auto()
|
||||
THEN = auto()
|
||||
TRAILING = auto()
|
||||
TRUE = auto()
|
||||
UNBOUNDED = auto()
|
||||
UNCACHE = auto()
|
||||
UNION = auto()
|
||||
UNLOGGED = auto()
|
||||
UNNEST = auto()
|
||||
UNPIVOT = auto()
|
||||
UPDATE = auto()
|
||||
|
@ -314,15 +288,11 @@ class TokenType(AutoName):
|
|||
WHERE = auto()
|
||||
WINDOW = auto()
|
||||
WITH = auto()
|
||||
WITH_TIME_ZONE = auto()
|
||||
WITH_LOCAL_TIME_ZONE = auto()
|
||||
WITHIN_GROUP = auto()
|
||||
WITHOUT_TIME_ZONE = auto()
|
||||
UNIQUE = auto()
|
||||
|
||||
|
||||
class Token:
|
||||
__slots__ = ("token_type", "text", "line", "col", "end", "comments")
|
||||
__slots__ = ("token_type", "text", "line", "col", "start", "end", "comments")
|
||||
|
||||
@classmethod
|
||||
def number(cls, number: int) -> Token:
|
||||
|
@ -350,22 +320,28 @@ class Token:
|
|||
text: str,
|
||||
line: int = 1,
|
||||
col: int = 1,
|
||||
start: int = 0,
|
||||
end: int = 0,
|
||||
comments: t.List[str] = [],
|
||||
) -> None:
|
||||
"""Token initializer.
|
||||
|
||||
Args:
|
||||
token_type: The TokenType Enum.
|
||||
text: The text of the token.
|
||||
line: The line that the token ends on.
|
||||
col: The column that the token ends on.
|
||||
start: The start index of the token.
|
||||
end: The ending index of the token.
|
||||
"""
|
||||
self.token_type = token_type
|
||||
self.text = text
|
||||
self.line = line
|
||||
size = len(text)
|
||||
self.col = col
|
||||
self.end = end if end else size
|
||||
self.start = start
|
||||
self.end = end
|
||||
self.comments = comments
|
||||
|
||||
@property
|
||||
def start(self) -> int:
|
||||
"""Returns the start of the token."""
|
||||
return self.end - len(self.text)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
|
||||
return f"<Token {attributes}>"
|
||||
|
@ -375,15 +351,31 @@ class _Tokenizer(type):
|
|||
def __new__(cls, clsname, bases, attrs):
|
||||
klass = super().__new__(cls, clsname, bases, attrs)
|
||||
|
||||
klass._QUOTES = {
|
||||
f"{prefix}{s}": e
|
||||
for s, e in cls._delimeter_list_to_dict(klass.QUOTES).items()
|
||||
for prefix in (("",) if s[0].isalpha() else ("", "n", "N"))
|
||||
def _convert_quotes(arr: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]:
|
||||
return dict(
|
||||
(item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr
|
||||
)
|
||||
|
||||
def _quotes_to_format(
|
||||
token_type: TokenType, arr: t.List[str | t.Tuple[str, str]]
|
||||
) -> t.Dict[str, t.Tuple[str, TokenType]]:
|
||||
return {k: (v, token_type) for k, v in _convert_quotes(arr).items()}
|
||||
|
||||
klass._QUOTES = _convert_quotes(klass.QUOTES)
|
||||
klass._IDENTIFIERS = _convert_quotes(klass.IDENTIFIERS)
|
||||
|
||||
klass._FORMAT_STRINGS = {
|
||||
**{
|
||||
p + s: (e, TokenType.NATIONAL_STRING)
|
||||
for s, e in klass._QUOTES.items()
|
||||
for p in ("n", "N")
|
||||
},
|
||||
**_quotes_to_format(TokenType.BIT_STRING, klass.BIT_STRINGS),
|
||||
**_quotes_to_format(TokenType.BYTE_STRING, klass.BYTE_STRINGS),
|
||||
**_quotes_to_format(TokenType.HEX_STRING, klass.HEX_STRINGS),
|
||||
**_quotes_to_format(TokenType.RAW_STRING, klass.RAW_STRINGS),
|
||||
}
|
||||
klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS)
|
||||
klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS)
|
||||
klass._BYTE_STRINGS = cls._delimeter_list_to_dict(klass.BYTE_STRINGS)
|
||||
klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS)
|
||||
|
||||
klass._STRING_ESCAPES = set(klass.STRING_ESCAPES)
|
||||
klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES)
|
||||
klass._COMMENTS = dict(
|
||||
|
@ -393,23 +385,17 @@ class _Tokenizer(type):
|
|||
|
||||
klass.KEYWORD_TRIE = new_trie(
|
||||
key.upper()
|
||||
for key in {
|
||||
**klass.KEYWORDS,
|
||||
**{comment: TokenType.COMMENT for comment in klass._COMMENTS},
|
||||
**{quote: TokenType.QUOTE for quote in klass._QUOTES},
|
||||
**{bit_string: TokenType.BIT_STRING for bit_string in klass._BIT_STRINGS},
|
||||
**{hex_string: TokenType.HEX_STRING for hex_string in klass._HEX_STRINGS},
|
||||
**{byte_string: TokenType.BYTE_STRING for byte_string in klass._BYTE_STRINGS},
|
||||
}
|
||||
for key in (
|
||||
*klass.KEYWORDS,
|
||||
*klass._COMMENTS,
|
||||
*klass._QUOTES,
|
||||
*klass._FORMAT_STRINGS,
|
||||
)
|
||||
if " " in key or any(single in key for single in klass.SINGLE_TOKENS)
|
||||
)
|
||||
|
||||
return klass
|
||||
|
||||
@staticmethod
|
||||
def _delimeter_list_to_dict(list: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]:
|
||||
return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list)
|
||||
|
||||
|
||||
class Tokenizer(metaclass=_Tokenizer):
|
||||
SINGLE_TOKENS = {
|
||||
|
@ -450,6 +436,7 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
||||
BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
||||
HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
||||
RAW_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
||||
IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
|
||||
IDENTIFIER_ESCAPES = ['"']
|
||||
QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
|
||||
|
@ -457,9 +444,7 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
VAR_SINGLE_TOKENS: t.Set[str] = set()
|
||||
|
||||
_COMMENTS: t.Dict[str, str] = {}
|
||||
_BIT_STRINGS: t.Dict[str, str] = {}
|
||||
_BYTE_STRINGS: t.Dict[str, str] = {}
|
||||
_HEX_STRINGS: t.Dict[str, str] = {}
|
||||
_FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {}
|
||||
_IDENTIFIERS: t.Dict[str, str] = {}
|
||||
_IDENTIFIER_ESCAPES: t.Set[str] = set()
|
||||
_QUOTES: t.Dict[str, str] = {}
|
||||
|
@ -495,30 +480,22 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"ANY": TokenType.ANY,
|
||||
"ASC": TokenType.ASC,
|
||||
"AS": TokenType.ALIAS,
|
||||
"AT TIME ZONE": TokenType.AT_TIME_ZONE,
|
||||
"AUTOINCREMENT": TokenType.AUTO_INCREMENT,
|
||||
"AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
|
||||
"BEGIN": TokenType.BEGIN,
|
||||
"BETWEEN": TokenType.BETWEEN,
|
||||
"BOTH": TokenType.BOTH,
|
||||
"BUCKET": TokenType.BUCKET,
|
||||
"BY DEFAULT": TokenType.BY_DEFAULT,
|
||||
"CACHE": TokenType.CACHE,
|
||||
"UNCACHE": TokenType.UNCACHE,
|
||||
"CASE": TokenType.CASE,
|
||||
"CASCADE": TokenType.CASCADE,
|
||||
"CHARACTER SET": TokenType.CHARACTER_SET,
|
||||
"CLUSTER BY": TokenType.CLUSTER_BY,
|
||||
"COLLATE": TokenType.COLLATE,
|
||||
"COLUMN": TokenType.COLUMN,
|
||||
"COMMIT": TokenType.COMMIT,
|
||||
"COMPOUND": TokenType.COMPOUND,
|
||||
"CONSTRAINT": TokenType.CONSTRAINT,
|
||||
"CREATE": TokenType.CREATE,
|
||||
"CROSS": TokenType.CROSS,
|
||||
"CUBE": TokenType.CUBE,
|
||||
"CURRENT_DATE": TokenType.CURRENT_DATE,
|
||||
"CURRENT ROW": TokenType.CURRENT_ROW,
|
||||
"CURRENT_TIME": TokenType.CURRENT_TIME,
|
||||
"CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
|
||||
"CURRENT_USER": TokenType.CURRENT_USER,
|
||||
|
@ -528,8 +505,6 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"DESC": TokenType.DESC,
|
||||
"DESCRIBE": TokenType.DESCRIBE,
|
||||
"DISTINCT": TokenType.DISTINCT,
|
||||
"DISTINCT FROM": TokenType.DISTINCT_FROM,
|
||||
"DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
|
||||
"DIV": TokenType.DIV,
|
||||
"DROP": TokenType.DROP,
|
||||
"ELSE": TokenType.ELSE,
|
||||
|
@ -544,18 +519,18 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"FIRST": TokenType.FIRST,
|
||||
"FULL": TokenType.FULL,
|
||||
"FUNCTION": TokenType.FUNCTION,
|
||||
"FOLLOWING": TokenType.FOLLOWING,
|
||||
"FOR": TokenType.FOR,
|
||||
"FOREIGN KEY": TokenType.FOREIGN_KEY,
|
||||
"FORMAT": TokenType.FORMAT,
|
||||
"FROM": TokenType.FROM,
|
||||
"GEOGRAPHY": TokenType.GEOGRAPHY,
|
||||
"GEOMETRY": TokenType.GEOMETRY,
|
||||
"GLOB": TokenType.GLOB,
|
||||
"GROUP BY": TokenType.GROUP_BY,
|
||||
"GROUPING SETS": TokenType.GROUPING_SETS,
|
||||
"HAVING": TokenType.HAVING,
|
||||
"IF": TokenType.IF,
|
||||
"ILIKE": TokenType.ILIKE,
|
||||
"IGNORE NULLS": TokenType.IGNORE_NULLS,
|
||||
"IN": TokenType.IN,
|
||||
"INDEX": TokenType.INDEX,
|
||||
"INET": TokenType.INET,
|
||||
|
@ -569,34 +544,25 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"JOIN": TokenType.JOIN,
|
||||
"KEEP": TokenType.KEEP,
|
||||
"LATERAL": TokenType.LATERAL,
|
||||
"LAZY": TokenType.LAZY,
|
||||
"LEADING": TokenType.LEADING,
|
||||
"LEFT": TokenType.LEFT,
|
||||
"LIKE": TokenType.LIKE,
|
||||
"LIMIT": TokenType.LIMIT,
|
||||
"LOAD DATA": TokenType.LOAD_DATA,
|
||||
"LOCAL": TokenType.LOCAL,
|
||||
"MATERIALIZED": TokenType.MATERIALIZED,
|
||||
"LOAD": TokenType.LOAD,
|
||||
"LOCK": TokenType.LOCK,
|
||||
"MERGE": TokenType.MERGE,
|
||||
"NATURAL": TokenType.NATURAL,
|
||||
"NEXT": TokenType.NEXT,
|
||||
"NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR,
|
||||
"NO ACTION": TokenType.NO_ACTION,
|
||||
"NOT": TokenType.NOT,
|
||||
"NOTNULL": TokenType.NOTNULL,
|
||||
"NULL": TokenType.NULL,
|
||||
"NULLS FIRST": TokenType.NULLS_FIRST,
|
||||
"NULLS LAST": TokenType.NULLS_LAST,
|
||||
"OBJECT": TokenType.OBJECT,
|
||||
"OFFSET": TokenType.OFFSET,
|
||||
"ON": TokenType.ON,
|
||||
"ONLY": TokenType.ONLY,
|
||||
"OPTIONS": TokenType.OPTIONS,
|
||||
"OR": TokenType.OR,
|
||||
"ORDER BY": TokenType.ORDER_BY,
|
||||
"ORDINALITY": TokenType.ORDINALITY,
|
||||
"OUTER": TokenType.OUTER,
|
||||
"OUT OF": TokenType.OUT_OF,
|
||||
"OVER": TokenType.OVER,
|
||||
"OVERLAPS": TokenType.OVERLAPS,
|
||||
"OVERWRITE": TokenType.OVERWRITE,
|
||||
|
@ -607,7 +573,6 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"PERCENT": TokenType.PERCENT,
|
||||
"PIVOT": TokenType.PIVOT,
|
||||
"PRAGMA": TokenType.PRAGMA,
|
||||
"PRECEDING": TokenType.PRECEDING,
|
||||
"PRIMARY KEY": TokenType.PRIMARY_KEY,
|
||||
"PROCEDURE": TokenType.PROCEDURE,
|
||||
"QUALIFY": TokenType.QUALIFY,
|
||||
|
@ -615,7 +580,6 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"RECURSIVE": TokenType.RECURSIVE,
|
||||
"REGEXP": TokenType.RLIKE,
|
||||
"REPLACE": TokenType.REPLACE,
|
||||
"RESPECT NULLS": TokenType.RESPECT_NULLS,
|
||||
"REFERENCES": TokenType.REFERENCES,
|
||||
"RIGHT": TokenType.RIGHT,
|
||||
"RLIKE": TokenType.RLIKE,
|
||||
|
@ -624,25 +588,20 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"ROW": TokenType.ROW,
|
||||
"ROWS": TokenType.ROWS,
|
||||
"SCHEMA": TokenType.SCHEMA,
|
||||
"SEED": TokenType.SEED,
|
||||
"SELECT": TokenType.SELECT,
|
||||
"SEMI": TokenType.SEMI,
|
||||
"SET": TokenType.SET,
|
||||
"SETTINGS": TokenType.SETTINGS,
|
||||
"SHOW": TokenType.SHOW,
|
||||
"SIMILAR TO": TokenType.SIMILAR_TO,
|
||||
"SOME": TokenType.SOME,
|
||||
"SORTKEY": TokenType.SORTKEY,
|
||||
"SORT BY": TokenType.SORT_BY,
|
||||
"TABLE": TokenType.TABLE,
|
||||
"TABLESAMPLE": TokenType.TABLE_SAMPLE,
|
||||
"TEMP": TokenType.TEMPORARY,
|
||||
"TEMPORARY": TokenType.TEMPORARY,
|
||||
"THEN": TokenType.THEN,
|
||||
"TRUE": TokenType.TRUE,
|
||||
"TRAILING": TokenType.TRAILING,
|
||||
"UNBOUNDED": TokenType.UNBOUNDED,
|
||||
"UNION": TokenType.UNION,
|
||||
"UNLOGGED": TokenType.UNLOGGED,
|
||||
"UNNEST": TokenType.UNNEST,
|
||||
"UNPIVOT": TokenType.UNPIVOT,
|
||||
"UPDATE": TokenType.UPDATE,
|
||||
|
@ -656,10 +615,6 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"WHERE": TokenType.WHERE,
|
||||
"WINDOW": TokenType.WINDOW,
|
||||
"WITH": TokenType.WITH,
|
||||
"WITH TIME ZONE": TokenType.WITH_TIME_ZONE,
|
||||
"WITH LOCAL TIME ZONE": TokenType.WITH_LOCAL_TIME_ZONE,
|
||||
"WITHIN GROUP": TokenType.WITHIN_GROUP,
|
||||
"WITHOUT TIME ZONE": TokenType.WITHOUT_TIME_ZONE,
|
||||
"APPLY": TokenType.APPLY,
|
||||
"ARRAY": TokenType.ARRAY,
|
||||
"BIT": TokenType.BIT,
|
||||
|
@ -718,15 +673,6 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
"STRUCT": TokenType.STRUCT,
|
||||
"VARIANT": TokenType.VARIANT,
|
||||
"ALTER": TokenType.ALTER,
|
||||
"ALTER AGGREGATE": TokenType.COMMAND,
|
||||
"ALTER DEFAULT": TokenType.COMMAND,
|
||||
"ALTER DOMAIN": TokenType.COMMAND,
|
||||
"ALTER ROLE": TokenType.COMMAND,
|
||||
"ALTER RULE": TokenType.COMMAND,
|
||||
"ALTER SEQUENCE": TokenType.COMMAND,
|
||||
"ALTER TYPE": TokenType.COMMAND,
|
||||
"ALTER USER": TokenType.COMMAND,
|
||||
"ALTER VIEW": TokenType.COMMAND,
|
||||
"ANALYZE": TokenType.COMMAND,
|
||||
"CALL": TokenType.COMMAND,
|
||||
"COMMENT": TokenType.COMMENT,
|
||||
|
@ -790,7 +736,7 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
self._start = 0
|
||||
self._current = 0
|
||||
self._line = 1
|
||||
self._col = 1
|
||||
self._col = 0
|
||||
self._comments: t.List[str] = []
|
||||
|
||||
self._char = ""
|
||||
|
@ -803,13 +749,12 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
self.reset()
|
||||
self.sql = sql
|
||||
self.size = len(sql)
|
||||
|
||||
try:
|
||||
self._scan()
|
||||
except Exception as e:
|
||||
start = self._current - 50
|
||||
end = self._current + 50
|
||||
start = start if start > 0 else 0
|
||||
end = end if end < self.size else self.size - 1
|
||||
start = max(self._current - 50, 0)
|
||||
end = min(self._current + 50, self.size - 1)
|
||||
context = self.sql[start:end]
|
||||
raise ValueError(f"Error tokenizing '{context}'") from e
|
||||
|
||||
|
@ -834,17 +779,17 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
if until and until():
|
||||
break
|
||||
|
||||
if self.tokens:
|
||||
if self.tokens and self._comments:
|
||||
self.tokens[-1].comments.extend(self._comments)
|
||||
|
||||
def _chars(self, size: int) -> str:
|
||||
if size == 1:
|
||||
return self._char
|
||||
|
||||
start = self._current - 1
|
||||
end = start + size
|
||||
if end <= self.size:
|
||||
return self.sql[start:end]
|
||||
return ""
|
||||
|
||||
return self.sql[start:end] if end <= self.size else ""
|
||||
|
||||
def _advance(self, i: int = 1, alnum: bool = False) -> None:
|
||||
if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
|
||||
|
@ -859,6 +804,7 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
self._peek = "" if self._end else self.sql[self._current]
|
||||
|
||||
if alnum and self._char.isalnum():
|
||||
# Here we use local variables instead of attributes for better performance
|
||||
_col = self._col
|
||||
_current = self._current
|
||||
_end = self._end
|
||||
|
@ -885,11 +831,12 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
self.tokens.append(
|
||||
Token(
|
||||
token_type,
|
||||
self._text if text is None else text,
|
||||
self._line,
|
||||
self._col,
|
||||
self._current,
|
||||
self._comments,
|
||||
text=self._text if text is None else text,
|
||||
line=self._line,
|
||||
col=self._col,
|
||||
start=self._start,
|
||||
end=self._current - 1,
|
||||
comments=self._comments,
|
||||
)
|
||||
)
|
||||
self._comments = []
|
||||
|
@ -929,6 +876,7 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
break
|
||||
if result == 2:
|
||||
word = chars
|
||||
|
||||
size += 1
|
||||
end = self._current - 1 + size
|
||||
|
||||
|
@ -946,6 +894,7 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
else:
|
||||
skip = True
|
||||
else:
|
||||
char = ""
|
||||
chars = " "
|
||||
|
||||
word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word
|
||||
|
@ -959,8 +908,6 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
|
||||
if self._scan_string(word):
|
||||
return
|
||||
if self._scan_formatted_string(word):
|
||||
return
|
||||
if self._scan_comment(word):
|
||||
return
|
||||
|
||||
|
@ -1004,9 +951,9 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
if self._char == "0":
|
||||
peek = self._peek.upper()
|
||||
if peek == "B":
|
||||
return self._scan_bits() if self._BIT_STRINGS else self._add(TokenType.NUMBER)
|
||||
return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER)
|
||||
elif peek == "X":
|
||||
return self._scan_hex() if self._HEX_STRINGS else self._add(TokenType.NUMBER)
|
||||
return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER)
|
||||
|
||||
decimal = False
|
||||
scientific = 0
|
||||
|
@ -1075,37 +1022,24 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
|
||||
return self._text
|
||||
|
||||
def _scan_string(self, quote: str) -> bool:
|
||||
quote_end = self._QUOTES.get(quote)
|
||||
if quote_end is None:
|
||||
return False
|
||||
def _scan_string(self, start: str) -> bool:
|
||||
base = None
|
||||
token_type = TokenType.STRING
|
||||
|
||||
self._advance(len(quote))
|
||||
text = self._extract_string(quote_end)
|
||||
text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
|
||||
self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text)
|
||||
return True
|
||||
if start in self._QUOTES:
|
||||
end = self._QUOTES[start]
|
||||
elif start in self._FORMAT_STRINGS:
|
||||
end, token_type = self._FORMAT_STRINGS[start]
|
||||
|
||||
# X'1234', b'0110', E'\\\\\' etc.
|
||||
def _scan_formatted_string(self, string_start: str) -> bool:
|
||||
if string_start in self._HEX_STRINGS:
|
||||
delimiters = self._HEX_STRINGS
|
||||
token_type = TokenType.HEX_STRING
|
||||
base = 16
|
||||
elif string_start in self._BIT_STRINGS:
|
||||
delimiters = self._BIT_STRINGS
|
||||
token_type = TokenType.BIT_STRING
|
||||
base = 2
|
||||
elif string_start in self._BYTE_STRINGS:
|
||||
delimiters = self._BYTE_STRINGS
|
||||
token_type = TokenType.BYTE_STRING
|
||||
base = None
|
||||
if token_type == TokenType.HEX_STRING:
|
||||
base = 16
|
||||
elif token_type == TokenType.BIT_STRING:
|
||||
base = 2
|
||||
else:
|
||||
return False
|
||||
|
||||
self._advance(len(string_start))
|
||||
string_end = delimiters[string_start]
|
||||
text = self._extract_string(string_end)
|
||||
self._advance(len(start))
|
||||
text = self._extract_string(end)
|
||||
|
||||
if base:
|
||||
try:
|
||||
|
@ -1114,6 +1048,8 @@ class Tokenizer(metaclass=_Tokenizer):
|
|||
raise RuntimeError(
|
||||
f"Numeric string contains invalid characters from {self._line}:{self._start}"
|
||||
)
|
||||
else:
|
||||
text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
|
||||
|
||||
self._add(token_type, text)
|
||||
return True
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue