2025-02-13 14:52:26 +01:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
import typing as t
|
2025-02-13 06:15:54 +01:00
|
|
|
from enum import auto
|
|
|
|
|
|
|
|
from sqlglot.helper import AutoName
|
|
|
|
from sqlglot.trie import in_trie, new_trie
|
|
|
|
|
|
|
|
|
|
|
|
class TokenType(AutoName):
|
|
|
|
L_PAREN = auto()
|
|
|
|
R_PAREN = auto()
|
|
|
|
L_BRACKET = auto()
|
|
|
|
R_BRACKET = auto()
|
|
|
|
L_BRACE = auto()
|
|
|
|
R_BRACE = auto()
|
|
|
|
COMMA = auto()
|
|
|
|
DOT = auto()
|
|
|
|
DASH = auto()
|
|
|
|
PLUS = auto()
|
|
|
|
COLON = auto()
|
|
|
|
DCOLON = auto()
|
|
|
|
SEMICOLON = auto()
|
|
|
|
STAR = auto()
|
2025-02-13 15:07:44 +01:00
|
|
|
BACKSLASH = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
SLASH = auto()
|
|
|
|
LT = auto()
|
|
|
|
LTE = auto()
|
|
|
|
GT = auto()
|
|
|
|
GTE = auto()
|
|
|
|
NOT = auto()
|
|
|
|
EQ = auto()
|
|
|
|
NEQ = auto()
|
2025-02-13 14:52:26 +01:00
|
|
|
NULLSAFE_EQ = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
AND = auto()
|
|
|
|
OR = auto()
|
|
|
|
AMP = auto()
|
|
|
|
DPIPE = auto()
|
|
|
|
PIPE = auto()
|
|
|
|
CARET = auto()
|
|
|
|
TILDA = auto()
|
|
|
|
ARROW = auto()
|
|
|
|
DARROW = auto()
|
2025-02-13 14:52:26 +01:00
|
|
|
FARROW = auto()
|
|
|
|
HASH = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
HASH_ARROW = auto()
|
|
|
|
DHASH_ARROW = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
LR_ARROW = auto()
|
2025-02-13 15:44:19 +01:00
|
|
|
LT_AT = auto()
|
|
|
|
AT_GT = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
DOLLAR = auto()
|
2025-02-13 14:30:50 +01:00
|
|
|
PARAMETER = auto()
|
2025-02-13 14:52:26 +01:00
|
|
|
SESSION_PARAMETER = auto()
|
2025-02-13 15:01:11 +01:00
|
|
|
NATIONAL = auto()
|
2025-02-13 15:44:19 +01:00
|
|
|
DAMP = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 14:57:38 +01:00
|
|
|
BLOCK_START = auto()
|
|
|
|
BLOCK_END = auto()
|
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
SPACE = auto()
|
|
|
|
BREAK = auto()
|
|
|
|
|
|
|
|
STRING = auto()
|
|
|
|
NUMBER = auto()
|
|
|
|
IDENTIFIER = auto()
|
2025-02-13 15:41:13 +01:00
|
|
|
DATABASE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
COLUMN = auto()
|
|
|
|
COLUMN_DEF = auto()
|
|
|
|
SCHEMA = auto()
|
|
|
|
TABLE = auto()
|
|
|
|
VAR = auto()
|
|
|
|
BIT_STRING = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
HEX_STRING = auto()
|
2025-02-13 14:47:39 +01:00
|
|
|
BYTE_STRING = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
|
|
|
|
# types
|
2025-02-13 15:44:19 +01:00
|
|
|
BIT = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
BOOLEAN = auto()
|
|
|
|
TINYINT = auto()
|
2025-02-13 15:44:19 +01:00
|
|
|
UTINYINT = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
SMALLINT = auto()
|
2025-02-13 15:44:19 +01:00
|
|
|
USMALLINT = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
INT = auto()
|
2025-02-13 15:44:19 +01:00
|
|
|
UINT = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
BIGINT = auto()
|
2025-02-13 15:44:19 +01:00
|
|
|
UBIGINT = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
FLOAT = auto()
|
|
|
|
DOUBLE = auto()
|
|
|
|
DECIMAL = auto()
|
2025-02-13 15:51:35 +01:00
|
|
|
BIGDECIMAL = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
CHAR = auto()
|
|
|
|
NCHAR = auto()
|
|
|
|
VARCHAR = auto()
|
|
|
|
NVARCHAR = auto()
|
|
|
|
TEXT = auto()
|
2025-02-13 15:04:17 +01:00
|
|
|
MEDIUMTEXT = auto()
|
|
|
|
LONGTEXT = auto()
|
2025-02-13 15:06:33 +01:00
|
|
|
MEDIUMBLOB = auto()
|
|
|
|
LONGBLOB = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
BINARY = auto()
|
2025-02-13 14:52:26 +01:00
|
|
|
VARBINARY = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
JSON = auto()
|
2025-02-13 14:55:11 +01:00
|
|
|
JSONB = auto()
|
2025-02-13 15:02:59 +01:00
|
|
|
TIME = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
TIMESTAMP = auto()
|
|
|
|
TIMESTAMPTZ = auto()
|
2025-02-13 14:30:50 +01:00
|
|
|
TIMESTAMPLTZ = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
DATETIME = auto()
|
|
|
|
DATE = auto()
|
|
|
|
UUID = auto()
|
|
|
|
GEOGRAPHY = auto()
|
|
|
|
NULLABLE = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
GEOMETRY = auto()
|
|
|
|
HLLSKETCH = auto()
|
2025-02-13 14:55:11 +01:00
|
|
|
HSTORE = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
SUPER = auto()
|
|
|
|
SERIAL = auto()
|
|
|
|
SMALLSERIAL = auto()
|
|
|
|
BIGSERIAL = auto()
|
2025-02-13 14:30:50 +01:00
|
|
|
XML = auto()
|
|
|
|
UNIQUEIDENTIFIER = auto()
|
|
|
|
MONEY = auto()
|
|
|
|
SMALLMONEY = auto()
|
|
|
|
ROWVERSION = auto()
|
|
|
|
IMAGE = auto()
|
2025-02-13 14:37:25 +01:00
|
|
|
VARIANT = auto()
|
|
|
|
OBJECT = auto()
|
2025-02-13 15:31:44 +01:00
|
|
|
INET = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
|
|
|
|
# keywords
|
|
|
|
ALIAS = auto()
|
2025-02-13 15:01:11 +01:00
|
|
|
ALTER = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
ALWAYS = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
ALL = auto()
|
2025-02-13 14:44:19 +01:00
|
|
|
ANTI = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
ANY = auto()
|
2025-02-13 14:51:09 +01:00
|
|
|
APPLY = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
ARRAY = auto()
|
|
|
|
ASC = auto()
|
2025-02-13 14:55:11 +01:00
|
|
|
ASOF = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
AT_TIME_ZONE = auto()
|
|
|
|
AUTO_INCREMENT = auto()
|
|
|
|
BEGIN = auto()
|
|
|
|
BETWEEN = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
BOTH = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
BUCKET = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
BY_DEFAULT = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
CACHE = auto()
|
2025-02-13 14:53:43 +01:00
|
|
|
CASCADE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
CASE = auto()
|
|
|
|
CHARACTER_SET = auto()
|
|
|
|
CLUSTER_BY = auto()
|
|
|
|
COLLATE = auto()
|
2025-02-13 14:53:43 +01:00
|
|
|
COMMAND = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
COMMENT = auto()
|
|
|
|
COMMIT = auto()
|
2025-02-13 14:55:11 +01:00
|
|
|
COMPOUND = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
CONSTRAINT = auto()
|
|
|
|
CREATE = auto()
|
|
|
|
CROSS = auto()
|
|
|
|
CUBE = auto()
|
|
|
|
CURRENT_DATE = auto()
|
|
|
|
CURRENT_DATETIME = auto()
|
|
|
|
CURRENT_ROW = auto()
|
|
|
|
CURRENT_TIME = auto()
|
|
|
|
CURRENT_TIMESTAMP = auto()
|
2025-02-13 15:48:55 +01:00
|
|
|
CURRENT_USER = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
DEFAULT = auto()
|
|
|
|
DELETE = auto()
|
|
|
|
DESC = auto()
|
2025-02-13 14:46:14 +01:00
|
|
|
DESCRIBE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
DISTINCT = auto()
|
2025-02-13 14:52:26 +01:00
|
|
|
DISTINCT_FROM = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
DISTRIBUTE_BY = auto()
|
2025-02-13 14:46:14 +01:00
|
|
|
DIV = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
DROP = auto()
|
|
|
|
ELSE = auto()
|
|
|
|
END = auto()
|
|
|
|
ESCAPE = auto()
|
|
|
|
EXCEPT = auto()
|
2025-02-13 14:42:49 +01:00
|
|
|
EXECUTE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
EXISTS = auto()
|
|
|
|
FALSE = auto()
|
|
|
|
FETCH = auto()
|
|
|
|
FILTER = auto()
|
|
|
|
FINAL = auto()
|
|
|
|
FIRST = auto()
|
|
|
|
FOLLOWING = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
FOR = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
FOREIGN_KEY = auto()
|
|
|
|
FORMAT = auto()
|
2025-02-13 15:07:44 +01:00
|
|
|
FROM = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
FULL = auto()
|
|
|
|
FUNCTION = auto()
|
2025-02-13 15:07:44 +01:00
|
|
|
GLOB = auto()
|
2025-02-13 15:02:59 +01:00
|
|
|
GLOBAL = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
GROUP_BY = auto()
|
|
|
|
GROUPING_SETS = auto()
|
|
|
|
HAVING = auto()
|
|
|
|
HINT = auto()
|
|
|
|
IF = auto()
|
|
|
|
IGNORE_NULLS = auto()
|
|
|
|
ILIKE = auto()
|
2025-02-13 15:24:45 +01:00
|
|
|
ILIKE_ANY = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
IN = auto()
|
|
|
|
INDEX = auto()
|
|
|
|
INNER = auto()
|
|
|
|
INSERT = auto()
|
|
|
|
INTERSECT = auto()
|
|
|
|
INTERVAL = auto()
|
|
|
|
INTO = auto()
|
|
|
|
INTRODUCER = auto()
|
2025-02-13 15:01:11 +01:00
|
|
|
IRLIKE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
IS = auto()
|
|
|
|
ISNULL = auto()
|
|
|
|
JOIN = auto()
|
2025-02-13 15:41:13 +01:00
|
|
|
JOIN_MARKER = auto()
|
2025-02-13 15:51:35 +01:00
|
|
|
KEEP = auto()
|
2025-02-13 14:37:25 +01:00
|
|
|
LANGUAGE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
LATERAL = auto()
|
|
|
|
LAZY = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
LEADING = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
LEFT = auto()
|
|
|
|
LIKE = auto()
|
2025-02-13 15:24:45 +01:00
|
|
|
LIKE_ANY = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
LIMIT = auto()
|
2025-02-13 14:46:14 +01:00
|
|
|
LOAD_DATA = auto()
|
|
|
|
LOCAL = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
MAP = auto()
|
2025-02-13 15:07:44 +01:00
|
|
|
MATCH_RECOGNIZE = auto()
|
2025-02-13 14:37:25 +01:00
|
|
|
MATERIALIZED = auto()
|
2025-02-13 14:57:38 +01:00
|
|
|
MERGE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
MOD = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
NATURAL = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
NEXT = auto()
|
2025-02-13 15:51:35 +01:00
|
|
|
NEXT_VALUE_FOR = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
NO_ACTION = auto()
|
2025-02-13 14:46:14 +01:00
|
|
|
NOTNULL = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
NULL = auto()
|
|
|
|
NULLS_FIRST = auto()
|
|
|
|
NULLS_LAST = auto()
|
|
|
|
OFFSET = auto()
|
|
|
|
ON = auto()
|
|
|
|
ONLY = auto()
|
|
|
|
OPTIONS = auto()
|
|
|
|
ORDER_BY = auto()
|
|
|
|
ORDERED = auto()
|
|
|
|
ORDINALITY = auto()
|
|
|
|
OUTER = auto()
|
|
|
|
OUT_OF = auto()
|
|
|
|
OVER = auto()
|
2025-02-13 15:41:13 +01:00
|
|
|
OVERLAPS = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
OVERWRITE = auto()
|
|
|
|
PARTITION = auto()
|
|
|
|
PARTITION_BY = auto()
|
|
|
|
PERCENT = auto()
|
2025-02-13 14:37:25 +01:00
|
|
|
PIVOT = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
PLACEHOLDER = auto()
|
2025-02-13 15:47:04 +01:00
|
|
|
PRAGMA = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
PRECEDING = auto()
|
|
|
|
PRIMARY_KEY = auto()
|
2025-02-13 14:42:49 +01:00
|
|
|
PROCEDURE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
PROPERTIES = auto()
|
2025-02-13 15:01:11 +01:00
|
|
|
PSEUDO_TYPE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
QUALIFY = auto()
|
|
|
|
QUOTE = auto()
|
|
|
|
RANGE = auto()
|
|
|
|
RECURSIVE = auto()
|
|
|
|
REPLACE = auto()
|
|
|
|
RESPECT_NULLS = auto()
|
2025-02-13 15:24:45 +01:00
|
|
|
RETURNING = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
REFERENCES = auto()
|
|
|
|
RIGHT = auto()
|
|
|
|
RLIKE = auto()
|
2025-02-13 14:52:26 +01:00
|
|
|
ROLLBACK = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
ROLLUP = auto()
|
|
|
|
ROW = auto()
|
|
|
|
ROWS = auto()
|
2025-02-13 14:37:25 +01:00
|
|
|
SEED = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
SELECT = auto()
|
2025-02-13 14:44:19 +01:00
|
|
|
SEMI = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
SEPARATOR = auto()
|
2025-02-13 14:57:38 +01:00
|
|
|
SERDE_PROPERTIES = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
SET = auto()
|
|
|
|
SHOW = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
SIMILAR_TO = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
SOME = auto()
|
2025-02-13 14:53:43 +01:00
|
|
|
SORTKEY = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
SORT_BY = auto()
|
|
|
|
STRUCT = auto()
|
|
|
|
TABLE_SAMPLE = auto()
|
|
|
|
TEMPORARY = auto()
|
|
|
|
TOP = auto()
|
|
|
|
THEN = auto()
|
2025-02-13 07:47:22 +01:00
|
|
|
TRAILING = auto()
|
2025-02-13 14:53:43 +01:00
|
|
|
TRUE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
UNBOUNDED = auto()
|
|
|
|
UNCACHE = auto()
|
|
|
|
UNION = auto()
|
2025-02-13 14:55:11 +01:00
|
|
|
UNLOGGED = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
UNNEST = auto()
|
2025-02-13 14:37:25 +01:00
|
|
|
UNPIVOT = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
UPDATE = auto()
|
|
|
|
USE = auto()
|
|
|
|
USING = auto()
|
|
|
|
VALUES = auto()
|
|
|
|
VIEW = auto()
|
2025-02-13 14:42:49 +01:00
|
|
|
VOLATILE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
WHEN = auto()
|
|
|
|
WHERE = auto()
|
|
|
|
WINDOW = auto()
|
|
|
|
WITH = auto()
|
|
|
|
WITH_TIME_ZONE = auto()
|
2025-02-13 14:30:50 +01:00
|
|
|
WITH_LOCAL_TIME_ZONE = auto()
|
2025-02-13 06:15:54 +01:00
|
|
|
WITHIN_GROUP = auto()
|
|
|
|
WITHOUT_TIME_ZONE = auto()
|
|
|
|
UNIQUE = auto()
|
|
|
|
|
|
|
|
|
|
|
|
class Token:
|
2025-02-13 15:51:35 +01:00
|
|
|
__slots__ = ("token_type", "text", "line", "col", "end", "comments")
|
2025-02-13 06:15:54 +01:00
|
|
|
|
|
|
|
@classmethod
|
2025-02-13 14:52:26 +01:00
|
|
|
def number(cls, number: int) -> Token:
|
|
|
|
"""Returns a NUMBER token with `number` as its text."""
|
2025-02-13 06:15:54 +01:00
|
|
|
return cls(TokenType.NUMBER, str(number))
|
|
|
|
|
|
|
|
@classmethod
|
2025-02-13 14:52:26 +01:00
|
|
|
def string(cls, string: str) -> Token:
|
|
|
|
"""Returns a STRING token with `string` as its text."""
|
2025-02-13 06:15:54 +01:00
|
|
|
return cls(TokenType.STRING, string)
|
|
|
|
|
|
|
|
@classmethod
|
2025-02-13 14:52:26 +01:00
|
|
|
def identifier(cls, identifier: str) -> Token:
|
|
|
|
"""Returns an IDENTIFIER token with `identifier` as its text."""
|
2025-02-13 06:15:54 +01:00
|
|
|
return cls(TokenType.IDENTIFIER, identifier)
|
|
|
|
|
|
|
|
@classmethod
|
2025-02-13 14:52:26 +01:00
|
|
|
def var(cls, var: str) -> Token:
|
|
|
|
"""Returns an VAR token with `var` as its text."""
|
2025-02-13 06:15:54 +01:00
|
|
|
return cls(TokenType.VAR, var)
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
token_type: TokenType,
|
|
|
|
text: str,
|
|
|
|
line: int = 1,
|
|
|
|
col: int = 1,
|
2025-02-13 15:51:35 +01:00
|
|
|
end: int = 0,
|
2025-02-13 14:55:11 +01:00
|
|
|
comments: t.List[str] = [],
|
2025-02-13 14:52:26 +01:00
|
|
|
) -> None:
|
2025-02-13 06:15:54 +01:00
|
|
|
self.token_type = token_type
|
|
|
|
self.text = text
|
|
|
|
self.line = line
|
2025-02-13 15:51:35 +01:00
|
|
|
size = len(text)
|
|
|
|
self.col = col
|
|
|
|
self.end = end if end else size
|
2025-02-13 14:55:11 +01:00
|
|
|
self.comments = comments
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 15:51:35 +01:00
|
|
|
@property
|
|
|
|
def start(self) -> int:
|
|
|
|
"""Returns the start of the token."""
|
|
|
|
return self.end - len(self.text)
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def __repr__(self) -> str:
|
2025-02-13 06:15:54 +01:00
|
|
|
attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
|
|
|
|
return f"<Token {attributes}>"
|
|
|
|
|
|
|
|
|
|
|
|
class _Tokenizer(type):
|
2025-02-13 15:51:35 +01:00
|
|
|
def __new__(cls, clsname, bases, attrs):
|
2025-02-13 06:15:54 +01:00
|
|
|
klass = super().__new__(cls, clsname, bases, attrs)
|
|
|
|
|
2025-02-13 15:01:11 +01:00
|
|
|
klass._QUOTES = {
|
|
|
|
f"{prefix}{s}": e
|
|
|
|
for s, e in cls._delimeter_list_to_dict(klass.QUOTES).items()
|
|
|
|
for prefix in (("",) if s[0].isalpha() else ("", "n", "N"))
|
|
|
|
}
|
2025-02-13 07:47:22 +01:00
|
|
|
klass._BIT_STRINGS = cls._delimeter_list_to_dict(klass.BIT_STRINGS)
|
|
|
|
klass._HEX_STRINGS = cls._delimeter_list_to_dict(klass.HEX_STRINGS)
|
2025-02-13 14:47:39 +01:00
|
|
|
klass._BYTE_STRINGS = cls._delimeter_list_to_dict(klass.BYTE_STRINGS)
|
2025-02-13 07:47:22 +01:00
|
|
|
klass._IDENTIFIERS = cls._delimeter_list_to_dict(klass.IDENTIFIERS)
|
2025-02-13 15:22:50 +01:00
|
|
|
klass._STRING_ESCAPES = set(klass.STRING_ESCAPES)
|
|
|
|
klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES)
|
2025-02-13 07:47:22 +01:00
|
|
|
klass._COMMENTS = dict(
|
2025-02-13 14:52:26 +01:00
|
|
|
(comment, None) if isinstance(comment, str) else (comment[0], comment[1])
|
|
|
|
for comment in klass.COMMENTS
|
2025-02-13 06:15:54 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
klass.KEYWORD_TRIE = new_trie(
|
|
|
|
key.upper()
|
2025-02-13 14:52:26 +01:00
|
|
|
for key in {
|
2025-02-13 06:15:54 +01:00
|
|
|
**klass.KEYWORDS,
|
2025-02-13 07:47:22 +01:00
|
|
|
**{comment: TokenType.COMMENT for comment in klass._COMMENTS},
|
|
|
|
**{quote: TokenType.QUOTE for quote in klass._QUOTES},
|
|
|
|
**{bit_string: TokenType.BIT_STRING for bit_string in klass._BIT_STRINGS},
|
|
|
|
**{hex_string: TokenType.HEX_STRING for hex_string in klass._HEX_STRINGS},
|
2025-02-13 14:47:39 +01:00
|
|
|
**{byte_string: TokenType.BYTE_STRING for byte_string in klass._BYTE_STRINGS},
|
2025-02-13 14:52:26 +01:00
|
|
|
}
|
2025-02-13 06:15:54 +01:00
|
|
|
if " " in key or any(single in key for single in klass.SINGLE_TOKENS)
|
|
|
|
)
|
|
|
|
|
|
|
|
return klass
|
|
|
|
|
2025-02-13 07:47:22 +01:00
|
|
|
@staticmethod
|
2025-02-13 14:52:26 +01:00
|
|
|
def _delimeter_list_to_dict(list: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]:
|
2025-02-13 07:47:22 +01:00
|
|
|
return dict((item, item) if isinstance(item, str) else (item[0], item[1]) for item in list)
|
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
|
|
|
|
class Tokenizer(metaclass=_Tokenizer):
|
|
|
|
SINGLE_TOKENS = {
|
|
|
|
"(": TokenType.L_PAREN,
|
|
|
|
")": TokenType.R_PAREN,
|
|
|
|
"[": TokenType.L_BRACKET,
|
|
|
|
"]": TokenType.R_BRACKET,
|
|
|
|
"{": TokenType.L_BRACE,
|
|
|
|
"}": TokenType.R_BRACE,
|
|
|
|
"&": TokenType.AMP,
|
|
|
|
"^": TokenType.CARET,
|
|
|
|
":": TokenType.COLON,
|
|
|
|
",": TokenType.COMMA,
|
|
|
|
".": TokenType.DOT,
|
|
|
|
"-": TokenType.DASH,
|
|
|
|
"=": TokenType.EQ,
|
|
|
|
">": TokenType.GT,
|
|
|
|
"<": TokenType.LT,
|
|
|
|
"%": TokenType.MOD,
|
|
|
|
"!": TokenType.NOT,
|
|
|
|
"|": TokenType.PIPE,
|
|
|
|
"+": TokenType.PLUS,
|
|
|
|
";": TokenType.SEMICOLON,
|
|
|
|
"/": TokenType.SLASH,
|
2025-02-13 15:07:44 +01:00
|
|
|
"\\": TokenType.BACKSLASH,
|
2025-02-13 06:15:54 +01:00
|
|
|
"*": TokenType.STAR,
|
|
|
|
"~": TokenType.TILDA,
|
|
|
|
"?": TokenType.PLACEHOLDER,
|
2025-02-13 14:30:50 +01:00
|
|
|
"@": TokenType.PARAMETER,
|
2025-02-13 06:15:54 +01:00
|
|
|
# used for breaking a var like x'y' but nothing else
|
|
|
|
# the token type doesn't matter
|
|
|
|
"'": TokenType.QUOTE,
|
|
|
|
"`": TokenType.IDENTIFIER,
|
|
|
|
'"': TokenType.IDENTIFIER,
|
2025-02-13 14:52:26 +01:00
|
|
|
"#": TokenType.HASH,
|
2025-02-13 06:15:54 +01:00
|
|
|
}
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
|
|
|
BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
2025-02-13 15:51:35 +01:00
|
|
|
HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
|
2025-02-13 14:52:26 +01:00
|
|
|
IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
|
2025-02-13 15:22:50 +01:00
|
|
|
IDENTIFIER_ESCAPES = ['"']
|
2025-02-13 15:51:35 +01:00
|
|
|
QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
|
|
|
|
STRING_ESCAPES = ["'"]
|
|
|
|
VAR_SINGLE_TOKENS: t.Set[str] = set()
|
2025-02-13 15:22:50 +01:00
|
|
|
|
2025-02-13 15:51:35 +01:00
|
|
|
_COMMENTS: t.Dict[str, str] = {}
|
|
|
|
_BIT_STRINGS: t.Dict[str, str] = {}
|
|
|
|
_BYTE_STRINGS: t.Dict[str, str] = {}
|
|
|
|
_HEX_STRINGS: t.Dict[str, str] = {}
|
|
|
|
_IDENTIFIERS: t.Dict[str, str] = {}
|
2025-02-13 15:22:50 +01:00
|
|
|
_IDENTIFIER_ESCAPES: t.Set[str] = set()
|
2025-02-13 15:51:35 +01:00
|
|
|
_QUOTES: t.Dict[str, str] = {}
|
|
|
|
_STRING_ESCAPES: t.Set[str] = set()
|
2025-02-13 15:04:17 +01:00
|
|
|
|
2025-02-13 15:51:35 +01:00
|
|
|
KEYWORDS: t.Dict[t.Optional[str], TokenType] = {
|
2025-02-13 15:31:44 +01:00
|
|
|
**{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
|
|
|
|
**{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
|
2025-02-13 15:24:45 +01:00
|
|
|
"{{+": TokenType.BLOCK_START,
|
|
|
|
"{{-": TokenType.BLOCK_START,
|
2025-02-13 15:07:44 +01:00
|
|
|
"+}}": TokenType.BLOCK_END,
|
|
|
|
"-}}": TokenType.BLOCK_END,
|
2025-02-13 06:15:54 +01:00
|
|
|
"/*+": TokenType.HINT,
|
|
|
|
"==": TokenType.EQ,
|
|
|
|
"::": TokenType.DCOLON,
|
|
|
|
"||": TokenType.DPIPE,
|
|
|
|
">=": TokenType.GTE,
|
|
|
|
"<=": TokenType.LTE,
|
|
|
|
"<>": TokenType.NEQ,
|
|
|
|
"!=": TokenType.NEQ,
|
2025-02-13 14:52:26 +01:00
|
|
|
"<=>": TokenType.NULLSAFE_EQ,
|
2025-02-13 06:15:54 +01:00
|
|
|
"->": TokenType.ARROW,
|
|
|
|
"->>": TokenType.DARROW,
|
2025-02-13 14:52:26 +01:00
|
|
|
"=>": TokenType.FARROW,
|
2025-02-13 06:15:54 +01:00
|
|
|
"#>": TokenType.HASH_ARROW,
|
|
|
|
"#>>": TokenType.DHASH_ARROW,
|
2025-02-13 07:47:22 +01:00
|
|
|
"<->": TokenType.LR_ARROW,
|
2025-02-13 15:44:19 +01:00
|
|
|
"&&": TokenType.DAMP,
|
2025-02-13 06:15:54 +01:00
|
|
|
"ALL": TokenType.ALL,
|
2025-02-13 15:04:17 +01:00
|
|
|
"ALWAYS": TokenType.ALWAYS,
|
2025-02-13 06:15:54 +01:00
|
|
|
"AND": TokenType.AND,
|
2025-02-13 14:44:19 +01:00
|
|
|
"ANTI": TokenType.ANTI,
|
2025-02-13 06:15:54 +01:00
|
|
|
"ANY": TokenType.ANY,
|
|
|
|
"ASC": TokenType.ASC,
|
|
|
|
"AS": TokenType.ALIAS,
|
|
|
|
"AT TIME ZONE": TokenType.AT_TIME_ZONE,
|
2025-02-13 15:22:50 +01:00
|
|
|
"AUTOINCREMENT": TokenType.AUTO_INCREMENT,
|
2025-02-13 06:15:54 +01:00
|
|
|
"AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
|
|
|
|
"BEGIN": TokenType.BEGIN,
|
|
|
|
"BETWEEN": TokenType.BETWEEN,
|
2025-02-13 07:47:22 +01:00
|
|
|
"BOTH": TokenType.BOTH,
|
2025-02-13 06:15:54 +01:00
|
|
|
"BUCKET": TokenType.BUCKET,
|
2025-02-13 15:04:17 +01:00
|
|
|
"BY DEFAULT": TokenType.BY_DEFAULT,
|
2025-02-13 06:15:54 +01:00
|
|
|
"CACHE": TokenType.CACHE,
|
|
|
|
"UNCACHE": TokenType.UNCACHE,
|
|
|
|
"CASE": TokenType.CASE,
|
2025-02-13 14:53:43 +01:00
|
|
|
"CASCADE": TokenType.CASCADE,
|
2025-02-13 06:15:54 +01:00
|
|
|
"CHARACTER SET": TokenType.CHARACTER_SET,
|
|
|
|
"CLUSTER BY": TokenType.CLUSTER_BY,
|
|
|
|
"COLLATE": TokenType.COLLATE,
|
2025-02-13 15:01:11 +01:00
|
|
|
"COLUMN": TokenType.COLUMN,
|
2025-02-13 06:15:54 +01:00
|
|
|
"COMMIT": TokenType.COMMIT,
|
2025-02-13 14:55:11 +01:00
|
|
|
"COMPOUND": TokenType.COMPOUND,
|
2025-02-13 06:15:54 +01:00
|
|
|
"CONSTRAINT": TokenType.CONSTRAINT,
|
|
|
|
"CREATE": TokenType.CREATE,
|
|
|
|
"CROSS": TokenType.CROSS,
|
|
|
|
"CUBE": TokenType.CUBE,
|
|
|
|
"CURRENT_DATE": TokenType.CURRENT_DATE,
|
|
|
|
"CURRENT ROW": TokenType.CURRENT_ROW,
|
2025-02-13 15:45:33 +01:00
|
|
|
"CURRENT_TIME": TokenType.CURRENT_TIME,
|
2025-02-13 06:15:54 +01:00
|
|
|
"CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
|
2025-02-13 15:48:55 +01:00
|
|
|
"CURRENT_USER": TokenType.CURRENT_USER,
|
2025-02-13 15:41:13 +01:00
|
|
|
"DATABASE": TokenType.DATABASE,
|
2025-02-13 06:15:54 +01:00
|
|
|
"DEFAULT": TokenType.DEFAULT,
|
|
|
|
"DELETE": TokenType.DELETE,
|
|
|
|
"DESC": TokenType.DESC,
|
2025-02-13 14:46:14 +01:00
|
|
|
"DESCRIBE": TokenType.DESCRIBE,
|
2025-02-13 06:15:54 +01:00
|
|
|
"DISTINCT": TokenType.DISTINCT,
|
2025-02-13 14:52:26 +01:00
|
|
|
"DISTINCT FROM": TokenType.DISTINCT_FROM,
|
2025-02-13 06:15:54 +01:00
|
|
|
"DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
|
2025-02-13 14:46:14 +01:00
|
|
|
"DIV": TokenType.DIV,
|
2025-02-13 06:15:54 +01:00
|
|
|
"DROP": TokenType.DROP,
|
|
|
|
"ELSE": TokenType.ELSE,
|
|
|
|
"END": TokenType.END,
|
|
|
|
"ESCAPE": TokenType.ESCAPE,
|
|
|
|
"EXCEPT": TokenType.EXCEPT,
|
2025-02-13 14:42:49 +01:00
|
|
|
"EXECUTE": TokenType.EXECUTE,
|
2025-02-13 06:15:54 +01:00
|
|
|
"EXISTS": TokenType.EXISTS,
|
|
|
|
"FALSE": TokenType.FALSE,
|
|
|
|
"FETCH": TokenType.FETCH,
|
|
|
|
"FILTER": TokenType.FILTER,
|
|
|
|
"FIRST": TokenType.FIRST,
|
|
|
|
"FULL": TokenType.FULL,
|
|
|
|
"FUNCTION": TokenType.FUNCTION,
|
|
|
|
"FOLLOWING": TokenType.FOLLOWING,
|
2025-02-13 14:37:25 +01:00
|
|
|
"FOR": TokenType.FOR,
|
2025-02-13 06:15:54 +01:00
|
|
|
"FOREIGN KEY": TokenType.FOREIGN_KEY,
|
|
|
|
"FORMAT": TokenType.FORMAT,
|
|
|
|
"FROM": TokenType.FROM,
|
2025-02-13 15:07:44 +01:00
|
|
|
"GLOB": TokenType.GLOB,
|
2025-02-13 06:15:54 +01:00
|
|
|
"GROUP BY": TokenType.GROUP_BY,
|
|
|
|
"GROUPING SETS": TokenType.GROUPING_SETS,
|
|
|
|
"HAVING": TokenType.HAVING,
|
|
|
|
"IF": TokenType.IF,
|
|
|
|
"ILIKE": TokenType.ILIKE,
|
|
|
|
"IGNORE NULLS": TokenType.IGNORE_NULLS,
|
|
|
|
"IN": TokenType.IN,
|
|
|
|
"INDEX": TokenType.INDEX,
|
2025-02-13 15:31:44 +01:00
|
|
|
"INET": TokenType.INET,
|
2025-02-13 06:15:54 +01:00
|
|
|
"INNER": TokenType.INNER,
|
|
|
|
"INSERT": TokenType.INSERT,
|
|
|
|
"INTERVAL": TokenType.INTERVAL,
|
|
|
|
"INTERSECT": TokenType.INTERSECT,
|
|
|
|
"INTO": TokenType.INTO,
|
|
|
|
"IS": TokenType.IS,
|
|
|
|
"ISNULL": TokenType.ISNULL,
|
|
|
|
"JOIN": TokenType.JOIN,
|
2025-02-13 15:51:35 +01:00
|
|
|
"KEEP": TokenType.KEEP,
|
2025-02-13 06:15:54 +01:00
|
|
|
"LATERAL": TokenType.LATERAL,
|
|
|
|
"LAZY": TokenType.LAZY,
|
2025-02-13 07:47:22 +01:00
|
|
|
"LEADING": TokenType.LEADING,
|
2025-02-13 06:15:54 +01:00
|
|
|
"LEFT": TokenType.LEFT,
|
|
|
|
"LIKE": TokenType.LIKE,
|
|
|
|
"LIMIT": TokenType.LIMIT,
|
2025-02-13 14:46:14 +01:00
|
|
|
"LOAD DATA": TokenType.LOAD_DATA,
|
|
|
|
"LOCAL": TokenType.LOCAL,
|
2025-02-13 14:37:25 +01:00
|
|
|
"MATERIALIZED": TokenType.MATERIALIZED,
|
2025-02-13 14:57:38 +01:00
|
|
|
"MERGE": TokenType.MERGE,
|
2025-02-13 07:47:22 +01:00
|
|
|
"NATURAL": TokenType.NATURAL,
|
2025-02-13 06:15:54 +01:00
|
|
|
"NEXT": TokenType.NEXT,
|
2025-02-13 15:51:35 +01:00
|
|
|
"NEXT VALUE FOR": TokenType.NEXT_VALUE_FOR,
|
2025-02-13 06:15:54 +01:00
|
|
|
"NO ACTION": TokenType.NO_ACTION,
|
|
|
|
"NOT": TokenType.NOT,
|
2025-02-13 14:46:14 +01:00
|
|
|
"NOTNULL": TokenType.NOTNULL,
|
2025-02-13 06:15:54 +01:00
|
|
|
"NULL": TokenType.NULL,
|
|
|
|
"NULLS FIRST": TokenType.NULLS_FIRST,
|
|
|
|
"NULLS LAST": TokenType.NULLS_LAST,
|
2025-02-13 14:37:25 +01:00
|
|
|
"OBJECT": TokenType.OBJECT,
|
2025-02-13 06:15:54 +01:00
|
|
|
"OFFSET": TokenType.OFFSET,
|
|
|
|
"ON": TokenType.ON,
|
|
|
|
"ONLY": TokenType.ONLY,
|
|
|
|
"OPTIONS": TokenType.OPTIONS,
|
|
|
|
"OR": TokenType.OR,
|
|
|
|
"ORDER BY": TokenType.ORDER_BY,
|
|
|
|
"ORDINALITY": TokenType.ORDINALITY,
|
|
|
|
"OUTER": TokenType.OUTER,
|
|
|
|
"OUT OF": TokenType.OUT_OF,
|
|
|
|
"OVER": TokenType.OVER,
|
2025-02-13 15:41:13 +01:00
|
|
|
"OVERLAPS": TokenType.OVERLAPS,
|
2025-02-13 06:15:54 +01:00
|
|
|
"OVERWRITE": TokenType.OVERWRITE,
|
|
|
|
"PARTITION": TokenType.PARTITION,
|
|
|
|
"PARTITION BY": TokenType.PARTITION_BY,
|
2025-02-13 15:07:44 +01:00
|
|
|
"PARTITIONED BY": TokenType.PARTITION_BY,
|
|
|
|
"PARTITIONED_BY": TokenType.PARTITION_BY,
|
2025-02-13 06:15:54 +01:00
|
|
|
"PERCENT": TokenType.PERCENT,
|
2025-02-13 14:37:25 +01:00
|
|
|
"PIVOT": TokenType.PIVOT,
|
2025-02-13 15:47:04 +01:00
|
|
|
"PRAGMA": TokenType.PRAGMA,
|
2025-02-13 06:15:54 +01:00
|
|
|
"PRECEDING": TokenType.PRECEDING,
|
|
|
|
"PRIMARY KEY": TokenType.PRIMARY_KEY,
|
2025-02-13 14:42:49 +01:00
|
|
|
"PROCEDURE": TokenType.PROCEDURE,
|
2025-02-13 15:06:33 +01:00
|
|
|
"QUALIFY": TokenType.QUALIFY,
|
2025-02-13 06:15:54 +01:00
|
|
|
"RANGE": TokenType.RANGE,
|
|
|
|
"RECURSIVE": TokenType.RECURSIVE,
|
|
|
|
"REGEXP": TokenType.RLIKE,
|
|
|
|
"REPLACE": TokenType.REPLACE,
|
|
|
|
"RESPECT NULLS": TokenType.RESPECT_NULLS,
|
|
|
|
"REFERENCES": TokenType.REFERENCES,
|
|
|
|
"RIGHT": TokenType.RIGHT,
|
|
|
|
"RLIKE": TokenType.RLIKE,
|
2025-02-13 14:52:26 +01:00
|
|
|
"ROLLBACK": TokenType.ROLLBACK,
|
2025-02-13 06:15:54 +01:00
|
|
|
"ROLLUP": TokenType.ROLLUP,
|
|
|
|
"ROW": TokenType.ROW,
|
|
|
|
"ROWS": TokenType.ROWS,
|
2025-02-13 14:46:14 +01:00
|
|
|
"SCHEMA": TokenType.SCHEMA,
|
2025-02-13 14:37:25 +01:00
|
|
|
"SEED": TokenType.SEED,
|
2025-02-13 06:15:54 +01:00
|
|
|
"SELECT": TokenType.SELECT,
|
2025-02-13 14:44:19 +01:00
|
|
|
"SEMI": TokenType.SEMI,
|
2025-02-13 06:15:54 +01:00
|
|
|
"SET": TokenType.SET,
|
|
|
|
"SHOW": TokenType.SHOW,
|
2025-02-13 15:01:11 +01:00
|
|
|
"SIMILAR TO": TokenType.SIMILAR_TO,
|
2025-02-13 06:15:54 +01:00
|
|
|
"SOME": TokenType.SOME,
|
2025-02-13 14:53:43 +01:00
|
|
|
"SORTKEY": TokenType.SORTKEY,
|
2025-02-13 06:15:54 +01:00
|
|
|
"SORT BY": TokenType.SORT_BY,
|
|
|
|
"TABLE": TokenType.TABLE,
|
|
|
|
"TABLESAMPLE": TokenType.TABLE_SAMPLE,
|
|
|
|
"TEMP": TokenType.TEMPORARY,
|
|
|
|
"TEMPORARY": TokenType.TEMPORARY,
|
|
|
|
"THEN": TokenType.THEN,
|
|
|
|
"TRUE": TokenType.TRUE,
|
2025-02-13 07:47:22 +01:00
|
|
|
"TRAILING": TokenType.TRAILING,
|
2025-02-13 06:15:54 +01:00
|
|
|
"UNBOUNDED": TokenType.UNBOUNDED,
|
|
|
|
"UNION": TokenType.UNION,
|
2025-02-13 14:55:11 +01:00
|
|
|
"UNLOGGED": TokenType.UNLOGGED,
|
2025-02-13 06:15:54 +01:00
|
|
|
"UNNEST": TokenType.UNNEST,
|
2025-02-13 14:55:11 +01:00
|
|
|
"UNPIVOT": TokenType.UNPIVOT,
|
2025-02-13 06:15:54 +01:00
|
|
|
"UPDATE": TokenType.UPDATE,
|
|
|
|
"USE": TokenType.USE,
|
|
|
|
"USING": TokenType.USING,
|
2025-02-13 15:51:35 +01:00
|
|
|
"UUID": TokenType.UUID,
|
2025-02-13 06:15:54 +01:00
|
|
|
"VALUES": TokenType.VALUES,
|
|
|
|
"VIEW": TokenType.VIEW,
|
2025-02-13 14:42:49 +01:00
|
|
|
"VOLATILE": TokenType.VOLATILE,
|
2025-02-13 06:15:54 +01:00
|
|
|
"WHEN": TokenType.WHEN,
|
|
|
|
"WHERE": TokenType.WHERE,
|
2025-02-13 15:01:11 +01:00
|
|
|
"WINDOW": TokenType.WINDOW,
|
2025-02-13 06:15:54 +01:00
|
|
|
"WITH": TokenType.WITH,
|
|
|
|
"WITH TIME ZONE": TokenType.WITH_TIME_ZONE,
|
2025-02-13 14:30:50 +01:00
|
|
|
"WITH LOCAL TIME ZONE": TokenType.WITH_LOCAL_TIME_ZONE,
|
2025-02-13 06:15:54 +01:00
|
|
|
"WITHIN GROUP": TokenType.WITHIN_GROUP,
|
|
|
|
"WITHOUT TIME ZONE": TokenType.WITHOUT_TIME_ZONE,
|
2025-02-13 14:51:09 +01:00
|
|
|
"APPLY": TokenType.APPLY,
|
2025-02-13 06:15:54 +01:00
|
|
|
"ARRAY": TokenType.ARRAY,
|
2025-02-13 15:44:19 +01:00
|
|
|
"BIT": TokenType.BIT,
|
2025-02-13 06:15:54 +01:00
|
|
|
"BOOL": TokenType.BOOLEAN,
|
|
|
|
"BOOLEAN": TokenType.BOOLEAN,
|
|
|
|
"BYTE": TokenType.TINYINT,
|
|
|
|
"TINYINT": TokenType.TINYINT,
|
|
|
|
"SHORT": TokenType.SMALLINT,
|
|
|
|
"SMALLINT": TokenType.SMALLINT,
|
|
|
|
"INT2": TokenType.SMALLINT,
|
|
|
|
"INTEGER": TokenType.INT,
|
|
|
|
"INT": TokenType.INT,
|
|
|
|
"INT4": TokenType.INT,
|
|
|
|
"LONG": TokenType.BIGINT,
|
|
|
|
"BIGINT": TokenType.BIGINT,
|
|
|
|
"INT8": TokenType.BIGINT,
|
2025-02-13 15:47:04 +01:00
|
|
|
"DEC": TokenType.DECIMAL,
|
2025-02-13 06:15:54 +01:00
|
|
|
"DECIMAL": TokenType.DECIMAL,
|
2025-02-13 15:51:35 +01:00
|
|
|
"BIGDECIMAL": TokenType.BIGDECIMAL,
|
|
|
|
"BIGNUMERIC": TokenType.BIGDECIMAL,
|
2025-02-13 06:15:54 +01:00
|
|
|
"MAP": TokenType.MAP,
|
2025-02-13 14:44:19 +01:00
|
|
|
"NULLABLE": TokenType.NULLABLE,
|
2025-02-13 06:15:54 +01:00
|
|
|
"NUMBER": TokenType.DECIMAL,
|
|
|
|
"NUMERIC": TokenType.DECIMAL,
|
|
|
|
"FIXED": TokenType.DECIMAL,
|
|
|
|
"REAL": TokenType.FLOAT,
|
|
|
|
"FLOAT": TokenType.FLOAT,
|
|
|
|
"FLOAT4": TokenType.FLOAT,
|
|
|
|
"FLOAT8": TokenType.DOUBLE,
|
|
|
|
"DOUBLE": TokenType.DOUBLE,
|
2025-02-13 15:02:59 +01:00
|
|
|
"DOUBLE PRECISION": TokenType.DOUBLE,
|
2025-02-13 06:15:54 +01:00
|
|
|
"JSON": TokenType.JSON,
|
|
|
|
"CHAR": TokenType.CHAR,
|
2025-02-13 15:41:13 +01:00
|
|
|
"CHARACTER": TokenType.CHAR,
|
2025-02-13 06:15:54 +01:00
|
|
|
"NCHAR": TokenType.NCHAR,
|
|
|
|
"VARCHAR": TokenType.VARCHAR,
|
|
|
|
"VARCHAR2": TokenType.VARCHAR,
|
|
|
|
"NVARCHAR": TokenType.NVARCHAR,
|
|
|
|
"NVARCHAR2": TokenType.NVARCHAR,
|
2025-02-13 15:01:11 +01:00
|
|
|
"STR": TokenType.TEXT,
|
2025-02-13 06:15:54 +01:00
|
|
|
"STRING": TokenType.TEXT,
|
|
|
|
"TEXT": TokenType.TEXT,
|
|
|
|
"CLOB": TokenType.TEXT,
|
2025-02-13 15:09:11 +01:00
|
|
|
"LONGVARCHAR": TokenType.TEXT,
|
2025-02-13 06:15:54 +01:00
|
|
|
"BINARY": TokenType.BINARY,
|
2025-02-13 14:52:26 +01:00
|
|
|
"BLOB": TokenType.VARBINARY,
|
|
|
|
"BYTEA": TokenType.VARBINARY,
|
|
|
|
"VARBINARY": TokenType.VARBINARY,
|
2025-02-13 15:02:59 +01:00
|
|
|
"TIME": TokenType.TIME,
|
2025-02-13 06:15:54 +01:00
|
|
|
"TIMESTAMP": TokenType.TIMESTAMP,
|
|
|
|
"TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
|
2025-02-13 14:30:50 +01:00
|
|
|
"TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
|
2025-02-13 06:15:54 +01:00
|
|
|
"DATE": TokenType.DATE,
|
|
|
|
"DATETIME": TokenType.DATETIME,
|
|
|
|
"UNIQUE": TokenType.UNIQUE,
|
|
|
|
"STRUCT": TokenType.STRUCT,
|
2025-02-13 14:37:25 +01:00
|
|
|
"VARIANT": TokenType.VARIANT,
|
2025-02-13 15:01:11 +01:00
|
|
|
"ALTER": TokenType.ALTER,
|
|
|
|
"ALTER AGGREGATE": TokenType.COMMAND,
|
|
|
|
"ALTER DEFAULT": TokenType.COMMAND,
|
|
|
|
"ALTER DOMAIN": TokenType.COMMAND,
|
|
|
|
"ALTER ROLE": TokenType.COMMAND,
|
|
|
|
"ALTER RULE": TokenType.COMMAND,
|
|
|
|
"ALTER SEQUENCE": TokenType.COMMAND,
|
|
|
|
"ALTER TYPE": TokenType.COMMAND,
|
|
|
|
"ALTER USER": TokenType.COMMAND,
|
|
|
|
"ALTER VIEW": TokenType.COMMAND,
|
2025-02-13 14:53:43 +01:00
|
|
|
"ANALYZE": TokenType.COMMAND,
|
|
|
|
"CALL": TokenType.COMMAND,
|
2025-02-13 15:41:13 +01:00
|
|
|
"COMMENT": TokenType.COMMENT,
|
2025-02-13 15:22:50 +01:00
|
|
|
"COPY": TokenType.COMMAND,
|
2025-02-13 14:53:43 +01:00
|
|
|
"EXPLAIN": TokenType.COMMAND,
|
2025-02-13 15:41:13 +01:00
|
|
|
"GRANT": TokenType.COMMAND,
|
2025-02-13 14:53:43 +01:00
|
|
|
"OPTIMIZE": TokenType.COMMAND,
|
|
|
|
"PREPARE": TokenType.COMMAND,
|
|
|
|
"TRUNCATE": TokenType.COMMAND,
|
|
|
|
"VACUUM": TokenType.COMMAND,
|
2025-02-13 06:15:54 +01:00
|
|
|
}
|
|
|
|
|
2025-02-13 15:47:04 +01:00
|
|
|
WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = {
|
2025-02-13 06:15:54 +01:00
|
|
|
" ": TokenType.SPACE,
|
|
|
|
"\t": TokenType.SPACE,
|
|
|
|
"\n": TokenType.BREAK,
|
|
|
|
"\r": TokenType.BREAK,
|
|
|
|
"\r\n": TokenType.BREAK,
|
|
|
|
}
|
|
|
|
|
|
|
|
COMMANDS = {
|
2025-02-13 14:53:43 +01:00
|
|
|
TokenType.COMMAND,
|
|
|
|
TokenType.EXECUTE,
|
|
|
|
TokenType.FETCH,
|
2025-02-13 06:15:54 +01:00
|
|
|
TokenType.SHOW,
|
|
|
|
}
|
|
|
|
|
2025-02-13 15:06:33 +01:00
|
|
|
COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
|
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
# handle numeric literals like in hive (3L = BIGINT)
|
2025-02-13 14:52:26 +01:00
|
|
|
NUMERIC_LITERALS: t.Dict[str, str] = {}
|
|
|
|
ENCODE: t.Optional[str] = None
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 15:31:44 +01:00
|
|
|
COMMENTS = ["--", ("/*", "*/"), ("{#", "#}")]
|
2025-02-13 15:51:35 +01:00
|
|
|
KEYWORD_TRIE: t.Dict = {} # autofilled
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 15:02:59 +01:00
|
|
|
IDENTIFIER_CAN_START_WITH_DIGIT = False
|
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
__slots__ = (
|
|
|
|
"sql",
|
|
|
|
"size",
|
|
|
|
"tokens",
|
|
|
|
"_start",
|
|
|
|
"_current",
|
|
|
|
"_line",
|
|
|
|
"_col",
|
2025-02-13 14:55:11 +01:00
|
|
|
"_comments",
|
2025-02-13 06:15:54 +01:00
|
|
|
"_char",
|
|
|
|
"_end",
|
|
|
|
"_peek",
|
2025-02-13 14:52:26 +01:00
|
|
|
"_prev_token_line",
|
2025-02-13 14:55:11 +01:00
|
|
|
"_prev_token_comments",
|
2025-02-13 14:42:49 +01:00
|
|
|
"_prev_token_type",
|
2025-02-13 06:15:54 +01:00
|
|
|
)
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def __init__(self) -> None:
|
2025-02-13 06:15:54 +01:00
|
|
|
self.reset()
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def reset(self) -> None:
|
2025-02-13 06:15:54 +01:00
|
|
|
self.sql = ""
|
|
|
|
self.size = 0
|
2025-02-13 14:52:26 +01:00
|
|
|
self.tokens: t.List[Token] = []
|
2025-02-13 06:15:54 +01:00
|
|
|
self._start = 0
|
|
|
|
self._current = 0
|
|
|
|
self._line = 1
|
|
|
|
self._col = 1
|
2025-02-13 14:55:11 +01:00
|
|
|
self._comments: t.List[str] = []
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 15:51:35 +01:00
|
|
|
self._char = ""
|
|
|
|
self._end = False
|
|
|
|
self._peek = ""
|
2025-02-13 14:52:26 +01:00
|
|
|
self._prev_token_line = -1
|
2025-02-13 14:55:11 +01:00
|
|
|
self._prev_token_comments: t.List[str] = []
|
2025-02-13 15:51:35 +01:00
|
|
|
self._prev_token_type: t.Optional[TokenType] = None
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def tokenize(self, sql: str) -> t.List[Token]:
|
|
|
|
"""Returns a list of tokens corresponding to the SQL string `sql`."""
|
2025-02-13 06:15:54 +01:00
|
|
|
self.reset()
|
|
|
|
self.sql = sql
|
|
|
|
self.size = len(sql)
|
2025-02-13 15:51:35 +01:00
|
|
|
try:
|
|
|
|
self._scan()
|
|
|
|
except Exception as e:
|
|
|
|
start = self._current - 50
|
|
|
|
end = self._current + 50
|
|
|
|
start = start if start > 0 else 0
|
|
|
|
end = end if end < self.size else self.size - 1
|
|
|
|
context = self.sql[start:end]
|
|
|
|
raise ValueError(f"Error tokenizing '{context}'") from e
|
|
|
|
|
2025-02-13 15:04:17 +01:00
|
|
|
return self.tokens
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 15:04:17 +01:00
|
|
|
def _scan(self, until: t.Optional[t.Callable] = None) -> None:
|
2025-02-13 06:15:54 +01:00
|
|
|
while self.size and not self._end:
|
|
|
|
self._start = self._current
|
|
|
|
self._advance()
|
|
|
|
|
2025-02-13 15:31:44 +01:00
|
|
|
if self._char is None:
|
2025-02-13 06:15:54 +01:00
|
|
|
break
|
|
|
|
|
2025-02-13 15:31:44 +01:00
|
|
|
if self._char not in self.WHITE_SPACE:
|
|
|
|
if self._char.isdigit():
|
|
|
|
self._scan_number()
|
|
|
|
elif self._char in self._IDENTIFIERS:
|
|
|
|
self._scan_identifier(self._IDENTIFIERS[self._char])
|
|
|
|
else:
|
|
|
|
self._scan_keywords()
|
2025-02-13 15:04:17 +01:00
|
|
|
|
|
|
|
if until and until():
|
|
|
|
break
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 15:51:35 +01:00
|
|
|
if self.tokens:
|
|
|
|
self.tokens[-1].comments.extend(self._comments)
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _chars(self, size: int) -> str:
|
2025-02-13 06:15:54 +01:00
|
|
|
if size == 1:
|
2025-02-13 15:51:35 +01:00
|
|
|
return self._char
|
2025-02-13 06:15:54 +01:00
|
|
|
start = self._current - 1
|
|
|
|
end = start + size
|
|
|
|
if end <= self.size:
|
|
|
|
return self.sql[start:end]
|
|
|
|
return ""
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _advance(self, i: int = 1) -> None:
|
2025-02-13 15:47:04 +01:00
|
|
|
if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
|
2025-02-13 15:51:35 +01:00
|
|
|
self._col = 1
|
|
|
|
self._line += 1
|
|
|
|
else:
|
|
|
|
self._col += i
|
2025-02-13 15:31:44 +01:00
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
self._current += i
|
2025-02-13 15:51:35 +01:00
|
|
|
self._end = self._current >= self.size
|
|
|
|
self._char = self.sql[self._current - 1]
|
|
|
|
self._peek = "" if self._end else self.sql[self._current]
|
2025-02-13 15:31:44 +01:00
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
@property
|
2025-02-13 14:52:26 +01:00
|
|
|
def _text(self) -> str:
|
2025-02-13 06:15:54 +01:00
|
|
|
return self.sql[self._start : self._current]
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
|
|
|
|
self._prev_token_line = self._line
|
2025-02-13 14:55:11 +01:00
|
|
|
self._prev_token_comments = self._comments
|
2025-02-13 15:51:35 +01:00
|
|
|
self._prev_token_type = token_type
|
2025-02-13 14:52:26 +01:00
|
|
|
self.tokens.append(
|
|
|
|
Token(
|
|
|
|
token_type,
|
|
|
|
self._text if text is None else text,
|
|
|
|
self._line,
|
|
|
|
self._col,
|
2025-02-13 15:51:35 +01:00
|
|
|
self._current,
|
2025-02-13 14:55:11 +01:00
|
|
|
self._comments,
|
2025-02-13 14:52:26 +01:00
|
|
|
)
|
|
|
|
)
|
2025-02-13 14:55:11 +01:00
|
|
|
self._comments = []
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 15:06:33 +01:00
|
|
|
# If we have either a semicolon or a begin token before the command's token, we'll parse
|
|
|
|
# whatever follows the command's token as a string
|
2025-02-13 15:45:33 +01:00
|
|
|
if (
|
|
|
|
token_type in self.COMMANDS
|
|
|
|
and self._peek != ";"
|
|
|
|
and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS)
|
2025-02-13 14:52:26 +01:00
|
|
|
):
|
2025-02-13 15:04:17 +01:00
|
|
|
start = self._current
|
|
|
|
tokens = len(self.tokens)
|
|
|
|
self._scan(lambda: self._peek == ";")
|
|
|
|
self.tokens = self.tokens[:tokens]
|
|
|
|
text = self.sql[start : self._current].strip()
|
|
|
|
if text:
|
|
|
|
self._add(TokenType.STRING, text)
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _scan_keywords(self) -> None:
|
2025-02-13 06:15:54 +01:00
|
|
|
size = 0
|
|
|
|
word = None
|
2025-02-13 15:42:50 +01:00
|
|
|
chars = self._text
|
2025-02-13 06:15:54 +01:00
|
|
|
char = chars
|
|
|
|
prev_space = False
|
|
|
|
skip = False
|
|
|
|
trie = self.KEYWORD_TRIE
|
2025-02-13 15:42:50 +01:00
|
|
|
single_token = char in self.SINGLE_TOKENS
|
2025-02-13 06:15:54 +01:00
|
|
|
|
|
|
|
while chars:
|
|
|
|
if skip:
|
|
|
|
result = 1
|
|
|
|
else:
|
2025-02-13 15:51:35 +01:00
|
|
|
result, trie = in_trie(trie, char.upper())
|
2025-02-13 06:15:54 +01:00
|
|
|
|
|
|
|
if result == 0:
|
|
|
|
break
|
|
|
|
if result == 2:
|
|
|
|
word = chars
|
|
|
|
size += 1
|
|
|
|
end = self._current - 1 + size
|
|
|
|
|
|
|
|
if end < self.size:
|
|
|
|
char = self.sql[end]
|
2025-02-13 15:42:50 +01:00
|
|
|
single_token = single_token or char in self.SINGLE_TOKENS
|
2025-02-13 06:15:54 +01:00
|
|
|
is_space = char in self.WHITE_SPACE
|
|
|
|
|
|
|
|
if not is_space or not prev_space:
|
|
|
|
if is_space:
|
|
|
|
char = " "
|
|
|
|
chars += char
|
|
|
|
prev_space = is_space
|
|
|
|
skip = False
|
|
|
|
else:
|
|
|
|
skip = True
|
|
|
|
else:
|
2025-02-13 15:42:50 +01:00
|
|
|
chars = " "
|
|
|
|
|
|
|
|
word = None if not single_token and chars[-1] not in self.WHITE_SPACE else word
|
2025-02-13 06:15:54 +01:00
|
|
|
|
|
|
|
if not word:
|
|
|
|
if self._char in self.SINGLE_TOKENS:
|
2025-02-13 15:51:35 +01:00
|
|
|
self._add(self.SINGLE_TOKENS[self._char], text=self._char)
|
2025-02-13 06:15:54 +01:00
|
|
|
return
|
|
|
|
self._scan_var()
|
|
|
|
return
|
|
|
|
|
|
|
|
if self._scan_string(word):
|
|
|
|
return
|
2025-02-13 14:47:39 +01:00
|
|
|
if self._scan_formatted_string(word):
|
2025-02-13 07:47:22 +01:00
|
|
|
return
|
2025-02-13 06:15:54 +01:00
|
|
|
if self._scan_comment(word):
|
|
|
|
return
|
|
|
|
|
|
|
|
self._advance(size - 1)
|
2025-02-13 15:48:55 +01:00
|
|
|
word = word.upper()
|
|
|
|
self._add(self.KEYWORDS[word], text=word)
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _scan_comment(self, comment_start: str) -> bool:
|
2025-02-13 15:51:35 +01:00
|
|
|
if comment_start not in self._COMMENTS:
|
2025-02-13 06:15:54 +01:00
|
|
|
return False
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
comment_start_line = self._line
|
|
|
|
comment_start_size = len(comment_start)
|
2025-02-13 15:51:35 +01:00
|
|
|
comment_end = self._COMMENTS[comment_start]
|
2025-02-13 06:15:54 +01:00
|
|
|
|
|
|
|
if comment_end:
|
2025-02-13 15:51:35 +01:00
|
|
|
# Skip the comment's start delimiter
|
|
|
|
self._advance(comment_start_size)
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 15:51:35 +01:00
|
|
|
comment_end_size = len(comment_end)
|
2025-02-13 06:15:54 +01:00
|
|
|
while not self._end and self._chars(comment_end_size) != comment_end:
|
|
|
|
self._advance()
|
2025-02-13 14:52:26 +01:00
|
|
|
|
2025-02-13 15:51:35 +01:00
|
|
|
self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
|
2025-02-13 06:15:54 +01:00
|
|
|
self._advance(comment_end_size - 1)
|
|
|
|
else:
|
2025-02-13 15:47:04 +01:00
|
|
|
while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK:
|
2025-02-13 06:15:54 +01:00
|
|
|
self._advance()
|
2025-02-13 15:51:35 +01:00
|
|
|
self._comments.append(self._text[comment_start_size:])
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 14:55:11 +01:00
|
|
|
# Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
|
|
|
|
# Multiple consecutive comments are preserved by appending them to the current comments list.
|
2025-02-13 15:51:35 +01:00
|
|
|
if comment_start_line == self._prev_token_line:
|
2025-02-13 14:55:11 +01:00
|
|
|
self.tokens[-1].comments.extend(self._comments)
|
|
|
|
self._comments = []
|
2025-02-13 15:31:44 +01:00
|
|
|
self._prev_token_line = self._line
|
2025-02-13 14:52:26 +01:00
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
def _scan_number(self) -> None:
|
2025-02-13 07:47:22 +01:00
|
|
|
if self._char == "0":
|
2025-02-13 15:51:35 +01:00
|
|
|
peek = self._peek.upper()
|
2025-02-13 07:47:22 +01:00
|
|
|
if peek == "B":
|
|
|
|
return self._scan_bits()
|
|
|
|
elif peek == "X":
|
|
|
|
return self._scan_hex()
|
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
decimal = False
|
|
|
|
scientific = 0
|
|
|
|
|
|
|
|
while True:
|
2025-02-13 15:51:35 +01:00
|
|
|
if self._peek.isdigit():
|
2025-02-13 06:15:54 +01:00
|
|
|
self._advance()
|
|
|
|
elif self._peek == "." and not decimal:
|
|
|
|
decimal = True
|
|
|
|
self._advance()
|
|
|
|
elif self._peek in ("-", "+") and scientific == 1:
|
|
|
|
scientific += 1
|
|
|
|
self._advance()
|
2025-02-13 15:51:35 +01:00
|
|
|
elif self._peek.upper() == "E" and not scientific:
|
2025-02-13 06:15:54 +01:00
|
|
|
scientific += 1
|
|
|
|
self._advance()
|
2025-02-13 15:51:35 +01:00
|
|
|
elif self._peek.isidentifier():
|
2025-02-13 15:02:59 +01:00
|
|
|
number_text = self._text
|
2025-02-13 15:51:35 +01:00
|
|
|
literal = ""
|
2025-02-13 15:04:17 +01:00
|
|
|
|
2025-02-13 15:51:35 +01:00
|
|
|
while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:
|
|
|
|
literal += self._peek.upper()
|
2025-02-13 06:15:54 +01:00
|
|
|
self._advance()
|
2025-02-13 15:02:59 +01:00
|
|
|
|
2025-02-13 15:51:35 +01:00
|
|
|
token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal))
|
2025-02-13 15:02:59 +01:00
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
if token_type:
|
2025-02-13 15:02:59 +01:00
|
|
|
self._add(TokenType.NUMBER, number_text)
|
2025-02-13 06:15:54 +01:00
|
|
|
self._add(TokenType.DCOLON, "::")
|
2025-02-13 15:51:35 +01:00
|
|
|
return self._add(token_type, literal)
|
2025-02-13 15:02:59 +01:00
|
|
|
elif self.IDENTIFIER_CAN_START_WITH_DIGIT:
|
|
|
|
return self._add(TokenType.VAR)
|
|
|
|
|
|
|
|
self._add(TokenType.NUMBER, number_text)
|
2025-02-13 06:15:54 +01:00
|
|
|
return self._advance(-len(literal))
|
|
|
|
else:
|
|
|
|
return self._add(TokenType.NUMBER)
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _scan_bits(self) -> None:
|
2025-02-13 07:47:22 +01:00
|
|
|
self._advance()
|
|
|
|
value = self._extract_value()
|
|
|
|
try:
|
|
|
|
self._add(TokenType.BIT_STRING, f"{int(value, 2)}")
|
|
|
|
except ValueError:
|
|
|
|
self._add(TokenType.IDENTIFIER)
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _scan_hex(self) -> None:
|
2025-02-13 06:15:54 +01:00
|
|
|
self._advance()
|
2025-02-13 07:47:22 +01:00
|
|
|
value = self._extract_value()
|
|
|
|
try:
|
|
|
|
self._add(TokenType.HEX_STRING, f"{int(value, 16)}")
|
|
|
|
except ValueError:
|
|
|
|
self._add(TokenType.IDENTIFIER)
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _extract_value(self) -> str:
|
2025-02-13 06:15:54 +01:00
|
|
|
while True:
|
2025-02-13 15:51:35 +01:00
|
|
|
char = self._peek.strip()
|
2025-02-13 06:15:54 +01:00
|
|
|
if char and char not in self.SINGLE_TOKENS:
|
|
|
|
self._advance()
|
|
|
|
else:
|
|
|
|
break
|
2025-02-13 07:47:22 +01:00
|
|
|
|
|
|
|
return self._text
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _scan_string(self, quote: str) -> bool:
|
2025-02-13 15:51:35 +01:00
|
|
|
quote_end = self._QUOTES.get(quote)
|
2025-02-13 06:15:54 +01:00
|
|
|
if quote_end is None:
|
|
|
|
return False
|
|
|
|
|
|
|
|
self._advance(len(quote))
|
2025-02-13 07:47:22 +01:00
|
|
|
text = self._extract_string(quote_end)
|
2025-02-13 15:51:35 +01:00
|
|
|
text = text.encode(self.ENCODE).decode(self.ENCODE) if self.ENCODE else text
|
2025-02-13 15:01:11 +01:00
|
|
|
self._add(TokenType.NATIONAL if quote[0].upper() == "N" else TokenType.STRING, text)
|
2025-02-13 06:15:54 +01:00
|
|
|
return True
|
|
|
|
|
2025-02-13 14:47:39 +01:00
|
|
|
# X'1234, b'0110', E'\\\\\' etc.
|
2025-02-13 14:52:26 +01:00
|
|
|
def _scan_formatted_string(self, string_start: str) -> bool:
|
2025-02-13 15:51:35 +01:00
|
|
|
if string_start in self._HEX_STRINGS:
|
|
|
|
delimiters = self._HEX_STRINGS
|
2025-02-13 07:47:22 +01:00
|
|
|
token_type = TokenType.HEX_STRING
|
|
|
|
base = 16
|
2025-02-13 15:51:35 +01:00
|
|
|
elif string_start in self._BIT_STRINGS:
|
|
|
|
delimiters = self._BIT_STRINGS
|
2025-02-13 07:47:22 +01:00
|
|
|
token_type = TokenType.BIT_STRING
|
|
|
|
base = 2
|
2025-02-13 15:51:35 +01:00
|
|
|
elif string_start in self._BYTE_STRINGS:
|
|
|
|
delimiters = self._BYTE_STRINGS
|
2025-02-13 14:47:39 +01:00
|
|
|
token_type = TokenType.BYTE_STRING
|
|
|
|
base = None
|
2025-02-13 07:47:22 +01:00
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
|
|
|
self._advance(len(string_start))
|
2025-02-13 15:51:35 +01:00
|
|
|
string_end = delimiters[string_start]
|
2025-02-13 07:47:22 +01:00
|
|
|
text = self._extract_string(string_end)
|
|
|
|
|
2025-02-13 14:47:39 +01:00
|
|
|
if base is None:
|
|
|
|
self._add(token_type, text)
|
|
|
|
else:
|
|
|
|
try:
|
|
|
|
self._add(token_type, f"{int(text, base)}")
|
|
|
|
except:
|
2025-02-13 14:52:26 +01:00
|
|
|
raise RuntimeError(
|
|
|
|
f"Numeric string contains invalid characters from {self._line}:{self._start}"
|
|
|
|
)
|
2025-02-13 14:47:39 +01:00
|
|
|
|
2025-02-13 07:47:22 +01:00
|
|
|
return True
|
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _scan_identifier(self, identifier_end: str) -> None:
|
2025-02-13 15:22:50 +01:00
|
|
|
text = ""
|
|
|
|
identifier_end_is_escape = identifier_end in self._IDENTIFIER_ESCAPES
|
|
|
|
|
|
|
|
while True:
|
2025-02-13 06:15:54 +01:00
|
|
|
if self._end:
|
2025-02-13 07:47:22 +01:00
|
|
|
raise RuntimeError(f"Missing {identifier_end} from {self._line}:{self._start}")
|
2025-02-13 15:22:50 +01:00
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
self._advance()
|
2025-02-13 15:22:50 +01:00
|
|
|
if self._char == identifier_end:
|
|
|
|
if identifier_end_is_escape and self._peek == identifier_end:
|
2025-02-13 15:51:35 +01:00
|
|
|
text += identifier_end
|
2025-02-13 15:22:50 +01:00
|
|
|
self._advance()
|
|
|
|
continue
|
|
|
|
|
|
|
|
break
|
|
|
|
|
2025-02-13 15:51:35 +01:00
|
|
|
text += self._char
|
2025-02-13 15:22:50 +01:00
|
|
|
|
|
|
|
self._add(TokenType.IDENTIFIER, text)
|
2025-02-13 06:15:54 +01:00
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _scan_var(self) -> None:
|
2025-02-13 06:15:54 +01:00
|
|
|
while True:
|
2025-02-13 15:51:35 +01:00
|
|
|
char = self._peek.strip()
|
|
|
|
if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
|
2025-02-13 06:15:54 +01:00
|
|
|
self._advance()
|
|
|
|
else:
|
|
|
|
break
|
2025-02-13 14:42:49 +01:00
|
|
|
self._add(
|
|
|
|
TokenType.VAR
|
|
|
|
if self._prev_token_type == TokenType.PARAMETER
|
|
|
|
else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
|
|
|
|
)
|
2025-02-13 07:47:22 +01:00
|
|
|
|
2025-02-13 14:52:26 +01:00
|
|
|
def _extract_string(self, delimiter: str) -> str:
|
2025-02-13 07:47:22 +01:00
|
|
|
text = ""
|
|
|
|
delim_size = len(delimiter)
|
|
|
|
|
|
|
|
while True:
|
2025-02-13 15:24:45 +01:00
|
|
|
if self._char in self._STRING_ESCAPES and (
|
|
|
|
self._peek == delimiter or self._peek in self._STRING_ESCAPES
|
2025-02-13 15:04:17 +01:00
|
|
|
):
|
2025-02-13 15:24:45 +01:00
|
|
|
if self._peek == delimiter:
|
2025-02-13 15:51:35 +01:00
|
|
|
text += self._peek
|
2025-02-13 15:24:45 +01:00
|
|
|
else:
|
2025-02-13 15:51:35 +01:00
|
|
|
text += self._char + self._peek
|
2025-02-13 15:24:45 +01:00
|
|
|
|
|
|
|
if self._current + 1 < self.size:
|
|
|
|
self._advance(2)
|
|
|
|
else:
|
|
|
|
raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._current}")
|
2025-02-13 07:47:22 +01:00
|
|
|
else:
|
|
|
|
if self._chars(delim_size) == delimiter:
|
|
|
|
if delim_size > 1:
|
|
|
|
self._advance(delim_size - 1)
|
|
|
|
break
|
|
|
|
|
|
|
|
if self._end:
|
|
|
|
raise RuntimeError(f"Missing {delimiter} from {self._line}:{self._start}")
|
2025-02-13 15:51:35 +01:00
|
|
|
text += self._char
|
2025-02-13 07:47:22 +01:00
|
|
|
self._advance()
|
|
|
|
|
|
|
|
return text
|