sqlglot.tokens
1from __future__ import annotations 2 3import os 4import typing as t 5 6from sqlglot.errors import SqlglotError, TokenError 7from sqlglot.token_type import TokenType 8from sqlglot.trie import TrieResult, in_trie, new_trie 9 10if t.TYPE_CHECKING: 11 from sqlglot.dialects.dialect import DialectType 12 13 14try: 15 from sqlglotrs import ( # type: ignore 16 Tokenizer as RsTokenizer, 17 TokenizerDialectSettings as RsTokenizerDialectSettings, 18 TokenizerSettings as RsTokenizerSettings, 19 ) 20 21 USE_RS_TOKENIZER = os.environ.get("SQLGLOTRS_TOKENIZER", "1") == "1" 22except ImportError: 23 USE_RS_TOKENIZER = False 24 25 26class Token: 27 __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments") 28 29 @classmethod 30 def number(cls, number: int) -> Token: 31 """Returns a NUMBER token with `number` as its text.""" 32 return cls(TokenType.NUMBER, str(number)) 33 34 @classmethod 35 def string(cls, string: str) -> Token: 36 """Returns a STRING token with `string` as its text.""" 37 return cls(TokenType.STRING, string) 38 39 @classmethod 40 def identifier(cls, identifier: str) -> Token: 41 """Returns an IDENTIFIER token with `identifier` as its text.""" 42 return cls(TokenType.IDENTIFIER, identifier) 43 44 @classmethod 45 def var(cls, var: str) -> Token: 46 """Returns an VAR token with `var` as its text.""" 47 return cls(TokenType.VAR, var) 48 49 def __init__( 50 self, 51 token_type: TokenType, 52 text: str, 53 line: int = 1, 54 col: int = 1, 55 start: int = 0, 56 end: int = 0, 57 comments: t.Optional[t.List[str]] = None, 58 ) -> None: 59 """Token initializer. 60 61 Args: 62 token_type: The TokenType Enum. 63 text: The text of the token. 64 line: The line that the token ends on. 65 col: The column that the token ends on. 66 start: The start index of the token. 67 end: The ending index of the token. 68 comments: The comments to attach to the token. 69 """ 70 self.token_type = token_type 71 self.text = text 72 self.line = line 73 self.col = col 74 self.start = start 75 self.end = end 76 self.comments = [] if comments is None else comments 77 78 def __repr__(self) -> str: 79 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) 80 return f"<Token {attributes}>" 81 82 83class _Tokenizer(type): 84 def __new__(cls, clsname, bases, attrs): 85 klass = super().__new__(cls, clsname, bases, attrs) 86 87 def _convert_quotes(arr: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]: 88 return dict( 89 (item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr 90 ) 91 92 def _quotes_to_format( 93 token_type: TokenType, arr: t.List[str | t.Tuple[str, str]] 94 ) -> t.Dict[str, t.Tuple[str, TokenType]]: 95 return {k: (v, token_type) for k, v in _convert_quotes(arr).items()} 96 97 klass._QUOTES = _convert_quotes(klass.QUOTES) 98 klass._IDENTIFIERS = _convert_quotes(klass.IDENTIFIERS) 99 100 klass._FORMAT_STRINGS = { 101 **{ 102 p + s: (e, TokenType.NATIONAL_STRING) 103 for s, e in klass._QUOTES.items() 104 for p in ("n", "N") 105 }, 106 **_quotes_to_format(TokenType.BIT_STRING, klass.BIT_STRINGS), 107 **_quotes_to_format(TokenType.BYTE_STRING, klass.BYTE_STRINGS), 108 **_quotes_to_format(TokenType.HEX_STRING, klass.HEX_STRINGS), 109 **_quotes_to_format(TokenType.RAW_STRING, klass.RAW_STRINGS), 110 **_quotes_to_format(TokenType.HEREDOC_STRING, klass.HEREDOC_STRINGS), 111 } 112 113 klass._STRING_ESCAPES = set(klass.STRING_ESCAPES) 114 klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES) 115 klass._COMMENTS = { 116 **dict( 117 (comment, None) if isinstance(comment, str) else (comment[0], comment[1]) 118 for comment in klass.COMMENTS 119 ), 120 "{#": "#}", # Ensure Jinja comments are tokenized correctly in all dialects 121 } 122 123 klass._KEYWORD_TRIE = new_trie( 124 key.upper() 125 for key in ( 126 *klass.KEYWORDS, 127 *klass._COMMENTS, 128 *klass._QUOTES, 129 *klass._FORMAT_STRINGS, 130 ) 131 if " " in key or any(single in key for single in klass.SINGLE_TOKENS) 132 ) 133 134 if USE_RS_TOKENIZER: 135 settings = RsTokenizerSettings( 136 white_space={k: v.name for k, v in klass.WHITE_SPACE.items()}, 137 single_tokens={k: v.name for k, v in klass.SINGLE_TOKENS.items()}, 138 keywords={k: v.name for k, v in klass.KEYWORDS.items()}, 139 numeric_literals=klass.NUMERIC_LITERALS, 140 identifiers=klass._IDENTIFIERS, 141 identifier_escapes=klass._IDENTIFIER_ESCAPES, 142 string_escapes=klass._STRING_ESCAPES, 143 quotes=klass._QUOTES, 144 format_strings={k: (v1, v2.name) for k, (v1, v2) in klass._FORMAT_STRINGS.items()}, 145 has_bit_strings=bool(klass.BIT_STRINGS), 146 has_hex_strings=bool(klass.HEX_STRINGS), 147 comments=klass._COMMENTS, 148 var_single_tokens=klass.VAR_SINGLE_TOKENS, 149 commands={v.name for v in klass.COMMANDS}, 150 command_prefix_tokens={v.name for v in klass.COMMAND_PREFIX_TOKENS}, 151 ) 152 klass._RS_TOKENIZER = RsTokenizer(settings) 153 else: 154 klass._RS_TOKENIZER = None 155 156 return klass 157 158 159class Tokenizer(metaclass=_Tokenizer): 160 SINGLE_TOKENS = { 161 "(": TokenType.L_PAREN, 162 ")": TokenType.R_PAREN, 163 "[": TokenType.L_BRACKET, 164 "]": TokenType.R_BRACKET, 165 "{": TokenType.L_BRACE, 166 "}": TokenType.R_BRACE, 167 "&": TokenType.AMP, 168 "^": TokenType.CARET, 169 ":": TokenType.COLON, 170 ",": TokenType.COMMA, 171 ".": TokenType.DOT, 172 "-": TokenType.DASH, 173 "=": TokenType.EQ, 174 ">": TokenType.GT, 175 "<": TokenType.LT, 176 "%": TokenType.MOD, 177 "!": TokenType.NOT, 178 "|": TokenType.PIPE, 179 "+": TokenType.PLUS, 180 ";": TokenType.SEMICOLON, 181 "/": TokenType.SLASH, 182 "\\": TokenType.BACKSLASH, 183 "*": TokenType.STAR, 184 "~": TokenType.TILDA, 185 "?": TokenType.PLACEHOLDER, 186 "@": TokenType.PARAMETER, 187 # used for breaking a var like x'y' but nothing else 188 # the token type doesn't matter 189 "'": TokenType.QUOTE, 190 "`": TokenType.IDENTIFIER, 191 '"': TokenType.IDENTIFIER, 192 "#": TokenType.HASH, 193 } 194 195 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] 196 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 197 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] 198 RAW_STRINGS: t.List[str | t.Tuple[str, str]] = [] 199 HEREDOC_STRINGS: t.List[str | t.Tuple[str, str]] = [] 200 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] 201 IDENTIFIER_ESCAPES = ['"'] 202 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] 203 STRING_ESCAPES = ["'"] 204 VAR_SINGLE_TOKENS: t.Set[str] = set() 205 206 # Autofilled 207 _COMMENTS: t.Dict[str, str] = {} 208 _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {} 209 _IDENTIFIERS: t.Dict[str, str] = {} 210 _IDENTIFIER_ESCAPES: t.Set[str] = set() 211 _QUOTES: t.Dict[str, str] = {} 212 _STRING_ESCAPES: t.Set[str] = set() 213 _KEYWORD_TRIE: t.Dict = {} 214 _RS_TOKENIZER: t.Optional[t.Any] = None 215 216 KEYWORDS: t.Dict[str, TokenType] = { 217 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 218 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 219 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 220 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 221 "/*+": TokenType.HINT, 222 "==": TokenType.EQ, 223 "::": TokenType.DCOLON, 224 "||": TokenType.DPIPE, 225 ">=": TokenType.GTE, 226 "<=": TokenType.LTE, 227 "<>": TokenType.NEQ, 228 "!=": TokenType.NEQ, 229 ":=": TokenType.COLON_EQ, 230 "<=>": TokenType.NULLSAFE_EQ, 231 "->": TokenType.ARROW, 232 "->>": TokenType.DARROW, 233 "=>": TokenType.FARROW, 234 "#>": TokenType.HASH_ARROW, 235 "#>>": TokenType.DHASH_ARROW, 236 "<->": TokenType.LR_ARROW, 237 "&&": TokenType.DAMP, 238 "??": TokenType.DQMARK, 239 "ALL": TokenType.ALL, 240 "ALWAYS": TokenType.ALWAYS, 241 "AND": TokenType.AND, 242 "ANTI": TokenType.ANTI, 243 "ANY": TokenType.ANY, 244 "ASC": TokenType.ASC, 245 "AS": TokenType.ALIAS, 246 "ASOF": TokenType.ASOF, 247 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 248 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 249 "BEGIN": TokenType.BEGIN, 250 "BETWEEN": TokenType.BETWEEN, 251 "CACHE": TokenType.CACHE, 252 "UNCACHE": TokenType.UNCACHE, 253 "CASE": TokenType.CASE, 254 "CHARACTER SET": TokenType.CHARACTER_SET, 255 "CLUSTER BY": TokenType.CLUSTER_BY, 256 "COLLATE": TokenType.COLLATE, 257 "COLUMN": TokenType.COLUMN, 258 "COMMIT": TokenType.COMMIT, 259 "CONNECT BY": TokenType.CONNECT_BY, 260 "CONSTRAINT": TokenType.CONSTRAINT, 261 "CREATE": TokenType.CREATE, 262 "CROSS": TokenType.CROSS, 263 "CUBE": TokenType.CUBE, 264 "CURRENT_DATE": TokenType.CURRENT_DATE, 265 "CURRENT_TIME": TokenType.CURRENT_TIME, 266 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 267 "CURRENT_USER": TokenType.CURRENT_USER, 268 "DATABASE": TokenType.DATABASE, 269 "DEFAULT": TokenType.DEFAULT, 270 "DELETE": TokenType.DELETE, 271 "DESC": TokenType.DESC, 272 "DESCRIBE": TokenType.DESCRIBE, 273 "DISTINCT": TokenType.DISTINCT, 274 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 275 "DIV": TokenType.DIV, 276 "DROP": TokenType.DROP, 277 "ELSE": TokenType.ELSE, 278 "END": TokenType.END, 279 "ESCAPE": TokenType.ESCAPE, 280 "EXCEPT": TokenType.EXCEPT, 281 "EXECUTE": TokenType.EXECUTE, 282 "EXISTS": TokenType.EXISTS, 283 "FALSE": TokenType.FALSE, 284 "FETCH": TokenType.FETCH, 285 "FILTER": TokenType.FILTER, 286 "FIRST": TokenType.FIRST, 287 "FULL": TokenType.FULL, 288 "FUNCTION": TokenType.FUNCTION, 289 "FOR": TokenType.FOR, 290 "FOREIGN KEY": TokenType.FOREIGN_KEY, 291 "FORMAT": TokenType.FORMAT, 292 "FROM": TokenType.FROM, 293 "GEOGRAPHY": TokenType.GEOGRAPHY, 294 "GEOMETRY": TokenType.GEOMETRY, 295 "GLOB": TokenType.GLOB, 296 "GROUP BY": TokenType.GROUP_BY, 297 "GROUPING SETS": TokenType.GROUPING_SETS, 298 "HAVING": TokenType.HAVING, 299 "ILIKE": TokenType.ILIKE, 300 "IN": TokenType.IN, 301 "INDEX": TokenType.INDEX, 302 "INET": TokenType.INET, 303 "INNER": TokenType.INNER, 304 "INSERT": TokenType.INSERT, 305 "INTERVAL": TokenType.INTERVAL, 306 "INTERSECT": TokenType.INTERSECT, 307 "INTO": TokenType.INTO, 308 "IS": TokenType.IS, 309 "ISNULL": TokenType.ISNULL, 310 "JOIN": TokenType.JOIN, 311 "KEEP": TokenType.KEEP, 312 "KILL": TokenType.KILL, 313 "LATERAL": TokenType.LATERAL, 314 "LEFT": TokenType.LEFT, 315 "LIKE": TokenType.LIKE, 316 "LIMIT": TokenType.LIMIT, 317 "LOAD": TokenType.LOAD, 318 "LOCK": TokenType.LOCK, 319 "MERGE": TokenType.MERGE, 320 "NATURAL": TokenType.NATURAL, 321 "NEXT": TokenType.NEXT, 322 "NOT": TokenType.NOT, 323 "NOTNULL": TokenType.NOTNULL, 324 "NULL": TokenType.NULL, 325 "OBJECT": TokenType.OBJECT, 326 "OFFSET": TokenType.OFFSET, 327 "ON": TokenType.ON, 328 "OR": TokenType.OR, 329 "XOR": TokenType.XOR, 330 "ORDER BY": TokenType.ORDER_BY, 331 "ORDINALITY": TokenType.ORDINALITY, 332 "OUTER": TokenType.OUTER, 333 "OVER": TokenType.OVER, 334 "OVERLAPS": TokenType.OVERLAPS, 335 "OVERWRITE": TokenType.OVERWRITE, 336 "PARTITION": TokenType.PARTITION, 337 "PARTITION BY": TokenType.PARTITION_BY, 338 "PARTITIONED BY": TokenType.PARTITION_BY, 339 "PARTITIONED_BY": TokenType.PARTITION_BY, 340 "PERCENT": TokenType.PERCENT, 341 "PIVOT": TokenType.PIVOT, 342 "PRAGMA": TokenType.PRAGMA, 343 "PRIMARY KEY": TokenType.PRIMARY_KEY, 344 "PROCEDURE": TokenType.PROCEDURE, 345 "QUALIFY": TokenType.QUALIFY, 346 "RANGE": TokenType.RANGE, 347 "RECURSIVE": TokenType.RECURSIVE, 348 "REGEXP": TokenType.RLIKE, 349 "REPLACE": TokenType.REPLACE, 350 "RETURNING": TokenType.RETURNING, 351 "REFERENCES": TokenType.REFERENCES, 352 "RIGHT": TokenType.RIGHT, 353 "RLIKE": TokenType.RLIKE, 354 "ROLLBACK": TokenType.ROLLBACK, 355 "ROLLUP": TokenType.ROLLUP, 356 "ROW": TokenType.ROW, 357 "ROWS": TokenType.ROWS, 358 "SCHEMA": TokenType.SCHEMA, 359 "SELECT": TokenType.SELECT, 360 "SEMI": TokenType.SEMI, 361 "SET": TokenType.SET, 362 "SETTINGS": TokenType.SETTINGS, 363 "SHOW": TokenType.SHOW, 364 "SIMILAR TO": TokenType.SIMILAR_TO, 365 "SOME": TokenType.SOME, 366 "SORT BY": TokenType.SORT_BY, 367 "START WITH": TokenType.START_WITH, 368 "TABLE": TokenType.TABLE, 369 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 370 "TEMP": TokenType.TEMPORARY, 371 "TEMPORARY": TokenType.TEMPORARY, 372 "THEN": TokenType.THEN, 373 "TRUE": TokenType.TRUE, 374 "UNION": TokenType.UNION, 375 "UNKNOWN": TokenType.UNKNOWN, 376 "UNNEST": TokenType.UNNEST, 377 "UNPIVOT": TokenType.UNPIVOT, 378 "UPDATE": TokenType.UPDATE, 379 "USE": TokenType.USE, 380 "USING": TokenType.USING, 381 "UUID": TokenType.UUID, 382 "VALUES": TokenType.VALUES, 383 "VIEW": TokenType.VIEW, 384 "VOLATILE": TokenType.VOLATILE, 385 "WHEN": TokenType.WHEN, 386 "WHERE": TokenType.WHERE, 387 "WINDOW": TokenType.WINDOW, 388 "WITH": TokenType.WITH, 389 "APPLY": TokenType.APPLY, 390 "ARRAY": TokenType.ARRAY, 391 "BIT": TokenType.BIT, 392 "BOOL": TokenType.BOOLEAN, 393 "BOOLEAN": TokenType.BOOLEAN, 394 "BYTE": TokenType.TINYINT, 395 "MEDIUMINT": TokenType.MEDIUMINT, 396 "INT1": TokenType.TINYINT, 397 "TINYINT": TokenType.TINYINT, 398 "INT16": TokenType.SMALLINT, 399 "SHORT": TokenType.SMALLINT, 400 "SMALLINT": TokenType.SMALLINT, 401 "INT128": TokenType.INT128, 402 "HUGEINT": TokenType.INT128, 403 "INT2": TokenType.SMALLINT, 404 "INTEGER": TokenType.INT, 405 "INT": TokenType.INT, 406 "INT4": TokenType.INT, 407 "INT32": TokenType.INT, 408 "INT64": TokenType.BIGINT, 409 "LONG": TokenType.BIGINT, 410 "BIGINT": TokenType.BIGINT, 411 "INT8": TokenType.TINYINT, 412 "DEC": TokenType.DECIMAL, 413 "DECIMAL": TokenType.DECIMAL, 414 "BIGDECIMAL": TokenType.BIGDECIMAL, 415 "BIGNUMERIC": TokenType.BIGDECIMAL, 416 "MAP": TokenType.MAP, 417 "NULLABLE": TokenType.NULLABLE, 418 "NUMBER": TokenType.DECIMAL, 419 "NUMERIC": TokenType.DECIMAL, 420 "FIXED": TokenType.DECIMAL, 421 "REAL": TokenType.FLOAT, 422 "FLOAT": TokenType.FLOAT, 423 "FLOAT4": TokenType.FLOAT, 424 "FLOAT8": TokenType.DOUBLE, 425 "DOUBLE": TokenType.DOUBLE, 426 "DOUBLE PRECISION": TokenType.DOUBLE, 427 "JSON": TokenType.JSON, 428 "CHAR": TokenType.CHAR, 429 "CHARACTER": TokenType.CHAR, 430 "NCHAR": TokenType.NCHAR, 431 "VARCHAR": TokenType.VARCHAR, 432 "VARCHAR2": TokenType.VARCHAR, 433 "NVARCHAR": TokenType.NVARCHAR, 434 "NVARCHAR2": TokenType.NVARCHAR, 435 "STR": TokenType.TEXT, 436 "STRING": TokenType.TEXT, 437 "TEXT": TokenType.TEXT, 438 "LONGTEXT": TokenType.LONGTEXT, 439 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 440 "TINYTEXT": TokenType.TINYTEXT, 441 "CLOB": TokenType.TEXT, 442 "LONGVARCHAR": TokenType.TEXT, 443 "BINARY": TokenType.BINARY, 444 "BLOB": TokenType.VARBINARY, 445 "LONGBLOB": TokenType.LONGBLOB, 446 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 447 "TINYBLOB": TokenType.TINYBLOB, 448 "BYTEA": TokenType.VARBINARY, 449 "VARBINARY": TokenType.VARBINARY, 450 "TIME": TokenType.TIME, 451 "TIMETZ": TokenType.TIMETZ, 452 "TIMESTAMP": TokenType.TIMESTAMP, 453 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 454 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 455 "DATE": TokenType.DATE, 456 "DATETIME": TokenType.DATETIME, 457 "INT4RANGE": TokenType.INT4RANGE, 458 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 459 "INT8RANGE": TokenType.INT8RANGE, 460 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 461 "NUMRANGE": TokenType.NUMRANGE, 462 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 463 "TSRANGE": TokenType.TSRANGE, 464 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 465 "TSTZRANGE": TokenType.TSTZRANGE, 466 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 467 "DATERANGE": TokenType.DATERANGE, 468 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 469 "UNIQUE": TokenType.UNIQUE, 470 "STRUCT": TokenType.STRUCT, 471 "VARIANT": TokenType.VARIANT, 472 "ALTER": TokenType.ALTER, 473 "ANALYZE": TokenType.COMMAND, 474 "CALL": TokenType.COMMAND, 475 "COMMENT": TokenType.COMMENT, 476 "COPY": TokenType.COMMAND, 477 "EXPLAIN": TokenType.COMMAND, 478 "GRANT": TokenType.COMMAND, 479 "OPTIMIZE": TokenType.COMMAND, 480 "PREPARE": TokenType.COMMAND, 481 "TRUNCATE": TokenType.COMMAND, 482 "VACUUM": TokenType.COMMAND, 483 "USER-DEFINED": TokenType.USERDEFINED, 484 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 485 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 486 } 487 488 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { 489 " ": TokenType.SPACE, 490 "\t": TokenType.SPACE, 491 "\n": TokenType.BREAK, 492 "\r": TokenType.BREAK, 493 } 494 495 COMMANDS = { 496 TokenType.COMMAND, 497 TokenType.EXECUTE, 498 TokenType.FETCH, 499 TokenType.SHOW, 500 } 501 502 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 503 504 # handle numeric literals like in hive (3L = BIGINT) 505 NUMERIC_LITERALS: t.Dict[str, str] = {} 506 507 COMMENTS = ["--", ("/*", "*/")] 508 509 __slots__ = ( 510 "sql", 511 "size", 512 "tokens", 513 "dialect", 514 "_start", 515 "_current", 516 "_line", 517 "_col", 518 "_comments", 519 "_char", 520 "_end", 521 "_peek", 522 "_prev_token_line", 523 "_rs_dialect_settings", 524 ) 525 526 def __init__(self, dialect: DialectType = None) -> None: 527 from sqlglot.dialects import Dialect 528 529 self.dialect = Dialect.get_or_raise(dialect) 530 531 if USE_RS_TOKENIZER: 532 self._rs_dialect_settings = RsTokenizerDialectSettings( 533 escape_sequences=self.dialect.ESCAPE_SEQUENCES, 534 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 535 ) 536 537 self.reset() 538 539 def reset(self) -> None: 540 self.sql = "" 541 self.size = 0 542 self.tokens: t.List[Token] = [] 543 self._start = 0 544 self._current = 0 545 self._line = 1 546 self._col = 0 547 self._comments: t.List[str] = [] 548 549 self._char = "" 550 self._end = False 551 self._peek = "" 552 self._prev_token_line = -1 553 554 def tokenize(self, sql: str) -> t.List[Token]: 555 """Returns a list of tokens corresponding to the SQL string `sql`.""" 556 if USE_RS_TOKENIZER: 557 return self.tokenize_rs(sql) 558 559 self.reset() 560 self.sql = sql 561 self.size = len(sql) 562 563 try: 564 self._scan() 565 except Exception as e: 566 start = max(self._current - 50, 0) 567 end = min(self._current + 50, self.size - 1) 568 context = self.sql[start:end] 569 raise TokenError(f"Error tokenizing '{context}'") from e 570 571 return self.tokens 572 573 def _scan(self, until: t.Optional[t.Callable] = None) -> None: 574 while self.size and not self._end: 575 current = self._current 576 577 # skip spaces inline rather than iteratively call advance() 578 # for performance reasons 579 while current < self.size: 580 char = self.sql[current] 581 582 if char.isspace() and (char == " " or char == "\t"): 583 current += 1 584 else: 585 break 586 587 n = current - self._current 588 self._start = current 589 self._advance(n if n > 1 else 1) 590 591 if self._char is None: 592 break 593 594 if not self._char.isspace(): 595 if self._char.isdigit(): 596 self._scan_number() 597 elif self._char in self._IDENTIFIERS: 598 self._scan_identifier(self._IDENTIFIERS[self._char]) 599 else: 600 self._scan_keywords() 601 602 if until and until(): 603 break 604 605 if self.tokens and self._comments: 606 self.tokens[-1].comments.extend(self._comments) 607 608 def _chars(self, size: int) -> str: 609 if size == 1: 610 return self._char 611 612 start = self._current - 1 613 end = start + size 614 615 return self.sql[start:end] if end <= self.size else "" 616 617 def _advance(self, i: int = 1, alnum: bool = False) -> None: 618 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: 619 # Ensures we don't count an extra line if we get a \r\n line break sequence 620 if self._char == "\r" and self._peek == "\n": 621 i = 2 622 self._start += 1 623 624 self._col = 1 625 self._line += 1 626 else: 627 self._col += i 628 629 self._current += i 630 self._end = self._current >= self.size 631 self._char = self.sql[self._current - 1] 632 self._peek = "" if self._end else self.sql[self._current] 633 634 if alnum and self._char.isalnum(): 635 # Here we use local variables instead of attributes for better performance 636 _col = self._col 637 _current = self._current 638 _end = self._end 639 _peek = self._peek 640 641 while _peek.isalnum(): 642 _col += 1 643 _current += 1 644 _end = _current >= self.size 645 _peek = "" if _end else self.sql[_current] 646 647 self._col = _col 648 self._current = _current 649 self._end = _end 650 self._peek = _peek 651 self._char = self.sql[_current - 1] 652 653 @property 654 def _text(self) -> str: 655 return self.sql[self._start : self._current] 656 657 def peek(self, i: int = 0) -> str: 658 i = self._current + i 659 if i < self.size: 660 return self.sql[i] 661 return "" 662 663 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 664 self._prev_token_line = self._line 665 666 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 667 self.tokens[-1].comments.extend(self._comments) 668 self._comments = [] 669 670 self.tokens.append( 671 Token( 672 token_type, 673 text=self._text if text is None else text, 674 line=self._line, 675 col=self._col, 676 start=self._start, 677 end=self._current - 1, 678 comments=self._comments, 679 ) 680 ) 681 self._comments = [] 682 683 # If we have either a semicolon or a begin token before the command's token, we'll parse 684 # whatever follows the command's token as a string 685 if ( 686 token_type in self.COMMANDS 687 and self._peek != ";" 688 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) 689 ): 690 start = self._current 691 tokens = len(self.tokens) 692 self._scan(lambda: self._peek == ";") 693 self.tokens = self.tokens[:tokens] 694 text = self.sql[start : self._current].strip() 695 if text: 696 self._add(TokenType.STRING, text) 697 698 def _scan_keywords(self) -> None: 699 size = 0 700 word = None 701 chars = self._text 702 char = chars 703 prev_space = False 704 skip = False 705 trie = self._KEYWORD_TRIE 706 single_token = char in self.SINGLE_TOKENS 707 708 while chars: 709 if skip: 710 result = TrieResult.PREFIX 711 else: 712 result, trie = in_trie(trie, char.upper()) 713 714 if result == TrieResult.FAILED: 715 break 716 if result == TrieResult.EXISTS: 717 word = chars 718 719 end = self._current + size 720 size += 1 721 722 if end < self.size: 723 char = self.sql[end] 724 single_token = single_token or char in self.SINGLE_TOKENS 725 is_space = char.isspace() 726 727 if not is_space or not prev_space: 728 if is_space: 729 char = " " 730 chars += char 731 prev_space = is_space 732 skip = False 733 else: 734 skip = True 735 else: 736 char = "" 737 break 738 739 if word: 740 if self._scan_string(word): 741 return 742 if self._scan_comment(word): 743 return 744 if prev_space or single_token or not char: 745 self._advance(size - 1) 746 word = word.upper() 747 self._add(self.KEYWORDS[word], text=word) 748 return 749 750 if self._char in self.SINGLE_TOKENS: 751 self._add(self.SINGLE_TOKENS[self._char], text=self._char) 752 return 753 754 self._scan_var() 755 756 def _scan_comment(self, comment_start: str) -> bool: 757 if comment_start not in self._COMMENTS: 758 return False 759 760 comment_start_line = self._line 761 comment_start_size = len(comment_start) 762 comment_end = self._COMMENTS[comment_start] 763 764 if comment_end: 765 # Skip the comment's start delimiter 766 self._advance(comment_start_size) 767 768 comment_end_size = len(comment_end) 769 while not self._end and self._chars(comment_end_size) != comment_end: 770 self._advance(alnum=True) 771 772 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 773 self._advance(comment_end_size - 1) 774 else: 775 while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK: 776 self._advance(alnum=True) 777 self._comments.append(self._text[comment_start_size:]) 778 779 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 780 # Multiple consecutive comments are preserved by appending them to the current comments list. 781 if comment_start_line == self._prev_token_line: 782 self.tokens[-1].comments.extend(self._comments) 783 self._comments = [] 784 self._prev_token_line = self._line 785 786 return True 787 788 def _scan_number(self) -> None: 789 if self._char == "0": 790 peek = self._peek.upper() 791 if peek == "B": 792 return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER) 793 elif peek == "X": 794 return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER) 795 796 decimal = False 797 scientific = 0 798 799 while True: 800 if self._peek.isdigit(): 801 self._advance() 802 elif self._peek == "." and not decimal: 803 after = self.peek(1) 804 if after.isdigit() or not after.isalpha(): 805 decimal = True 806 self._advance() 807 else: 808 return self._add(TokenType.VAR) 809 elif self._peek in ("-", "+") and scientific == 1: 810 scientific += 1 811 self._advance() 812 elif self._peek.upper() == "E" and not scientific: 813 scientific += 1 814 self._advance() 815 elif self._peek.isidentifier(): 816 number_text = self._text 817 literal = "" 818 819 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: 820 literal += self._peek 821 self._advance() 822 823 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal.upper(), "")) 824 825 if token_type: 826 self._add(TokenType.NUMBER, number_text) 827 self._add(TokenType.DCOLON, "::") 828 return self._add(token_type, literal) 829 elif self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT: 830 return self._add(TokenType.VAR) 831 832 self._advance(-len(literal)) 833 return self._add(TokenType.NUMBER, number_text) 834 else: 835 return self._add(TokenType.NUMBER) 836 837 def _scan_bits(self) -> None: 838 self._advance() 839 value = self._extract_value() 840 try: 841 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 842 int(value, 2) 843 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 844 except ValueError: 845 self._add(TokenType.IDENTIFIER) 846 847 def _scan_hex(self) -> None: 848 self._advance() 849 value = self._extract_value() 850 try: 851 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 852 int(value, 16) 853 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 854 except ValueError: 855 self._add(TokenType.IDENTIFIER) 856 857 def _extract_value(self) -> str: 858 while True: 859 char = self._peek.strip() 860 if char and char not in self.SINGLE_TOKENS: 861 self._advance(alnum=True) 862 else: 863 break 864 865 return self._text 866 867 def _scan_string(self, start: str) -> bool: 868 base = None 869 token_type = TokenType.STRING 870 871 if start in self._QUOTES: 872 end = self._QUOTES[start] 873 elif start in self._FORMAT_STRINGS: 874 end, token_type = self._FORMAT_STRINGS[start] 875 876 if token_type == TokenType.HEX_STRING: 877 base = 16 878 elif token_type == TokenType.BIT_STRING: 879 base = 2 880 elif token_type == TokenType.HEREDOC_STRING: 881 self._advance() 882 tag = "" if self._char == end else self._extract_string(end) 883 end = f"{start}{tag}{end}" 884 else: 885 return False 886 887 self._advance(len(start)) 888 text = self._extract_string(end) 889 890 if base: 891 try: 892 int(text, base) 893 except: 894 raise TokenError( 895 f"Numeric string contains invalid characters from {self._line}:{self._start}" 896 ) 897 898 self._add(token_type, text) 899 return True 900 901 def _scan_identifier(self, identifier_end: str) -> None: 902 self._advance() 903 text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES) 904 self._add(TokenType.IDENTIFIER, text) 905 906 def _scan_var(self) -> None: 907 while True: 908 char = self._peek.strip() 909 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): 910 self._advance(alnum=True) 911 else: 912 break 913 914 self._add( 915 TokenType.VAR 916 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 917 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) 918 ) 919 920 def _extract_string(self, delimiter: str, escapes=None) -> str: 921 text = "" 922 delim_size = len(delimiter) 923 escapes = self._STRING_ESCAPES if escapes is None else escapes 924 925 while True: 926 if ( 927 self._char in escapes 928 and (self._peek == delimiter or self._peek in escapes) 929 and (self._char not in self._QUOTES or self._char == self._peek) 930 ): 931 if self._peek == delimiter: 932 text += self._peek 933 else: 934 text += self._char + self._peek 935 936 if self._current + 1 < self.size: 937 self._advance(2) 938 else: 939 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 940 else: 941 if self._chars(delim_size) == delimiter: 942 if delim_size > 1: 943 self._advance(delim_size - 1) 944 break 945 946 if self._end: 947 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 948 949 if ( 950 self.dialect.ESCAPE_SEQUENCES 951 and self._peek 952 and self._char in self.STRING_ESCAPES 953 ): 954 escaped_sequence = self.dialect.ESCAPE_SEQUENCES.get(self._char + self._peek) 955 if escaped_sequence: 956 self._advance(2) 957 text += escaped_sequence 958 continue 959 960 current = self._current - 1 961 self._advance(alnum=True) 962 text += self.sql[current : self._current - 1] 963 964 return text 965 966 def tokenize_rs(self, sql: str) -> t.List[Token]: 967 if not self._RS_TOKENIZER: 968 raise SqlglotError("Rust tokenizer is not available") 969 970 try: 971 return [ 972 Token( 973 token_type=_ALL_TOKEN_TYPES[token.token_type.index], 974 text=token.text, 975 line=token.line, 976 col=token.col, 977 start=token.start, 978 end=token.end, 979 comments=token.comments, 980 ) 981 for token in self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings) 982 ] 983 except Exception as e: 984 raise TokenError(str(e)) 985 986 987_ALL_TOKEN_TYPES = list(TokenType)
class
Token:
27class Token: 28 __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments") 29 30 @classmethod 31 def number(cls, number: int) -> Token: 32 """Returns a NUMBER token with `number` as its text.""" 33 return cls(TokenType.NUMBER, str(number)) 34 35 @classmethod 36 def string(cls, string: str) -> Token: 37 """Returns a STRING token with `string` as its text.""" 38 return cls(TokenType.STRING, string) 39 40 @classmethod 41 def identifier(cls, identifier: str) -> Token: 42 """Returns an IDENTIFIER token with `identifier` as its text.""" 43 return cls(TokenType.IDENTIFIER, identifier) 44 45 @classmethod 46 def var(cls, var: str) -> Token: 47 """Returns an VAR token with `var` as its text.""" 48 return cls(TokenType.VAR, var) 49 50 def __init__( 51 self, 52 token_type: TokenType, 53 text: str, 54 line: int = 1, 55 col: int = 1, 56 start: int = 0, 57 end: int = 0, 58 comments: t.Optional[t.List[str]] = None, 59 ) -> None: 60 """Token initializer. 61 62 Args: 63 token_type: The TokenType Enum. 64 text: The text of the token. 65 line: The line that the token ends on. 66 col: The column that the token ends on. 67 start: The start index of the token. 68 end: The ending index of the token. 69 comments: The comments to attach to the token. 70 """ 71 self.token_type = token_type 72 self.text = text 73 self.line = line 74 self.col = col 75 self.start = start 76 self.end = end 77 self.comments = [] if comments is None else comments 78 79 def __repr__(self) -> str: 80 attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__) 81 return f"<Token {attributes}>"
Token( token_type: sqlglot.token_type.TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: Optional[List[str]] = None)
50 def __init__( 51 self, 52 token_type: TokenType, 53 text: str, 54 line: int = 1, 55 col: int = 1, 56 start: int = 0, 57 end: int = 0, 58 comments: t.Optional[t.List[str]] = None, 59 ) -> None: 60 """Token initializer. 61 62 Args: 63 token_type: The TokenType Enum. 64 text: The text of the token. 65 line: The line that the token ends on. 66 col: The column that the token ends on. 67 start: The start index of the token. 68 end: The ending index of the token. 69 comments: The comments to attach to the token. 70 """ 71 self.token_type = token_type 72 self.text = text 73 self.line = line 74 self.col = col 75 self.start = start 76 self.end = end 77 self.comments = [] if comments is None else comments
Token initializer.
Arguments:
- token_type: The TokenType Enum.
- text: The text of the token.
- line: The line that the token ends on.
- col: The column that the token ends on.
- start: The start index of the token.
- end: The ending index of the token.
- comments: The comments to attach to the token.
30 @classmethod 31 def number(cls, number: int) -> Token: 32 """Returns a NUMBER token with `number` as its text.""" 33 return cls(TokenType.NUMBER, str(number))
Returns a NUMBER token with number
as its text.
35 @classmethod 36 def string(cls, string: str) -> Token: 37 """Returns a STRING token with `string` as its text.""" 38 return cls(TokenType.STRING, string)
Returns a STRING token with string
as its text.
40 @classmethod 41 def identifier(cls, identifier: str) -> Token: 42 """Returns an IDENTIFIER token with `identifier` as its text.""" 43 return cls(TokenType.IDENTIFIER, identifier)
Returns an IDENTIFIER token with identifier
as its text.
class
Tokenizer:
160class Tokenizer(metaclass=_Tokenizer): 161 SINGLE_TOKENS = { 162 "(": TokenType.L_PAREN, 163 ")": TokenType.R_PAREN, 164 "[": TokenType.L_BRACKET, 165 "]": TokenType.R_BRACKET, 166 "{": TokenType.L_BRACE, 167 "}": TokenType.R_BRACE, 168 "&": TokenType.AMP, 169 "^": TokenType.CARET, 170 ":": TokenType.COLON, 171 ",": TokenType.COMMA, 172 ".": TokenType.DOT, 173 "-": TokenType.DASH, 174 "=": TokenType.EQ, 175 ">": TokenType.GT, 176 "<": TokenType.LT, 177 "%": TokenType.MOD, 178 "!": TokenType.NOT, 179 "|": TokenType.PIPE, 180 "+": TokenType.PLUS, 181 ";": TokenType.SEMICOLON, 182 "/": TokenType.SLASH, 183 "\\": TokenType.BACKSLASH, 184 "*": TokenType.STAR, 185 "~": TokenType.TILDA, 186 "?": TokenType.PLACEHOLDER, 187 "@": TokenType.PARAMETER, 188 # used for breaking a var like x'y' but nothing else 189 # the token type doesn't matter 190 "'": TokenType.QUOTE, 191 "`": TokenType.IDENTIFIER, 192 '"': TokenType.IDENTIFIER, 193 "#": TokenType.HASH, 194 } 195 196 BIT_STRINGS: t.List[str | t.Tuple[str, str]] = [] 197 BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = [] 198 HEX_STRINGS: t.List[str | t.Tuple[str, str]] = [] 199 RAW_STRINGS: t.List[str | t.Tuple[str, str]] = [] 200 HEREDOC_STRINGS: t.List[str | t.Tuple[str, str]] = [] 201 IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"'] 202 IDENTIFIER_ESCAPES = ['"'] 203 QUOTES: t.List[t.Tuple[str, str] | str] = ["'"] 204 STRING_ESCAPES = ["'"] 205 VAR_SINGLE_TOKENS: t.Set[str] = set() 206 207 # Autofilled 208 _COMMENTS: t.Dict[str, str] = {} 209 _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {} 210 _IDENTIFIERS: t.Dict[str, str] = {} 211 _IDENTIFIER_ESCAPES: t.Set[str] = set() 212 _QUOTES: t.Dict[str, str] = {} 213 _STRING_ESCAPES: t.Set[str] = set() 214 _KEYWORD_TRIE: t.Dict = {} 215 _RS_TOKENIZER: t.Optional[t.Any] = None 216 217 KEYWORDS: t.Dict[str, TokenType] = { 218 **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")}, 219 **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")}, 220 **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")}, 221 **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")}, 222 "/*+": TokenType.HINT, 223 "==": TokenType.EQ, 224 "::": TokenType.DCOLON, 225 "||": TokenType.DPIPE, 226 ">=": TokenType.GTE, 227 "<=": TokenType.LTE, 228 "<>": TokenType.NEQ, 229 "!=": TokenType.NEQ, 230 ":=": TokenType.COLON_EQ, 231 "<=>": TokenType.NULLSAFE_EQ, 232 "->": TokenType.ARROW, 233 "->>": TokenType.DARROW, 234 "=>": TokenType.FARROW, 235 "#>": TokenType.HASH_ARROW, 236 "#>>": TokenType.DHASH_ARROW, 237 "<->": TokenType.LR_ARROW, 238 "&&": TokenType.DAMP, 239 "??": TokenType.DQMARK, 240 "ALL": TokenType.ALL, 241 "ALWAYS": TokenType.ALWAYS, 242 "AND": TokenType.AND, 243 "ANTI": TokenType.ANTI, 244 "ANY": TokenType.ANY, 245 "ASC": TokenType.ASC, 246 "AS": TokenType.ALIAS, 247 "ASOF": TokenType.ASOF, 248 "AUTOINCREMENT": TokenType.AUTO_INCREMENT, 249 "AUTO_INCREMENT": TokenType.AUTO_INCREMENT, 250 "BEGIN": TokenType.BEGIN, 251 "BETWEEN": TokenType.BETWEEN, 252 "CACHE": TokenType.CACHE, 253 "UNCACHE": TokenType.UNCACHE, 254 "CASE": TokenType.CASE, 255 "CHARACTER SET": TokenType.CHARACTER_SET, 256 "CLUSTER BY": TokenType.CLUSTER_BY, 257 "COLLATE": TokenType.COLLATE, 258 "COLUMN": TokenType.COLUMN, 259 "COMMIT": TokenType.COMMIT, 260 "CONNECT BY": TokenType.CONNECT_BY, 261 "CONSTRAINT": TokenType.CONSTRAINT, 262 "CREATE": TokenType.CREATE, 263 "CROSS": TokenType.CROSS, 264 "CUBE": TokenType.CUBE, 265 "CURRENT_DATE": TokenType.CURRENT_DATE, 266 "CURRENT_TIME": TokenType.CURRENT_TIME, 267 "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP, 268 "CURRENT_USER": TokenType.CURRENT_USER, 269 "DATABASE": TokenType.DATABASE, 270 "DEFAULT": TokenType.DEFAULT, 271 "DELETE": TokenType.DELETE, 272 "DESC": TokenType.DESC, 273 "DESCRIBE": TokenType.DESCRIBE, 274 "DISTINCT": TokenType.DISTINCT, 275 "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY, 276 "DIV": TokenType.DIV, 277 "DROP": TokenType.DROP, 278 "ELSE": TokenType.ELSE, 279 "END": TokenType.END, 280 "ESCAPE": TokenType.ESCAPE, 281 "EXCEPT": TokenType.EXCEPT, 282 "EXECUTE": TokenType.EXECUTE, 283 "EXISTS": TokenType.EXISTS, 284 "FALSE": TokenType.FALSE, 285 "FETCH": TokenType.FETCH, 286 "FILTER": TokenType.FILTER, 287 "FIRST": TokenType.FIRST, 288 "FULL": TokenType.FULL, 289 "FUNCTION": TokenType.FUNCTION, 290 "FOR": TokenType.FOR, 291 "FOREIGN KEY": TokenType.FOREIGN_KEY, 292 "FORMAT": TokenType.FORMAT, 293 "FROM": TokenType.FROM, 294 "GEOGRAPHY": TokenType.GEOGRAPHY, 295 "GEOMETRY": TokenType.GEOMETRY, 296 "GLOB": TokenType.GLOB, 297 "GROUP BY": TokenType.GROUP_BY, 298 "GROUPING SETS": TokenType.GROUPING_SETS, 299 "HAVING": TokenType.HAVING, 300 "ILIKE": TokenType.ILIKE, 301 "IN": TokenType.IN, 302 "INDEX": TokenType.INDEX, 303 "INET": TokenType.INET, 304 "INNER": TokenType.INNER, 305 "INSERT": TokenType.INSERT, 306 "INTERVAL": TokenType.INTERVAL, 307 "INTERSECT": TokenType.INTERSECT, 308 "INTO": TokenType.INTO, 309 "IS": TokenType.IS, 310 "ISNULL": TokenType.ISNULL, 311 "JOIN": TokenType.JOIN, 312 "KEEP": TokenType.KEEP, 313 "KILL": TokenType.KILL, 314 "LATERAL": TokenType.LATERAL, 315 "LEFT": TokenType.LEFT, 316 "LIKE": TokenType.LIKE, 317 "LIMIT": TokenType.LIMIT, 318 "LOAD": TokenType.LOAD, 319 "LOCK": TokenType.LOCK, 320 "MERGE": TokenType.MERGE, 321 "NATURAL": TokenType.NATURAL, 322 "NEXT": TokenType.NEXT, 323 "NOT": TokenType.NOT, 324 "NOTNULL": TokenType.NOTNULL, 325 "NULL": TokenType.NULL, 326 "OBJECT": TokenType.OBJECT, 327 "OFFSET": TokenType.OFFSET, 328 "ON": TokenType.ON, 329 "OR": TokenType.OR, 330 "XOR": TokenType.XOR, 331 "ORDER BY": TokenType.ORDER_BY, 332 "ORDINALITY": TokenType.ORDINALITY, 333 "OUTER": TokenType.OUTER, 334 "OVER": TokenType.OVER, 335 "OVERLAPS": TokenType.OVERLAPS, 336 "OVERWRITE": TokenType.OVERWRITE, 337 "PARTITION": TokenType.PARTITION, 338 "PARTITION BY": TokenType.PARTITION_BY, 339 "PARTITIONED BY": TokenType.PARTITION_BY, 340 "PARTITIONED_BY": TokenType.PARTITION_BY, 341 "PERCENT": TokenType.PERCENT, 342 "PIVOT": TokenType.PIVOT, 343 "PRAGMA": TokenType.PRAGMA, 344 "PRIMARY KEY": TokenType.PRIMARY_KEY, 345 "PROCEDURE": TokenType.PROCEDURE, 346 "QUALIFY": TokenType.QUALIFY, 347 "RANGE": TokenType.RANGE, 348 "RECURSIVE": TokenType.RECURSIVE, 349 "REGEXP": TokenType.RLIKE, 350 "REPLACE": TokenType.REPLACE, 351 "RETURNING": TokenType.RETURNING, 352 "REFERENCES": TokenType.REFERENCES, 353 "RIGHT": TokenType.RIGHT, 354 "RLIKE": TokenType.RLIKE, 355 "ROLLBACK": TokenType.ROLLBACK, 356 "ROLLUP": TokenType.ROLLUP, 357 "ROW": TokenType.ROW, 358 "ROWS": TokenType.ROWS, 359 "SCHEMA": TokenType.SCHEMA, 360 "SELECT": TokenType.SELECT, 361 "SEMI": TokenType.SEMI, 362 "SET": TokenType.SET, 363 "SETTINGS": TokenType.SETTINGS, 364 "SHOW": TokenType.SHOW, 365 "SIMILAR TO": TokenType.SIMILAR_TO, 366 "SOME": TokenType.SOME, 367 "SORT BY": TokenType.SORT_BY, 368 "START WITH": TokenType.START_WITH, 369 "TABLE": TokenType.TABLE, 370 "TABLESAMPLE": TokenType.TABLE_SAMPLE, 371 "TEMP": TokenType.TEMPORARY, 372 "TEMPORARY": TokenType.TEMPORARY, 373 "THEN": TokenType.THEN, 374 "TRUE": TokenType.TRUE, 375 "UNION": TokenType.UNION, 376 "UNKNOWN": TokenType.UNKNOWN, 377 "UNNEST": TokenType.UNNEST, 378 "UNPIVOT": TokenType.UNPIVOT, 379 "UPDATE": TokenType.UPDATE, 380 "USE": TokenType.USE, 381 "USING": TokenType.USING, 382 "UUID": TokenType.UUID, 383 "VALUES": TokenType.VALUES, 384 "VIEW": TokenType.VIEW, 385 "VOLATILE": TokenType.VOLATILE, 386 "WHEN": TokenType.WHEN, 387 "WHERE": TokenType.WHERE, 388 "WINDOW": TokenType.WINDOW, 389 "WITH": TokenType.WITH, 390 "APPLY": TokenType.APPLY, 391 "ARRAY": TokenType.ARRAY, 392 "BIT": TokenType.BIT, 393 "BOOL": TokenType.BOOLEAN, 394 "BOOLEAN": TokenType.BOOLEAN, 395 "BYTE": TokenType.TINYINT, 396 "MEDIUMINT": TokenType.MEDIUMINT, 397 "INT1": TokenType.TINYINT, 398 "TINYINT": TokenType.TINYINT, 399 "INT16": TokenType.SMALLINT, 400 "SHORT": TokenType.SMALLINT, 401 "SMALLINT": TokenType.SMALLINT, 402 "INT128": TokenType.INT128, 403 "HUGEINT": TokenType.INT128, 404 "INT2": TokenType.SMALLINT, 405 "INTEGER": TokenType.INT, 406 "INT": TokenType.INT, 407 "INT4": TokenType.INT, 408 "INT32": TokenType.INT, 409 "INT64": TokenType.BIGINT, 410 "LONG": TokenType.BIGINT, 411 "BIGINT": TokenType.BIGINT, 412 "INT8": TokenType.TINYINT, 413 "DEC": TokenType.DECIMAL, 414 "DECIMAL": TokenType.DECIMAL, 415 "BIGDECIMAL": TokenType.BIGDECIMAL, 416 "BIGNUMERIC": TokenType.BIGDECIMAL, 417 "MAP": TokenType.MAP, 418 "NULLABLE": TokenType.NULLABLE, 419 "NUMBER": TokenType.DECIMAL, 420 "NUMERIC": TokenType.DECIMAL, 421 "FIXED": TokenType.DECIMAL, 422 "REAL": TokenType.FLOAT, 423 "FLOAT": TokenType.FLOAT, 424 "FLOAT4": TokenType.FLOAT, 425 "FLOAT8": TokenType.DOUBLE, 426 "DOUBLE": TokenType.DOUBLE, 427 "DOUBLE PRECISION": TokenType.DOUBLE, 428 "JSON": TokenType.JSON, 429 "CHAR": TokenType.CHAR, 430 "CHARACTER": TokenType.CHAR, 431 "NCHAR": TokenType.NCHAR, 432 "VARCHAR": TokenType.VARCHAR, 433 "VARCHAR2": TokenType.VARCHAR, 434 "NVARCHAR": TokenType.NVARCHAR, 435 "NVARCHAR2": TokenType.NVARCHAR, 436 "STR": TokenType.TEXT, 437 "STRING": TokenType.TEXT, 438 "TEXT": TokenType.TEXT, 439 "LONGTEXT": TokenType.LONGTEXT, 440 "MEDIUMTEXT": TokenType.MEDIUMTEXT, 441 "TINYTEXT": TokenType.TINYTEXT, 442 "CLOB": TokenType.TEXT, 443 "LONGVARCHAR": TokenType.TEXT, 444 "BINARY": TokenType.BINARY, 445 "BLOB": TokenType.VARBINARY, 446 "LONGBLOB": TokenType.LONGBLOB, 447 "MEDIUMBLOB": TokenType.MEDIUMBLOB, 448 "TINYBLOB": TokenType.TINYBLOB, 449 "BYTEA": TokenType.VARBINARY, 450 "VARBINARY": TokenType.VARBINARY, 451 "TIME": TokenType.TIME, 452 "TIMETZ": TokenType.TIMETZ, 453 "TIMESTAMP": TokenType.TIMESTAMP, 454 "TIMESTAMPTZ": TokenType.TIMESTAMPTZ, 455 "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ, 456 "DATE": TokenType.DATE, 457 "DATETIME": TokenType.DATETIME, 458 "INT4RANGE": TokenType.INT4RANGE, 459 "INT4MULTIRANGE": TokenType.INT4MULTIRANGE, 460 "INT8RANGE": TokenType.INT8RANGE, 461 "INT8MULTIRANGE": TokenType.INT8MULTIRANGE, 462 "NUMRANGE": TokenType.NUMRANGE, 463 "NUMMULTIRANGE": TokenType.NUMMULTIRANGE, 464 "TSRANGE": TokenType.TSRANGE, 465 "TSMULTIRANGE": TokenType.TSMULTIRANGE, 466 "TSTZRANGE": TokenType.TSTZRANGE, 467 "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE, 468 "DATERANGE": TokenType.DATERANGE, 469 "DATEMULTIRANGE": TokenType.DATEMULTIRANGE, 470 "UNIQUE": TokenType.UNIQUE, 471 "STRUCT": TokenType.STRUCT, 472 "VARIANT": TokenType.VARIANT, 473 "ALTER": TokenType.ALTER, 474 "ANALYZE": TokenType.COMMAND, 475 "CALL": TokenType.COMMAND, 476 "COMMENT": TokenType.COMMENT, 477 "COPY": TokenType.COMMAND, 478 "EXPLAIN": TokenType.COMMAND, 479 "GRANT": TokenType.COMMAND, 480 "OPTIMIZE": TokenType.COMMAND, 481 "PREPARE": TokenType.COMMAND, 482 "TRUNCATE": TokenType.COMMAND, 483 "VACUUM": TokenType.COMMAND, 484 "USER-DEFINED": TokenType.USERDEFINED, 485 "FOR VERSION": TokenType.VERSION_SNAPSHOT, 486 "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT, 487 } 488 489 WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = { 490 " ": TokenType.SPACE, 491 "\t": TokenType.SPACE, 492 "\n": TokenType.BREAK, 493 "\r": TokenType.BREAK, 494 } 495 496 COMMANDS = { 497 TokenType.COMMAND, 498 TokenType.EXECUTE, 499 TokenType.FETCH, 500 TokenType.SHOW, 501 } 502 503 COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN} 504 505 # handle numeric literals like in hive (3L = BIGINT) 506 NUMERIC_LITERALS: t.Dict[str, str] = {} 507 508 COMMENTS = ["--", ("/*", "*/")] 509 510 __slots__ = ( 511 "sql", 512 "size", 513 "tokens", 514 "dialect", 515 "_start", 516 "_current", 517 "_line", 518 "_col", 519 "_comments", 520 "_char", 521 "_end", 522 "_peek", 523 "_prev_token_line", 524 "_rs_dialect_settings", 525 ) 526 527 def __init__(self, dialect: DialectType = None) -> None: 528 from sqlglot.dialects import Dialect 529 530 self.dialect = Dialect.get_or_raise(dialect) 531 532 if USE_RS_TOKENIZER: 533 self._rs_dialect_settings = RsTokenizerDialectSettings( 534 escape_sequences=self.dialect.ESCAPE_SEQUENCES, 535 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 536 ) 537 538 self.reset() 539 540 def reset(self) -> None: 541 self.sql = "" 542 self.size = 0 543 self.tokens: t.List[Token] = [] 544 self._start = 0 545 self._current = 0 546 self._line = 1 547 self._col = 0 548 self._comments: t.List[str] = [] 549 550 self._char = "" 551 self._end = False 552 self._peek = "" 553 self._prev_token_line = -1 554 555 def tokenize(self, sql: str) -> t.List[Token]: 556 """Returns a list of tokens corresponding to the SQL string `sql`.""" 557 if USE_RS_TOKENIZER: 558 return self.tokenize_rs(sql) 559 560 self.reset() 561 self.sql = sql 562 self.size = len(sql) 563 564 try: 565 self._scan() 566 except Exception as e: 567 start = max(self._current - 50, 0) 568 end = min(self._current + 50, self.size - 1) 569 context = self.sql[start:end] 570 raise TokenError(f"Error tokenizing '{context}'") from e 571 572 return self.tokens 573 574 def _scan(self, until: t.Optional[t.Callable] = None) -> None: 575 while self.size and not self._end: 576 current = self._current 577 578 # skip spaces inline rather than iteratively call advance() 579 # for performance reasons 580 while current < self.size: 581 char = self.sql[current] 582 583 if char.isspace() and (char == " " or char == "\t"): 584 current += 1 585 else: 586 break 587 588 n = current - self._current 589 self._start = current 590 self._advance(n if n > 1 else 1) 591 592 if self._char is None: 593 break 594 595 if not self._char.isspace(): 596 if self._char.isdigit(): 597 self._scan_number() 598 elif self._char in self._IDENTIFIERS: 599 self._scan_identifier(self._IDENTIFIERS[self._char]) 600 else: 601 self._scan_keywords() 602 603 if until and until(): 604 break 605 606 if self.tokens and self._comments: 607 self.tokens[-1].comments.extend(self._comments) 608 609 def _chars(self, size: int) -> str: 610 if size == 1: 611 return self._char 612 613 start = self._current - 1 614 end = start + size 615 616 return self.sql[start:end] if end <= self.size else "" 617 618 def _advance(self, i: int = 1, alnum: bool = False) -> None: 619 if self.WHITE_SPACE.get(self._char) is TokenType.BREAK: 620 # Ensures we don't count an extra line if we get a \r\n line break sequence 621 if self._char == "\r" and self._peek == "\n": 622 i = 2 623 self._start += 1 624 625 self._col = 1 626 self._line += 1 627 else: 628 self._col += i 629 630 self._current += i 631 self._end = self._current >= self.size 632 self._char = self.sql[self._current - 1] 633 self._peek = "" if self._end else self.sql[self._current] 634 635 if alnum and self._char.isalnum(): 636 # Here we use local variables instead of attributes for better performance 637 _col = self._col 638 _current = self._current 639 _end = self._end 640 _peek = self._peek 641 642 while _peek.isalnum(): 643 _col += 1 644 _current += 1 645 _end = _current >= self.size 646 _peek = "" if _end else self.sql[_current] 647 648 self._col = _col 649 self._current = _current 650 self._end = _end 651 self._peek = _peek 652 self._char = self.sql[_current - 1] 653 654 @property 655 def _text(self) -> str: 656 return self.sql[self._start : self._current] 657 658 def peek(self, i: int = 0) -> str: 659 i = self._current + i 660 if i < self.size: 661 return self.sql[i] 662 return "" 663 664 def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None: 665 self._prev_token_line = self._line 666 667 if self._comments and token_type == TokenType.SEMICOLON and self.tokens: 668 self.tokens[-1].comments.extend(self._comments) 669 self._comments = [] 670 671 self.tokens.append( 672 Token( 673 token_type, 674 text=self._text if text is None else text, 675 line=self._line, 676 col=self._col, 677 start=self._start, 678 end=self._current - 1, 679 comments=self._comments, 680 ) 681 ) 682 self._comments = [] 683 684 # If we have either a semicolon or a begin token before the command's token, we'll parse 685 # whatever follows the command's token as a string 686 if ( 687 token_type in self.COMMANDS 688 and self._peek != ";" 689 and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS) 690 ): 691 start = self._current 692 tokens = len(self.tokens) 693 self._scan(lambda: self._peek == ";") 694 self.tokens = self.tokens[:tokens] 695 text = self.sql[start : self._current].strip() 696 if text: 697 self._add(TokenType.STRING, text) 698 699 def _scan_keywords(self) -> None: 700 size = 0 701 word = None 702 chars = self._text 703 char = chars 704 prev_space = False 705 skip = False 706 trie = self._KEYWORD_TRIE 707 single_token = char in self.SINGLE_TOKENS 708 709 while chars: 710 if skip: 711 result = TrieResult.PREFIX 712 else: 713 result, trie = in_trie(trie, char.upper()) 714 715 if result == TrieResult.FAILED: 716 break 717 if result == TrieResult.EXISTS: 718 word = chars 719 720 end = self._current + size 721 size += 1 722 723 if end < self.size: 724 char = self.sql[end] 725 single_token = single_token or char in self.SINGLE_TOKENS 726 is_space = char.isspace() 727 728 if not is_space or not prev_space: 729 if is_space: 730 char = " " 731 chars += char 732 prev_space = is_space 733 skip = False 734 else: 735 skip = True 736 else: 737 char = "" 738 break 739 740 if word: 741 if self._scan_string(word): 742 return 743 if self._scan_comment(word): 744 return 745 if prev_space or single_token or not char: 746 self._advance(size - 1) 747 word = word.upper() 748 self._add(self.KEYWORDS[word], text=word) 749 return 750 751 if self._char in self.SINGLE_TOKENS: 752 self._add(self.SINGLE_TOKENS[self._char], text=self._char) 753 return 754 755 self._scan_var() 756 757 def _scan_comment(self, comment_start: str) -> bool: 758 if comment_start not in self._COMMENTS: 759 return False 760 761 comment_start_line = self._line 762 comment_start_size = len(comment_start) 763 comment_end = self._COMMENTS[comment_start] 764 765 if comment_end: 766 # Skip the comment's start delimiter 767 self._advance(comment_start_size) 768 769 comment_end_size = len(comment_end) 770 while not self._end and self._chars(comment_end_size) != comment_end: 771 self._advance(alnum=True) 772 773 self._comments.append(self._text[comment_start_size : -comment_end_size + 1]) 774 self._advance(comment_end_size - 1) 775 else: 776 while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK: 777 self._advance(alnum=True) 778 self._comments.append(self._text[comment_start_size:]) 779 780 # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding. 781 # Multiple consecutive comments are preserved by appending them to the current comments list. 782 if comment_start_line == self._prev_token_line: 783 self.tokens[-1].comments.extend(self._comments) 784 self._comments = [] 785 self._prev_token_line = self._line 786 787 return True 788 789 def _scan_number(self) -> None: 790 if self._char == "0": 791 peek = self._peek.upper() 792 if peek == "B": 793 return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER) 794 elif peek == "X": 795 return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER) 796 797 decimal = False 798 scientific = 0 799 800 while True: 801 if self._peek.isdigit(): 802 self._advance() 803 elif self._peek == "." and not decimal: 804 after = self.peek(1) 805 if after.isdigit() or not after.isalpha(): 806 decimal = True 807 self._advance() 808 else: 809 return self._add(TokenType.VAR) 810 elif self._peek in ("-", "+") and scientific == 1: 811 scientific += 1 812 self._advance() 813 elif self._peek.upper() == "E" and not scientific: 814 scientific += 1 815 self._advance() 816 elif self._peek.isidentifier(): 817 number_text = self._text 818 literal = "" 819 820 while self._peek.strip() and self._peek not in self.SINGLE_TOKENS: 821 literal += self._peek 822 self._advance() 823 824 token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal.upper(), "")) 825 826 if token_type: 827 self._add(TokenType.NUMBER, number_text) 828 self._add(TokenType.DCOLON, "::") 829 return self._add(token_type, literal) 830 elif self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT: 831 return self._add(TokenType.VAR) 832 833 self._advance(-len(literal)) 834 return self._add(TokenType.NUMBER, number_text) 835 else: 836 return self._add(TokenType.NUMBER) 837 838 def _scan_bits(self) -> None: 839 self._advance() 840 value = self._extract_value() 841 try: 842 # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier 843 int(value, 2) 844 self._add(TokenType.BIT_STRING, value[2:]) # Drop the 0b 845 except ValueError: 846 self._add(TokenType.IDENTIFIER) 847 848 def _scan_hex(self) -> None: 849 self._advance() 850 value = self._extract_value() 851 try: 852 # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier 853 int(value, 16) 854 self._add(TokenType.HEX_STRING, value[2:]) # Drop the 0x 855 except ValueError: 856 self._add(TokenType.IDENTIFIER) 857 858 def _extract_value(self) -> str: 859 while True: 860 char = self._peek.strip() 861 if char and char not in self.SINGLE_TOKENS: 862 self._advance(alnum=True) 863 else: 864 break 865 866 return self._text 867 868 def _scan_string(self, start: str) -> bool: 869 base = None 870 token_type = TokenType.STRING 871 872 if start in self._QUOTES: 873 end = self._QUOTES[start] 874 elif start in self._FORMAT_STRINGS: 875 end, token_type = self._FORMAT_STRINGS[start] 876 877 if token_type == TokenType.HEX_STRING: 878 base = 16 879 elif token_type == TokenType.BIT_STRING: 880 base = 2 881 elif token_type == TokenType.HEREDOC_STRING: 882 self._advance() 883 tag = "" if self._char == end else self._extract_string(end) 884 end = f"{start}{tag}{end}" 885 else: 886 return False 887 888 self._advance(len(start)) 889 text = self._extract_string(end) 890 891 if base: 892 try: 893 int(text, base) 894 except: 895 raise TokenError( 896 f"Numeric string contains invalid characters from {self._line}:{self._start}" 897 ) 898 899 self._add(token_type, text) 900 return True 901 902 def _scan_identifier(self, identifier_end: str) -> None: 903 self._advance() 904 text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES) 905 self._add(TokenType.IDENTIFIER, text) 906 907 def _scan_var(self) -> None: 908 while True: 909 char = self._peek.strip() 910 if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS): 911 self._advance(alnum=True) 912 else: 913 break 914 915 self._add( 916 TokenType.VAR 917 if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER 918 else self.KEYWORDS.get(self._text.upper(), TokenType.VAR) 919 ) 920 921 def _extract_string(self, delimiter: str, escapes=None) -> str: 922 text = "" 923 delim_size = len(delimiter) 924 escapes = self._STRING_ESCAPES if escapes is None else escapes 925 926 while True: 927 if ( 928 self._char in escapes 929 and (self._peek == delimiter or self._peek in escapes) 930 and (self._char not in self._QUOTES or self._char == self._peek) 931 ): 932 if self._peek == delimiter: 933 text += self._peek 934 else: 935 text += self._char + self._peek 936 937 if self._current + 1 < self.size: 938 self._advance(2) 939 else: 940 raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}") 941 else: 942 if self._chars(delim_size) == delimiter: 943 if delim_size > 1: 944 self._advance(delim_size - 1) 945 break 946 947 if self._end: 948 raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}") 949 950 if ( 951 self.dialect.ESCAPE_SEQUENCES 952 and self._peek 953 and self._char in self.STRING_ESCAPES 954 ): 955 escaped_sequence = self.dialect.ESCAPE_SEQUENCES.get(self._char + self._peek) 956 if escaped_sequence: 957 self._advance(2) 958 text += escaped_sequence 959 continue 960 961 current = self._current - 1 962 self._advance(alnum=True) 963 text += self.sql[current : self._current - 1] 964 965 return text 966 967 def tokenize_rs(self, sql: str) -> t.List[Token]: 968 if not self._RS_TOKENIZER: 969 raise SqlglotError("Rust tokenizer is not available") 970 971 try: 972 return [ 973 Token( 974 token_type=_ALL_TOKEN_TYPES[token.token_type.index], 975 text=token.text, 976 line=token.line, 977 col=token.col, 978 start=token.start, 979 end=token.end, 980 comments=token.comments, 981 ) 982 for token in self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings) 983 ] 984 except Exception as e: 985 raise TokenError(str(e))
Tokenizer( dialect: Union[str, sqlglot.dialects.dialect.Dialect, Type[sqlglot.dialects.dialect.Dialect], NoneType] = None)
527 def __init__(self, dialect: DialectType = None) -> None: 528 from sqlglot.dialects import Dialect 529 530 self.dialect = Dialect.get_or_raise(dialect) 531 532 if USE_RS_TOKENIZER: 533 self._rs_dialect_settings = RsTokenizerDialectSettings( 534 escape_sequences=self.dialect.ESCAPE_SEQUENCES, 535 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, 536 ) 537 538 self.reset()
SINGLE_TOKENS =
{'(': <TokenType.L_PAREN: 'L_PAREN'>, ')': <TokenType.R_PAREN: 'R_PAREN'>, '[': <TokenType.L_BRACKET: 'L_BRACKET'>, ']': <TokenType.R_BRACKET: 'R_BRACKET'>, '{': <TokenType.L_BRACE: 'L_BRACE'>, '}': <TokenType.R_BRACE: 'R_BRACE'>, '&': <TokenType.AMP: 'AMP'>, '^': <TokenType.CARET: 'CARET'>, ':': <TokenType.COLON: 'COLON'>, ',': <TokenType.COMMA: 'COMMA'>, '.': <TokenType.DOT: 'DOT'>, '-': <TokenType.DASH: 'DASH'>, '=': <TokenType.EQ: 'EQ'>, '>': <TokenType.GT: 'GT'>, '<': <TokenType.LT: 'LT'>, '%': <TokenType.MOD: 'MOD'>, '!': <TokenType.NOT: 'NOT'>, '|': <TokenType.PIPE: 'PIPE'>, '+': <TokenType.PLUS: 'PLUS'>, ';': <TokenType.SEMICOLON: 'SEMICOLON'>, '/': <TokenType.SLASH: 'SLASH'>, '\\': <TokenType.BACKSLASH: 'BACKSLASH'>, '*': <TokenType.STAR: 'STAR'>, '~': <TokenType.TILDA: 'TILDA'>, '?': <TokenType.PLACEHOLDER: 'PLACEHOLDER'>, '@': <TokenType.PARAMETER: 'PARAMETER'>, "'": <TokenType.QUOTE: 'QUOTE'>, '`': <TokenType.IDENTIFIER: 'IDENTIFIER'>, '"': <TokenType.IDENTIFIER: 'IDENTIFIER'>, '#': <TokenType.HASH: 'HASH'>}
KEYWORDS: Dict[str, sqlglot.token_type.TokenType] =
{'{%': <TokenType.BLOCK_START: 'BLOCK_START'>, '{%+': <TokenType.BLOCK_START: 'BLOCK_START'>, '{%-': <TokenType.BLOCK_START: 'BLOCK_START'>, '%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '+%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '-%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '{{+': <TokenType.BLOCK_START: 'BLOCK_START'>, '{{-': <TokenType.BLOCK_START: 'BLOCK_START'>, '+}}': <TokenType.BLOCK_END: 'BLOCK_END'>, '-}}': <TokenType.BLOCK_END: 'BLOCK_END'>, '/*+': <TokenType.HINT: 'HINT'>, '==': <TokenType.EQ: 'EQ'>, '::': <TokenType.DCOLON: 'DCOLON'>, '||': <TokenType.DPIPE: 'DPIPE'>, '>=': <TokenType.GTE: 'GTE'>, '<=': <TokenType.LTE: 'LTE'>, '<>': <TokenType.NEQ: 'NEQ'>, '!=': <TokenType.NEQ: 'NEQ'>, ':=': <TokenType.COLON_EQ: 'COLON_EQ'>, '<=>': <TokenType.NULLSAFE_EQ: 'NULLSAFE_EQ'>, '->': <TokenType.ARROW: 'ARROW'>, '->>': <TokenType.DARROW: 'DARROW'>, '=>': <TokenType.FARROW: 'FARROW'>, '#>': <TokenType.HASH_ARROW: 'HASH_ARROW'>, '#>>': <TokenType.DHASH_ARROW: 'DHASH_ARROW'>, '<->': <TokenType.LR_ARROW: 'LR_ARROW'>, '&&': <TokenType.DAMP: 'DAMP'>, '??': <TokenType.DQMARK: 'DQMARK'>, 'ALL': <TokenType.ALL: 'ALL'>, 'ALWAYS': <TokenType.ALWAYS: 'ALWAYS'>, 'AND': <TokenType.AND: 'AND'>, 'ANTI': <TokenType.ANTI: 'ANTI'>, 'ANY': <TokenType.ANY: 'ANY'>, 'ASC': <TokenType.ASC: 'ASC'>, 'AS': <TokenType.ALIAS: 'ALIAS'>, 'ASOF': <TokenType.ASOF: 'ASOF'>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>, 'BEGIN': <TokenType.BEGIN: 'BEGIN'>, 'BETWEEN': <TokenType.BETWEEN: 'BETWEEN'>, 'CACHE': <TokenType.CACHE: 'CACHE'>, 'UNCACHE': <TokenType.UNCACHE: 'UNCACHE'>, 'CASE': <TokenType.CASE: 'CASE'>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 'CHARACTER_SET'>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 'CLUSTER_BY'>, 'COLLATE': <TokenType.COLLATE: 'COLLATE'>, 'COLUMN': <TokenType.COLUMN: 'COLUMN'>, 'COMMIT': <TokenType.COMMIT: 'COMMIT'>, 'CONNECT BY': <TokenType.CONNECT_BY: 'CONNECT_BY'>, 'CONSTRAINT': <TokenType.CONSTRAINT: 'CONSTRAINT'>, 'CREATE': <TokenType.CREATE: 'CREATE'>, 'CROSS': <TokenType.CROSS: 'CROSS'>, 'CUBE': <TokenType.CUBE: 'CUBE'>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 'CURRENT_DATE'>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 'CURRENT_TIME'>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'>, 'CURRENT_USER': <TokenType.CURRENT_USER: 'CURRENT_USER'>, 'DATABASE': <TokenType.DATABASE: 'DATABASE'>, 'DEFAULT': <TokenType.DEFAULT: 'DEFAULT'>, 'DELETE': <TokenType.DELETE: 'DELETE'>, 'DESC': <TokenType.DESC: 'DESC'>, 'DESCRIBE': <TokenType.DESCRIBE: 'DESCRIBE'>, 'DISTINCT': <TokenType.DISTINCT: 'DISTINCT'>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 'DISTRIBUTE_BY'>, 'DIV': <TokenType.DIV: 'DIV'>, 'DROP': <TokenType.DROP: 'DROP'>, 'ELSE': <TokenType.ELSE: 'ELSE'>, 'END': <TokenType.END: 'END'>, 'ESCAPE': <TokenType.ESCAPE: 'ESCAPE'>, 'EXCEPT': <TokenType.EXCEPT: 'EXCEPT'>, 'EXECUTE': <TokenType.EXECUTE: 'EXECUTE'>, 'EXISTS': <TokenType.EXISTS: 'EXISTS'>, 'FALSE': <TokenType.FALSE: 'FALSE'>, 'FETCH': <TokenType.FETCH: 'FETCH'>, 'FILTER': <TokenType.FILTER: 'FILTER'>, 'FIRST': <TokenType.FIRST: 'FIRST'>, 'FULL': <TokenType.FULL: 'FULL'>, 'FUNCTION': <TokenType.FUNCTION: 'FUNCTION'>, 'FOR': <TokenType.FOR: 'FOR'>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 'FOREIGN_KEY'>, 'FORMAT': <TokenType.FORMAT: 'FORMAT'>, 'FROM': <TokenType.FROM: 'FROM'>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 'GEOGRAPHY'>, 'GEOMETRY': <TokenType.GEOMETRY: 'GEOMETRY'>, 'GLOB': <TokenType.GLOB: 'GLOB'>, 'GROUP BY': <TokenType.GROUP_BY: 'GROUP_BY'>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 'GROUPING_SETS'>, 'HAVING': <TokenType.HAVING: 'HAVING'>, 'ILIKE': <TokenType.ILIKE: 'ILIKE'>, 'IN': <TokenType.IN: 'IN'>, 'INDEX': <TokenType.INDEX: 'INDEX'>, 'INET': <TokenType.INET: 'INET'>, 'INNER': <TokenType.INNER: 'INNER'>, 'INSERT': <TokenType.INSERT: 'INSERT'>, 'INTERVAL': <TokenType.INTERVAL: 'INTERVAL'>, 'INTERSECT': <TokenType.INTERSECT: 'INTERSECT'>, 'INTO': <TokenType.INTO: 'INTO'>, 'IS': <TokenType.IS: 'IS'>, 'ISNULL': <TokenType.ISNULL: 'ISNULL'>, 'JOIN': <TokenType.JOIN: 'JOIN'>, 'KEEP': <TokenType.KEEP: 'KEEP'>, 'KILL': <TokenType.KILL: 'KILL'>, 'LATERAL': <TokenType.LATERAL: 'LATERAL'>, 'LEFT': <TokenType.LEFT: 'LEFT'>, 'LIKE': <TokenType.LIKE: 'LIKE'>, 'LIMIT': <TokenType.LIMIT: 'LIMIT'>, 'LOAD': <TokenType.LOAD: 'LOAD'>, 'LOCK': <TokenType.LOCK: 'LOCK'>, 'MERGE': <TokenType.MERGE: 'MERGE'>, 'NATURAL': <TokenType.NATURAL: 'NATURAL'>, 'NEXT': <TokenType.NEXT: 'NEXT'>, 'NOT': <TokenType.NOT: 'NOT'>, 'NOTNULL': <TokenType.NOTNULL: 'NOTNULL'>, 'NULL': <TokenType.NULL: 'NULL'>, 'OBJECT': <TokenType.OBJECT: 'OBJECT'>, 'OFFSET': <TokenType.OFFSET: 'OFFSET'>, 'ON': <TokenType.ON: 'ON'>, 'OR': <TokenType.OR: 'OR'>, 'XOR': <TokenType.XOR: 'XOR'>, 'ORDER BY': <TokenType.ORDER_BY: 'ORDER_BY'>, 'ORDINALITY': <TokenType.ORDINALITY: 'ORDINALITY'>, 'OUTER': <TokenType.OUTER: 'OUTER'>, 'OVER': <TokenType.OVER: 'OVER'>, 'OVERLAPS': <TokenType.OVERLAPS: 'OVERLAPS'>, 'OVERWRITE': <TokenType.OVERWRITE: 'OVERWRITE'>, 'PARTITION': <TokenType.PARTITION: 'PARTITION'>, 'PARTITION BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PERCENT': <TokenType.PERCENT: 'PERCENT'>, 'PIVOT': <TokenType.PIVOT: 'PIVOT'>, 'PRAGMA': <TokenType.PRAGMA: 'PRAGMA'>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 'PRIMARY_KEY'>, 'PROCEDURE': <TokenType.PROCEDURE: 'PROCEDURE'>, 'QUALIFY': <TokenType.QUALIFY: 'QUALIFY'>, 'RANGE': <TokenType.RANGE: 'RANGE'>, 'RECURSIVE': <TokenType.RECURSIVE: 'RECURSIVE'>, 'REGEXP': <TokenType.RLIKE: 'RLIKE'>, 'REPLACE': <TokenType.REPLACE: 'REPLACE'>, 'RETURNING': <TokenType.RETURNING: 'RETURNING'>, 'REFERENCES': <TokenType.REFERENCES: 'REFERENCES'>, 'RIGHT': <TokenType.RIGHT: 'RIGHT'>, 'RLIKE': <TokenType.RLIKE: 'RLIKE'>, 'ROLLBACK': <TokenType.ROLLBACK: 'ROLLBACK'>, 'ROLLUP': <TokenType.ROLLUP: 'ROLLUP'>, 'ROW': <TokenType.ROW: 'ROW'>, 'ROWS': <TokenType.ROWS: 'ROWS'>, 'SCHEMA': <TokenType.SCHEMA: 'SCHEMA'>, 'SELECT': <TokenType.SELECT: 'SELECT'>, 'SEMI': <TokenType.SEMI: 'SEMI'>, 'SET': <TokenType.SET: 'SET'>, 'SETTINGS': <TokenType.SETTINGS: 'SETTINGS'>, 'SHOW': <TokenType.SHOW: 'SHOW'>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 'SIMILAR_TO'>, 'SOME': <TokenType.SOME: 'SOME'>, 'SORT BY': <TokenType.SORT_BY: 'SORT_BY'>, 'START WITH': <TokenType.START_WITH: 'START_WITH'>, 'TABLE': <TokenType.TABLE: 'TABLE'>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 'TABLE_SAMPLE'>, 'TEMP': <TokenType.TEMPORARY: 'TEMPORARY'>, 'TEMPORARY': <TokenType.TEMPORARY: 'TEMPORARY'>, 'THEN': <TokenType.THEN: 'THEN'>, 'TRUE': <TokenType.TRUE: 'TRUE'>, 'UNION': <TokenType.UNION: 'UNION'>, 'UNKNOWN': <TokenType.UNKNOWN: 'UNKNOWN'>, 'UNNEST': <TokenType.UNNEST: 'UNNEST'>, 'UNPIVOT': <TokenType.UNPIVOT: 'UNPIVOT'>, 'UPDATE': <TokenType.UPDATE: 'UPDATE'>, 'USE': <TokenType.USE: 'USE'>, 'USING': <TokenType.USING: 'USING'>, 'UUID': <TokenType.UUID: 'UUID'>, 'VALUES': <TokenType.VALUES: 'VALUES'>, 'VIEW': <TokenType.VIEW: 'VIEW'>, 'VOLATILE': <TokenType.VOLATILE: 'VOLATILE'>, 'WHEN': <TokenType.WHEN: 'WHEN'>, 'WHERE': <TokenType.WHERE: 'WHERE'>, 'WINDOW': <TokenType.WINDOW: 'WINDOW'>, 'WITH': <TokenType.WITH: 'WITH'>, 'APPLY': <TokenType.APPLY: 'APPLY'>, 'ARRAY': <TokenType.ARRAY: 'ARRAY'>, 'BIT': <TokenType.BIT: 'BIT'>, 'BOOL': <TokenType.BOOLEAN: 'BOOLEAN'>, 'BOOLEAN': <TokenType.BOOLEAN: 'BOOLEAN'>, 'BYTE': <TokenType.TINYINT: 'TINYINT'>, 'MEDIUMINT': <TokenType.MEDIUMINT: 'MEDIUMINT'>, 'INT1': <TokenType.TINYINT: 'TINYINT'>, 'TINYINT': <TokenType.TINYINT: 'TINYINT'>, 'INT16': <TokenType.SMALLINT: 'SMALLINT'>, 'SHORT': <TokenType.SMALLINT: 'SMALLINT'>, 'SMALLINT': <TokenType.SMALLINT: 'SMALLINT'>, 'INT128': <TokenType.INT128: 'INT128'>, 'HUGEINT': <TokenType.INT128: 'INT128'>, 'INT2': <TokenType.SMALLINT: 'SMALLINT'>, 'INTEGER': <TokenType.INT: 'INT'>, 'INT': <TokenType.INT: 'INT'>, 'INT4': <TokenType.INT: 'INT'>, 'INT32': <TokenType.INT: 'INT'>, 'INT64': <TokenType.BIGINT: 'BIGINT'>, 'LONG': <TokenType.BIGINT: 'BIGINT'>, 'BIGINT': <TokenType.BIGINT: 'BIGINT'>, 'INT8': <TokenType.TINYINT: 'TINYINT'>, 'DEC': <TokenType.DECIMAL: 'DECIMAL'>, 'DECIMAL': <TokenType.DECIMAL: 'DECIMAL'>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 'BIGDECIMAL'>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 'BIGDECIMAL'>, 'MAP': <TokenType.MAP: 'MAP'>, 'NULLABLE': <TokenType.NULLABLE: 'NULLABLE'>, 'NUMBER': <TokenType.DECIMAL: 'DECIMAL'>, 'NUMERIC': <TokenType.DECIMAL: 'DECIMAL'>, 'FIXED': <TokenType.DECIMAL: 'DECIMAL'>, 'REAL': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT4': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT8': <TokenType.DOUBLE: 'DOUBLE'>, 'DOUBLE': <TokenType.DOUBLE: 'DOUBLE'>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 'DOUBLE'>, 'JSON': <TokenType.JSON: 'JSON'>, 'CHAR': <TokenType.CHAR: 'CHAR'>, 'CHARACTER': <TokenType.CHAR: 'CHAR'>, 'NCHAR': <TokenType.NCHAR: 'NCHAR'>, 'VARCHAR': <TokenType.VARCHAR: 'VARCHAR'>, 'VARCHAR2': <TokenType.VARCHAR: 'VARCHAR'>, 'NVARCHAR': <TokenType.NVARCHAR: 'NVARCHAR'>, 'NVARCHAR2': <TokenType.NVARCHAR: 'NVARCHAR'>, 'STR': <TokenType.TEXT: 'TEXT'>, 'STRING': <TokenType.TEXT: 'TEXT'>, 'TEXT': <TokenType.TEXT: 'TEXT'>, 'LONGTEXT': <TokenType.LONGTEXT: 'LONGTEXT'>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 'MEDIUMTEXT'>, 'TINYTEXT': <TokenType.TINYTEXT: 'TINYTEXT'>, 'CLOB': <TokenType.TEXT: 'TEXT'>, 'LONGVARCHAR': <TokenType.TEXT: 'TEXT'>, 'BINARY': <TokenType.BINARY: 'BINARY'>, 'BLOB': <TokenType.VARBINARY: 'VARBINARY'>, 'LONGBLOB': <TokenType.LONGBLOB: 'LONGBLOB'>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 'MEDIUMBLOB'>, 'TINYBLOB': <TokenType.TINYBLOB: 'TINYBLOB'>, 'BYTEA': <TokenType.VARBINARY: 'VARBINARY'>, 'VARBINARY': <TokenType.VARBINARY: 'VARBINARY'>, 'TIME': <TokenType.TIME: 'TIME'>, 'TIMETZ': <TokenType.TIMETZ: 'TIMETZ'>, 'TIMESTAMP': <TokenType.TIMESTAMP: 'TIMESTAMP'>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 'TIMESTAMPTZ'>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 'TIMESTAMPLTZ'>, 'DATE': <TokenType.DATE: 'DATE'>, 'DATETIME': <TokenType.DATETIME: 'DATETIME'>, 'INT4RANGE': <TokenType.INT4RANGE: 'INT4RANGE'>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 'INT4MULTIRANGE'>, 'INT8RANGE': <TokenType.INT8RANGE: 'INT8RANGE'>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 'INT8MULTIRANGE'>, 'NUMRANGE': <TokenType.NUMRANGE: 'NUMRANGE'>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 'NUMMULTIRANGE'>, 'TSRANGE': <TokenType.TSRANGE: 'TSRANGE'>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 'TSMULTIRANGE'>, 'TSTZRANGE': <TokenType.TSTZRANGE: 'TSTZRANGE'>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 'TSTZMULTIRANGE'>, 'DATERANGE': <TokenType.DATERANGE: 'DATERANGE'>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 'DATEMULTIRANGE'>, 'UNIQUE': <TokenType.UNIQUE: 'UNIQUE'>, 'STRUCT': <TokenType.STRUCT: 'STRUCT'>, 'VARIANT': <TokenType.VARIANT: 'VARIANT'>, 'ALTER': <TokenType.ALTER: 'ALTER'>, 'ANALYZE': <TokenType.COMMAND: 'COMMAND'>, 'CALL': <TokenType.COMMAND: 'COMMAND'>, 'COMMENT': <TokenType.COMMENT: 'COMMENT'>, 'COPY': <TokenType.COMMAND: 'COMMAND'>, 'EXPLAIN': <TokenType.COMMAND: 'COMMAND'>, 'GRANT': <TokenType.COMMAND: 'COMMAND'>, 'OPTIMIZE': <TokenType.COMMAND: 'COMMAND'>, 'PREPARE': <TokenType.COMMAND: 'COMMAND'>, 'TRUNCATE': <TokenType.COMMAND: 'COMMAND'>, 'VACUUM': <TokenType.COMMAND: 'COMMAND'>, 'USER-DEFINED': <TokenType.USERDEFINED: 'USERDEFINED'>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 'VERSION_SNAPSHOT'>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 'TIMESTAMP_SNAPSHOT'>}
WHITE_SPACE: Dict[Optional[str], sqlglot.token_type.TokenType] =
{' ': <TokenType.SPACE: 'SPACE'>, '\t': <TokenType.SPACE: 'SPACE'>, '\n': <TokenType.BREAK: 'BREAK'>, '\r': <TokenType.BREAK: 'BREAK'>}
COMMANDS =
{<TokenType.EXECUTE: 'EXECUTE'>, <TokenType.SHOW: 'SHOW'>, <TokenType.FETCH: 'FETCH'>, <TokenType.COMMAND: 'COMMAND'>}
def
reset(self) -> None:
540 def reset(self) -> None: 541 self.sql = "" 542 self.size = 0 543 self.tokens: t.List[Token] = [] 544 self._start = 0 545 self._current = 0 546 self._line = 1 547 self._col = 0 548 self._comments: t.List[str] = [] 549 550 self._char = "" 551 self._end = False 552 self._peek = "" 553 self._prev_token_line = -1
555 def tokenize(self, sql: str) -> t.List[Token]: 556 """Returns a list of tokens corresponding to the SQL string `sql`.""" 557 if USE_RS_TOKENIZER: 558 return self.tokenize_rs(sql) 559 560 self.reset() 561 self.sql = sql 562 self.size = len(sql) 563 564 try: 565 self._scan() 566 except Exception as e: 567 start = max(self._current - 50, 0) 568 end = min(self._current + 50, self.size - 1) 569 context = self.sql[start:end] 570 raise TokenError(f"Error tokenizing '{context}'") from e 571 572 return self.tokens
Returns a list of tokens corresponding to the SQL string sql
.
967 def tokenize_rs(self, sql: str) -> t.List[Token]: 968 if not self._RS_TOKENIZER: 969 raise SqlglotError("Rust tokenizer is not available") 970 971 try: 972 return [ 973 Token( 974 token_type=_ALL_TOKEN_TYPES[token.token_type.index], 975 text=token.text, 976 line=token.line, 977 col=token.col, 978 start=token.start, 979 end=token.end, 980 comments=token.comments, 981 ) 982 for token in self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings) 983 ] 984 except Exception as e: 985 raise TokenError(str(e))