Edit on GitHub

sqlglot.tokens

  1from __future__ import annotations
  2
  3import os
  4import typing as t
  5
  6from sqlglot.errors import SqlglotError, TokenError
  7from sqlglot.token_type import TokenType
  8from sqlglot.trie import TrieResult, in_trie, new_trie
  9
 10if t.TYPE_CHECKING:
 11    from sqlglot.dialects.dialect import DialectType
 12
 13
 14try:
 15    from sqlglotrs import (  # type: ignore
 16        Tokenizer as RsTokenizer,
 17        TokenizerDialectSettings as RsTokenizerDialectSettings,
 18        TokenizerSettings as RsTokenizerSettings,
 19    )
 20
 21    USE_RS_TOKENIZER = os.environ.get("SQLGLOTRS_TOKENIZER", "1") == "1"
 22except ImportError:
 23    USE_RS_TOKENIZER = False
 24
 25
 26class Token:
 27    __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments")
 28
 29    @classmethod
 30    def number(cls, number: int) -> Token:
 31        """Returns a NUMBER token with `number` as its text."""
 32        return cls(TokenType.NUMBER, str(number))
 33
 34    @classmethod
 35    def string(cls, string: str) -> Token:
 36        """Returns a STRING token with `string` as its text."""
 37        return cls(TokenType.STRING, string)
 38
 39    @classmethod
 40    def identifier(cls, identifier: str) -> Token:
 41        """Returns an IDENTIFIER token with `identifier` as its text."""
 42        return cls(TokenType.IDENTIFIER, identifier)
 43
 44    @classmethod
 45    def var(cls, var: str) -> Token:
 46        """Returns an VAR token with `var` as its text."""
 47        return cls(TokenType.VAR, var)
 48
 49    def __init__(
 50        self,
 51        token_type: TokenType,
 52        text: str,
 53        line: int = 1,
 54        col: int = 1,
 55        start: int = 0,
 56        end: int = 0,
 57        comments: t.Optional[t.List[str]] = None,
 58    ) -> None:
 59        """Token initializer.
 60
 61        Args:
 62            token_type: The TokenType Enum.
 63            text: The text of the token.
 64            line: The line that the token ends on.
 65            col: The column that the token ends on.
 66            start: The start index of the token.
 67            end: The ending index of the token.
 68            comments: The comments to attach to the token.
 69        """
 70        self.token_type = token_type
 71        self.text = text
 72        self.line = line
 73        self.col = col
 74        self.start = start
 75        self.end = end
 76        self.comments = [] if comments is None else comments
 77
 78    def __repr__(self) -> str:
 79        attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
 80        return f"<Token {attributes}>"
 81
 82
 83class _Tokenizer(type):
 84    def __new__(cls, clsname, bases, attrs):
 85        klass = super().__new__(cls, clsname, bases, attrs)
 86
 87        def _convert_quotes(arr: t.List[str | t.Tuple[str, str]]) -> t.Dict[str, str]:
 88            return dict(
 89                (item, item) if isinstance(item, str) else (item[0], item[1]) for item in arr
 90            )
 91
 92        def _quotes_to_format(
 93            token_type: TokenType, arr: t.List[str | t.Tuple[str, str]]
 94        ) -> t.Dict[str, t.Tuple[str, TokenType]]:
 95            return {k: (v, token_type) for k, v in _convert_quotes(arr).items()}
 96
 97        klass._QUOTES = _convert_quotes(klass.QUOTES)
 98        klass._IDENTIFIERS = _convert_quotes(klass.IDENTIFIERS)
 99
100        klass._FORMAT_STRINGS = {
101            **{
102                p + s: (e, TokenType.NATIONAL_STRING)
103                for s, e in klass._QUOTES.items()
104                for p in ("n", "N")
105            },
106            **_quotes_to_format(TokenType.BIT_STRING, klass.BIT_STRINGS),
107            **_quotes_to_format(TokenType.BYTE_STRING, klass.BYTE_STRINGS),
108            **_quotes_to_format(TokenType.HEX_STRING, klass.HEX_STRINGS),
109            **_quotes_to_format(TokenType.RAW_STRING, klass.RAW_STRINGS),
110            **_quotes_to_format(TokenType.HEREDOC_STRING, klass.HEREDOC_STRINGS),
111        }
112
113        klass._STRING_ESCAPES = set(klass.STRING_ESCAPES)
114        klass._IDENTIFIER_ESCAPES = set(klass.IDENTIFIER_ESCAPES)
115        klass._COMMENTS = {
116            **dict(
117                (comment, None) if isinstance(comment, str) else (comment[0], comment[1])
118                for comment in klass.COMMENTS
119            ),
120            "{#": "#}",  # Ensure Jinja comments are tokenized correctly in all dialects
121        }
122
123        klass._KEYWORD_TRIE = new_trie(
124            key.upper()
125            for key in (
126                *klass.KEYWORDS,
127                *klass._COMMENTS,
128                *klass._QUOTES,
129                *klass._FORMAT_STRINGS,
130            )
131            if " " in key or any(single in key for single in klass.SINGLE_TOKENS)
132        )
133
134        if USE_RS_TOKENIZER:
135            settings = RsTokenizerSettings(
136                white_space={k: v.name for k, v in klass.WHITE_SPACE.items()},
137                single_tokens={k: v.name for k, v in klass.SINGLE_TOKENS.items()},
138                keywords={k: v.name for k, v in klass.KEYWORDS.items()},
139                numeric_literals=klass.NUMERIC_LITERALS,
140                identifiers=klass._IDENTIFIERS,
141                identifier_escapes=klass._IDENTIFIER_ESCAPES,
142                string_escapes=klass._STRING_ESCAPES,
143                quotes=klass._QUOTES,
144                format_strings={k: (v1, v2.name) for k, (v1, v2) in klass._FORMAT_STRINGS.items()},
145                has_bit_strings=bool(klass.BIT_STRINGS),
146                has_hex_strings=bool(klass.HEX_STRINGS),
147                comments=klass._COMMENTS,
148                var_single_tokens=klass.VAR_SINGLE_TOKENS,
149                commands={v.name for v in klass.COMMANDS},
150                command_prefix_tokens={v.name for v in klass.COMMAND_PREFIX_TOKENS},
151            )
152            klass._RS_TOKENIZER = RsTokenizer(settings)
153        else:
154            klass._RS_TOKENIZER = None
155
156        return klass
157
158
159class Tokenizer(metaclass=_Tokenizer):
160    SINGLE_TOKENS = {
161        "(": TokenType.L_PAREN,
162        ")": TokenType.R_PAREN,
163        "[": TokenType.L_BRACKET,
164        "]": TokenType.R_BRACKET,
165        "{": TokenType.L_BRACE,
166        "}": TokenType.R_BRACE,
167        "&": TokenType.AMP,
168        "^": TokenType.CARET,
169        ":": TokenType.COLON,
170        ",": TokenType.COMMA,
171        ".": TokenType.DOT,
172        "-": TokenType.DASH,
173        "=": TokenType.EQ,
174        ">": TokenType.GT,
175        "<": TokenType.LT,
176        "%": TokenType.MOD,
177        "!": TokenType.NOT,
178        "|": TokenType.PIPE,
179        "+": TokenType.PLUS,
180        ";": TokenType.SEMICOLON,
181        "/": TokenType.SLASH,
182        "\\": TokenType.BACKSLASH,
183        "*": TokenType.STAR,
184        "~": TokenType.TILDA,
185        "?": TokenType.PLACEHOLDER,
186        "@": TokenType.PARAMETER,
187        # used for breaking a var like x'y' but nothing else
188        # the token type doesn't matter
189        "'": TokenType.QUOTE,
190        "`": TokenType.IDENTIFIER,
191        '"': TokenType.IDENTIFIER,
192        "#": TokenType.HASH,
193    }
194
195    BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
196    BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
197    HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
198    RAW_STRINGS: t.List[str | t.Tuple[str, str]] = []
199    HEREDOC_STRINGS: t.List[str | t.Tuple[str, str]] = []
200    IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
201    IDENTIFIER_ESCAPES = ['"']
202    QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
203    STRING_ESCAPES = ["'"]
204    VAR_SINGLE_TOKENS: t.Set[str] = set()
205
206    # Autofilled
207    _COMMENTS: t.Dict[str, str] = {}
208    _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {}
209    _IDENTIFIERS: t.Dict[str, str] = {}
210    _IDENTIFIER_ESCAPES: t.Set[str] = set()
211    _QUOTES: t.Dict[str, str] = {}
212    _STRING_ESCAPES: t.Set[str] = set()
213    _KEYWORD_TRIE: t.Dict = {}
214    _RS_TOKENIZER: t.Optional[t.Any] = None
215
216    KEYWORDS: t.Dict[str, TokenType] = {
217        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
218        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
219        **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
220        **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
221        "/*+": TokenType.HINT,
222        "==": TokenType.EQ,
223        "::": TokenType.DCOLON,
224        "||": TokenType.DPIPE,
225        ">=": TokenType.GTE,
226        "<=": TokenType.LTE,
227        "<>": TokenType.NEQ,
228        "!=": TokenType.NEQ,
229        ":=": TokenType.COLON_EQ,
230        "<=>": TokenType.NULLSAFE_EQ,
231        "->": TokenType.ARROW,
232        "->>": TokenType.DARROW,
233        "=>": TokenType.FARROW,
234        "#>": TokenType.HASH_ARROW,
235        "#>>": TokenType.DHASH_ARROW,
236        "<->": TokenType.LR_ARROW,
237        "&&": TokenType.DAMP,
238        "??": TokenType.DQMARK,
239        "ALL": TokenType.ALL,
240        "ALWAYS": TokenType.ALWAYS,
241        "AND": TokenType.AND,
242        "ANTI": TokenType.ANTI,
243        "ANY": TokenType.ANY,
244        "ASC": TokenType.ASC,
245        "AS": TokenType.ALIAS,
246        "ASOF": TokenType.ASOF,
247        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
248        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
249        "BEGIN": TokenType.BEGIN,
250        "BETWEEN": TokenType.BETWEEN,
251        "CACHE": TokenType.CACHE,
252        "UNCACHE": TokenType.UNCACHE,
253        "CASE": TokenType.CASE,
254        "CHARACTER SET": TokenType.CHARACTER_SET,
255        "CLUSTER BY": TokenType.CLUSTER_BY,
256        "COLLATE": TokenType.COLLATE,
257        "COLUMN": TokenType.COLUMN,
258        "COMMIT": TokenType.COMMIT,
259        "CONNECT BY": TokenType.CONNECT_BY,
260        "CONSTRAINT": TokenType.CONSTRAINT,
261        "CREATE": TokenType.CREATE,
262        "CROSS": TokenType.CROSS,
263        "CUBE": TokenType.CUBE,
264        "CURRENT_DATE": TokenType.CURRENT_DATE,
265        "CURRENT_TIME": TokenType.CURRENT_TIME,
266        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
267        "CURRENT_USER": TokenType.CURRENT_USER,
268        "DATABASE": TokenType.DATABASE,
269        "DEFAULT": TokenType.DEFAULT,
270        "DELETE": TokenType.DELETE,
271        "DESC": TokenType.DESC,
272        "DESCRIBE": TokenType.DESCRIBE,
273        "DISTINCT": TokenType.DISTINCT,
274        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
275        "DIV": TokenType.DIV,
276        "DROP": TokenType.DROP,
277        "ELSE": TokenType.ELSE,
278        "END": TokenType.END,
279        "ESCAPE": TokenType.ESCAPE,
280        "EXCEPT": TokenType.EXCEPT,
281        "EXECUTE": TokenType.EXECUTE,
282        "EXISTS": TokenType.EXISTS,
283        "FALSE": TokenType.FALSE,
284        "FETCH": TokenType.FETCH,
285        "FILTER": TokenType.FILTER,
286        "FIRST": TokenType.FIRST,
287        "FULL": TokenType.FULL,
288        "FUNCTION": TokenType.FUNCTION,
289        "FOR": TokenType.FOR,
290        "FOREIGN KEY": TokenType.FOREIGN_KEY,
291        "FORMAT": TokenType.FORMAT,
292        "FROM": TokenType.FROM,
293        "GEOGRAPHY": TokenType.GEOGRAPHY,
294        "GEOMETRY": TokenType.GEOMETRY,
295        "GLOB": TokenType.GLOB,
296        "GROUP BY": TokenType.GROUP_BY,
297        "GROUPING SETS": TokenType.GROUPING_SETS,
298        "HAVING": TokenType.HAVING,
299        "ILIKE": TokenType.ILIKE,
300        "IN": TokenType.IN,
301        "INDEX": TokenType.INDEX,
302        "INET": TokenType.INET,
303        "INNER": TokenType.INNER,
304        "INSERT": TokenType.INSERT,
305        "INTERVAL": TokenType.INTERVAL,
306        "INTERSECT": TokenType.INTERSECT,
307        "INTO": TokenType.INTO,
308        "IS": TokenType.IS,
309        "ISNULL": TokenType.ISNULL,
310        "JOIN": TokenType.JOIN,
311        "KEEP": TokenType.KEEP,
312        "KILL": TokenType.KILL,
313        "LATERAL": TokenType.LATERAL,
314        "LEFT": TokenType.LEFT,
315        "LIKE": TokenType.LIKE,
316        "LIMIT": TokenType.LIMIT,
317        "LOAD": TokenType.LOAD,
318        "LOCK": TokenType.LOCK,
319        "MERGE": TokenType.MERGE,
320        "NATURAL": TokenType.NATURAL,
321        "NEXT": TokenType.NEXT,
322        "NOT": TokenType.NOT,
323        "NOTNULL": TokenType.NOTNULL,
324        "NULL": TokenType.NULL,
325        "OBJECT": TokenType.OBJECT,
326        "OFFSET": TokenType.OFFSET,
327        "ON": TokenType.ON,
328        "OR": TokenType.OR,
329        "XOR": TokenType.XOR,
330        "ORDER BY": TokenType.ORDER_BY,
331        "ORDINALITY": TokenType.ORDINALITY,
332        "OUTER": TokenType.OUTER,
333        "OVER": TokenType.OVER,
334        "OVERLAPS": TokenType.OVERLAPS,
335        "OVERWRITE": TokenType.OVERWRITE,
336        "PARTITION": TokenType.PARTITION,
337        "PARTITION BY": TokenType.PARTITION_BY,
338        "PARTITIONED BY": TokenType.PARTITION_BY,
339        "PARTITIONED_BY": TokenType.PARTITION_BY,
340        "PERCENT": TokenType.PERCENT,
341        "PIVOT": TokenType.PIVOT,
342        "PRAGMA": TokenType.PRAGMA,
343        "PRIMARY KEY": TokenType.PRIMARY_KEY,
344        "PROCEDURE": TokenType.PROCEDURE,
345        "QUALIFY": TokenType.QUALIFY,
346        "RANGE": TokenType.RANGE,
347        "RECURSIVE": TokenType.RECURSIVE,
348        "REGEXP": TokenType.RLIKE,
349        "REPLACE": TokenType.REPLACE,
350        "RETURNING": TokenType.RETURNING,
351        "REFERENCES": TokenType.REFERENCES,
352        "RIGHT": TokenType.RIGHT,
353        "RLIKE": TokenType.RLIKE,
354        "ROLLBACK": TokenType.ROLLBACK,
355        "ROLLUP": TokenType.ROLLUP,
356        "ROW": TokenType.ROW,
357        "ROWS": TokenType.ROWS,
358        "SCHEMA": TokenType.SCHEMA,
359        "SELECT": TokenType.SELECT,
360        "SEMI": TokenType.SEMI,
361        "SET": TokenType.SET,
362        "SETTINGS": TokenType.SETTINGS,
363        "SHOW": TokenType.SHOW,
364        "SIMILAR TO": TokenType.SIMILAR_TO,
365        "SOME": TokenType.SOME,
366        "SORT BY": TokenType.SORT_BY,
367        "START WITH": TokenType.START_WITH,
368        "TABLE": TokenType.TABLE,
369        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
370        "TEMP": TokenType.TEMPORARY,
371        "TEMPORARY": TokenType.TEMPORARY,
372        "THEN": TokenType.THEN,
373        "TRUE": TokenType.TRUE,
374        "UNION": TokenType.UNION,
375        "UNKNOWN": TokenType.UNKNOWN,
376        "UNNEST": TokenType.UNNEST,
377        "UNPIVOT": TokenType.UNPIVOT,
378        "UPDATE": TokenType.UPDATE,
379        "USE": TokenType.USE,
380        "USING": TokenType.USING,
381        "UUID": TokenType.UUID,
382        "VALUES": TokenType.VALUES,
383        "VIEW": TokenType.VIEW,
384        "VOLATILE": TokenType.VOLATILE,
385        "WHEN": TokenType.WHEN,
386        "WHERE": TokenType.WHERE,
387        "WINDOW": TokenType.WINDOW,
388        "WITH": TokenType.WITH,
389        "APPLY": TokenType.APPLY,
390        "ARRAY": TokenType.ARRAY,
391        "BIT": TokenType.BIT,
392        "BOOL": TokenType.BOOLEAN,
393        "BOOLEAN": TokenType.BOOLEAN,
394        "BYTE": TokenType.TINYINT,
395        "MEDIUMINT": TokenType.MEDIUMINT,
396        "INT1": TokenType.TINYINT,
397        "TINYINT": TokenType.TINYINT,
398        "INT16": TokenType.SMALLINT,
399        "SHORT": TokenType.SMALLINT,
400        "SMALLINT": TokenType.SMALLINT,
401        "INT128": TokenType.INT128,
402        "HUGEINT": TokenType.INT128,
403        "INT2": TokenType.SMALLINT,
404        "INTEGER": TokenType.INT,
405        "INT": TokenType.INT,
406        "INT4": TokenType.INT,
407        "INT32": TokenType.INT,
408        "INT64": TokenType.BIGINT,
409        "LONG": TokenType.BIGINT,
410        "BIGINT": TokenType.BIGINT,
411        "INT8": TokenType.TINYINT,
412        "DEC": TokenType.DECIMAL,
413        "DECIMAL": TokenType.DECIMAL,
414        "BIGDECIMAL": TokenType.BIGDECIMAL,
415        "BIGNUMERIC": TokenType.BIGDECIMAL,
416        "MAP": TokenType.MAP,
417        "NULLABLE": TokenType.NULLABLE,
418        "NUMBER": TokenType.DECIMAL,
419        "NUMERIC": TokenType.DECIMAL,
420        "FIXED": TokenType.DECIMAL,
421        "REAL": TokenType.FLOAT,
422        "FLOAT": TokenType.FLOAT,
423        "FLOAT4": TokenType.FLOAT,
424        "FLOAT8": TokenType.DOUBLE,
425        "DOUBLE": TokenType.DOUBLE,
426        "DOUBLE PRECISION": TokenType.DOUBLE,
427        "JSON": TokenType.JSON,
428        "CHAR": TokenType.CHAR,
429        "CHARACTER": TokenType.CHAR,
430        "NCHAR": TokenType.NCHAR,
431        "VARCHAR": TokenType.VARCHAR,
432        "VARCHAR2": TokenType.VARCHAR,
433        "NVARCHAR": TokenType.NVARCHAR,
434        "NVARCHAR2": TokenType.NVARCHAR,
435        "STR": TokenType.TEXT,
436        "STRING": TokenType.TEXT,
437        "TEXT": TokenType.TEXT,
438        "LONGTEXT": TokenType.LONGTEXT,
439        "MEDIUMTEXT": TokenType.MEDIUMTEXT,
440        "TINYTEXT": TokenType.TINYTEXT,
441        "CLOB": TokenType.TEXT,
442        "LONGVARCHAR": TokenType.TEXT,
443        "BINARY": TokenType.BINARY,
444        "BLOB": TokenType.VARBINARY,
445        "LONGBLOB": TokenType.LONGBLOB,
446        "MEDIUMBLOB": TokenType.MEDIUMBLOB,
447        "TINYBLOB": TokenType.TINYBLOB,
448        "BYTEA": TokenType.VARBINARY,
449        "VARBINARY": TokenType.VARBINARY,
450        "TIME": TokenType.TIME,
451        "TIMETZ": TokenType.TIMETZ,
452        "TIMESTAMP": TokenType.TIMESTAMP,
453        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
454        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
455        "DATE": TokenType.DATE,
456        "DATETIME": TokenType.DATETIME,
457        "INT4RANGE": TokenType.INT4RANGE,
458        "INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
459        "INT8RANGE": TokenType.INT8RANGE,
460        "INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
461        "NUMRANGE": TokenType.NUMRANGE,
462        "NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
463        "TSRANGE": TokenType.TSRANGE,
464        "TSMULTIRANGE": TokenType.TSMULTIRANGE,
465        "TSTZRANGE": TokenType.TSTZRANGE,
466        "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
467        "DATERANGE": TokenType.DATERANGE,
468        "DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
469        "UNIQUE": TokenType.UNIQUE,
470        "STRUCT": TokenType.STRUCT,
471        "VARIANT": TokenType.VARIANT,
472        "ALTER": TokenType.ALTER,
473        "ANALYZE": TokenType.COMMAND,
474        "CALL": TokenType.COMMAND,
475        "COMMENT": TokenType.COMMENT,
476        "COPY": TokenType.COMMAND,
477        "EXPLAIN": TokenType.COMMAND,
478        "GRANT": TokenType.COMMAND,
479        "OPTIMIZE": TokenType.COMMAND,
480        "PREPARE": TokenType.COMMAND,
481        "TRUNCATE": TokenType.COMMAND,
482        "VACUUM": TokenType.COMMAND,
483        "USER-DEFINED": TokenType.USERDEFINED,
484        "FOR VERSION": TokenType.VERSION_SNAPSHOT,
485        "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
486    }
487
488    WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = {
489        " ": TokenType.SPACE,
490        "\t": TokenType.SPACE,
491        "\n": TokenType.BREAK,
492        "\r": TokenType.BREAK,
493    }
494
495    COMMANDS = {
496        TokenType.COMMAND,
497        TokenType.EXECUTE,
498        TokenType.FETCH,
499        TokenType.SHOW,
500    }
501
502    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
503
504    # handle numeric literals like in hive (3L = BIGINT)
505    NUMERIC_LITERALS: t.Dict[str, str] = {}
506
507    COMMENTS = ["--", ("/*", "*/")]
508
509    __slots__ = (
510        "sql",
511        "size",
512        "tokens",
513        "dialect",
514        "_start",
515        "_current",
516        "_line",
517        "_col",
518        "_comments",
519        "_char",
520        "_end",
521        "_peek",
522        "_prev_token_line",
523        "_rs_dialect_settings",
524    )
525
526    def __init__(self, dialect: DialectType = None) -> None:
527        from sqlglot.dialects import Dialect
528
529        self.dialect = Dialect.get_or_raise(dialect)
530
531        if USE_RS_TOKENIZER:
532            self._rs_dialect_settings = RsTokenizerDialectSettings(
533                escape_sequences=self.dialect.ESCAPE_SEQUENCES,
534                identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
535            )
536
537        self.reset()
538
539    def reset(self) -> None:
540        self.sql = ""
541        self.size = 0
542        self.tokens: t.List[Token] = []
543        self._start = 0
544        self._current = 0
545        self._line = 1
546        self._col = 0
547        self._comments: t.List[str] = []
548
549        self._char = ""
550        self._end = False
551        self._peek = ""
552        self._prev_token_line = -1
553
554    def tokenize(self, sql: str) -> t.List[Token]:
555        """Returns a list of tokens corresponding to the SQL string `sql`."""
556        if USE_RS_TOKENIZER:
557            return self.tokenize_rs(sql)
558
559        self.reset()
560        self.sql = sql
561        self.size = len(sql)
562
563        try:
564            self._scan()
565        except Exception as e:
566            start = max(self._current - 50, 0)
567            end = min(self._current + 50, self.size - 1)
568            context = self.sql[start:end]
569            raise TokenError(f"Error tokenizing '{context}'") from e
570
571        return self.tokens
572
573    def _scan(self, until: t.Optional[t.Callable] = None) -> None:
574        while self.size and not self._end:
575            current = self._current
576
577            # skip spaces inline rather than iteratively call advance()
578            # for performance reasons
579            while current < self.size:
580                char = self.sql[current]
581
582                if char.isspace() and (char == " " or char == "\t"):
583                    current += 1
584                else:
585                    break
586
587            n = current - self._current
588            self._start = current
589            self._advance(n if n > 1 else 1)
590
591            if self._char is None:
592                break
593
594            if not self._char.isspace():
595                if self._char.isdigit():
596                    self._scan_number()
597                elif self._char in self._IDENTIFIERS:
598                    self._scan_identifier(self._IDENTIFIERS[self._char])
599                else:
600                    self._scan_keywords()
601
602            if until and until():
603                break
604
605        if self.tokens and self._comments:
606            self.tokens[-1].comments.extend(self._comments)
607
608    def _chars(self, size: int) -> str:
609        if size == 1:
610            return self._char
611
612        start = self._current - 1
613        end = start + size
614
615        return self.sql[start:end] if end <= self.size else ""
616
617    def _advance(self, i: int = 1, alnum: bool = False) -> None:
618        if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
619            # Ensures we don't count an extra line if we get a \r\n line break sequence
620            if self._char == "\r" and self._peek == "\n":
621                i = 2
622                self._start += 1
623
624            self._col = 1
625            self._line += 1
626        else:
627            self._col += i
628
629        self._current += i
630        self._end = self._current >= self.size
631        self._char = self.sql[self._current - 1]
632        self._peek = "" if self._end else self.sql[self._current]
633
634        if alnum and self._char.isalnum():
635            # Here we use local variables instead of attributes for better performance
636            _col = self._col
637            _current = self._current
638            _end = self._end
639            _peek = self._peek
640
641            while _peek.isalnum():
642                _col += 1
643                _current += 1
644                _end = _current >= self.size
645                _peek = "" if _end else self.sql[_current]
646
647            self._col = _col
648            self._current = _current
649            self._end = _end
650            self._peek = _peek
651            self._char = self.sql[_current - 1]
652
653    @property
654    def _text(self) -> str:
655        return self.sql[self._start : self._current]
656
657    def peek(self, i: int = 0) -> str:
658        i = self._current + i
659        if i < self.size:
660            return self.sql[i]
661        return ""
662
663    def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
664        self._prev_token_line = self._line
665
666        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
667            self.tokens[-1].comments.extend(self._comments)
668            self._comments = []
669
670        self.tokens.append(
671            Token(
672                token_type,
673                text=self._text if text is None else text,
674                line=self._line,
675                col=self._col,
676                start=self._start,
677                end=self._current - 1,
678                comments=self._comments,
679            )
680        )
681        self._comments = []
682
683        # If we have either a semicolon or a begin token before the command's token, we'll parse
684        # whatever follows the command's token as a string
685        if (
686            token_type in self.COMMANDS
687            and self._peek != ";"
688            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS)
689        ):
690            start = self._current
691            tokens = len(self.tokens)
692            self._scan(lambda: self._peek == ";")
693            self.tokens = self.tokens[:tokens]
694            text = self.sql[start : self._current].strip()
695            if text:
696                self._add(TokenType.STRING, text)
697
698    def _scan_keywords(self) -> None:
699        size = 0
700        word = None
701        chars = self._text
702        char = chars
703        prev_space = False
704        skip = False
705        trie = self._KEYWORD_TRIE
706        single_token = char in self.SINGLE_TOKENS
707
708        while chars:
709            if skip:
710                result = TrieResult.PREFIX
711            else:
712                result, trie = in_trie(trie, char.upper())
713
714            if result == TrieResult.FAILED:
715                break
716            if result == TrieResult.EXISTS:
717                word = chars
718
719            end = self._current + size
720            size += 1
721
722            if end < self.size:
723                char = self.sql[end]
724                single_token = single_token or char in self.SINGLE_TOKENS
725                is_space = char.isspace()
726
727                if not is_space or not prev_space:
728                    if is_space:
729                        char = " "
730                    chars += char
731                    prev_space = is_space
732                    skip = False
733                else:
734                    skip = True
735            else:
736                char = ""
737                break
738
739        if word:
740            if self._scan_string(word):
741                return
742            if self._scan_comment(word):
743                return
744            if prev_space or single_token or not char:
745                self._advance(size - 1)
746                word = word.upper()
747                self._add(self.KEYWORDS[word], text=word)
748                return
749
750        if self._char in self.SINGLE_TOKENS:
751            self._add(self.SINGLE_TOKENS[self._char], text=self._char)
752            return
753
754        self._scan_var()
755
756    def _scan_comment(self, comment_start: str) -> bool:
757        if comment_start not in self._COMMENTS:
758            return False
759
760        comment_start_line = self._line
761        comment_start_size = len(comment_start)
762        comment_end = self._COMMENTS[comment_start]
763
764        if comment_end:
765            # Skip the comment's start delimiter
766            self._advance(comment_start_size)
767
768            comment_end_size = len(comment_end)
769            while not self._end and self._chars(comment_end_size) != comment_end:
770                self._advance(alnum=True)
771
772            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
773            self._advance(comment_end_size - 1)
774        else:
775            while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK:
776                self._advance(alnum=True)
777            self._comments.append(self._text[comment_start_size:])
778
779        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
780        # Multiple consecutive comments are preserved by appending them to the current comments list.
781        if comment_start_line == self._prev_token_line:
782            self.tokens[-1].comments.extend(self._comments)
783            self._comments = []
784            self._prev_token_line = self._line
785
786        return True
787
788    def _scan_number(self) -> None:
789        if self._char == "0":
790            peek = self._peek.upper()
791            if peek == "B":
792                return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER)
793            elif peek == "X":
794                return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER)
795
796        decimal = False
797        scientific = 0
798
799        while True:
800            if self._peek.isdigit():
801                self._advance()
802            elif self._peek == "." and not decimal:
803                after = self.peek(1)
804                if after.isdigit() or not after.isalpha():
805                    decimal = True
806                    self._advance()
807                else:
808                    return self._add(TokenType.VAR)
809            elif self._peek in ("-", "+") and scientific == 1:
810                scientific += 1
811                self._advance()
812            elif self._peek.upper() == "E" and not scientific:
813                scientific += 1
814                self._advance()
815            elif self._peek.isidentifier():
816                number_text = self._text
817                literal = ""
818
819                while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:
820                    literal += self._peek
821                    self._advance()
822
823                token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal.upper(), ""))
824
825                if token_type:
826                    self._add(TokenType.NUMBER, number_text)
827                    self._add(TokenType.DCOLON, "::")
828                    return self._add(token_type, literal)
829                elif self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT:
830                    return self._add(TokenType.VAR)
831
832                self._advance(-len(literal))
833                return self._add(TokenType.NUMBER, number_text)
834            else:
835                return self._add(TokenType.NUMBER)
836
837    def _scan_bits(self) -> None:
838        self._advance()
839        value = self._extract_value()
840        try:
841            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
842            int(value, 2)
843            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
844        except ValueError:
845            self._add(TokenType.IDENTIFIER)
846
847    def _scan_hex(self) -> None:
848        self._advance()
849        value = self._extract_value()
850        try:
851            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
852            int(value, 16)
853            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
854        except ValueError:
855            self._add(TokenType.IDENTIFIER)
856
857    def _extract_value(self) -> str:
858        while True:
859            char = self._peek.strip()
860            if char and char not in self.SINGLE_TOKENS:
861                self._advance(alnum=True)
862            else:
863                break
864
865        return self._text
866
867    def _scan_string(self, start: str) -> bool:
868        base = None
869        token_type = TokenType.STRING
870
871        if start in self._QUOTES:
872            end = self._QUOTES[start]
873        elif start in self._FORMAT_STRINGS:
874            end, token_type = self._FORMAT_STRINGS[start]
875
876            if token_type == TokenType.HEX_STRING:
877                base = 16
878            elif token_type == TokenType.BIT_STRING:
879                base = 2
880            elif token_type == TokenType.HEREDOC_STRING:
881                self._advance()
882                tag = "" if self._char == end else self._extract_string(end)
883                end = f"{start}{tag}{end}"
884        else:
885            return False
886
887        self._advance(len(start))
888        text = self._extract_string(end)
889
890        if base:
891            try:
892                int(text, base)
893            except:
894                raise TokenError(
895                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
896                )
897
898        self._add(token_type, text)
899        return True
900
901    def _scan_identifier(self, identifier_end: str) -> None:
902        self._advance()
903        text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES)
904        self._add(TokenType.IDENTIFIER, text)
905
906    def _scan_var(self) -> None:
907        while True:
908            char = self._peek.strip()
909            if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
910                self._advance(alnum=True)
911            else:
912                break
913
914        self._add(
915            TokenType.VAR
916            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
917            else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
918        )
919
920    def _extract_string(self, delimiter: str, escapes=None) -> str:
921        text = ""
922        delim_size = len(delimiter)
923        escapes = self._STRING_ESCAPES if escapes is None else escapes
924
925        while True:
926            if (
927                self._char in escapes
928                and (self._peek == delimiter or self._peek in escapes)
929                and (self._char not in self._QUOTES or self._char == self._peek)
930            ):
931                if self._peek == delimiter:
932                    text += self._peek
933                else:
934                    text += self._char + self._peek
935
936                if self._current + 1 < self.size:
937                    self._advance(2)
938                else:
939                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
940            else:
941                if self._chars(delim_size) == delimiter:
942                    if delim_size > 1:
943                        self._advance(delim_size - 1)
944                    break
945
946                if self._end:
947                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
948
949                if (
950                    self.dialect.ESCAPE_SEQUENCES
951                    and self._peek
952                    and self._char in self.STRING_ESCAPES
953                ):
954                    escaped_sequence = self.dialect.ESCAPE_SEQUENCES.get(self._char + self._peek)
955                    if escaped_sequence:
956                        self._advance(2)
957                        text += escaped_sequence
958                        continue
959
960                current = self._current - 1
961                self._advance(alnum=True)
962                text += self.sql[current : self._current - 1]
963
964        return text
965
966    def tokenize_rs(self, sql: str) -> t.List[Token]:
967        if not self._RS_TOKENIZER:
968            raise SqlglotError("Rust tokenizer is not available")
969
970        try:
971            return [
972                Token(
973                    token_type=_ALL_TOKEN_TYPES[token.token_type.index],
974                    text=token.text,
975                    line=token.line,
976                    col=token.col,
977                    start=token.start,
978                    end=token.end,
979                    comments=token.comments,
980                )
981                for token in self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings)
982            ]
983        except Exception as e:
984            raise TokenError(str(e))
985
986
987_ALL_TOKEN_TYPES = list(TokenType)
class Token:
27class Token:
28    __slots__ = ("token_type", "text", "line", "col", "start", "end", "comments")
29
30    @classmethod
31    def number(cls, number: int) -> Token:
32        """Returns a NUMBER token with `number` as its text."""
33        return cls(TokenType.NUMBER, str(number))
34
35    @classmethod
36    def string(cls, string: str) -> Token:
37        """Returns a STRING token with `string` as its text."""
38        return cls(TokenType.STRING, string)
39
40    @classmethod
41    def identifier(cls, identifier: str) -> Token:
42        """Returns an IDENTIFIER token with `identifier` as its text."""
43        return cls(TokenType.IDENTIFIER, identifier)
44
45    @classmethod
46    def var(cls, var: str) -> Token:
47        """Returns an VAR token with `var` as its text."""
48        return cls(TokenType.VAR, var)
49
50    def __init__(
51        self,
52        token_type: TokenType,
53        text: str,
54        line: int = 1,
55        col: int = 1,
56        start: int = 0,
57        end: int = 0,
58        comments: t.Optional[t.List[str]] = None,
59    ) -> None:
60        """Token initializer.
61
62        Args:
63            token_type: The TokenType Enum.
64            text: The text of the token.
65            line: The line that the token ends on.
66            col: The column that the token ends on.
67            start: The start index of the token.
68            end: The ending index of the token.
69            comments: The comments to attach to the token.
70        """
71        self.token_type = token_type
72        self.text = text
73        self.line = line
74        self.col = col
75        self.start = start
76        self.end = end
77        self.comments = [] if comments is None else comments
78
79    def __repr__(self) -> str:
80        attributes = ", ".join(f"{k}: {getattr(self, k)}" for k in self.__slots__)
81        return f"<Token {attributes}>"
Token( token_type: sqlglot.token_type.TokenType, text: str, line: int = 1, col: int = 1, start: int = 0, end: int = 0, comments: Optional[List[str]] = None)
50    def __init__(
51        self,
52        token_type: TokenType,
53        text: str,
54        line: int = 1,
55        col: int = 1,
56        start: int = 0,
57        end: int = 0,
58        comments: t.Optional[t.List[str]] = None,
59    ) -> None:
60        """Token initializer.
61
62        Args:
63            token_type: The TokenType Enum.
64            text: The text of the token.
65            line: The line that the token ends on.
66            col: The column that the token ends on.
67            start: The start index of the token.
68            end: The ending index of the token.
69            comments: The comments to attach to the token.
70        """
71        self.token_type = token_type
72        self.text = text
73        self.line = line
74        self.col = col
75        self.start = start
76        self.end = end
77        self.comments = [] if comments is None else comments

Token initializer.

Arguments:
  • token_type: The TokenType Enum.
  • text: The text of the token.
  • line: The line that the token ends on.
  • col: The column that the token ends on.
  • start: The start index of the token.
  • end: The ending index of the token.
  • comments: The comments to attach to the token.
@classmethod
def number(cls, number: int) -> Token:
30    @classmethod
31    def number(cls, number: int) -> Token:
32        """Returns a NUMBER token with `number` as its text."""
33        return cls(TokenType.NUMBER, str(number))

Returns a NUMBER token with number as its text.

@classmethod
def string(cls, string: str) -> Token:
35    @classmethod
36    def string(cls, string: str) -> Token:
37        """Returns a STRING token with `string` as its text."""
38        return cls(TokenType.STRING, string)

Returns a STRING token with string as its text.

@classmethod
def identifier(cls, identifier: str) -> Token:
40    @classmethod
41    def identifier(cls, identifier: str) -> Token:
42        """Returns an IDENTIFIER token with `identifier` as its text."""
43        return cls(TokenType.IDENTIFIER, identifier)

Returns an IDENTIFIER token with identifier as its text.

@classmethod
def var(cls, var: str) -> Token:
45    @classmethod
46    def var(cls, var: str) -> Token:
47        """Returns an VAR token with `var` as its text."""
48        return cls(TokenType.VAR, var)

Returns an VAR token with var as its text.

token_type
text
line
col
start
end
comments
class Tokenizer:
160class Tokenizer(metaclass=_Tokenizer):
161    SINGLE_TOKENS = {
162        "(": TokenType.L_PAREN,
163        ")": TokenType.R_PAREN,
164        "[": TokenType.L_BRACKET,
165        "]": TokenType.R_BRACKET,
166        "{": TokenType.L_BRACE,
167        "}": TokenType.R_BRACE,
168        "&": TokenType.AMP,
169        "^": TokenType.CARET,
170        ":": TokenType.COLON,
171        ",": TokenType.COMMA,
172        ".": TokenType.DOT,
173        "-": TokenType.DASH,
174        "=": TokenType.EQ,
175        ">": TokenType.GT,
176        "<": TokenType.LT,
177        "%": TokenType.MOD,
178        "!": TokenType.NOT,
179        "|": TokenType.PIPE,
180        "+": TokenType.PLUS,
181        ";": TokenType.SEMICOLON,
182        "/": TokenType.SLASH,
183        "\\": TokenType.BACKSLASH,
184        "*": TokenType.STAR,
185        "~": TokenType.TILDA,
186        "?": TokenType.PLACEHOLDER,
187        "@": TokenType.PARAMETER,
188        # used for breaking a var like x'y' but nothing else
189        # the token type doesn't matter
190        "'": TokenType.QUOTE,
191        "`": TokenType.IDENTIFIER,
192        '"': TokenType.IDENTIFIER,
193        "#": TokenType.HASH,
194    }
195
196    BIT_STRINGS: t.List[str | t.Tuple[str, str]] = []
197    BYTE_STRINGS: t.List[str | t.Tuple[str, str]] = []
198    HEX_STRINGS: t.List[str | t.Tuple[str, str]] = []
199    RAW_STRINGS: t.List[str | t.Tuple[str, str]] = []
200    HEREDOC_STRINGS: t.List[str | t.Tuple[str, str]] = []
201    IDENTIFIERS: t.List[str | t.Tuple[str, str]] = ['"']
202    IDENTIFIER_ESCAPES = ['"']
203    QUOTES: t.List[t.Tuple[str, str] | str] = ["'"]
204    STRING_ESCAPES = ["'"]
205    VAR_SINGLE_TOKENS: t.Set[str] = set()
206
207    # Autofilled
208    _COMMENTS: t.Dict[str, str] = {}
209    _FORMAT_STRINGS: t.Dict[str, t.Tuple[str, TokenType]] = {}
210    _IDENTIFIERS: t.Dict[str, str] = {}
211    _IDENTIFIER_ESCAPES: t.Set[str] = set()
212    _QUOTES: t.Dict[str, str] = {}
213    _STRING_ESCAPES: t.Set[str] = set()
214    _KEYWORD_TRIE: t.Dict = {}
215    _RS_TOKENIZER: t.Optional[t.Any] = None
216
217    KEYWORDS: t.Dict[str, TokenType] = {
218        **{f"{{%{postfix}": TokenType.BLOCK_START for postfix in ("", "+", "-")},
219        **{f"{prefix}%}}": TokenType.BLOCK_END for prefix in ("", "+", "-")},
220        **{f"{{{{{postfix}": TokenType.BLOCK_START for postfix in ("+", "-")},
221        **{f"{prefix}}}}}": TokenType.BLOCK_END for prefix in ("+", "-")},
222        "/*+": TokenType.HINT,
223        "==": TokenType.EQ,
224        "::": TokenType.DCOLON,
225        "||": TokenType.DPIPE,
226        ">=": TokenType.GTE,
227        "<=": TokenType.LTE,
228        "<>": TokenType.NEQ,
229        "!=": TokenType.NEQ,
230        ":=": TokenType.COLON_EQ,
231        "<=>": TokenType.NULLSAFE_EQ,
232        "->": TokenType.ARROW,
233        "->>": TokenType.DARROW,
234        "=>": TokenType.FARROW,
235        "#>": TokenType.HASH_ARROW,
236        "#>>": TokenType.DHASH_ARROW,
237        "<->": TokenType.LR_ARROW,
238        "&&": TokenType.DAMP,
239        "??": TokenType.DQMARK,
240        "ALL": TokenType.ALL,
241        "ALWAYS": TokenType.ALWAYS,
242        "AND": TokenType.AND,
243        "ANTI": TokenType.ANTI,
244        "ANY": TokenType.ANY,
245        "ASC": TokenType.ASC,
246        "AS": TokenType.ALIAS,
247        "ASOF": TokenType.ASOF,
248        "AUTOINCREMENT": TokenType.AUTO_INCREMENT,
249        "AUTO_INCREMENT": TokenType.AUTO_INCREMENT,
250        "BEGIN": TokenType.BEGIN,
251        "BETWEEN": TokenType.BETWEEN,
252        "CACHE": TokenType.CACHE,
253        "UNCACHE": TokenType.UNCACHE,
254        "CASE": TokenType.CASE,
255        "CHARACTER SET": TokenType.CHARACTER_SET,
256        "CLUSTER BY": TokenType.CLUSTER_BY,
257        "COLLATE": TokenType.COLLATE,
258        "COLUMN": TokenType.COLUMN,
259        "COMMIT": TokenType.COMMIT,
260        "CONNECT BY": TokenType.CONNECT_BY,
261        "CONSTRAINT": TokenType.CONSTRAINT,
262        "CREATE": TokenType.CREATE,
263        "CROSS": TokenType.CROSS,
264        "CUBE": TokenType.CUBE,
265        "CURRENT_DATE": TokenType.CURRENT_DATE,
266        "CURRENT_TIME": TokenType.CURRENT_TIME,
267        "CURRENT_TIMESTAMP": TokenType.CURRENT_TIMESTAMP,
268        "CURRENT_USER": TokenType.CURRENT_USER,
269        "DATABASE": TokenType.DATABASE,
270        "DEFAULT": TokenType.DEFAULT,
271        "DELETE": TokenType.DELETE,
272        "DESC": TokenType.DESC,
273        "DESCRIBE": TokenType.DESCRIBE,
274        "DISTINCT": TokenType.DISTINCT,
275        "DISTRIBUTE BY": TokenType.DISTRIBUTE_BY,
276        "DIV": TokenType.DIV,
277        "DROP": TokenType.DROP,
278        "ELSE": TokenType.ELSE,
279        "END": TokenType.END,
280        "ESCAPE": TokenType.ESCAPE,
281        "EXCEPT": TokenType.EXCEPT,
282        "EXECUTE": TokenType.EXECUTE,
283        "EXISTS": TokenType.EXISTS,
284        "FALSE": TokenType.FALSE,
285        "FETCH": TokenType.FETCH,
286        "FILTER": TokenType.FILTER,
287        "FIRST": TokenType.FIRST,
288        "FULL": TokenType.FULL,
289        "FUNCTION": TokenType.FUNCTION,
290        "FOR": TokenType.FOR,
291        "FOREIGN KEY": TokenType.FOREIGN_KEY,
292        "FORMAT": TokenType.FORMAT,
293        "FROM": TokenType.FROM,
294        "GEOGRAPHY": TokenType.GEOGRAPHY,
295        "GEOMETRY": TokenType.GEOMETRY,
296        "GLOB": TokenType.GLOB,
297        "GROUP BY": TokenType.GROUP_BY,
298        "GROUPING SETS": TokenType.GROUPING_SETS,
299        "HAVING": TokenType.HAVING,
300        "ILIKE": TokenType.ILIKE,
301        "IN": TokenType.IN,
302        "INDEX": TokenType.INDEX,
303        "INET": TokenType.INET,
304        "INNER": TokenType.INNER,
305        "INSERT": TokenType.INSERT,
306        "INTERVAL": TokenType.INTERVAL,
307        "INTERSECT": TokenType.INTERSECT,
308        "INTO": TokenType.INTO,
309        "IS": TokenType.IS,
310        "ISNULL": TokenType.ISNULL,
311        "JOIN": TokenType.JOIN,
312        "KEEP": TokenType.KEEP,
313        "KILL": TokenType.KILL,
314        "LATERAL": TokenType.LATERAL,
315        "LEFT": TokenType.LEFT,
316        "LIKE": TokenType.LIKE,
317        "LIMIT": TokenType.LIMIT,
318        "LOAD": TokenType.LOAD,
319        "LOCK": TokenType.LOCK,
320        "MERGE": TokenType.MERGE,
321        "NATURAL": TokenType.NATURAL,
322        "NEXT": TokenType.NEXT,
323        "NOT": TokenType.NOT,
324        "NOTNULL": TokenType.NOTNULL,
325        "NULL": TokenType.NULL,
326        "OBJECT": TokenType.OBJECT,
327        "OFFSET": TokenType.OFFSET,
328        "ON": TokenType.ON,
329        "OR": TokenType.OR,
330        "XOR": TokenType.XOR,
331        "ORDER BY": TokenType.ORDER_BY,
332        "ORDINALITY": TokenType.ORDINALITY,
333        "OUTER": TokenType.OUTER,
334        "OVER": TokenType.OVER,
335        "OVERLAPS": TokenType.OVERLAPS,
336        "OVERWRITE": TokenType.OVERWRITE,
337        "PARTITION": TokenType.PARTITION,
338        "PARTITION BY": TokenType.PARTITION_BY,
339        "PARTITIONED BY": TokenType.PARTITION_BY,
340        "PARTITIONED_BY": TokenType.PARTITION_BY,
341        "PERCENT": TokenType.PERCENT,
342        "PIVOT": TokenType.PIVOT,
343        "PRAGMA": TokenType.PRAGMA,
344        "PRIMARY KEY": TokenType.PRIMARY_KEY,
345        "PROCEDURE": TokenType.PROCEDURE,
346        "QUALIFY": TokenType.QUALIFY,
347        "RANGE": TokenType.RANGE,
348        "RECURSIVE": TokenType.RECURSIVE,
349        "REGEXP": TokenType.RLIKE,
350        "REPLACE": TokenType.REPLACE,
351        "RETURNING": TokenType.RETURNING,
352        "REFERENCES": TokenType.REFERENCES,
353        "RIGHT": TokenType.RIGHT,
354        "RLIKE": TokenType.RLIKE,
355        "ROLLBACK": TokenType.ROLLBACK,
356        "ROLLUP": TokenType.ROLLUP,
357        "ROW": TokenType.ROW,
358        "ROWS": TokenType.ROWS,
359        "SCHEMA": TokenType.SCHEMA,
360        "SELECT": TokenType.SELECT,
361        "SEMI": TokenType.SEMI,
362        "SET": TokenType.SET,
363        "SETTINGS": TokenType.SETTINGS,
364        "SHOW": TokenType.SHOW,
365        "SIMILAR TO": TokenType.SIMILAR_TO,
366        "SOME": TokenType.SOME,
367        "SORT BY": TokenType.SORT_BY,
368        "START WITH": TokenType.START_WITH,
369        "TABLE": TokenType.TABLE,
370        "TABLESAMPLE": TokenType.TABLE_SAMPLE,
371        "TEMP": TokenType.TEMPORARY,
372        "TEMPORARY": TokenType.TEMPORARY,
373        "THEN": TokenType.THEN,
374        "TRUE": TokenType.TRUE,
375        "UNION": TokenType.UNION,
376        "UNKNOWN": TokenType.UNKNOWN,
377        "UNNEST": TokenType.UNNEST,
378        "UNPIVOT": TokenType.UNPIVOT,
379        "UPDATE": TokenType.UPDATE,
380        "USE": TokenType.USE,
381        "USING": TokenType.USING,
382        "UUID": TokenType.UUID,
383        "VALUES": TokenType.VALUES,
384        "VIEW": TokenType.VIEW,
385        "VOLATILE": TokenType.VOLATILE,
386        "WHEN": TokenType.WHEN,
387        "WHERE": TokenType.WHERE,
388        "WINDOW": TokenType.WINDOW,
389        "WITH": TokenType.WITH,
390        "APPLY": TokenType.APPLY,
391        "ARRAY": TokenType.ARRAY,
392        "BIT": TokenType.BIT,
393        "BOOL": TokenType.BOOLEAN,
394        "BOOLEAN": TokenType.BOOLEAN,
395        "BYTE": TokenType.TINYINT,
396        "MEDIUMINT": TokenType.MEDIUMINT,
397        "INT1": TokenType.TINYINT,
398        "TINYINT": TokenType.TINYINT,
399        "INT16": TokenType.SMALLINT,
400        "SHORT": TokenType.SMALLINT,
401        "SMALLINT": TokenType.SMALLINT,
402        "INT128": TokenType.INT128,
403        "HUGEINT": TokenType.INT128,
404        "INT2": TokenType.SMALLINT,
405        "INTEGER": TokenType.INT,
406        "INT": TokenType.INT,
407        "INT4": TokenType.INT,
408        "INT32": TokenType.INT,
409        "INT64": TokenType.BIGINT,
410        "LONG": TokenType.BIGINT,
411        "BIGINT": TokenType.BIGINT,
412        "INT8": TokenType.TINYINT,
413        "DEC": TokenType.DECIMAL,
414        "DECIMAL": TokenType.DECIMAL,
415        "BIGDECIMAL": TokenType.BIGDECIMAL,
416        "BIGNUMERIC": TokenType.BIGDECIMAL,
417        "MAP": TokenType.MAP,
418        "NULLABLE": TokenType.NULLABLE,
419        "NUMBER": TokenType.DECIMAL,
420        "NUMERIC": TokenType.DECIMAL,
421        "FIXED": TokenType.DECIMAL,
422        "REAL": TokenType.FLOAT,
423        "FLOAT": TokenType.FLOAT,
424        "FLOAT4": TokenType.FLOAT,
425        "FLOAT8": TokenType.DOUBLE,
426        "DOUBLE": TokenType.DOUBLE,
427        "DOUBLE PRECISION": TokenType.DOUBLE,
428        "JSON": TokenType.JSON,
429        "CHAR": TokenType.CHAR,
430        "CHARACTER": TokenType.CHAR,
431        "NCHAR": TokenType.NCHAR,
432        "VARCHAR": TokenType.VARCHAR,
433        "VARCHAR2": TokenType.VARCHAR,
434        "NVARCHAR": TokenType.NVARCHAR,
435        "NVARCHAR2": TokenType.NVARCHAR,
436        "STR": TokenType.TEXT,
437        "STRING": TokenType.TEXT,
438        "TEXT": TokenType.TEXT,
439        "LONGTEXT": TokenType.LONGTEXT,
440        "MEDIUMTEXT": TokenType.MEDIUMTEXT,
441        "TINYTEXT": TokenType.TINYTEXT,
442        "CLOB": TokenType.TEXT,
443        "LONGVARCHAR": TokenType.TEXT,
444        "BINARY": TokenType.BINARY,
445        "BLOB": TokenType.VARBINARY,
446        "LONGBLOB": TokenType.LONGBLOB,
447        "MEDIUMBLOB": TokenType.MEDIUMBLOB,
448        "TINYBLOB": TokenType.TINYBLOB,
449        "BYTEA": TokenType.VARBINARY,
450        "VARBINARY": TokenType.VARBINARY,
451        "TIME": TokenType.TIME,
452        "TIMETZ": TokenType.TIMETZ,
453        "TIMESTAMP": TokenType.TIMESTAMP,
454        "TIMESTAMPTZ": TokenType.TIMESTAMPTZ,
455        "TIMESTAMPLTZ": TokenType.TIMESTAMPLTZ,
456        "DATE": TokenType.DATE,
457        "DATETIME": TokenType.DATETIME,
458        "INT4RANGE": TokenType.INT4RANGE,
459        "INT4MULTIRANGE": TokenType.INT4MULTIRANGE,
460        "INT8RANGE": TokenType.INT8RANGE,
461        "INT8MULTIRANGE": TokenType.INT8MULTIRANGE,
462        "NUMRANGE": TokenType.NUMRANGE,
463        "NUMMULTIRANGE": TokenType.NUMMULTIRANGE,
464        "TSRANGE": TokenType.TSRANGE,
465        "TSMULTIRANGE": TokenType.TSMULTIRANGE,
466        "TSTZRANGE": TokenType.TSTZRANGE,
467        "TSTZMULTIRANGE": TokenType.TSTZMULTIRANGE,
468        "DATERANGE": TokenType.DATERANGE,
469        "DATEMULTIRANGE": TokenType.DATEMULTIRANGE,
470        "UNIQUE": TokenType.UNIQUE,
471        "STRUCT": TokenType.STRUCT,
472        "VARIANT": TokenType.VARIANT,
473        "ALTER": TokenType.ALTER,
474        "ANALYZE": TokenType.COMMAND,
475        "CALL": TokenType.COMMAND,
476        "COMMENT": TokenType.COMMENT,
477        "COPY": TokenType.COMMAND,
478        "EXPLAIN": TokenType.COMMAND,
479        "GRANT": TokenType.COMMAND,
480        "OPTIMIZE": TokenType.COMMAND,
481        "PREPARE": TokenType.COMMAND,
482        "TRUNCATE": TokenType.COMMAND,
483        "VACUUM": TokenType.COMMAND,
484        "USER-DEFINED": TokenType.USERDEFINED,
485        "FOR VERSION": TokenType.VERSION_SNAPSHOT,
486        "FOR TIMESTAMP": TokenType.TIMESTAMP_SNAPSHOT,
487    }
488
489    WHITE_SPACE: t.Dict[t.Optional[str], TokenType] = {
490        " ": TokenType.SPACE,
491        "\t": TokenType.SPACE,
492        "\n": TokenType.BREAK,
493        "\r": TokenType.BREAK,
494    }
495
496    COMMANDS = {
497        TokenType.COMMAND,
498        TokenType.EXECUTE,
499        TokenType.FETCH,
500        TokenType.SHOW,
501    }
502
503    COMMAND_PREFIX_TOKENS = {TokenType.SEMICOLON, TokenType.BEGIN}
504
505    # handle numeric literals like in hive (3L = BIGINT)
506    NUMERIC_LITERALS: t.Dict[str, str] = {}
507
508    COMMENTS = ["--", ("/*", "*/")]
509
510    __slots__ = (
511        "sql",
512        "size",
513        "tokens",
514        "dialect",
515        "_start",
516        "_current",
517        "_line",
518        "_col",
519        "_comments",
520        "_char",
521        "_end",
522        "_peek",
523        "_prev_token_line",
524        "_rs_dialect_settings",
525    )
526
527    def __init__(self, dialect: DialectType = None) -> None:
528        from sqlglot.dialects import Dialect
529
530        self.dialect = Dialect.get_or_raise(dialect)
531
532        if USE_RS_TOKENIZER:
533            self._rs_dialect_settings = RsTokenizerDialectSettings(
534                escape_sequences=self.dialect.ESCAPE_SEQUENCES,
535                identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
536            )
537
538        self.reset()
539
540    def reset(self) -> None:
541        self.sql = ""
542        self.size = 0
543        self.tokens: t.List[Token] = []
544        self._start = 0
545        self._current = 0
546        self._line = 1
547        self._col = 0
548        self._comments: t.List[str] = []
549
550        self._char = ""
551        self._end = False
552        self._peek = ""
553        self._prev_token_line = -1
554
555    def tokenize(self, sql: str) -> t.List[Token]:
556        """Returns a list of tokens corresponding to the SQL string `sql`."""
557        if USE_RS_TOKENIZER:
558            return self.tokenize_rs(sql)
559
560        self.reset()
561        self.sql = sql
562        self.size = len(sql)
563
564        try:
565            self._scan()
566        except Exception as e:
567            start = max(self._current - 50, 0)
568            end = min(self._current + 50, self.size - 1)
569            context = self.sql[start:end]
570            raise TokenError(f"Error tokenizing '{context}'") from e
571
572        return self.tokens
573
574    def _scan(self, until: t.Optional[t.Callable] = None) -> None:
575        while self.size and not self._end:
576            current = self._current
577
578            # skip spaces inline rather than iteratively call advance()
579            # for performance reasons
580            while current < self.size:
581                char = self.sql[current]
582
583                if char.isspace() and (char == " " or char == "\t"):
584                    current += 1
585                else:
586                    break
587
588            n = current - self._current
589            self._start = current
590            self._advance(n if n > 1 else 1)
591
592            if self._char is None:
593                break
594
595            if not self._char.isspace():
596                if self._char.isdigit():
597                    self._scan_number()
598                elif self._char in self._IDENTIFIERS:
599                    self._scan_identifier(self._IDENTIFIERS[self._char])
600                else:
601                    self._scan_keywords()
602
603            if until and until():
604                break
605
606        if self.tokens and self._comments:
607            self.tokens[-1].comments.extend(self._comments)
608
609    def _chars(self, size: int) -> str:
610        if size == 1:
611            return self._char
612
613        start = self._current - 1
614        end = start + size
615
616        return self.sql[start:end] if end <= self.size else ""
617
618    def _advance(self, i: int = 1, alnum: bool = False) -> None:
619        if self.WHITE_SPACE.get(self._char) is TokenType.BREAK:
620            # Ensures we don't count an extra line if we get a \r\n line break sequence
621            if self._char == "\r" and self._peek == "\n":
622                i = 2
623                self._start += 1
624
625            self._col = 1
626            self._line += 1
627        else:
628            self._col += i
629
630        self._current += i
631        self._end = self._current >= self.size
632        self._char = self.sql[self._current - 1]
633        self._peek = "" if self._end else self.sql[self._current]
634
635        if alnum and self._char.isalnum():
636            # Here we use local variables instead of attributes for better performance
637            _col = self._col
638            _current = self._current
639            _end = self._end
640            _peek = self._peek
641
642            while _peek.isalnum():
643                _col += 1
644                _current += 1
645                _end = _current >= self.size
646                _peek = "" if _end else self.sql[_current]
647
648            self._col = _col
649            self._current = _current
650            self._end = _end
651            self._peek = _peek
652            self._char = self.sql[_current - 1]
653
654    @property
655    def _text(self) -> str:
656        return self.sql[self._start : self._current]
657
658    def peek(self, i: int = 0) -> str:
659        i = self._current + i
660        if i < self.size:
661            return self.sql[i]
662        return ""
663
664    def _add(self, token_type: TokenType, text: t.Optional[str] = None) -> None:
665        self._prev_token_line = self._line
666
667        if self._comments and token_type == TokenType.SEMICOLON and self.tokens:
668            self.tokens[-1].comments.extend(self._comments)
669            self._comments = []
670
671        self.tokens.append(
672            Token(
673                token_type,
674                text=self._text if text is None else text,
675                line=self._line,
676                col=self._col,
677                start=self._start,
678                end=self._current - 1,
679                comments=self._comments,
680            )
681        )
682        self._comments = []
683
684        # If we have either a semicolon or a begin token before the command's token, we'll parse
685        # whatever follows the command's token as a string
686        if (
687            token_type in self.COMMANDS
688            and self._peek != ";"
689            and (len(self.tokens) == 1 or self.tokens[-2].token_type in self.COMMAND_PREFIX_TOKENS)
690        ):
691            start = self._current
692            tokens = len(self.tokens)
693            self._scan(lambda: self._peek == ";")
694            self.tokens = self.tokens[:tokens]
695            text = self.sql[start : self._current].strip()
696            if text:
697                self._add(TokenType.STRING, text)
698
699    def _scan_keywords(self) -> None:
700        size = 0
701        word = None
702        chars = self._text
703        char = chars
704        prev_space = False
705        skip = False
706        trie = self._KEYWORD_TRIE
707        single_token = char in self.SINGLE_TOKENS
708
709        while chars:
710            if skip:
711                result = TrieResult.PREFIX
712            else:
713                result, trie = in_trie(trie, char.upper())
714
715            if result == TrieResult.FAILED:
716                break
717            if result == TrieResult.EXISTS:
718                word = chars
719
720            end = self._current + size
721            size += 1
722
723            if end < self.size:
724                char = self.sql[end]
725                single_token = single_token or char in self.SINGLE_TOKENS
726                is_space = char.isspace()
727
728                if not is_space or not prev_space:
729                    if is_space:
730                        char = " "
731                    chars += char
732                    prev_space = is_space
733                    skip = False
734                else:
735                    skip = True
736            else:
737                char = ""
738                break
739
740        if word:
741            if self._scan_string(word):
742                return
743            if self._scan_comment(word):
744                return
745            if prev_space or single_token or not char:
746                self._advance(size - 1)
747                word = word.upper()
748                self._add(self.KEYWORDS[word], text=word)
749                return
750
751        if self._char in self.SINGLE_TOKENS:
752            self._add(self.SINGLE_TOKENS[self._char], text=self._char)
753            return
754
755        self._scan_var()
756
757    def _scan_comment(self, comment_start: str) -> bool:
758        if comment_start not in self._COMMENTS:
759            return False
760
761        comment_start_line = self._line
762        comment_start_size = len(comment_start)
763        comment_end = self._COMMENTS[comment_start]
764
765        if comment_end:
766            # Skip the comment's start delimiter
767            self._advance(comment_start_size)
768
769            comment_end_size = len(comment_end)
770            while not self._end and self._chars(comment_end_size) != comment_end:
771                self._advance(alnum=True)
772
773            self._comments.append(self._text[comment_start_size : -comment_end_size + 1])
774            self._advance(comment_end_size - 1)
775        else:
776            while not self._end and not self.WHITE_SPACE.get(self._peek) is TokenType.BREAK:
777                self._advance(alnum=True)
778            self._comments.append(self._text[comment_start_size:])
779
780        # Leading comment is attached to the succeeding token, whilst trailing comment to the preceding.
781        # Multiple consecutive comments are preserved by appending them to the current comments list.
782        if comment_start_line == self._prev_token_line:
783            self.tokens[-1].comments.extend(self._comments)
784            self._comments = []
785            self._prev_token_line = self._line
786
787        return True
788
789    def _scan_number(self) -> None:
790        if self._char == "0":
791            peek = self._peek.upper()
792            if peek == "B":
793                return self._scan_bits() if self.BIT_STRINGS else self._add(TokenType.NUMBER)
794            elif peek == "X":
795                return self._scan_hex() if self.HEX_STRINGS else self._add(TokenType.NUMBER)
796
797        decimal = False
798        scientific = 0
799
800        while True:
801            if self._peek.isdigit():
802                self._advance()
803            elif self._peek == "." and not decimal:
804                after = self.peek(1)
805                if after.isdigit() or not after.isalpha():
806                    decimal = True
807                    self._advance()
808                else:
809                    return self._add(TokenType.VAR)
810            elif self._peek in ("-", "+") and scientific == 1:
811                scientific += 1
812                self._advance()
813            elif self._peek.upper() == "E" and not scientific:
814                scientific += 1
815                self._advance()
816            elif self._peek.isidentifier():
817                number_text = self._text
818                literal = ""
819
820                while self._peek.strip() and self._peek not in self.SINGLE_TOKENS:
821                    literal += self._peek
822                    self._advance()
823
824                token_type = self.KEYWORDS.get(self.NUMERIC_LITERALS.get(literal.upper(), ""))
825
826                if token_type:
827                    self._add(TokenType.NUMBER, number_text)
828                    self._add(TokenType.DCOLON, "::")
829                    return self._add(token_type, literal)
830                elif self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT:
831                    return self._add(TokenType.VAR)
832
833                self._advance(-len(literal))
834                return self._add(TokenType.NUMBER, number_text)
835            else:
836                return self._add(TokenType.NUMBER)
837
838    def _scan_bits(self) -> None:
839        self._advance()
840        value = self._extract_value()
841        try:
842            # If `value` can't be converted to a binary, fallback to tokenizing it as an identifier
843            int(value, 2)
844            self._add(TokenType.BIT_STRING, value[2:])  # Drop the 0b
845        except ValueError:
846            self._add(TokenType.IDENTIFIER)
847
848    def _scan_hex(self) -> None:
849        self._advance()
850        value = self._extract_value()
851        try:
852            # If `value` can't be converted to a hex, fallback to tokenizing it as an identifier
853            int(value, 16)
854            self._add(TokenType.HEX_STRING, value[2:])  # Drop the 0x
855        except ValueError:
856            self._add(TokenType.IDENTIFIER)
857
858    def _extract_value(self) -> str:
859        while True:
860            char = self._peek.strip()
861            if char and char not in self.SINGLE_TOKENS:
862                self._advance(alnum=True)
863            else:
864                break
865
866        return self._text
867
868    def _scan_string(self, start: str) -> bool:
869        base = None
870        token_type = TokenType.STRING
871
872        if start in self._QUOTES:
873            end = self._QUOTES[start]
874        elif start in self._FORMAT_STRINGS:
875            end, token_type = self._FORMAT_STRINGS[start]
876
877            if token_type == TokenType.HEX_STRING:
878                base = 16
879            elif token_type == TokenType.BIT_STRING:
880                base = 2
881            elif token_type == TokenType.HEREDOC_STRING:
882                self._advance()
883                tag = "" if self._char == end else self._extract_string(end)
884                end = f"{start}{tag}{end}"
885        else:
886            return False
887
888        self._advance(len(start))
889        text = self._extract_string(end)
890
891        if base:
892            try:
893                int(text, base)
894            except:
895                raise TokenError(
896                    f"Numeric string contains invalid characters from {self._line}:{self._start}"
897                )
898
899        self._add(token_type, text)
900        return True
901
902    def _scan_identifier(self, identifier_end: str) -> None:
903        self._advance()
904        text = self._extract_string(identifier_end, self._IDENTIFIER_ESCAPES)
905        self._add(TokenType.IDENTIFIER, text)
906
907    def _scan_var(self) -> None:
908        while True:
909            char = self._peek.strip()
910            if char and (char in self.VAR_SINGLE_TOKENS or char not in self.SINGLE_TOKENS):
911                self._advance(alnum=True)
912            else:
913                break
914
915        self._add(
916            TokenType.VAR
917            if self.tokens and self.tokens[-1].token_type == TokenType.PARAMETER
918            else self.KEYWORDS.get(self._text.upper(), TokenType.VAR)
919        )
920
921    def _extract_string(self, delimiter: str, escapes=None) -> str:
922        text = ""
923        delim_size = len(delimiter)
924        escapes = self._STRING_ESCAPES if escapes is None else escapes
925
926        while True:
927            if (
928                self._char in escapes
929                and (self._peek == delimiter or self._peek in escapes)
930                and (self._char not in self._QUOTES or self._char == self._peek)
931            ):
932                if self._peek == delimiter:
933                    text += self._peek
934                else:
935                    text += self._char + self._peek
936
937                if self._current + 1 < self.size:
938                    self._advance(2)
939                else:
940                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._current}")
941            else:
942                if self._chars(delim_size) == delimiter:
943                    if delim_size > 1:
944                        self._advance(delim_size - 1)
945                    break
946
947                if self._end:
948                    raise TokenError(f"Missing {delimiter} from {self._line}:{self._start}")
949
950                if (
951                    self.dialect.ESCAPE_SEQUENCES
952                    and self._peek
953                    and self._char in self.STRING_ESCAPES
954                ):
955                    escaped_sequence = self.dialect.ESCAPE_SEQUENCES.get(self._char + self._peek)
956                    if escaped_sequence:
957                        self._advance(2)
958                        text += escaped_sequence
959                        continue
960
961                current = self._current - 1
962                self._advance(alnum=True)
963                text += self.sql[current : self._current - 1]
964
965        return text
966
967    def tokenize_rs(self, sql: str) -> t.List[Token]:
968        if not self._RS_TOKENIZER:
969            raise SqlglotError("Rust tokenizer is not available")
970
971        try:
972            return [
973                Token(
974                    token_type=_ALL_TOKEN_TYPES[token.token_type.index],
975                    text=token.text,
976                    line=token.line,
977                    col=token.col,
978                    start=token.start,
979                    end=token.end,
980                    comments=token.comments,
981                )
982                for token in self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings)
983            ]
984        except Exception as e:
985            raise TokenError(str(e))
Tokenizer( dialect: Union[str, sqlglot.dialects.dialect.Dialect, Type[sqlglot.dialects.dialect.Dialect], NoneType] = None)
527    def __init__(self, dialect: DialectType = None) -> None:
528        from sqlglot.dialects import Dialect
529
530        self.dialect = Dialect.get_or_raise(dialect)
531
532        if USE_RS_TOKENIZER:
533            self._rs_dialect_settings = RsTokenizerDialectSettings(
534                escape_sequences=self.dialect.ESCAPE_SEQUENCES,
535                identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
536            )
537
538        self.reset()
SINGLE_TOKENS = {'(': <TokenType.L_PAREN: 'L_PAREN'>, ')': <TokenType.R_PAREN: 'R_PAREN'>, '[': <TokenType.L_BRACKET: 'L_BRACKET'>, ']': <TokenType.R_BRACKET: 'R_BRACKET'>, '{': <TokenType.L_BRACE: 'L_BRACE'>, '}': <TokenType.R_BRACE: 'R_BRACE'>, '&': <TokenType.AMP: 'AMP'>, '^': <TokenType.CARET: 'CARET'>, ':': <TokenType.COLON: 'COLON'>, ',': <TokenType.COMMA: 'COMMA'>, '.': <TokenType.DOT: 'DOT'>, '-': <TokenType.DASH: 'DASH'>, '=': <TokenType.EQ: 'EQ'>, '>': <TokenType.GT: 'GT'>, '<': <TokenType.LT: 'LT'>, '%': <TokenType.MOD: 'MOD'>, '!': <TokenType.NOT: 'NOT'>, '|': <TokenType.PIPE: 'PIPE'>, '+': <TokenType.PLUS: 'PLUS'>, ';': <TokenType.SEMICOLON: 'SEMICOLON'>, '/': <TokenType.SLASH: 'SLASH'>, '\\': <TokenType.BACKSLASH: 'BACKSLASH'>, '*': <TokenType.STAR: 'STAR'>, '~': <TokenType.TILDA: 'TILDA'>, '?': <TokenType.PLACEHOLDER: 'PLACEHOLDER'>, '@': <TokenType.PARAMETER: 'PARAMETER'>, "'": <TokenType.QUOTE: 'QUOTE'>, '`': <TokenType.IDENTIFIER: 'IDENTIFIER'>, '"': <TokenType.IDENTIFIER: 'IDENTIFIER'>, '#': <TokenType.HASH: 'HASH'>}
BIT_STRINGS: List[Union[str, Tuple[str, str]]] = []
BYTE_STRINGS: List[Union[str, Tuple[str, str]]] = []
HEX_STRINGS: List[Union[str, Tuple[str, str]]] = []
RAW_STRINGS: List[Union[str, Tuple[str, str]]] = []
HEREDOC_STRINGS: List[Union[str, Tuple[str, str]]] = []
IDENTIFIERS: List[Union[str, Tuple[str, str]]] = ['"']
IDENTIFIER_ESCAPES = ['"']
QUOTES: List[Union[str, Tuple[str, str]]] = ["'"]
STRING_ESCAPES = ["'"]
VAR_SINGLE_TOKENS: Set[str] = set()
KEYWORDS: Dict[str, sqlglot.token_type.TokenType] = {'{%': <TokenType.BLOCK_START: 'BLOCK_START'>, '{%+': <TokenType.BLOCK_START: 'BLOCK_START'>, '{%-': <TokenType.BLOCK_START: 'BLOCK_START'>, '%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '+%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '-%}': <TokenType.BLOCK_END: 'BLOCK_END'>, '{{+': <TokenType.BLOCK_START: 'BLOCK_START'>, '{{-': <TokenType.BLOCK_START: 'BLOCK_START'>, '+}}': <TokenType.BLOCK_END: 'BLOCK_END'>, '-}}': <TokenType.BLOCK_END: 'BLOCK_END'>, '/*+': <TokenType.HINT: 'HINT'>, '==': <TokenType.EQ: 'EQ'>, '::': <TokenType.DCOLON: 'DCOLON'>, '||': <TokenType.DPIPE: 'DPIPE'>, '>=': <TokenType.GTE: 'GTE'>, '<=': <TokenType.LTE: 'LTE'>, '<>': <TokenType.NEQ: 'NEQ'>, '!=': <TokenType.NEQ: 'NEQ'>, ':=': <TokenType.COLON_EQ: 'COLON_EQ'>, '<=>': <TokenType.NULLSAFE_EQ: 'NULLSAFE_EQ'>, '->': <TokenType.ARROW: 'ARROW'>, '->>': <TokenType.DARROW: 'DARROW'>, '=>': <TokenType.FARROW: 'FARROW'>, '#>': <TokenType.HASH_ARROW: 'HASH_ARROW'>, '#>>': <TokenType.DHASH_ARROW: 'DHASH_ARROW'>, '<->': <TokenType.LR_ARROW: 'LR_ARROW'>, '&&': <TokenType.DAMP: 'DAMP'>, '??': <TokenType.DQMARK: 'DQMARK'>, 'ALL': <TokenType.ALL: 'ALL'>, 'ALWAYS': <TokenType.ALWAYS: 'ALWAYS'>, 'AND': <TokenType.AND: 'AND'>, 'ANTI': <TokenType.ANTI: 'ANTI'>, 'ANY': <TokenType.ANY: 'ANY'>, 'ASC': <TokenType.ASC: 'ASC'>, 'AS': <TokenType.ALIAS: 'ALIAS'>, 'ASOF': <TokenType.ASOF: 'ASOF'>, 'AUTOINCREMENT': <TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>, 'AUTO_INCREMENT': <TokenType.AUTO_INCREMENT: 'AUTO_INCREMENT'>, 'BEGIN': <TokenType.BEGIN: 'BEGIN'>, 'BETWEEN': <TokenType.BETWEEN: 'BETWEEN'>, 'CACHE': <TokenType.CACHE: 'CACHE'>, 'UNCACHE': <TokenType.UNCACHE: 'UNCACHE'>, 'CASE': <TokenType.CASE: 'CASE'>, 'CHARACTER SET': <TokenType.CHARACTER_SET: 'CHARACTER_SET'>, 'CLUSTER BY': <TokenType.CLUSTER_BY: 'CLUSTER_BY'>, 'COLLATE': <TokenType.COLLATE: 'COLLATE'>, 'COLUMN': <TokenType.COLUMN: 'COLUMN'>, 'COMMIT': <TokenType.COMMIT: 'COMMIT'>, 'CONNECT BY': <TokenType.CONNECT_BY: 'CONNECT_BY'>, 'CONSTRAINT': <TokenType.CONSTRAINT: 'CONSTRAINT'>, 'CREATE': <TokenType.CREATE: 'CREATE'>, 'CROSS': <TokenType.CROSS: 'CROSS'>, 'CUBE': <TokenType.CUBE: 'CUBE'>, 'CURRENT_DATE': <TokenType.CURRENT_DATE: 'CURRENT_DATE'>, 'CURRENT_TIME': <TokenType.CURRENT_TIME: 'CURRENT_TIME'>, 'CURRENT_TIMESTAMP': <TokenType.CURRENT_TIMESTAMP: 'CURRENT_TIMESTAMP'>, 'CURRENT_USER': <TokenType.CURRENT_USER: 'CURRENT_USER'>, 'DATABASE': <TokenType.DATABASE: 'DATABASE'>, 'DEFAULT': <TokenType.DEFAULT: 'DEFAULT'>, 'DELETE': <TokenType.DELETE: 'DELETE'>, 'DESC': <TokenType.DESC: 'DESC'>, 'DESCRIBE': <TokenType.DESCRIBE: 'DESCRIBE'>, 'DISTINCT': <TokenType.DISTINCT: 'DISTINCT'>, 'DISTRIBUTE BY': <TokenType.DISTRIBUTE_BY: 'DISTRIBUTE_BY'>, 'DIV': <TokenType.DIV: 'DIV'>, 'DROP': <TokenType.DROP: 'DROP'>, 'ELSE': <TokenType.ELSE: 'ELSE'>, 'END': <TokenType.END: 'END'>, 'ESCAPE': <TokenType.ESCAPE: 'ESCAPE'>, 'EXCEPT': <TokenType.EXCEPT: 'EXCEPT'>, 'EXECUTE': <TokenType.EXECUTE: 'EXECUTE'>, 'EXISTS': <TokenType.EXISTS: 'EXISTS'>, 'FALSE': <TokenType.FALSE: 'FALSE'>, 'FETCH': <TokenType.FETCH: 'FETCH'>, 'FILTER': <TokenType.FILTER: 'FILTER'>, 'FIRST': <TokenType.FIRST: 'FIRST'>, 'FULL': <TokenType.FULL: 'FULL'>, 'FUNCTION': <TokenType.FUNCTION: 'FUNCTION'>, 'FOR': <TokenType.FOR: 'FOR'>, 'FOREIGN KEY': <TokenType.FOREIGN_KEY: 'FOREIGN_KEY'>, 'FORMAT': <TokenType.FORMAT: 'FORMAT'>, 'FROM': <TokenType.FROM: 'FROM'>, 'GEOGRAPHY': <TokenType.GEOGRAPHY: 'GEOGRAPHY'>, 'GEOMETRY': <TokenType.GEOMETRY: 'GEOMETRY'>, 'GLOB': <TokenType.GLOB: 'GLOB'>, 'GROUP BY': <TokenType.GROUP_BY: 'GROUP_BY'>, 'GROUPING SETS': <TokenType.GROUPING_SETS: 'GROUPING_SETS'>, 'HAVING': <TokenType.HAVING: 'HAVING'>, 'ILIKE': <TokenType.ILIKE: 'ILIKE'>, 'IN': <TokenType.IN: 'IN'>, 'INDEX': <TokenType.INDEX: 'INDEX'>, 'INET': <TokenType.INET: 'INET'>, 'INNER': <TokenType.INNER: 'INNER'>, 'INSERT': <TokenType.INSERT: 'INSERT'>, 'INTERVAL': <TokenType.INTERVAL: 'INTERVAL'>, 'INTERSECT': <TokenType.INTERSECT: 'INTERSECT'>, 'INTO': <TokenType.INTO: 'INTO'>, 'IS': <TokenType.IS: 'IS'>, 'ISNULL': <TokenType.ISNULL: 'ISNULL'>, 'JOIN': <TokenType.JOIN: 'JOIN'>, 'KEEP': <TokenType.KEEP: 'KEEP'>, 'KILL': <TokenType.KILL: 'KILL'>, 'LATERAL': <TokenType.LATERAL: 'LATERAL'>, 'LEFT': <TokenType.LEFT: 'LEFT'>, 'LIKE': <TokenType.LIKE: 'LIKE'>, 'LIMIT': <TokenType.LIMIT: 'LIMIT'>, 'LOAD': <TokenType.LOAD: 'LOAD'>, 'LOCK': <TokenType.LOCK: 'LOCK'>, 'MERGE': <TokenType.MERGE: 'MERGE'>, 'NATURAL': <TokenType.NATURAL: 'NATURAL'>, 'NEXT': <TokenType.NEXT: 'NEXT'>, 'NOT': <TokenType.NOT: 'NOT'>, 'NOTNULL': <TokenType.NOTNULL: 'NOTNULL'>, 'NULL': <TokenType.NULL: 'NULL'>, 'OBJECT': <TokenType.OBJECT: 'OBJECT'>, 'OFFSET': <TokenType.OFFSET: 'OFFSET'>, 'ON': <TokenType.ON: 'ON'>, 'OR': <TokenType.OR: 'OR'>, 'XOR': <TokenType.XOR: 'XOR'>, 'ORDER BY': <TokenType.ORDER_BY: 'ORDER_BY'>, 'ORDINALITY': <TokenType.ORDINALITY: 'ORDINALITY'>, 'OUTER': <TokenType.OUTER: 'OUTER'>, 'OVER': <TokenType.OVER: 'OVER'>, 'OVERLAPS': <TokenType.OVERLAPS: 'OVERLAPS'>, 'OVERWRITE': <TokenType.OVERWRITE: 'OVERWRITE'>, 'PARTITION': <TokenType.PARTITION: 'PARTITION'>, 'PARTITION BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PARTITIONED BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PARTITIONED_BY': <TokenType.PARTITION_BY: 'PARTITION_BY'>, 'PERCENT': <TokenType.PERCENT: 'PERCENT'>, 'PIVOT': <TokenType.PIVOT: 'PIVOT'>, 'PRAGMA': <TokenType.PRAGMA: 'PRAGMA'>, 'PRIMARY KEY': <TokenType.PRIMARY_KEY: 'PRIMARY_KEY'>, 'PROCEDURE': <TokenType.PROCEDURE: 'PROCEDURE'>, 'QUALIFY': <TokenType.QUALIFY: 'QUALIFY'>, 'RANGE': <TokenType.RANGE: 'RANGE'>, 'RECURSIVE': <TokenType.RECURSIVE: 'RECURSIVE'>, 'REGEXP': <TokenType.RLIKE: 'RLIKE'>, 'REPLACE': <TokenType.REPLACE: 'REPLACE'>, 'RETURNING': <TokenType.RETURNING: 'RETURNING'>, 'REFERENCES': <TokenType.REFERENCES: 'REFERENCES'>, 'RIGHT': <TokenType.RIGHT: 'RIGHT'>, 'RLIKE': <TokenType.RLIKE: 'RLIKE'>, 'ROLLBACK': <TokenType.ROLLBACK: 'ROLLBACK'>, 'ROLLUP': <TokenType.ROLLUP: 'ROLLUP'>, 'ROW': <TokenType.ROW: 'ROW'>, 'ROWS': <TokenType.ROWS: 'ROWS'>, 'SCHEMA': <TokenType.SCHEMA: 'SCHEMA'>, 'SELECT': <TokenType.SELECT: 'SELECT'>, 'SEMI': <TokenType.SEMI: 'SEMI'>, 'SET': <TokenType.SET: 'SET'>, 'SETTINGS': <TokenType.SETTINGS: 'SETTINGS'>, 'SHOW': <TokenType.SHOW: 'SHOW'>, 'SIMILAR TO': <TokenType.SIMILAR_TO: 'SIMILAR_TO'>, 'SOME': <TokenType.SOME: 'SOME'>, 'SORT BY': <TokenType.SORT_BY: 'SORT_BY'>, 'START WITH': <TokenType.START_WITH: 'START_WITH'>, 'TABLE': <TokenType.TABLE: 'TABLE'>, 'TABLESAMPLE': <TokenType.TABLE_SAMPLE: 'TABLE_SAMPLE'>, 'TEMP': <TokenType.TEMPORARY: 'TEMPORARY'>, 'TEMPORARY': <TokenType.TEMPORARY: 'TEMPORARY'>, 'THEN': <TokenType.THEN: 'THEN'>, 'TRUE': <TokenType.TRUE: 'TRUE'>, 'UNION': <TokenType.UNION: 'UNION'>, 'UNKNOWN': <TokenType.UNKNOWN: 'UNKNOWN'>, 'UNNEST': <TokenType.UNNEST: 'UNNEST'>, 'UNPIVOT': <TokenType.UNPIVOT: 'UNPIVOT'>, 'UPDATE': <TokenType.UPDATE: 'UPDATE'>, 'USE': <TokenType.USE: 'USE'>, 'USING': <TokenType.USING: 'USING'>, 'UUID': <TokenType.UUID: 'UUID'>, 'VALUES': <TokenType.VALUES: 'VALUES'>, 'VIEW': <TokenType.VIEW: 'VIEW'>, 'VOLATILE': <TokenType.VOLATILE: 'VOLATILE'>, 'WHEN': <TokenType.WHEN: 'WHEN'>, 'WHERE': <TokenType.WHERE: 'WHERE'>, 'WINDOW': <TokenType.WINDOW: 'WINDOW'>, 'WITH': <TokenType.WITH: 'WITH'>, 'APPLY': <TokenType.APPLY: 'APPLY'>, 'ARRAY': <TokenType.ARRAY: 'ARRAY'>, 'BIT': <TokenType.BIT: 'BIT'>, 'BOOL': <TokenType.BOOLEAN: 'BOOLEAN'>, 'BOOLEAN': <TokenType.BOOLEAN: 'BOOLEAN'>, 'BYTE': <TokenType.TINYINT: 'TINYINT'>, 'MEDIUMINT': <TokenType.MEDIUMINT: 'MEDIUMINT'>, 'INT1': <TokenType.TINYINT: 'TINYINT'>, 'TINYINT': <TokenType.TINYINT: 'TINYINT'>, 'INT16': <TokenType.SMALLINT: 'SMALLINT'>, 'SHORT': <TokenType.SMALLINT: 'SMALLINT'>, 'SMALLINT': <TokenType.SMALLINT: 'SMALLINT'>, 'INT128': <TokenType.INT128: 'INT128'>, 'HUGEINT': <TokenType.INT128: 'INT128'>, 'INT2': <TokenType.SMALLINT: 'SMALLINT'>, 'INTEGER': <TokenType.INT: 'INT'>, 'INT': <TokenType.INT: 'INT'>, 'INT4': <TokenType.INT: 'INT'>, 'INT32': <TokenType.INT: 'INT'>, 'INT64': <TokenType.BIGINT: 'BIGINT'>, 'LONG': <TokenType.BIGINT: 'BIGINT'>, 'BIGINT': <TokenType.BIGINT: 'BIGINT'>, 'INT8': <TokenType.TINYINT: 'TINYINT'>, 'DEC': <TokenType.DECIMAL: 'DECIMAL'>, 'DECIMAL': <TokenType.DECIMAL: 'DECIMAL'>, 'BIGDECIMAL': <TokenType.BIGDECIMAL: 'BIGDECIMAL'>, 'BIGNUMERIC': <TokenType.BIGDECIMAL: 'BIGDECIMAL'>, 'MAP': <TokenType.MAP: 'MAP'>, 'NULLABLE': <TokenType.NULLABLE: 'NULLABLE'>, 'NUMBER': <TokenType.DECIMAL: 'DECIMAL'>, 'NUMERIC': <TokenType.DECIMAL: 'DECIMAL'>, 'FIXED': <TokenType.DECIMAL: 'DECIMAL'>, 'REAL': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT4': <TokenType.FLOAT: 'FLOAT'>, 'FLOAT8': <TokenType.DOUBLE: 'DOUBLE'>, 'DOUBLE': <TokenType.DOUBLE: 'DOUBLE'>, 'DOUBLE PRECISION': <TokenType.DOUBLE: 'DOUBLE'>, 'JSON': <TokenType.JSON: 'JSON'>, 'CHAR': <TokenType.CHAR: 'CHAR'>, 'CHARACTER': <TokenType.CHAR: 'CHAR'>, 'NCHAR': <TokenType.NCHAR: 'NCHAR'>, 'VARCHAR': <TokenType.VARCHAR: 'VARCHAR'>, 'VARCHAR2': <TokenType.VARCHAR: 'VARCHAR'>, 'NVARCHAR': <TokenType.NVARCHAR: 'NVARCHAR'>, 'NVARCHAR2': <TokenType.NVARCHAR: 'NVARCHAR'>, 'STR': <TokenType.TEXT: 'TEXT'>, 'STRING': <TokenType.TEXT: 'TEXT'>, 'TEXT': <TokenType.TEXT: 'TEXT'>, 'LONGTEXT': <TokenType.LONGTEXT: 'LONGTEXT'>, 'MEDIUMTEXT': <TokenType.MEDIUMTEXT: 'MEDIUMTEXT'>, 'TINYTEXT': <TokenType.TINYTEXT: 'TINYTEXT'>, 'CLOB': <TokenType.TEXT: 'TEXT'>, 'LONGVARCHAR': <TokenType.TEXT: 'TEXT'>, 'BINARY': <TokenType.BINARY: 'BINARY'>, 'BLOB': <TokenType.VARBINARY: 'VARBINARY'>, 'LONGBLOB': <TokenType.LONGBLOB: 'LONGBLOB'>, 'MEDIUMBLOB': <TokenType.MEDIUMBLOB: 'MEDIUMBLOB'>, 'TINYBLOB': <TokenType.TINYBLOB: 'TINYBLOB'>, 'BYTEA': <TokenType.VARBINARY: 'VARBINARY'>, 'VARBINARY': <TokenType.VARBINARY: 'VARBINARY'>, 'TIME': <TokenType.TIME: 'TIME'>, 'TIMETZ': <TokenType.TIMETZ: 'TIMETZ'>, 'TIMESTAMP': <TokenType.TIMESTAMP: 'TIMESTAMP'>, 'TIMESTAMPTZ': <TokenType.TIMESTAMPTZ: 'TIMESTAMPTZ'>, 'TIMESTAMPLTZ': <TokenType.TIMESTAMPLTZ: 'TIMESTAMPLTZ'>, 'DATE': <TokenType.DATE: 'DATE'>, 'DATETIME': <TokenType.DATETIME: 'DATETIME'>, 'INT4RANGE': <TokenType.INT4RANGE: 'INT4RANGE'>, 'INT4MULTIRANGE': <TokenType.INT4MULTIRANGE: 'INT4MULTIRANGE'>, 'INT8RANGE': <TokenType.INT8RANGE: 'INT8RANGE'>, 'INT8MULTIRANGE': <TokenType.INT8MULTIRANGE: 'INT8MULTIRANGE'>, 'NUMRANGE': <TokenType.NUMRANGE: 'NUMRANGE'>, 'NUMMULTIRANGE': <TokenType.NUMMULTIRANGE: 'NUMMULTIRANGE'>, 'TSRANGE': <TokenType.TSRANGE: 'TSRANGE'>, 'TSMULTIRANGE': <TokenType.TSMULTIRANGE: 'TSMULTIRANGE'>, 'TSTZRANGE': <TokenType.TSTZRANGE: 'TSTZRANGE'>, 'TSTZMULTIRANGE': <TokenType.TSTZMULTIRANGE: 'TSTZMULTIRANGE'>, 'DATERANGE': <TokenType.DATERANGE: 'DATERANGE'>, 'DATEMULTIRANGE': <TokenType.DATEMULTIRANGE: 'DATEMULTIRANGE'>, 'UNIQUE': <TokenType.UNIQUE: 'UNIQUE'>, 'STRUCT': <TokenType.STRUCT: 'STRUCT'>, 'VARIANT': <TokenType.VARIANT: 'VARIANT'>, 'ALTER': <TokenType.ALTER: 'ALTER'>, 'ANALYZE': <TokenType.COMMAND: 'COMMAND'>, 'CALL': <TokenType.COMMAND: 'COMMAND'>, 'COMMENT': <TokenType.COMMENT: 'COMMENT'>, 'COPY': <TokenType.COMMAND: 'COMMAND'>, 'EXPLAIN': <TokenType.COMMAND: 'COMMAND'>, 'GRANT': <TokenType.COMMAND: 'COMMAND'>, 'OPTIMIZE': <TokenType.COMMAND: 'COMMAND'>, 'PREPARE': <TokenType.COMMAND: 'COMMAND'>, 'TRUNCATE': <TokenType.COMMAND: 'COMMAND'>, 'VACUUM': <TokenType.COMMAND: 'COMMAND'>, 'USER-DEFINED': <TokenType.USERDEFINED: 'USERDEFINED'>, 'FOR VERSION': <TokenType.VERSION_SNAPSHOT: 'VERSION_SNAPSHOT'>, 'FOR TIMESTAMP': <TokenType.TIMESTAMP_SNAPSHOT: 'TIMESTAMP_SNAPSHOT'>}
WHITE_SPACE: Dict[Optional[str], sqlglot.token_type.TokenType] = {' ': <TokenType.SPACE: 'SPACE'>, '\t': <TokenType.SPACE: 'SPACE'>, '\n': <TokenType.BREAK: 'BREAK'>, '\r': <TokenType.BREAK: 'BREAK'>}
COMMANDS = {<TokenType.EXECUTE: 'EXECUTE'>, <TokenType.SHOW: 'SHOW'>, <TokenType.FETCH: 'FETCH'>, <TokenType.COMMAND: 'COMMAND'>}
COMMAND_PREFIX_TOKENS = {<TokenType.BEGIN: 'BEGIN'>, <TokenType.SEMICOLON: 'SEMICOLON'>}
NUMERIC_LITERALS: Dict[str, str] = {}
COMMENTS = ['--', ('/*', '*/')]
dialect
def reset(self) -> None:
540    def reset(self) -> None:
541        self.sql = ""
542        self.size = 0
543        self.tokens: t.List[Token] = []
544        self._start = 0
545        self._current = 0
546        self._line = 1
547        self._col = 0
548        self._comments: t.List[str] = []
549
550        self._char = ""
551        self._end = False
552        self._peek = ""
553        self._prev_token_line = -1
def tokenize(self, sql: str) -> List[Token]:
555    def tokenize(self, sql: str) -> t.List[Token]:
556        """Returns a list of tokens corresponding to the SQL string `sql`."""
557        if USE_RS_TOKENIZER:
558            return self.tokenize_rs(sql)
559
560        self.reset()
561        self.sql = sql
562        self.size = len(sql)
563
564        try:
565            self._scan()
566        except Exception as e:
567            start = max(self._current - 50, 0)
568            end = min(self._current + 50, self.size - 1)
569            context = self.sql[start:end]
570            raise TokenError(f"Error tokenizing '{context}'") from e
571
572        return self.tokens

Returns a list of tokens corresponding to the SQL string sql.

def peek(self, i: int = 0) -> str:
658    def peek(self, i: int = 0) -> str:
659        i = self._current + i
660        if i < self.size:
661            return self.sql[i]
662        return ""
def tokenize_rs(self, sql: str) -> List[Token]:
967    def tokenize_rs(self, sql: str) -> t.List[Token]:
968        if not self._RS_TOKENIZER:
969            raise SqlglotError("Rust tokenizer is not available")
970
971        try:
972            return [
973                Token(
974                    token_type=_ALL_TOKEN_TYPES[token.token_type.index],
975                    text=token.text,
976                    line=token.line,
977                    col=token.col,
978                    start=token.start,
979                    end=token.end,
980                    comments=token.comments,
981                )
982                for token in self._RS_TOKENIZER.tokenize(sql, self._rs_dialect_settings)
983            ]
984        except Exception as e:
985            raise TokenError(str(e))
size
sql
tokens