sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7from sqlglot import exp, generator, parser, tokens, transforms 8from sqlglot.dialects.dialect import ( 9 Dialect, 10 NormalizationStrategy, 11 arg_max_or_min_no_count, 12 binary_from_function, 13 date_add_interval_sql, 14 datestrtodate_sql, 15 build_formatted_time, 16 filter_array_using_unnest, 17 if_sql, 18 inline_array_unless_query, 19 max_or_greatest, 20 min_or_least, 21 no_ilike_sql, 22 build_date_delta_with_interval, 23 regexp_replace_sql, 24 rename_func, 25 sha256_sql, 26 timestrtotime_sql, 27 ts_or_ds_add_cast, 28 unit_to_var, 29 str_position_sql, 30) 31from sqlglot.helper import seq_get, split_num_words 32from sqlglot.tokens import TokenType 33from sqlglot.generator import unsupported_args 34 35if t.TYPE_CHECKING: 36 from sqlglot._typing import E, Lit 37 38 from sqlglot.optimizer.annotate_types import TypeAnnotator 39 40logger = logging.getLogger("sqlglot") 41 42 43def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 44 if not expression.find_ancestor(exp.From, exp.Join): 45 return self.values_sql(expression) 46 47 structs = [] 48 alias = expression.args.get("alias") 49 for tup in expression.find_all(exp.Tuple): 50 field_aliases = ( 51 alias.columns 52 if alias and alias.columns 53 else (f"_c{i}" for i in range(len(tup.expressions))) 54 ) 55 expressions = [ 56 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 57 for name, fld in zip(field_aliases, tup.expressions) 58 ] 59 structs.append(exp.Struct(expressions=expressions)) 60 61 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 62 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 63 return self.unnest_sql( 64 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 65 ) 66 67 68def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 69 this = expression.this 70 if isinstance(this, exp.Schema): 71 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 72 else: 73 this = self.sql(this) 74 return f"RETURNS {this}" 75 76 77def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 78 returns = expression.find(exp.ReturnsProperty) 79 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 80 expression.set("kind", "TABLE FUNCTION") 81 82 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 83 expression.set("expression", expression.expression.this) 84 85 return self.create_sql(expression) 86 87 88# https://issuetracker.google.com/issues/162294746 89# workaround for bigquery bug when grouping by an expression and then ordering 90# WITH x AS (SELECT 1 y) 91# SELECT y + 1 z 92# FROM x 93# GROUP BY x + 1 94# ORDER by z 95def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 96 if isinstance(expression, exp.Select): 97 group = expression.args.get("group") 98 order = expression.args.get("order") 99 100 if group and order: 101 aliases = { 102 select.this: select.args["alias"] 103 for select in expression.selects 104 if isinstance(select, exp.Alias) 105 } 106 107 for grouped in group.expressions: 108 if grouped.is_int: 109 continue 110 alias = aliases.get(grouped) 111 if alias: 112 grouped.replace(exp.column(alias)) 113 114 return expression 115 116 117def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 118 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 119 if isinstance(expression, exp.CTE) and expression.alias_column_names: 120 cte_query = expression.this 121 122 if cte_query.is_star: 123 logger.warning( 124 "Can't push down CTE column names for star queries. Run the query through" 125 " the optimizer or use 'qualify' to expand the star projections first." 126 ) 127 return expression 128 129 column_names = expression.alias_column_names 130 expression.args["alias"].set("columns", None) 131 132 for name, select in zip(column_names, cte_query.selects): 133 to_replace = select 134 135 if isinstance(select, exp.Alias): 136 select = select.this 137 138 # Inner aliases are shadowed by the CTE column names 139 to_replace.replace(exp.alias_(select, name)) 140 141 return expression 142 143 144def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 145 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 146 this.set("zone", seq_get(args, 2)) 147 return this 148 149 150def _build_timestamp(args: t.List) -> exp.Timestamp: 151 timestamp = exp.Timestamp.from_arg_list(args) 152 timestamp.set("with_tz", True) 153 return timestamp 154 155 156def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 157 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 158 return expr_type.from_arg_list(args) 159 160 161def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 162 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 163 arg = seq_get(args, 0) 164 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 165 166 167def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 168 return self.sql( 169 exp.Exists( 170 this=exp.select("1") 171 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 172 .where(exp.column("_col").eq(expression.right)) 173 ) 174 ) 175 176 177def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 178 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 179 180 181def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 182 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 183 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 184 unit = unit_to_var(expression) 185 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 186 187 188def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 189 scale = expression.args.get("scale") 190 timestamp = expression.this 191 192 if scale in (None, exp.UnixToTime.SECONDS): 193 return self.func("TIMESTAMP_SECONDS", timestamp) 194 if scale == exp.UnixToTime.MILLIS: 195 return self.func("TIMESTAMP_MILLIS", timestamp) 196 if scale == exp.UnixToTime.MICROS: 197 return self.func("TIMESTAMP_MICROS", timestamp) 198 199 unix_seconds = exp.cast( 200 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 201 ) 202 return self.func("TIMESTAMP_SECONDS", unix_seconds) 203 204 205def _build_time(args: t.List) -> exp.Func: 206 if len(args) == 1: 207 return exp.TsOrDsToTime(this=args[0]) 208 if len(args) == 2: 209 return exp.Time.from_arg_list(args) 210 return exp.TimeFromParts.from_arg_list(args) 211 212 213def _build_datetime(args: t.List) -> exp.Func: 214 if len(args) == 1: 215 return exp.TsOrDsToDatetime.from_arg_list(args) 216 if len(args) == 2: 217 return exp.Datetime.from_arg_list(args) 218 return exp.TimestampFromParts.from_arg_list(args) 219 220 221def _build_regexp_extract( 222 expr_type: t.Type[E], default_group: t.Optional[exp.Expression] = None 223) -> t.Callable[[t.List], E]: 224 def _builder(args: t.List) -> E: 225 try: 226 group = re.compile(args[1].name).groups == 1 227 except re.error: 228 group = False 229 230 # Default group is used for the transpilation of REGEXP_EXTRACT_ALL 231 return expr_type( 232 this=seq_get(args, 0), 233 expression=seq_get(args, 1), 234 position=seq_get(args, 2), 235 occurrence=seq_get(args, 3), 236 group=exp.Literal.number(1) if group else default_group, 237 ) 238 239 return _builder 240 241 242def _build_extract_json_with_default_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]: 243 def _builder(args: t.List, dialect: Dialect) -> E: 244 if len(args) == 1: 245 # The default value for the JSONPath is '$' i.e all of the data 246 args.append(exp.Literal.string("$")) 247 return parser.build_extract_json_with_path(expr_type)(args, dialect) 248 249 return _builder 250 251 252def _str_to_datetime_sql( 253 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 254) -> str: 255 this = self.sql(expression, "this") 256 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 257 258 if expression.args.get("safe"): 259 fmt = self.format_time( 260 expression, 261 self.dialect.INVERSE_FORMAT_MAPPING, 262 self.dialect.INVERSE_FORMAT_TRIE, 263 ) 264 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 265 266 fmt = self.format_time(expression) 267 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 268 269 270def _annotate_math_functions(self: TypeAnnotator, expression: E) -> E: 271 """ 272 Many BigQuery math functions such as CEIL, FLOOR etc follow this return type convention: 273 +---------+---------+---------+------------+---------+ 274 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 275 +---------+---------+---------+------------+---------+ 276 | OUTPUT | FLOAT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 277 +---------+---------+---------+------------+---------+ 278 """ 279 self._annotate_args(expression) 280 281 this: exp.Expression = expression.this 282 283 self._set_type( 284 expression, 285 exp.DataType.Type.DOUBLE if this.is_type(*exp.DataType.INTEGER_TYPES) else this.type, 286 ) 287 return expression 288 289 290@unsupported_args("ins_cost", "del_cost", "sub_cost") 291def _levenshtein_sql(self: BigQuery.Generator, expression: exp.Levenshtein) -> str: 292 max_dist = expression.args.get("max_dist") 293 if max_dist: 294 max_dist = exp.Kwarg(this=exp.var("max_distance"), expression=max_dist) 295 296 return self.func("EDIT_DISTANCE", expression.this, expression.expression, max_dist) 297 298 299def _build_levenshtein(args: t.List) -> exp.Levenshtein: 300 max_dist = seq_get(args, 2) 301 return exp.Levenshtein( 302 this=seq_get(args, 0), 303 expression=seq_get(args, 1), 304 max_dist=max_dist.expression if max_dist else None, 305 ) 306 307 308def _build_format_time(expr_type: t.Type[exp.Expression]) -> t.Callable[[t.List], exp.TimeToStr]: 309 def _builder(args: t.List) -> exp.TimeToStr: 310 return exp.TimeToStr(this=expr_type(this=seq_get(args, 1)), format=seq_get(args, 0)) 311 312 return _builder 313 314 315def _build_contains_substring(args: t.List) -> exp.Contains | exp.Anonymous: 316 if len(args) == 3: 317 return exp.Anonymous(this="CONTAINS_SUBSTRING", expressions=args) 318 319 # Lowercase the operands in case of transpilation, as exp.Contains 320 # is case-sensitive on other dialects 321 this = exp.Lower(this=seq_get(args, 0)) 322 expr = exp.Lower(this=seq_get(args, 1)) 323 324 return exp.Contains(this=this, expression=expr) 325 326 327class BigQuery(Dialect): 328 WEEK_OFFSET = -1 329 UNNEST_COLUMN_ONLY = True 330 SUPPORTS_USER_DEFINED_TYPES = False 331 SUPPORTS_SEMI_ANTI_JOIN = False 332 LOG_BASE_FIRST = False 333 HEX_LOWERCASE = True 334 FORCE_EARLY_ALIAS_REF_EXPANSION = True 335 EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = True 336 337 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 338 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 339 340 # bigquery udfs are case sensitive 341 NORMALIZE_FUNCTIONS = False 342 343 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 344 TIME_MAPPING = { 345 "%D": "%m/%d/%y", 346 "%E6S": "%S.%f", 347 "%e": "%-d", 348 } 349 350 FORMAT_MAPPING = { 351 "DD": "%d", 352 "MM": "%m", 353 "MON": "%b", 354 "MONTH": "%B", 355 "YYYY": "%Y", 356 "YY": "%y", 357 "HH": "%I", 358 "HH12": "%I", 359 "HH24": "%H", 360 "MI": "%M", 361 "SS": "%S", 362 "SSSSS": "%f", 363 "TZH": "%z", 364 } 365 366 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 367 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 368 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 369 370 # All set operations require either a DISTINCT or ALL specifier 371 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 372 373 ANNOTATORS = { 374 **Dialect.ANNOTATORS, 375 **{ 376 expr_type: lambda self, e: _annotate_math_functions(self, e) 377 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 378 }, 379 **{ 380 expr_type: lambda self, e: self._annotate_by_args(e, "this") 381 for expr_type in ( 382 exp.Left, 383 exp.Right, 384 exp.Lower, 385 exp.Upper, 386 exp.Pad, 387 exp.Trim, 388 exp.RegexpExtract, 389 exp.RegexpReplace, 390 exp.Repeat, 391 exp.Substring, 392 ) 393 }, 394 exp.Concat: lambda self, e: self._annotate_by_args(e, "expressions"), 395 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 396 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 397 } 398 399 def normalize_identifier(self, expression: E) -> E: 400 if ( 401 isinstance(expression, exp.Identifier) 402 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 403 ): 404 parent = expression.parent 405 while isinstance(parent, exp.Dot): 406 parent = parent.parent 407 408 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 409 # by default. The following check uses a heuristic to detect tables based on whether 410 # they are qualified. This should generally be correct, because tables in BigQuery 411 # must be qualified with at least a dataset, unless @@dataset_id is set. 412 case_sensitive = ( 413 isinstance(parent, exp.UserDefinedFunction) 414 or ( 415 isinstance(parent, exp.Table) 416 and parent.db 417 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 418 ) 419 or expression.meta.get("is_table") 420 ) 421 if not case_sensitive: 422 expression.set("this", expression.this.lower()) 423 424 return expression 425 426 class Tokenizer(tokens.Tokenizer): 427 QUOTES = ["'", '"', '"""', "'''"] 428 COMMENTS = ["--", "#", ("/*", "*/")] 429 IDENTIFIERS = ["`"] 430 STRING_ESCAPES = ["\\"] 431 432 HEX_STRINGS = [("0x", ""), ("0X", "")] 433 434 BYTE_STRINGS = [ 435 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 436 ] 437 438 RAW_STRINGS = [ 439 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 440 ] 441 442 KEYWORDS = { 443 **tokens.Tokenizer.KEYWORDS, 444 "ANY TYPE": TokenType.VARIANT, 445 "BEGIN": TokenType.COMMAND, 446 "BEGIN TRANSACTION": TokenType.BEGIN, 447 "BYTEINT": TokenType.INT, 448 "BYTES": TokenType.BINARY, 449 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 450 "DATETIME": TokenType.TIMESTAMP, 451 "DECLARE": TokenType.COMMAND, 452 "ELSEIF": TokenType.COMMAND, 453 "EXCEPTION": TokenType.COMMAND, 454 "FLOAT64": TokenType.DOUBLE, 455 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 456 "MODEL": TokenType.MODEL, 457 "NOT DETERMINISTIC": TokenType.VOLATILE, 458 "RECORD": TokenType.STRUCT, 459 "TIMESTAMP": TokenType.TIMESTAMPTZ, 460 } 461 KEYWORDS.pop("DIV") 462 KEYWORDS.pop("VALUES") 463 KEYWORDS.pop("/*+") 464 465 class Parser(parser.Parser): 466 PREFIXED_PIVOT_COLUMNS = True 467 LOG_DEFAULTS_TO_LN = True 468 SUPPORTS_IMPLICIT_UNNEST = True 469 470 FUNCTIONS = { 471 **parser.Parser.FUNCTIONS, 472 "CONTAINS_SUBSTRING": _build_contains_substring, 473 "DATE": _build_date, 474 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 475 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 476 "DATE_TRUNC": lambda args: exp.DateTrunc( 477 unit=exp.Literal.string(str(seq_get(args, 1))), 478 this=seq_get(args, 0), 479 zone=seq_get(args, 2), 480 ), 481 "DATETIME": _build_datetime, 482 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 483 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 484 "DIV": binary_from_function(exp.IntDiv), 485 "EDIT_DISTANCE": _build_levenshtein, 486 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 487 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 488 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 489 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 490 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 491 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 492 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 493 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 494 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 495 "MD5": exp.MD5Digest.from_arg_list, 496 "TO_HEX": _build_to_hex, 497 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 498 [seq_get(args, 1), seq_get(args, 0)] 499 ), 500 "PARSE_TIMESTAMP": _build_parse_timestamp, 501 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 502 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 503 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 504 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 505 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 506 ), 507 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 508 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 509 "SPLIT": lambda args: exp.Split( 510 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 511 this=seq_get(args, 0), 512 expression=seq_get(args, 1) or exp.Literal.string(","), 513 ), 514 "STRPOS": exp.StrPosition.from_arg_list, 515 "TIME": _build_time, 516 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 517 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 518 "TIMESTAMP": _build_timestamp, 519 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 520 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 521 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 522 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 523 ), 524 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 525 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 526 ), 527 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 528 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 529 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 530 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 531 } 532 533 FUNCTION_PARSERS = { 534 **parser.Parser.FUNCTION_PARSERS, 535 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 536 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 537 } 538 FUNCTION_PARSERS.pop("TRIM") 539 540 NO_PAREN_FUNCTIONS = { 541 **parser.Parser.NO_PAREN_FUNCTIONS, 542 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 543 } 544 545 NESTED_TYPE_TOKENS = { 546 *parser.Parser.NESTED_TYPE_TOKENS, 547 TokenType.TABLE, 548 } 549 550 PROPERTY_PARSERS = { 551 **parser.Parser.PROPERTY_PARSERS, 552 "NOT DETERMINISTIC": lambda self: self.expression( 553 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 554 ), 555 "OPTIONS": lambda self: self._parse_with_property(), 556 } 557 558 CONSTRAINT_PARSERS = { 559 **parser.Parser.CONSTRAINT_PARSERS, 560 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 561 } 562 563 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 564 RANGE_PARSERS.pop(TokenType.OVERLAPS) 565 566 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 567 568 STATEMENT_PARSERS = { 569 **parser.Parser.STATEMENT_PARSERS, 570 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 571 TokenType.END: lambda self: self._parse_as_command(self._prev), 572 TokenType.FOR: lambda self: self._parse_for_in(), 573 } 574 575 BRACKET_OFFSETS = { 576 "OFFSET": (0, False), 577 "ORDINAL": (1, False), 578 "SAFE_OFFSET": (0, True), 579 "SAFE_ORDINAL": (1, True), 580 } 581 582 def _parse_for_in(self) -> exp.ForIn: 583 this = self._parse_range() 584 self._match_text_seq("DO") 585 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 586 587 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 588 this = super()._parse_table_part(schema=schema) or self._parse_number() 589 590 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 591 if isinstance(this, exp.Identifier): 592 table_name = this.name 593 while self._match(TokenType.DASH, advance=False) and self._next: 594 text = "" 595 while self._is_connected() and self._curr.token_type != TokenType.DOT: 596 self._advance() 597 text += self._prev.text 598 table_name += text 599 600 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 601 elif isinstance(this, exp.Literal): 602 table_name = this.name 603 604 if self._is_connected() and self._parse_var(any_token=True): 605 table_name += self._prev.text 606 607 this = exp.Identifier(this=table_name, quoted=True) 608 609 return this 610 611 def _parse_table_parts( 612 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 613 ) -> exp.Table: 614 table = super()._parse_table_parts( 615 schema=schema, is_db_reference=is_db_reference, wildcard=True 616 ) 617 618 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 619 if not table.catalog: 620 if table.db: 621 parts = table.db.split(".") 622 if len(parts) == 2 and not table.args["db"].quoted: 623 table.set("catalog", exp.Identifier(this=parts[0])) 624 table.set("db", exp.Identifier(this=parts[1])) 625 else: 626 parts = table.name.split(".") 627 if len(parts) == 2 and not table.this.quoted: 628 table.set("db", exp.Identifier(this=parts[0])) 629 table.set("this", exp.Identifier(this=parts[1])) 630 631 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 632 alias = table.this 633 catalog, db, this, *rest = ( 634 exp.to_identifier(p, quoted=True) 635 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 636 ) 637 638 if rest and this: 639 this = exp.Dot.build([this, *rest]) # type: ignore 640 641 table = exp.Table( 642 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 643 ) 644 table.meta["quoted_table"] = True 645 else: 646 alias = None 647 648 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 649 # dataset, so if the project identifier is omitted we need to fix the ast so that 650 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 651 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 652 # views, because it would seem like the "catalog" part is set, when it'd actually 653 # be the region/dataset. Merging the two identifiers into a single one is done to 654 # avoid producing a 4-part Table reference, which would cause issues in the schema 655 # module, when there are 3-part table names mixed with information schema views. 656 # 657 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 658 table_parts = table.parts 659 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 660 # We need to alias the table here to avoid breaking existing qualified columns. 661 # This is expected to be safe, because if there's an actual alias coming up in 662 # the token stream, it will overwrite this one. If there isn't one, we are only 663 # exposing the name that can be used to reference the view explicitly (a no-op). 664 exp.alias_( 665 table, 666 t.cast(exp.Identifier, alias or table_parts[-1]), 667 table=True, 668 copy=False, 669 ) 670 671 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 672 table.set("this", exp.Identifier(this=info_schema_view, quoted=True)) 673 table.set("db", seq_get(table_parts, -3)) 674 table.set("catalog", seq_get(table_parts, -4)) 675 676 return table 677 678 def _parse_column(self) -> t.Optional[exp.Expression]: 679 column = super()._parse_column() 680 if isinstance(column, exp.Column): 681 parts = column.parts 682 if any("." in p.name for p in parts): 683 catalog, db, table, this, *rest = ( 684 exp.to_identifier(p, quoted=True) 685 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 686 ) 687 688 if rest and this: 689 this = exp.Dot.build([this, *rest]) # type: ignore 690 691 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 692 column.meta["quoted_column"] = True 693 694 return column 695 696 @t.overload 697 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 698 699 @t.overload 700 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 701 702 def _parse_json_object(self, agg=False): 703 json_object = super()._parse_json_object() 704 array_kv_pair = seq_get(json_object.expressions, 0) 705 706 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 707 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 708 if ( 709 array_kv_pair 710 and isinstance(array_kv_pair.this, exp.Array) 711 and isinstance(array_kv_pair.expression, exp.Array) 712 ): 713 keys = array_kv_pair.this.expressions 714 values = array_kv_pair.expression.expressions 715 716 json_object.set( 717 "expressions", 718 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 719 ) 720 721 return json_object 722 723 def _parse_bracket( 724 self, this: t.Optional[exp.Expression] = None 725 ) -> t.Optional[exp.Expression]: 726 bracket = super()._parse_bracket(this) 727 728 if this is bracket: 729 return bracket 730 731 if isinstance(bracket, exp.Bracket): 732 for expression in bracket.expressions: 733 name = expression.name.upper() 734 735 if name not in self.BRACKET_OFFSETS: 736 break 737 738 offset, safe = self.BRACKET_OFFSETS[name] 739 bracket.set("offset", offset) 740 bracket.set("safe", safe) 741 expression.replace(expression.expressions[0]) 742 743 return bracket 744 745 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 746 unnest = super()._parse_unnest(with_alias=with_alias) 747 748 if not unnest: 749 return None 750 751 unnest_expr = seq_get(unnest.expressions, 0) 752 if unnest_expr: 753 from sqlglot.optimizer.annotate_types import annotate_types 754 755 unnest_expr = annotate_types(unnest_expr) 756 757 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 758 # in contrast to other dialects such as DuckDB which flattens only the array by default 759 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 760 array_elem.is_type(exp.DataType.Type.STRUCT) 761 for array_elem in unnest_expr._type.expressions 762 ): 763 unnest.set("explode_array", True) 764 765 return unnest 766 767 def _parse_make_interval(self): 768 expr = exp.MakeInterval() 769 770 for arg_key in expr.arg_types: 771 value = self._parse_lambda() 772 773 if not value: 774 break 775 776 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 777 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 778 if isinstance(value, exp.Kwarg): 779 arg_key = value.this.name 780 781 expr.set(arg_key, value) 782 783 self._match(TokenType.COMMA) 784 785 return expr 786 787 class Generator(generator.Generator): 788 INTERVAL_ALLOWS_PLURAL_FORM = False 789 JOIN_HINTS = False 790 QUERY_HINTS = False 791 TABLE_HINTS = False 792 LIMIT_FETCH = "LIMIT" 793 RENAME_TABLE_WITH_DB = False 794 NVL2_SUPPORTED = False 795 UNNEST_WITH_ORDINALITY = False 796 COLLATE_IS_FUNC = True 797 LIMIT_ONLY_LITERALS = True 798 SUPPORTS_TABLE_ALIAS_COLUMNS = False 799 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 800 JSON_KEY_VALUE_PAIR_SEP = "," 801 NULL_ORDERING_SUPPORTED = False 802 IGNORE_NULLS_IN_FUNC = True 803 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 804 CAN_IMPLEMENT_ARRAY_ANY = True 805 SUPPORTS_TO_NUMBER = False 806 NAMED_PLACEHOLDER_TOKEN = "@" 807 HEX_FUNC = "TO_HEX" 808 WITH_PROPERTIES_PREFIX = "OPTIONS" 809 SUPPORTS_EXPLODING_PROJECTIONS = False 810 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 811 SUPPORTS_UNIX_SECONDS = True 812 813 TRANSFORMS = { 814 **generator.Generator.TRANSFORMS, 815 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 816 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 817 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 818 exp.Array: inline_array_unless_query, 819 exp.ArrayContains: _array_contains_sql, 820 exp.ArrayFilter: filter_array_using_unnest, 821 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 822 exp.CollateProperty: lambda self, e: ( 823 f"DEFAULT COLLATE {self.sql(e, 'this')}" 824 if e.args.get("default") 825 else f"COLLATE {self.sql(e, 'this')}" 826 ), 827 exp.Commit: lambda *_: "COMMIT TRANSACTION", 828 exp.CountIf: rename_func("COUNTIF"), 829 exp.Create: _create_sql, 830 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 831 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 832 exp.DateDiff: lambda self, e: self.func( 833 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 834 ), 835 exp.DateFromParts: rename_func("DATE"), 836 exp.DateStrToDate: datestrtodate_sql, 837 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 838 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 839 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 840 exp.DateTrunc: lambda self, e: self.func( 841 "DATE_TRUNC", e.this, e.text("unit"), e.args.get("zone") 842 ), 843 exp.FromTimeZone: lambda self, e: self.func( 844 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 845 ), 846 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 847 exp.GroupConcat: rename_func("STRING_AGG"), 848 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 849 exp.If: if_sql(false_value="NULL"), 850 exp.ILike: no_ilike_sql, 851 exp.IntDiv: rename_func("DIV"), 852 exp.Int64: rename_func("INT64"), 853 exp.JSONFormat: rename_func("TO_JSON_STRING"), 854 exp.Levenshtein: _levenshtein_sql, 855 exp.Max: max_or_greatest, 856 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 857 exp.MD5Digest: rename_func("MD5"), 858 exp.Min: min_or_least, 859 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 860 exp.RegexpExtract: lambda self, e: self.func( 861 "REGEXP_EXTRACT", 862 e.this, 863 e.expression, 864 e.args.get("position"), 865 e.args.get("occurrence"), 866 ), 867 exp.RegexpExtractAll: lambda self, e: self.func( 868 "REGEXP_EXTRACT_ALL", e.this, e.expression 869 ), 870 exp.RegexpReplace: regexp_replace_sql, 871 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 872 exp.ReturnsProperty: _returnsproperty_sql, 873 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 874 exp.Select: transforms.preprocess( 875 [ 876 transforms.explode_to_unnest(), 877 transforms.unqualify_unnest, 878 transforms.eliminate_distinct_on, 879 _alias_ordered_group, 880 transforms.eliminate_semi_and_anti_joins, 881 ] 882 ), 883 exp.SHA: rename_func("SHA1"), 884 exp.SHA2: sha256_sql, 885 exp.StabilityProperty: lambda self, e: ( 886 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 887 ), 888 exp.String: rename_func("STRING"), 889 exp.StrPosition: str_position_sql, 890 exp.StrToDate: _str_to_datetime_sql, 891 exp.StrToTime: _str_to_datetime_sql, 892 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 893 exp.TimeFromParts: rename_func("TIME"), 894 exp.TimestampFromParts: rename_func("DATETIME"), 895 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 896 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 897 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 898 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 899 exp.TimeStrToTime: timestrtotime_sql, 900 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 901 exp.TsOrDsAdd: _ts_or_ds_add_sql, 902 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 903 exp.TsOrDsToTime: rename_func("TIME"), 904 exp.TsOrDsToDatetime: rename_func("DATETIME"), 905 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 906 exp.Unhex: rename_func("FROM_HEX"), 907 exp.UnixDate: rename_func("UNIX_DATE"), 908 exp.UnixToTime: _unix_to_time_sql, 909 exp.Uuid: lambda *_: "GENERATE_UUID()", 910 exp.Values: _derived_table_values_to_unnest, 911 exp.VariancePop: rename_func("VAR_POP"), 912 } 913 914 SUPPORTED_JSON_PATH_PARTS = { 915 exp.JSONPathKey, 916 exp.JSONPathRoot, 917 exp.JSONPathSubscript, 918 } 919 920 TYPE_MAPPING = { 921 **generator.Generator.TYPE_MAPPING, 922 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 923 exp.DataType.Type.BIGINT: "INT64", 924 exp.DataType.Type.BINARY: "BYTES", 925 exp.DataType.Type.BOOLEAN: "BOOL", 926 exp.DataType.Type.CHAR: "STRING", 927 exp.DataType.Type.DECIMAL: "NUMERIC", 928 exp.DataType.Type.DOUBLE: "FLOAT64", 929 exp.DataType.Type.FLOAT: "FLOAT64", 930 exp.DataType.Type.INT: "INT64", 931 exp.DataType.Type.NCHAR: "STRING", 932 exp.DataType.Type.NVARCHAR: "STRING", 933 exp.DataType.Type.SMALLINT: "INT64", 934 exp.DataType.Type.TEXT: "STRING", 935 exp.DataType.Type.TIMESTAMP: "DATETIME", 936 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 937 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 938 exp.DataType.Type.TINYINT: "INT64", 939 exp.DataType.Type.ROWVERSION: "BYTES", 940 exp.DataType.Type.UUID: "STRING", 941 exp.DataType.Type.VARBINARY: "BYTES", 942 exp.DataType.Type.VARCHAR: "STRING", 943 exp.DataType.Type.VARIANT: "ANY TYPE", 944 } 945 946 PROPERTIES_LOCATION = { 947 **generator.Generator.PROPERTIES_LOCATION, 948 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 949 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 950 } 951 952 # WINDOW comes after QUALIFY 953 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 954 AFTER_HAVING_MODIFIER_TRANSFORMS = { 955 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 956 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 957 } 958 959 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 960 RESERVED_KEYWORDS = { 961 "all", 962 "and", 963 "any", 964 "array", 965 "as", 966 "asc", 967 "assert_rows_modified", 968 "at", 969 "between", 970 "by", 971 "case", 972 "cast", 973 "collate", 974 "contains", 975 "create", 976 "cross", 977 "cube", 978 "current", 979 "default", 980 "define", 981 "desc", 982 "distinct", 983 "else", 984 "end", 985 "enum", 986 "escape", 987 "except", 988 "exclude", 989 "exists", 990 "extract", 991 "false", 992 "fetch", 993 "following", 994 "for", 995 "from", 996 "full", 997 "group", 998 "grouping", 999 "groups", 1000 "hash", 1001 "having", 1002 "if", 1003 "ignore", 1004 "in", 1005 "inner", 1006 "intersect", 1007 "interval", 1008 "into", 1009 "is", 1010 "join", 1011 "lateral", 1012 "left", 1013 "like", 1014 "limit", 1015 "lookup", 1016 "merge", 1017 "natural", 1018 "new", 1019 "no", 1020 "not", 1021 "null", 1022 "nulls", 1023 "of", 1024 "on", 1025 "or", 1026 "order", 1027 "outer", 1028 "over", 1029 "partition", 1030 "preceding", 1031 "proto", 1032 "qualify", 1033 "range", 1034 "recursive", 1035 "respect", 1036 "right", 1037 "rollup", 1038 "rows", 1039 "select", 1040 "set", 1041 "some", 1042 "struct", 1043 "tablesample", 1044 "then", 1045 "to", 1046 "treat", 1047 "true", 1048 "unbounded", 1049 "union", 1050 "unnest", 1051 "using", 1052 "when", 1053 "where", 1054 "window", 1055 "with", 1056 "within", 1057 } 1058 1059 def mod_sql(self, expression: exp.Mod) -> str: 1060 this = expression.this 1061 expr = expression.expression 1062 return self.func( 1063 "MOD", 1064 this.unnest() if isinstance(this, exp.Paren) else this, 1065 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1066 ) 1067 1068 def column_parts(self, expression: exp.Column) -> str: 1069 if expression.meta.get("quoted_column"): 1070 # If a column reference is of the form `dataset.table`.name, we need 1071 # to preserve the quoted table path, otherwise the reference breaks 1072 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1073 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1074 return f"{table_path}.{self.sql(expression, 'this')}" 1075 1076 return super().column_parts(expression) 1077 1078 def table_parts(self, expression: exp.Table) -> str: 1079 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1080 # we need to make sure the correct quoting is used in each case. 1081 # 1082 # For example, if there is a CTE x that clashes with a schema name, then the former will 1083 # return the table y in that schema, whereas the latter will return the CTE's y column: 1084 # 1085 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1086 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1087 if expression.meta.get("quoted_table"): 1088 table_parts = ".".join(p.name for p in expression.parts) 1089 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1090 1091 return super().table_parts(expression) 1092 1093 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1094 this = expression.this 1095 if isinstance(this, exp.TsOrDsToDatetime): 1096 func_name = "FORMAT_DATETIME" 1097 elif isinstance(this, exp.TsOrDsToTimestamp): 1098 func_name = "FORMAT_TIMESTAMP" 1099 else: 1100 func_name = "FORMAT_DATE" 1101 1102 time_expr = ( 1103 this 1104 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1105 else expression 1106 ) 1107 return self.func(func_name, self.format_time(expression), time_expr.this) 1108 1109 def eq_sql(self, expression: exp.EQ) -> str: 1110 # Operands of = cannot be NULL in BigQuery 1111 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1112 if not isinstance(expression.parent, exp.Update): 1113 return "NULL" 1114 1115 return self.binary(expression, "=") 1116 1117 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1118 parent = expression.parent 1119 1120 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1121 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1122 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1123 return self.func( 1124 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1125 ) 1126 1127 return super().attimezone_sql(expression) 1128 1129 def trycast_sql(self, expression: exp.TryCast) -> str: 1130 return self.cast_sql(expression, safe_prefix="SAFE_") 1131 1132 def bracket_sql(self, expression: exp.Bracket) -> str: 1133 this = expression.this 1134 expressions = expression.expressions 1135 1136 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1137 arg = expressions[0] 1138 if arg.type is None: 1139 from sqlglot.optimizer.annotate_types import annotate_types 1140 1141 arg = annotate_types(arg) 1142 1143 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1144 # BQ doesn't support bracket syntax with string values for structs 1145 return f"{self.sql(this)}.{arg.name}" 1146 1147 expressions_sql = self.expressions(expression, flat=True) 1148 offset = expression.args.get("offset") 1149 1150 if offset == 0: 1151 expressions_sql = f"OFFSET({expressions_sql})" 1152 elif offset == 1: 1153 expressions_sql = f"ORDINAL({expressions_sql})" 1154 elif offset is not None: 1155 self.unsupported(f"Unsupported array offset: {offset}") 1156 1157 if expression.args.get("safe"): 1158 expressions_sql = f"SAFE_{expressions_sql}" 1159 1160 return f"{self.sql(this)}[{expressions_sql}]" 1161 1162 def in_unnest_op(self, expression: exp.Unnest) -> str: 1163 return self.sql(expression) 1164 1165 def version_sql(self, expression: exp.Version) -> str: 1166 if expression.name == "TIMESTAMP": 1167 expression.set("this", "SYSTEM_TIME") 1168 return super().version_sql(expression) 1169 1170 def contains_sql(self, expression: exp.Contains) -> str: 1171 this = expression.this 1172 expr = expression.expression 1173 1174 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1175 this = this.this 1176 expr = expr.this 1177 1178 return self.func("CONTAINS_SUBSTRING", this, expr)
328class BigQuery(Dialect): 329 WEEK_OFFSET = -1 330 UNNEST_COLUMN_ONLY = True 331 SUPPORTS_USER_DEFINED_TYPES = False 332 SUPPORTS_SEMI_ANTI_JOIN = False 333 LOG_BASE_FIRST = False 334 HEX_LOWERCASE = True 335 FORCE_EARLY_ALIAS_REF_EXPANSION = True 336 EXPAND_ALIAS_REFS_EARLY_ONLY_IN_GROUP_BY = True 337 338 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 339 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 340 341 # bigquery udfs are case sensitive 342 NORMALIZE_FUNCTIONS = False 343 344 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 345 TIME_MAPPING = { 346 "%D": "%m/%d/%y", 347 "%E6S": "%S.%f", 348 "%e": "%-d", 349 } 350 351 FORMAT_MAPPING = { 352 "DD": "%d", 353 "MM": "%m", 354 "MON": "%b", 355 "MONTH": "%B", 356 "YYYY": "%Y", 357 "YY": "%y", 358 "HH": "%I", 359 "HH12": "%I", 360 "HH24": "%H", 361 "MI": "%M", 362 "SS": "%S", 363 "SSSSS": "%f", 364 "TZH": "%z", 365 } 366 367 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 368 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 369 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 370 371 # All set operations require either a DISTINCT or ALL specifier 372 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 373 374 ANNOTATORS = { 375 **Dialect.ANNOTATORS, 376 **{ 377 expr_type: lambda self, e: _annotate_math_functions(self, e) 378 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 379 }, 380 **{ 381 expr_type: lambda self, e: self._annotate_by_args(e, "this") 382 for expr_type in ( 383 exp.Left, 384 exp.Right, 385 exp.Lower, 386 exp.Upper, 387 exp.Pad, 388 exp.Trim, 389 exp.RegexpExtract, 390 exp.RegexpReplace, 391 exp.Repeat, 392 exp.Substring, 393 ) 394 }, 395 exp.Concat: lambda self, e: self._annotate_by_args(e, "expressions"), 396 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 397 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 398 } 399 400 def normalize_identifier(self, expression: E) -> E: 401 if ( 402 isinstance(expression, exp.Identifier) 403 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 404 ): 405 parent = expression.parent 406 while isinstance(parent, exp.Dot): 407 parent = parent.parent 408 409 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 410 # by default. The following check uses a heuristic to detect tables based on whether 411 # they are qualified. This should generally be correct, because tables in BigQuery 412 # must be qualified with at least a dataset, unless @@dataset_id is set. 413 case_sensitive = ( 414 isinstance(parent, exp.UserDefinedFunction) 415 or ( 416 isinstance(parent, exp.Table) 417 and parent.db 418 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 419 ) 420 or expression.meta.get("is_table") 421 ) 422 if not case_sensitive: 423 expression.set("this", expression.this.lower()) 424 425 return expression 426 427 class Tokenizer(tokens.Tokenizer): 428 QUOTES = ["'", '"', '"""', "'''"] 429 COMMENTS = ["--", "#", ("/*", "*/")] 430 IDENTIFIERS = ["`"] 431 STRING_ESCAPES = ["\\"] 432 433 HEX_STRINGS = [("0x", ""), ("0X", "")] 434 435 BYTE_STRINGS = [ 436 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 437 ] 438 439 RAW_STRINGS = [ 440 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 441 ] 442 443 KEYWORDS = { 444 **tokens.Tokenizer.KEYWORDS, 445 "ANY TYPE": TokenType.VARIANT, 446 "BEGIN": TokenType.COMMAND, 447 "BEGIN TRANSACTION": TokenType.BEGIN, 448 "BYTEINT": TokenType.INT, 449 "BYTES": TokenType.BINARY, 450 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 451 "DATETIME": TokenType.TIMESTAMP, 452 "DECLARE": TokenType.COMMAND, 453 "ELSEIF": TokenType.COMMAND, 454 "EXCEPTION": TokenType.COMMAND, 455 "FLOAT64": TokenType.DOUBLE, 456 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 457 "MODEL": TokenType.MODEL, 458 "NOT DETERMINISTIC": TokenType.VOLATILE, 459 "RECORD": TokenType.STRUCT, 460 "TIMESTAMP": TokenType.TIMESTAMPTZ, 461 } 462 KEYWORDS.pop("DIV") 463 KEYWORDS.pop("VALUES") 464 KEYWORDS.pop("/*+") 465 466 class Parser(parser.Parser): 467 PREFIXED_PIVOT_COLUMNS = True 468 LOG_DEFAULTS_TO_LN = True 469 SUPPORTS_IMPLICIT_UNNEST = True 470 471 FUNCTIONS = { 472 **parser.Parser.FUNCTIONS, 473 "CONTAINS_SUBSTRING": _build_contains_substring, 474 "DATE": _build_date, 475 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 476 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 477 "DATE_TRUNC": lambda args: exp.DateTrunc( 478 unit=exp.Literal.string(str(seq_get(args, 1))), 479 this=seq_get(args, 0), 480 zone=seq_get(args, 2), 481 ), 482 "DATETIME": _build_datetime, 483 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 484 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 485 "DIV": binary_from_function(exp.IntDiv), 486 "EDIT_DISTANCE": _build_levenshtein, 487 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 488 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 489 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 490 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 491 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 492 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 493 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 494 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 495 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 496 "MD5": exp.MD5Digest.from_arg_list, 497 "TO_HEX": _build_to_hex, 498 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 499 [seq_get(args, 1), seq_get(args, 0)] 500 ), 501 "PARSE_TIMESTAMP": _build_parse_timestamp, 502 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 503 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 504 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 505 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 506 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 507 ), 508 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 509 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 510 "SPLIT": lambda args: exp.Split( 511 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 512 this=seq_get(args, 0), 513 expression=seq_get(args, 1) or exp.Literal.string(","), 514 ), 515 "STRPOS": exp.StrPosition.from_arg_list, 516 "TIME": _build_time, 517 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 518 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 519 "TIMESTAMP": _build_timestamp, 520 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 521 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 522 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 523 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 524 ), 525 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 526 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 527 ), 528 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 529 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 530 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 531 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 532 } 533 534 FUNCTION_PARSERS = { 535 **parser.Parser.FUNCTION_PARSERS, 536 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 537 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 538 } 539 FUNCTION_PARSERS.pop("TRIM") 540 541 NO_PAREN_FUNCTIONS = { 542 **parser.Parser.NO_PAREN_FUNCTIONS, 543 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 544 } 545 546 NESTED_TYPE_TOKENS = { 547 *parser.Parser.NESTED_TYPE_TOKENS, 548 TokenType.TABLE, 549 } 550 551 PROPERTY_PARSERS = { 552 **parser.Parser.PROPERTY_PARSERS, 553 "NOT DETERMINISTIC": lambda self: self.expression( 554 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 555 ), 556 "OPTIONS": lambda self: self._parse_with_property(), 557 } 558 559 CONSTRAINT_PARSERS = { 560 **parser.Parser.CONSTRAINT_PARSERS, 561 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 562 } 563 564 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 565 RANGE_PARSERS.pop(TokenType.OVERLAPS) 566 567 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 568 569 STATEMENT_PARSERS = { 570 **parser.Parser.STATEMENT_PARSERS, 571 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 572 TokenType.END: lambda self: self._parse_as_command(self._prev), 573 TokenType.FOR: lambda self: self._parse_for_in(), 574 } 575 576 BRACKET_OFFSETS = { 577 "OFFSET": (0, False), 578 "ORDINAL": (1, False), 579 "SAFE_OFFSET": (0, True), 580 "SAFE_ORDINAL": (1, True), 581 } 582 583 def _parse_for_in(self) -> exp.ForIn: 584 this = self._parse_range() 585 self._match_text_seq("DO") 586 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 587 588 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 589 this = super()._parse_table_part(schema=schema) or self._parse_number() 590 591 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 592 if isinstance(this, exp.Identifier): 593 table_name = this.name 594 while self._match(TokenType.DASH, advance=False) and self._next: 595 text = "" 596 while self._is_connected() and self._curr.token_type != TokenType.DOT: 597 self._advance() 598 text += self._prev.text 599 table_name += text 600 601 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 602 elif isinstance(this, exp.Literal): 603 table_name = this.name 604 605 if self._is_connected() and self._parse_var(any_token=True): 606 table_name += self._prev.text 607 608 this = exp.Identifier(this=table_name, quoted=True) 609 610 return this 611 612 def _parse_table_parts( 613 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 614 ) -> exp.Table: 615 table = super()._parse_table_parts( 616 schema=schema, is_db_reference=is_db_reference, wildcard=True 617 ) 618 619 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 620 if not table.catalog: 621 if table.db: 622 parts = table.db.split(".") 623 if len(parts) == 2 and not table.args["db"].quoted: 624 table.set("catalog", exp.Identifier(this=parts[0])) 625 table.set("db", exp.Identifier(this=parts[1])) 626 else: 627 parts = table.name.split(".") 628 if len(parts) == 2 and not table.this.quoted: 629 table.set("db", exp.Identifier(this=parts[0])) 630 table.set("this", exp.Identifier(this=parts[1])) 631 632 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 633 alias = table.this 634 catalog, db, this, *rest = ( 635 exp.to_identifier(p, quoted=True) 636 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 637 ) 638 639 if rest and this: 640 this = exp.Dot.build([this, *rest]) # type: ignore 641 642 table = exp.Table( 643 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 644 ) 645 table.meta["quoted_table"] = True 646 else: 647 alias = None 648 649 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 650 # dataset, so if the project identifier is omitted we need to fix the ast so that 651 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 652 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 653 # views, because it would seem like the "catalog" part is set, when it'd actually 654 # be the region/dataset. Merging the two identifiers into a single one is done to 655 # avoid producing a 4-part Table reference, which would cause issues in the schema 656 # module, when there are 3-part table names mixed with information schema views. 657 # 658 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 659 table_parts = table.parts 660 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 661 # We need to alias the table here to avoid breaking existing qualified columns. 662 # This is expected to be safe, because if there's an actual alias coming up in 663 # the token stream, it will overwrite this one. If there isn't one, we are only 664 # exposing the name that can be used to reference the view explicitly (a no-op). 665 exp.alias_( 666 table, 667 t.cast(exp.Identifier, alias or table_parts[-1]), 668 table=True, 669 copy=False, 670 ) 671 672 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 673 table.set("this", exp.Identifier(this=info_schema_view, quoted=True)) 674 table.set("db", seq_get(table_parts, -3)) 675 table.set("catalog", seq_get(table_parts, -4)) 676 677 return table 678 679 def _parse_column(self) -> t.Optional[exp.Expression]: 680 column = super()._parse_column() 681 if isinstance(column, exp.Column): 682 parts = column.parts 683 if any("." in p.name for p in parts): 684 catalog, db, table, this, *rest = ( 685 exp.to_identifier(p, quoted=True) 686 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 687 ) 688 689 if rest and this: 690 this = exp.Dot.build([this, *rest]) # type: ignore 691 692 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 693 column.meta["quoted_column"] = True 694 695 return column 696 697 @t.overload 698 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 699 700 @t.overload 701 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 702 703 def _parse_json_object(self, agg=False): 704 json_object = super()._parse_json_object() 705 array_kv_pair = seq_get(json_object.expressions, 0) 706 707 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 708 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 709 if ( 710 array_kv_pair 711 and isinstance(array_kv_pair.this, exp.Array) 712 and isinstance(array_kv_pair.expression, exp.Array) 713 ): 714 keys = array_kv_pair.this.expressions 715 values = array_kv_pair.expression.expressions 716 717 json_object.set( 718 "expressions", 719 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 720 ) 721 722 return json_object 723 724 def _parse_bracket( 725 self, this: t.Optional[exp.Expression] = None 726 ) -> t.Optional[exp.Expression]: 727 bracket = super()._parse_bracket(this) 728 729 if this is bracket: 730 return bracket 731 732 if isinstance(bracket, exp.Bracket): 733 for expression in bracket.expressions: 734 name = expression.name.upper() 735 736 if name not in self.BRACKET_OFFSETS: 737 break 738 739 offset, safe = self.BRACKET_OFFSETS[name] 740 bracket.set("offset", offset) 741 bracket.set("safe", safe) 742 expression.replace(expression.expressions[0]) 743 744 return bracket 745 746 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 747 unnest = super()._parse_unnest(with_alias=with_alias) 748 749 if not unnest: 750 return None 751 752 unnest_expr = seq_get(unnest.expressions, 0) 753 if unnest_expr: 754 from sqlglot.optimizer.annotate_types import annotate_types 755 756 unnest_expr = annotate_types(unnest_expr) 757 758 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 759 # in contrast to other dialects such as DuckDB which flattens only the array by default 760 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 761 array_elem.is_type(exp.DataType.Type.STRUCT) 762 for array_elem in unnest_expr._type.expressions 763 ): 764 unnest.set("explode_array", True) 765 766 return unnest 767 768 def _parse_make_interval(self): 769 expr = exp.MakeInterval() 770 771 for arg_key in expr.arg_types: 772 value = self._parse_lambda() 773 774 if not value: 775 break 776 777 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 778 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 779 if isinstance(value, exp.Kwarg): 780 arg_key = value.this.name 781 782 expr.set(arg_key, value) 783 784 self._match(TokenType.COMMA) 785 786 return expr 787 788 class Generator(generator.Generator): 789 INTERVAL_ALLOWS_PLURAL_FORM = False 790 JOIN_HINTS = False 791 QUERY_HINTS = False 792 TABLE_HINTS = False 793 LIMIT_FETCH = "LIMIT" 794 RENAME_TABLE_WITH_DB = False 795 NVL2_SUPPORTED = False 796 UNNEST_WITH_ORDINALITY = False 797 COLLATE_IS_FUNC = True 798 LIMIT_ONLY_LITERALS = True 799 SUPPORTS_TABLE_ALIAS_COLUMNS = False 800 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 801 JSON_KEY_VALUE_PAIR_SEP = "," 802 NULL_ORDERING_SUPPORTED = False 803 IGNORE_NULLS_IN_FUNC = True 804 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 805 CAN_IMPLEMENT_ARRAY_ANY = True 806 SUPPORTS_TO_NUMBER = False 807 NAMED_PLACEHOLDER_TOKEN = "@" 808 HEX_FUNC = "TO_HEX" 809 WITH_PROPERTIES_PREFIX = "OPTIONS" 810 SUPPORTS_EXPLODING_PROJECTIONS = False 811 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 812 SUPPORTS_UNIX_SECONDS = True 813 814 TRANSFORMS = { 815 **generator.Generator.TRANSFORMS, 816 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 817 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 818 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 819 exp.Array: inline_array_unless_query, 820 exp.ArrayContains: _array_contains_sql, 821 exp.ArrayFilter: filter_array_using_unnest, 822 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 823 exp.CollateProperty: lambda self, e: ( 824 f"DEFAULT COLLATE {self.sql(e, 'this')}" 825 if e.args.get("default") 826 else f"COLLATE {self.sql(e, 'this')}" 827 ), 828 exp.Commit: lambda *_: "COMMIT TRANSACTION", 829 exp.CountIf: rename_func("COUNTIF"), 830 exp.Create: _create_sql, 831 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 832 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 833 exp.DateDiff: lambda self, e: self.func( 834 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 835 ), 836 exp.DateFromParts: rename_func("DATE"), 837 exp.DateStrToDate: datestrtodate_sql, 838 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 839 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 840 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 841 exp.DateTrunc: lambda self, e: self.func( 842 "DATE_TRUNC", e.this, e.text("unit"), e.args.get("zone") 843 ), 844 exp.FromTimeZone: lambda self, e: self.func( 845 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 846 ), 847 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 848 exp.GroupConcat: rename_func("STRING_AGG"), 849 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 850 exp.If: if_sql(false_value="NULL"), 851 exp.ILike: no_ilike_sql, 852 exp.IntDiv: rename_func("DIV"), 853 exp.Int64: rename_func("INT64"), 854 exp.JSONFormat: rename_func("TO_JSON_STRING"), 855 exp.Levenshtein: _levenshtein_sql, 856 exp.Max: max_or_greatest, 857 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 858 exp.MD5Digest: rename_func("MD5"), 859 exp.Min: min_or_least, 860 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 861 exp.RegexpExtract: lambda self, e: self.func( 862 "REGEXP_EXTRACT", 863 e.this, 864 e.expression, 865 e.args.get("position"), 866 e.args.get("occurrence"), 867 ), 868 exp.RegexpExtractAll: lambda self, e: self.func( 869 "REGEXP_EXTRACT_ALL", e.this, e.expression 870 ), 871 exp.RegexpReplace: regexp_replace_sql, 872 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 873 exp.ReturnsProperty: _returnsproperty_sql, 874 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 875 exp.Select: transforms.preprocess( 876 [ 877 transforms.explode_to_unnest(), 878 transforms.unqualify_unnest, 879 transforms.eliminate_distinct_on, 880 _alias_ordered_group, 881 transforms.eliminate_semi_and_anti_joins, 882 ] 883 ), 884 exp.SHA: rename_func("SHA1"), 885 exp.SHA2: sha256_sql, 886 exp.StabilityProperty: lambda self, e: ( 887 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 888 ), 889 exp.String: rename_func("STRING"), 890 exp.StrPosition: str_position_sql, 891 exp.StrToDate: _str_to_datetime_sql, 892 exp.StrToTime: _str_to_datetime_sql, 893 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 894 exp.TimeFromParts: rename_func("TIME"), 895 exp.TimestampFromParts: rename_func("DATETIME"), 896 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 897 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 898 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 899 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 900 exp.TimeStrToTime: timestrtotime_sql, 901 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 902 exp.TsOrDsAdd: _ts_or_ds_add_sql, 903 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 904 exp.TsOrDsToTime: rename_func("TIME"), 905 exp.TsOrDsToDatetime: rename_func("DATETIME"), 906 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 907 exp.Unhex: rename_func("FROM_HEX"), 908 exp.UnixDate: rename_func("UNIX_DATE"), 909 exp.UnixToTime: _unix_to_time_sql, 910 exp.Uuid: lambda *_: "GENERATE_UUID()", 911 exp.Values: _derived_table_values_to_unnest, 912 exp.VariancePop: rename_func("VAR_POP"), 913 } 914 915 SUPPORTED_JSON_PATH_PARTS = { 916 exp.JSONPathKey, 917 exp.JSONPathRoot, 918 exp.JSONPathSubscript, 919 } 920 921 TYPE_MAPPING = { 922 **generator.Generator.TYPE_MAPPING, 923 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 924 exp.DataType.Type.BIGINT: "INT64", 925 exp.DataType.Type.BINARY: "BYTES", 926 exp.DataType.Type.BOOLEAN: "BOOL", 927 exp.DataType.Type.CHAR: "STRING", 928 exp.DataType.Type.DECIMAL: "NUMERIC", 929 exp.DataType.Type.DOUBLE: "FLOAT64", 930 exp.DataType.Type.FLOAT: "FLOAT64", 931 exp.DataType.Type.INT: "INT64", 932 exp.DataType.Type.NCHAR: "STRING", 933 exp.DataType.Type.NVARCHAR: "STRING", 934 exp.DataType.Type.SMALLINT: "INT64", 935 exp.DataType.Type.TEXT: "STRING", 936 exp.DataType.Type.TIMESTAMP: "DATETIME", 937 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 938 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 939 exp.DataType.Type.TINYINT: "INT64", 940 exp.DataType.Type.ROWVERSION: "BYTES", 941 exp.DataType.Type.UUID: "STRING", 942 exp.DataType.Type.VARBINARY: "BYTES", 943 exp.DataType.Type.VARCHAR: "STRING", 944 exp.DataType.Type.VARIANT: "ANY TYPE", 945 } 946 947 PROPERTIES_LOCATION = { 948 **generator.Generator.PROPERTIES_LOCATION, 949 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 950 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 951 } 952 953 # WINDOW comes after QUALIFY 954 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 955 AFTER_HAVING_MODIFIER_TRANSFORMS = { 956 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 957 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 958 } 959 960 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 961 RESERVED_KEYWORDS = { 962 "all", 963 "and", 964 "any", 965 "array", 966 "as", 967 "asc", 968 "assert_rows_modified", 969 "at", 970 "between", 971 "by", 972 "case", 973 "cast", 974 "collate", 975 "contains", 976 "create", 977 "cross", 978 "cube", 979 "current", 980 "default", 981 "define", 982 "desc", 983 "distinct", 984 "else", 985 "end", 986 "enum", 987 "escape", 988 "except", 989 "exclude", 990 "exists", 991 "extract", 992 "false", 993 "fetch", 994 "following", 995 "for", 996 "from", 997 "full", 998 "group", 999 "grouping", 1000 "groups", 1001 "hash", 1002 "having", 1003 "if", 1004 "ignore", 1005 "in", 1006 "inner", 1007 "intersect", 1008 "interval", 1009 "into", 1010 "is", 1011 "join", 1012 "lateral", 1013 "left", 1014 "like", 1015 "limit", 1016 "lookup", 1017 "merge", 1018 "natural", 1019 "new", 1020 "no", 1021 "not", 1022 "null", 1023 "nulls", 1024 "of", 1025 "on", 1026 "or", 1027 "order", 1028 "outer", 1029 "over", 1030 "partition", 1031 "preceding", 1032 "proto", 1033 "qualify", 1034 "range", 1035 "recursive", 1036 "respect", 1037 "right", 1038 "rollup", 1039 "rows", 1040 "select", 1041 "set", 1042 "some", 1043 "struct", 1044 "tablesample", 1045 "then", 1046 "to", 1047 "treat", 1048 "true", 1049 "unbounded", 1050 "union", 1051 "unnest", 1052 "using", 1053 "when", 1054 "where", 1055 "window", 1056 "with", 1057 "within", 1058 } 1059 1060 def mod_sql(self, expression: exp.Mod) -> str: 1061 this = expression.this 1062 expr = expression.expression 1063 return self.func( 1064 "MOD", 1065 this.unnest() if isinstance(this, exp.Paren) else this, 1066 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1067 ) 1068 1069 def column_parts(self, expression: exp.Column) -> str: 1070 if expression.meta.get("quoted_column"): 1071 # If a column reference is of the form `dataset.table`.name, we need 1072 # to preserve the quoted table path, otherwise the reference breaks 1073 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1074 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1075 return f"{table_path}.{self.sql(expression, 'this')}" 1076 1077 return super().column_parts(expression) 1078 1079 def table_parts(self, expression: exp.Table) -> str: 1080 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1081 # we need to make sure the correct quoting is used in each case. 1082 # 1083 # For example, if there is a CTE x that clashes with a schema name, then the former will 1084 # return the table y in that schema, whereas the latter will return the CTE's y column: 1085 # 1086 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1087 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1088 if expression.meta.get("quoted_table"): 1089 table_parts = ".".join(p.name for p in expression.parts) 1090 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1091 1092 return super().table_parts(expression) 1093 1094 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1095 this = expression.this 1096 if isinstance(this, exp.TsOrDsToDatetime): 1097 func_name = "FORMAT_DATETIME" 1098 elif isinstance(this, exp.TsOrDsToTimestamp): 1099 func_name = "FORMAT_TIMESTAMP" 1100 else: 1101 func_name = "FORMAT_DATE" 1102 1103 time_expr = ( 1104 this 1105 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1106 else expression 1107 ) 1108 return self.func(func_name, self.format_time(expression), time_expr.this) 1109 1110 def eq_sql(self, expression: exp.EQ) -> str: 1111 # Operands of = cannot be NULL in BigQuery 1112 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1113 if not isinstance(expression.parent, exp.Update): 1114 return "NULL" 1115 1116 return self.binary(expression, "=") 1117 1118 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1119 parent = expression.parent 1120 1121 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1122 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1123 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1124 return self.func( 1125 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1126 ) 1127 1128 return super().attimezone_sql(expression) 1129 1130 def trycast_sql(self, expression: exp.TryCast) -> str: 1131 return self.cast_sql(expression, safe_prefix="SAFE_") 1132 1133 def bracket_sql(self, expression: exp.Bracket) -> str: 1134 this = expression.this 1135 expressions = expression.expressions 1136 1137 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1138 arg = expressions[0] 1139 if arg.type is None: 1140 from sqlglot.optimizer.annotate_types import annotate_types 1141 1142 arg = annotate_types(arg) 1143 1144 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1145 # BQ doesn't support bracket syntax with string values for structs 1146 return f"{self.sql(this)}.{arg.name}" 1147 1148 expressions_sql = self.expressions(expression, flat=True) 1149 offset = expression.args.get("offset") 1150 1151 if offset == 0: 1152 expressions_sql = f"OFFSET({expressions_sql})" 1153 elif offset == 1: 1154 expressions_sql = f"ORDINAL({expressions_sql})" 1155 elif offset is not None: 1156 self.unsupported(f"Unsupported array offset: {offset}") 1157 1158 if expression.args.get("safe"): 1159 expressions_sql = f"SAFE_{expressions_sql}" 1160 1161 return f"{self.sql(this)}[{expressions_sql}]" 1162 1163 def in_unnest_op(self, expression: exp.Unnest) -> str: 1164 return self.sql(expression) 1165 1166 def version_sql(self, expression: exp.Version) -> str: 1167 if expression.name == "TIMESTAMP": 1168 expression.set("this", "SYSTEM_TIME") 1169 return super().version_sql(expression) 1170 1171 def contains_sql(self, expression: exp.Contains) -> str: 1172 this = expression.this 1173 expr = expression.expression 1174 1175 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1176 this = this.this 1177 expr = expr.this 1178 1179 return self.func("CONTAINS_SUBSTRING", this, expr)
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether alias reference expansion before qualification should only happen for the GROUP BY clause.
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime
formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy')
.
If empty, the corresponding trie will be constructed off of TIME_MAPPING
.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT *
queries.
Whether a set operation uses DISTINCT by default. This is None
when either DISTINCT
or ALL
must be explicitly specified.
400 def normalize_identifier(self, expression: E) -> E: 401 if ( 402 isinstance(expression, exp.Identifier) 403 and self.normalization_strategy is not NormalizationStrategy.CASE_SENSITIVE 404 ): 405 parent = expression.parent 406 while isinstance(parent, exp.Dot): 407 parent = parent.parent 408 409 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 410 # by default. The following check uses a heuristic to detect tables based on whether 411 # they are qualified. This should generally be correct, because tables in BigQuery 412 # must be qualified with at least a dataset, unless @@dataset_id is set. 413 case_sensitive = ( 414 isinstance(parent, exp.UserDefinedFunction) 415 or ( 416 isinstance(parent, exp.Table) 417 and parent.db 418 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 419 ) 420 or expression.meta.get("is_table") 421 ) 422 if not case_sensitive: 423 expression.set("this", expression.this.lower()) 424 425 return expression
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO
would be resolved as foo
in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO
. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
Inherited Members
- sqlglot.dialects.dialect.Dialect
- Dialect
- INDEX_OFFSET
- ALIAS_POST_TABLESAMPLE
- TABLESAMPLE_SIZE_IS_PERCENT
- IDENTIFIERS_CAN_START_WITH_DIGIT
- DPIPE_IS_STRING_CONCAT
- STRICT_STRING_CONCAT
- COPY_PARAMS_ARE_CSV
- NULL_ORDERING
- TYPED_DIVISION
- SAFE_DIVISION
- CONCAT_COALESCE
- DATE_FORMAT
- DATEINT_FORMAT
- TIME_FORMAT
- PREFER_CTE_ALIAS_COLUMN
- SUPPORTS_ORDER_BY_ALL
- HAS_DISTINCT_ARRAY_CONSTRUCTORS
- SUPPORTS_FIXED_SIZE_ARRAYS
- STRICT_JSON_PATH_SYNTAX
- ON_CONDITION_EMPTY_BEFORE_ERROR
- ARRAY_AGG_INCLUDES_NULLS
- REGEXP_EXTRACT_DEFAULT_GROUP
- CREATABLE_KIND_MAPPING
- DATE_PART_MAPPING
- TYPE_TO_EXPRESSIONS
- get_or_raise
- format_time
- settings
- case_sensitive
- can_identify
- quote_identifier
- to_json_path
- parse
- parse_into
- generate
- transpile
- tokenize
- tokenizer
- jsonpath_tokenizer
- parser
- generator
427 class Tokenizer(tokens.Tokenizer): 428 QUOTES = ["'", '"', '"""', "'''"] 429 COMMENTS = ["--", "#", ("/*", "*/")] 430 IDENTIFIERS = ["`"] 431 STRING_ESCAPES = ["\\"] 432 433 HEX_STRINGS = [("0x", ""), ("0X", "")] 434 435 BYTE_STRINGS = [ 436 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 437 ] 438 439 RAW_STRINGS = [ 440 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 441 ] 442 443 KEYWORDS = { 444 **tokens.Tokenizer.KEYWORDS, 445 "ANY TYPE": TokenType.VARIANT, 446 "BEGIN": TokenType.COMMAND, 447 "BEGIN TRANSACTION": TokenType.BEGIN, 448 "BYTEINT": TokenType.INT, 449 "BYTES": TokenType.BINARY, 450 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 451 "DATETIME": TokenType.TIMESTAMP, 452 "DECLARE": TokenType.COMMAND, 453 "ELSEIF": TokenType.COMMAND, 454 "EXCEPTION": TokenType.COMMAND, 455 "FLOAT64": TokenType.DOUBLE, 456 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 457 "MODEL": TokenType.MODEL, 458 "NOT DETERMINISTIC": TokenType.VOLATILE, 459 "RECORD": TokenType.STRUCT, 460 "TIMESTAMP": TokenType.TIMESTAMPTZ, 461 } 462 KEYWORDS.pop("DIV") 463 KEYWORDS.pop("VALUES") 464 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- NESTED_COMMENTS
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
466 class Parser(parser.Parser): 467 PREFIXED_PIVOT_COLUMNS = True 468 LOG_DEFAULTS_TO_LN = True 469 SUPPORTS_IMPLICIT_UNNEST = True 470 471 FUNCTIONS = { 472 **parser.Parser.FUNCTIONS, 473 "CONTAINS_SUBSTRING": _build_contains_substring, 474 "DATE": _build_date, 475 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 476 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 477 "DATE_TRUNC": lambda args: exp.DateTrunc( 478 unit=exp.Literal.string(str(seq_get(args, 1))), 479 this=seq_get(args, 0), 480 zone=seq_get(args, 2), 481 ), 482 "DATETIME": _build_datetime, 483 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 484 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 485 "DIV": binary_from_function(exp.IntDiv), 486 "EDIT_DISTANCE": _build_levenshtein, 487 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 488 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 489 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 490 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 491 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 492 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 493 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 494 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 495 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 496 "MD5": exp.MD5Digest.from_arg_list, 497 "TO_HEX": _build_to_hex, 498 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 499 [seq_get(args, 1), seq_get(args, 0)] 500 ), 501 "PARSE_TIMESTAMP": _build_parse_timestamp, 502 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 503 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 504 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 505 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 506 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 507 ), 508 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 509 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 510 "SPLIT": lambda args: exp.Split( 511 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 512 this=seq_get(args, 0), 513 expression=seq_get(args, 1) or exp.Literal.string(","), 514 ), 515 "STRPOS": exp.StrPosition.from_arg_list, 516 "TIME": _build_time, 517 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 518 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 519 "TIMESTAMP": _build_timestamp, 520 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 521 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 522 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 523 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 524 ), 525 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 526 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 527 ), 528 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 529 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 530 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 531 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 532 } 533 534 FUNCTION_PARSERS = { 535 **parser.Parser.FUNCTION_PARSERS, 536 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 537 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 538 } 539 FUNCTION_PARSERS.pop("TRIM") 540 541 NO_PAREN_FUNCTIONS = { 542 **parser.Parser.NO_PAREN_FUNCTIONS, 543 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 544 } 545 546 NESTED_TYPE_TOKENS = { 547 *parser.Parser.NESTED_TYPE_TOKENS, 548 TokenType.TABLE, 549 } 550 551 PROPERTY_PARSERS = { 552 **parser.Parser.PROPERTY_PARSERS, 553 "NOT DETERMINISTIC": lambda self: self.expression( 554 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 555 ), 556 "OPTIONS": lambda self: self._parse_with_property(), 557 } 558 559 CONSTRAINT_PARSERS = { 560 **parser.Parser.CONSTRAINT_PARSERS, 561 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 562 } 563 564 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 565 RANGE_PARSERS.pop(TokenType.OVERLAPS) 566 567 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 568 569 STATEMENT_PARSERS = { 570 **parser.Parser.STATEMENT_PARSERS, 571 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 572 TokenType.END: lambda self: self._parse_as_command(self._prev), 573 TokenType.FOR: lambda self: self._parse_for_in(), 574 } 575 576 BRACKET_OFFSETS = { 577 "OFFSET": (0, False), 578 "ORDINAL": (1, False), 579 "SAFE_OFFSET": (0, True), 580 "SAFE_ORDINAL": (1, True), 581 } 582 583 def _parse_for_in(self) -> exp.ForIn: 584 this = self._parse_range() 585 self._match_text_seq("DO") 586 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 587 588 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 589 this = super()._parse_table_part(schema=schema) or self._parse_number() 590 591 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 592 if isinstance(this, exp.Identifier): 593 table_name = this.name 594 while self._match(TokenType.DASH, advance=False) and self._next: 595 text = "" 596 while self._is_connected() and self._curr.token_type != TokenType.DOT: 597 self._advance() 598 text += self._prev.text 599 table_name += text 600 601 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 602 elif isinstance(this, exp.Literal): 603 table_name = this.name 604 605 if self._is_connected() and self._parse_var(any_token=True): 606 table_name += self._prev.text 607 608 this = exp.Identifier(this=table_name, quoted=True) 609 610 return this 611 612 def _parse_table_parts( 613 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 614 ) -> exp.Table: 615 table = super()._parse_table_parts( 616 schema=schema, is_db_reference=is_db_reference, wildcard=True 617 ) 618 619 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 620 if not table.catalog: 621 if table.db: 622 parts = table.db.split(".") 623 if len(parts) == 2 and not table.args["db"].quoted: 624 table.set("catalog", exp.Identifier(this=parts[0])) 625 table.set("db", exp.Identifier(this=parts[1])) 626 else: 627 parts = table.name.split(".") 628 if len(parts) == 2 and not table.this.quoted: 629 table.set("db", exp.Identifier(this=parts[0])) 630 table.set("this", exp.Identifier(this=parts[1])) 631 632 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 633 alias = table.this 634 catalog, db, this, *rest = ( 635 exp.to_identifier(p, quoted=True) 636 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 637 ) 638 639 if rest and this: 640 this = exp.Dot.build([this, *rest]) # type: ignore 641 642 table = exp.Table( 643 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 644 ) 645 table.meta["quoted_table"] = True 646 else: 647 alias = None 648 649 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 650 # dataset, so if the project identifier is omitted we need to fix the ast so that 651 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 652 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 653 # views, because it would seem like the "catalog" part is set, when it'd actually 654 # be the region/dataset. Merging the two identifiers into a single one is done to 655 # avoid producing a 4-part Table reference, which would cause issues in the schema 656 # module, when there are 3-part table names mixed with information schema views. 657 # 658 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 659 table_parts = table.parts 660 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 661 # We need to alias the table here to avoid breaking existing qualified columns. 662 # This is expected to be safe, because if there's an actual alias coming up in 663 # the token stream, it will overwrite this one. If there isn't one, we are only 664 # exposing the name that can be used to reference the view explicitly (a no-op). 665 exp.alias_( 666 table, 667 t.cast(exp.Identifier, alias or table_parts[-1]), 668 table=True, 669 copy=False, 670 ) 671 672 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 673 table.set("this", exp.Identifier(this=info_schema_view, quoted=True)) 674 table.set("db", seq_get(table_parts, -3)) 675 table.set("catalog", seq_get(table_parts, -4)) 676 677 return table 678 679 def _parse_column(self) -> t.Optional[exp.Expression]: 680 column = super()._parse_column() 681 if isinstance(column, exp.Column): 682 parts = column.parts 683 if any("." in p.name for p in parts): 684 catalog, db, table, this, *rest = ( 685 exp.to_identifier(p, quoted=True) 686 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 687 ) 688 689 if rest and this: 690 this = exp.Dot.build([this, *rest]) # type: ignore 691 692 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 693 column.meta["quoted_column"] = True 694 695 return column 696 697 @t.overload 698 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 699 700 @t.overload 701 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 702 703 def _parse_json_object(self, agg=False): 704 json_object = super()._parse_json_object() 705 array_kv_pair = seq_get(json_object.expressions, 0) 706 707 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 708 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 709 if ( 710 array_kv_pair 711 and isinstance(array_kv_pair.this, exp.Array) 712 and isinstance(array_kv_pair.expression, exp.Array) 713 ): 714 keys = array_kv_pair.this.expressions 715 values = array_kv_pair.expression.expressions 716 717 json_object.set( 718 "expressions", 719 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 720 ) 721 722 return json_object 723 724 def _parse_bracket( 725 self, this: t.Optional[exp.Expression] = None 726 ) -> t.Optional[exp.Expression]: 727 bracket = super()._parse_bracket(this) 728 729 if this is bracket: 730 return bracket 731 732 if isinstance(bracket, exp.Bracket): 733 for expression in bracket.expressions: 734 name = expression.name.upper() 735 736 if name not in self.BRACKET_OFFSETS: 737 break 738 739 offset, safe = self.BRACKET_OFFSETS[name] 740 bracket.set("offset", offset) 741 bracket.set("safe", safe) 742 expression.replace(expression.expressions[0]) 743 744 return bracket 745 746 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 747 unnest = super()._parse_unnest(with_alias=with_alias) 748 749 if not unnest: 750 return None 751 752 unnest_expr = seq_get(unnest.expressions, 0) 753 if unnest_expr: 754 from sqlglot.optimizer.annotate_types import annotate_types 755 756 unnest_expr = annotate_types(unnest_expr) 757 758 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 759 # in contrast to other dialects such as DuckDB which flattens only the array by default 760 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 761 array_elem.is_type(exp.DataType.Type.STRUCT) 762 for array_elem in unnest_expr._type.expressions 763 ): 764 unnest.set("explode_array", True) 765 766 return unnest 767 768 def _parse_make_interval(self): 769 expr = exp.MakeInterval() 770 771 for arg_key in expr.arg_types: 772 value = self._parse_lambda() 773 774 if not value: 775 break 776 777 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 778 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 779 if isinstance(value, exp.Kwarg): 780 arg_key = value.this.name 781 782 expr.set(arg_key, value) 783 784 self._match(TokenType.COMMA) 785 786 return expr
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- INTERVAL_VARS
- ALIAS_TOKENS
- ARRAY_CONSTRUCTORS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- OPERATION_MODIFIERS
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- errors
- sql
788 class Generator(generator.Generator): 789 INTERVAL_ALLOWS_PLURAL_FORM = False 790 JOIN_HINTS = False 791 QUERY_HINTS = False 792 TABLE_HINTS = False 793 LIMIT_FETCH = "LIMIT" 794 RENAME_TABLE_WITH_DB = False 795 NVL2_SUPPORTED = False 796 UNNEST_WITH_ORDINALITY = False 797 COLLATE_IS_FUNC = True 798 LIMIT_ONLY_LITERALS = True 799 SUPPORTS_TABLE_ALIAS_COLUMNS = False 800 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 801 JSON_KEY_VALUE_PAIR_SEP = "," 802 NULL_ORDERING_SUPPORTED = False 803 IGNORE_NULLS_IN_FUNC = True 804 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 805 CAN_IMPLEMENT_ARRAY_ANY = True 806 SUPPORTS_TO_NUMBER = False 807 NAMED_PLACEHOLDER_TOKEN = "@" 808 HEX_FUNC = "TO_HEX" 809 WITH_PROPERTIES_PREFIX = "OPTIONS" 810 SUPPORTS_EXPLODING_PROJECTIONS = False 811 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 812 SUPPORTS_UNIX_SECONDS = True 813 814 TRANSFORMS = { 815 **generator.Generator.TRANSFORMS, 816 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 817 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 818 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 819 exp.Array: inline_array_unless_query, 820 exp.ArrayContains: _array_contains_sql, 821 exp.ArrayFilter: filter_array_using_unnest, 822 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 823 exp.CollateProperty: lambda self, e: ( 824 f"DEFAULT COLLATE {self.sql(e, 'this')}" 825 if e.args.get("default") 826 else f"COLLATE {self.sql(e, 'this')}" 827 ), 828 exp.Commit: lambda *_: "COMMIT TRANSACTION", 829 exp.CountIf: rename_func("COUNTIF"), 830 exp.Create: _create_sql, 831 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 832 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 833 exp.DateDiff: lambda self, e: self.func( 834 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 835 ), 836 exp.DateFromParts: rename_func("DATE"), 837 exp.DateStrToDate: datestrtodate_sql, 838 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 839 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 840 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 841 exp.DateTrunc: lambda self, e: self.func( 842 "DATE_TRUNC", e.this, e.text("unit"), e.args.get("zone") 843 ), 844 exp.FromTimeZone: lambda self, e: self.func( 845 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 846 ), 847 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 848 exp.GroupConcat: rename_func("STRING_AGG"), 849 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 850 exp.If: if_sql(false_value="NULL"), 851 exp.ILike: no_ilike_sql, 852 exp.IntDiv: rename_func("DIV"), 853 exp.Int64: rename_func("INT64"), 854 exp.JSONFormat: rename_func("TO_JSON_STRING"), 855 exp.Levenshtein: _levenshtein_sql, 856 exp.Max: max_or_greatest, 857 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 858 exp.MD5Digest: rename_func("MD5"), 859 exp.Min: min_or_least, 860 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 861 exp.RegexpExtract: lambda self, e: self.func( 862 "REGEXP_EXTRACT", 863 e.this, 864 e.expression, 865 e.args.get("position"), 866 e.args.get("occurrence"), 867 ), 868 exp.RegexpExtractAll: lambda self, e: self.func( 869 "REGEXP_EXTRACT_ALL", e.this, e.expression 870 ), 871 exp.RegexpReplace: regexp_replace_sql, 872 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 873 exp.ReturnsProperty: _returnsproperty_sql, 874 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 875 exp.Select: transforms.preprocess( 876 [ 877 transforms.explode_to_unnest(), 878 transforms.unqualify_unnest, 879 transforms.eliminate_distinct_on, 880 _alias_ordered_group, 881 transforms.eliminate_semi_and_anti_joins, 882 ] 883 ), 884 exp.SHA: rename_func("SHA1"), 885 exp.SHA2: sha256_sql, 886 exp.StabilityProperty: lambda self, e: ( 887 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 888 ), 889 exp.String: rename_func("STRING"), 890 exp.StrPosition: str_position_sql, 891 exp.StrToDate: _str_to_datetime_sql, 892 exp.StrToTime: _str_to_datetime_sql, 893 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 894 exp.TimeFromParts: rename_func("TIME"), 895 exp.TimestampFromParts: rename_func("DATETIME"), 896 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 897 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 898 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 899 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 900 exp.TimeStrToTime: timestrtotime_sql, 901 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 902 exp.TsOrDsAdd: _ts_or_ds_add_sql, 903 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 904 exp.TsOrDsToTime: rename_func("TIME"), 905 exp.TsOrDsToDatetime: rename_func("DATETIME"), 906 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 907 exp.Unhex: rename_func("FROM_HEX"), 908 exp.UnixDate: rename_func("UNIX_DATE"), 909 exp.UnixToTime: _unix_to_time_sql, 910 exp.Uuid: lambda *_: "GENERATE_UUID()", 911 exp.Values: _derived_table_values_to_unnest, 912 exp.VariancePop: rename_func("VAR_POP"), 913 } 914 915 SUPPORTED_JSON_PATH_PARTS = { 916 exp.JSONPathKey, 917 exp.JSONPathRoot, 918 exp.JSONPathSubscript, 919 } 920 921 TYPE_MAPPING = { 922 **generator.Generator.TYPE_MAPPING, 923 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 924 exp.DataType.Type.BIGINT: "INT64", 925 exp.DataType.Type.BINARY: "BYTES", 926 exp.DataType.Type.BOOLEAN: "BOOL", 927 exp.DataType.Type.CHAR: "STRING", 928 exp.DataType.Type.DECIMAL: "NUMERIC", 929 exp.DataType.Type.DOUBLE: "FLOAT64", 930 exp.DataType.Type.FLOAT: "FLOAT64", 931 exp.DataType.Type.INT: "INT64", 932 exp.DataType.Type.NCHAR: "STRING", 933 exp.DataType.Type.NVARCHAR: "STRING", 934 exp.DataType.Type.SMALLINT: "INT64", 935 exp.DataType.Type.TEXT: "STRING", 936 exp.DataType.Type.TIMESTAMP: "DATETIME", 937 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 938 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 939 exp.DataType.Type.TINYINT: "INT64", 940 exp.DataType.Type.ROWVERSION: "BYTES", 941 exp.DataType.Type.UUID: "STRING", 942 exp.DataType.Type.VARBINARY: "BYTES", 943 exp.DataType.Type.VARCHAR: "STRING", 944 exp.DataType.Type.VARIANT: "ANY TYPE", 945 } 946 947 PROPERTIES_LOCATION = { 948 **generator.Generator.PROPERTIES_LOCATION, 949 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 950 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 951 } 952 953 # WINDOW comes after QUALIFY 954 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 955 AFTER_HAVING_MODIFIER_TRANSFORMS = { 956 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 957 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 958 } 959 960 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 961 RESERVED_KEYWORDS = { 962 "all", 963 "and", 964 "any", 965 "array", 966 "as", 967 "asc", 968 "assert_rows_modified", 969 "at", 970 "between", 971 "by", 972 "case", 973 "cast", 974 "collate", 975 "contains", 976 "create", 977 "cross", 978 "cube", 979 "current", 980 "default", 981 "define", 982 "desc", 983 "distinct", 984 "else", 985 "end", 986 "enum", 987 "escape", 988 "except", 989 "exclude", 990 "exists", 991 "extract", 992 "false", 993 "fetch", 994 "following", 995 "for", 996 "from", 997 "full", 998 "group", 999 "grouping", 1000 "groups", 1001 "hash", 1002 "having", 1003 "if", 1004 "ignore", 1005 "in", 1006 "inner", 1007 "intersect", 1008 "interval", 1009 "into", 1010 "is", 1011 "join", 1012 "lateral", 1013 "left", 1014 "like", 1015 "limit", 1016 "lookup", 1017 "merge", 1018 "natural", 1019 "new", 1020 "no", 1021 "not", 1022 "null", 1023 "nulls", 1024 "of", 1025 "on", 1026 "or", 1027 "order", 1028 "outer", 1029 "over", 1030 "partition", 1031 "preceding", 1032 "proto", 1033 "qualify", 1034 "range", 1035 "recursive", 1036 "respect", 1037 "right", 1038 "rollup", 1039 "rows", 1040 "select", 1041 "set", 1042 "some", 1043 "struct", 1044 "tablesample", 1045 "then", 1046 "to", 1047 "treat", 1048 "true", 1049 "unbounded", 1050 "union", 1051 "unnest", 1052 "using", 1053 "when", 1054 "where", 1055 "window", 1056 "with", 1057 "within", 1058 } 1059 1060 def mod_sql(self, expression: exp.Mod) -> str: 1061 this = expression.this 1062 expr = expression.expression 1063 return self.func( 1064 "MOD", 1065 this.unnest() if isinstance(this, exp.Paren) else this, 1066 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1067 ) 1068 1069 def column_parts(self, expression: exp.Column) -> str: 1070 if expression.meta.get("quoted_column"): 1071 # If a column reference is of the form `dataset.table`.name, we need 1072 # to preserve the quoted table path, otherwise the reference breaks 1073 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1074 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1075 return f"{table_path}.{self.sql(expression, 'this')}" 1076 1077 return super().column_parts(expression) 1078 1079 def table_parts(self, expression: exp.Table) -> str: 1080 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1081 # we need to make sure the correct quoting is used in each case. 1082 # 1083 # For example, if there is a CTE x that clashes with a schema name, then the former will 1084 # return the table y in that schema, whereas the latter will return the CTE's y column: 1085 # 1086 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1087 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1088 if expression.meta.get("quoted_table"): 1089 table_parts = ".".join(p.name for p in expression.parts) 1090 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1091 1092 return super().table_parts(expression) 1093 1094 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1095 this = expression.this 1096 if isinstance(this, exp.TsOrDsToDatetime): 1097 func_name = "FORMAT_DATETIME" 1098 elif isinstance(this, exp.TsOrDsToTimestamp): 1099 func_name = "FORMAT_TIMESTAMP" 1100 else: 1101 func_name = "FORMAT_DATE" 1102 1103 time_expr = ( 1104 this 1105 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1106 else expression 1107 ) 1108 return self.func(func_name, self.format_time(expression), time_expr.this) 1109 1110 def eq_sql(self, expression: exp.EQ) -> str: 1111 # Operands of = cannot be NULL in BigQuery 1112 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1113 if not isinstance(expression.parent, exp.Update): 1114 return "NULL" 1115 1116 return self.binary(expression, "=") 1117 1118 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1119 parent = expression.parent 1120 1121 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1122 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1123 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1124 return self.func( 1125 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1126 ) 1127 1128 return super().attimezone_sql(expression) 1129 1130 def trycast_sql(self, expression: exp.TryCast) -> str: 1131 return self.cast_sql(expression, safe_prefix="SAFE_") 1132 1133 def bracket_sql(self, expression: exp.Bracket) -> str: 1134 this = expression.this 1135 expressions = expression.expressions 1136 1137 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1138 arg = expressions[0] 1139 if arg.type is None: 1140 from sqlglot.optimizer.annotate_types import annotate_types 1141 1142 arg = annotate_types(arg) 1143 1144 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1145 # BQ doesn't support bracket syntax with string values for structs 1146 return f"{self.sql(this)}.{arg.name}" 1147 1148 expressions_sql = self.expressions(expression, flat=True) 1149 offset = expression.args.get("offset") 1150 1151 if offset == 0: 1152 expressions_sql = f"OFFSET({expressions_sql})" 1153 elif offset == 1: 1154 expressions_sql = f"ORDINAL({expressions_sql})" 1155 elif offset is not None: 1156 self.unsupported(f"Unsupported array offset: {offset}") 1157 1158 if expression.args.get("safe"): 1159 expressions_sql = f"SAFE_{expressions_sql}" 1160 1161 return f"{self.sql(this)}[{expressions_sql}]" 1162 1163 def in_unnest_op(self, expression: exp.Unnest) -> str: 1164 return self.sql(expression) 1165 1166 def version_sql(self, expression: exp.Version) -> str: 1167 if expression.name == "TIMESTAMP": 1168 expression.set("this", "SYSTEM_TIME") 1169 return super().version_sql(expression) 1170 1171 def contains_sql(self, expression: exp.Contains) -> str: 1172 this = expression.this 1173 expr = expression.expression 1174 1175 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1176 this = this.this 1177 expr = expr.this 1178 1179 return self.func("CONTAINS_SUBSTRING", this, expr)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1069 def column_parts(self, expression: exp.Column) -> str: 1070 if expression.meta.get("quoted_column"): 1071 # If a column reference is of the form `dataset.table`.name, we need 1072 # to preserve the quoted table path, otherwise the reference breaks 1073 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1074 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1075 return f"{table_path}.{self.sql(expression, 'this')}" 1076 1077 return super().column_parts(expression)
1079 def table_parts(self, expression: exp.Table) -> str: 1080 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1081 # we need to make sure the correct quoting is used in each case. 1082 # 1083 # For example, if there is a CTE x that clashes with a schema name, then the former will 1084 # return the table y in that schema, whereas the latter will return the CTE's y column: 1085 # 1086 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1087 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1088 if expression.meta.get("quoted_table"): 1089 table_parts = ".".join(p.name for p in expression.parts) 1090 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1091 1092 return super().table_parts(expression)
1094 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1095 this = expression.this 1096 if isinstance(this, exp.TsOrDsToDatetime): 1097 func_name = "FORMAT_DATETIME" 1098 elif isinstance(this, exp.TsOrDsToTimestamp): 1099 func_name = "FORMAT_TIMESTAMP" 1100 else: 1101 func_name = "FORMAT_DATE" 1102 1103 time_expr = ( 1104 this 1105 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1106 else expression 1107 ) 1108 return self.func(func_name, self.format_time(expression), time_expr.this)
1110 def eq_sql(self, expression: exp.EQ) -> str: 1111 # Operands of = cannot be NULL in BigQuery 1112 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1113 if not isinstance(expression.parent, exp.Update): 1114 return "NULL" 1115 1116 return self.binary(expression, "=")
1118 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1119 parent = expression.parent 1120 1121 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1122 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1123 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1124 return self.func( 1125 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1126 ) 1127 1128 return super().attimezone_sql(expression)
1133 def bracket_sql(self, expression: exp.Bracket) -> str: 1134 this = expression.this 1135 expressions = expression.expressions 1136 1137 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1138 arg = expressions[0] 1139 if arg.type is None: 1140 from sqlglot.optimizer.annotate_types import annotate_types 1141 1142 arg = annotate_types(arg) 1143 1144 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1145 # BQ doesn't support bracket syntax with string values for structs 1146 return f"{self.sql(this)}.{arg.name}" 1147 1148 expressions_sql = self.expressions(expression, flat=True) 1149 offset = expression.args.get("offset") 1150 1151 if offset == 0: 1152 expressions_sql = f"OFFSET({expressions_sql})" 1153 elif offset == 1: 1154 expressions_sql = f"ORDINAL({expressions_sql})" 1155 elif offset is not None: 1156 self.unsupported(f"Unsupported array offset: {offset}") 1157 1158 if expression.args.get("safe"): 1159 expressions_sql = f"SAFE_{expressions_sql}" 1160 1161 return f"{self.sql(this)}[{expressions_sql}]"
1171 def contains_sql(self, expression: exp.Contains) -> str: 1172 this = expression.this 1173 expr = expression.expression 1174 1175 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1176 this = this.this 1177 expr = expr.this 1178 1179 return self.func("CONTAINS_SUBSTRING", this, expr)
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ARRAY_SIZE_DIM_REQUIRED
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- pad_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- transformcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- cast_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- ilike_sql
- ilikeany_sql
- is_sql
- like_sql
- likeany_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- try_sql
- log_sql
- use_sql
- binary
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql