sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7from sqlglot import exp, generator, parser, tokens, transforms 8from sqlglot._typing import E 9from sqlglot.dialects.dialect import ( 10 Dialect, 11 NormalizationStrategy, 12 arg_max_or_min_no_count, 13 binary_from_function, 14 date_add_interval_sql, 15 datestrtodate_sql, 16 format_time_lambda, 17 if_sql, 18 inline_array_sql, 19 json_keyvalue_comma_sql, 20 max_or_greatest, 21 min_or_least, 22 no_ilike_sql, 23 parse_date_delta_with_interval, 24 regexp_replace_sql, 25 rename_func, 26 timestrtotime_sql, 27 ts_or_ds_add_cast, 28 ts_or_ds_to_date_sql, 29) 30from sqlglot.helper import seq_get, split_num_words 31from sqlglot.tokens import TokenType 32 33logger = logging.getLogger("sqlglot") 34 35 36def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 37 if not expression.find_ancestor(exp.From, exp.Join): 38 return self.values_sql(expression) 39 40 alias = expression.args.get("alias") 41 42 structs = [ 43 exp.Struct( 44 expressions=[ 45 exp.alias_(value, column_name) 46 for value, column_name in zip( 47 t.expressions, 48 alias.columns 49 if alias and alias.columns 50 else (f"_c{i}" for i in range(len(t.expressions))), 51 ) 52 ] 53 ) 54 for t in expression.find_all(exp.Tuple) 55 ] 56 57 return self.unnest_sql(exp.Unnest(expressions=[exp.Array(expressions=structs)])) 58 59 60def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 61 this = expression.this 62 if isinstance(this, exp.Schema): 63 this = f"{this.this} <{self.expressions(this)}>" 64 else: 65 this = self.sql(this) 66 return f"RETURNS {this}" 67 68 69def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 70 kind = expression.args["kind"] 71 returns = expression.find(exp.ReturnsProperty) 72 73 if kind.upper() == "FUNCTION" and returns and returns.args.get("is_table"): 74 expression.set("kind", "TABLE FUNCTION") 75 76 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 77 expression.set("expression", expression.expression.this) 78 79 return self.create_sql(expression) 80 81 return self.create_sql(expression) 82 83 84def _unqualify_unnest(expression: exp.Expression) -> exp.Expression: 85 """Remove references to unnest table aliases since bigquery doesn't allow them. 86 87 These are added by the optimizer's qualify_column step. 88 """ 89 from sqlglot.optimizer.scope import find_all_in_scope 90 91 if isinstance(expression, exp.Select): 92 unnest_aliases = { 93 unnest.alias 94 for unnest in find_all_in_scope(expression, exp.Unnest) 95 if isinstance(unnest.parent, (exp.From, exp.Join)) 96 } 97 if unnest_aliases: 98 for column in expression.find_all(exp.Column): 99 if column.table in unnest_aliases: 100 column.set("table", None) 101 elif column.db in unnest_aliases: 102 column.set("db", None) 103 104 return expression 105 106 107# https://issuetracker.google.com/issues/162294746 108# workaround for bigquery bug when grouping by an expression and then ordering 109# WITH x AS (SELECT 1 y) 110# SELECT y + 1 z 111# FROM x 112# GROUP BY x + 1 113# ORDER by z 114def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 115 if isinstance(expression, exp.Select): 116 group = expression.args.get("group") 117 order = expression.args.get("order") 118 119 if group and order: 120 aliases = { 121 select.this: select.args["alias"] 122 for select in expression.selects 123 if isinstance(select, exp.Alias) 124 } 125 126 for e in group.expressions: 127 alias = aliases.get(e) 128 129 if alias: 130 e.replace(exp.column(alias)) 131 132 return expression 133 134 135def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 136 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 137 if isinstance(expression, exp.CTE) and expression.alias_column_names: 138 cte_query = expression.this 139 140 if cte_query.is_star: 141 logger.warning( 142 "Can't push down CTE column names for star queries. Run the query through" 143 " the optimizer or use 'qualify' to expand the star projections first." 144 ) 145 return expression 146 147 column_names = expression.alias_column_names 148 expression.args["alias"].set("columns", None) 149 150 for name, select in zip(column_names, cte_query.selects): 151 to_replace = select 152 153 if isinstance(select, exp.Alias): 154 select = select.this 155 156 # Inner aliases are shadowed by the CTE column names 157 to_replace.replace(exp.alias_(select, name)) 158 159 return expression 160 161 162def _parse_timestamp(args: t.List) -> exp.StrToTime: 163 this = format_time_lambda(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 164 this.set("zone", seq_get(args, 2)) 165 return this 166 167 168def _parse_date(args: t.List) -> exp.Date | exp.DateFromParts: 169 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 170 return expr_type.from_arg_list(args) 171 172 173def _parse_to_hex(args: t.List) -> exp.Hex | exp.MD5: 174 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 175 arg = seq_get(args, 0) 176 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.Hex(this=arg) 177 178 179def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 180 return self.sql( 181 exp.Exists( 182 this=exp.select("1") 183 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 184 .where(exp.column("_col").eq(expression.right)) 185 ) 186 ) 187 188 189def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 190 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 191 192 193def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 194 expression.this.replace(exp.cast(expression.this, "TIMESTAMP", copy=True)) 195 expression.expression.replace(exp.cast(expression.expression, "TIMESTAMP", copy=True)) 196 unit = expression.args.get("unit") or "DAY" 197 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 198 199 200def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 201 scale = expression.args.get("scale") 202 timestamp = self.sql(expression, "this") 203 if scale in (None, exp.UnixToTime.SECONDS): 204 return f"TIMESTAMP_SECONDS({timestamp})" 205 if scale == exp.UnixToTime.MILLIS: 206 return f"TIMESTAMP_MILLIS({timestamp})" 207 if scale == exp.UnixToTime.MICROS: 208 return f"TIMESTAMP_MICROS({timestamp})" 209 if scale == exp.UnixToTime.NANOS: 210 # We need to cast to INT64 because that's what BQ expects 211 return f"TIMESTAMP_MICROS(CAST({timestamp} / 1000 AS INT64))" 212 213 self.unsupported(f"Unsupported scale for timestamp: {scale}.") 214 return "" 215 216 217class BigQuery(Dialect): 218 UNNEST_COLUMN_ONLY = True 219 SUPPORTS_USER_DEFINED_TYPES = False 220 SUPPORTS_SEMI_ANTI_JOIN = False 221 LOG_BASE_FIRST = False 222 223 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 224 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 225 226 # bigquery udfs are case sensitive 227 NORMALIZE_FUNCTIONS = False 228 229 TIME_MAPPING = { 230 "%D": "%m/%d/%y", 231 } 232 233 ESCAPE_SEQUENCES = { 234 "\\a": "\a", 235 "\\b": "\b", 236 "\\f": "\f", 237 "\\n": "\n", 238 "\\r": "\r", 239 "\\t": "\t", 240 "\\v": "\v", 241 } 242 243 FORMAT_MAPPING = { 244 "DD": "%d", 245 "MM": "%m", 246 "MON": "%b", 247 "MONTH": "%B", 248 "YYYY": "%Y", 249 "YY": "%y", 250 "HH": "%I", 251 "HH12": "%I", 252 "HH24": "%H", 253 "MI": "%M", 254 "SS": "%S", 255 "SSSSS": "%f", 256 "TZH": "%z", 257 } 258 259 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 260 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 261 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 262 263 def normalize_identifier(self, expression: E) -> E: 264 if isinstance(expression, exp.Identifier): 265 parent = expression.parent 266 while isinstance(parent, exp.Dot): 267 parent = parent.parent 268 269 # In BigQuery, CTEs aren't case-sensitive, but table names are (by default, at least). 270 # The following check is essentially a heuristic to detect tables based on whether or 271 # not they're qualified. It also avoids normalizing UDFs, because they're case-sensitive. 272 if ( 273 not isinstance(parent, exp.UserDefinedFunction) 274 and not (isinstance(parent, exp.Table) and parent.db) 275 and not expression.meta.get("is_table") 276 ): 277 expression.set("this", expression.this.lower()) 278 279 return expression 280 281 class Tokenizer(tokens.Tokenizer): 282 QUOTES = ["'", '"', '"""', "'''"] 283 COMMENTS = ["--", "#", ("/*", "*/")] 284 IDENTIFIERS = ["`"] 285 STRING_ESCAPES = ["\\"] 286 287 HEX_STRINGS = [("0x", ""), ("0X", "")] 288 289 BYTE_STRINGS = [ 290 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 291 ] 292 293 RAW_STRINGS = [ 294 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 295 ] 296 297 KEYWORDS = { 298 **tokens.Tokenizer.KEYWORDS, 299 "ANY TYPE": TokenType.VARIANT, 300 "BEGIN": TokenType.COMMAND, 301 "BEGIN TRANSACTION": TokenType.BEGIN, 302 "BYTES": TokenType.BINARY, 303 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 304 "DECLARE": TokenType.COMMAND, 305 "FLOAT64": TokenType.DOUBLE, 306 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 307 "MODEL": TokenType.MODEL, 308 "NOT DETERMINISTIC": TokenType.VOLATILE, 309 "RECORD": TokenType.STRUCT, 310 "TIMESTAMP": TokenType.TIMESTAMPTZ, 311 } 312 KEYWORDS.pop("DIV") 313 314 class Parser(parser.Parser): 315 PREFIXED_PIVOT_COLUMNS = True 316 317 LOG_DEFAULTS_TO_LN = True 318 319 FUNCTIONS = { 320 **parser.Parser.FUNCTIONS, 321 "DATE": _parse_date, 322 "DATE_ADD": parse_date_delta_with_interval(exp.DateAdd), 323 "DATE_SUB": parse_date_delta_with_interval(exp.DateSub), 324 "DATE_TRUNC": lambda args: exp.DateTrunc( 325 unit=exp.Literal.string(str(seq_get(args, 1))), 326 this=seq_get(args, 0), 327 ), 328 "DATETIME_ADD": parse_date_delta_with_interval(exp.DatetimeAdd), 329 "DATETIME_SUB": parse_date_delta_with_interval(exp.DatetimeSub), 330 "DIV": binary_from_function(exp.IntDiv), 331 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 332 "MD5": exp.MD5Digest.from_arg_list, 333 "TO_HEX": _parse_to_hex, 334 "PARSE_DATE": lambda args: format_time_lambda(exp.StrToDate, "bigquery")( 335 [seq_get(args, 1), seq_get(args, 0)] 336 ), 337 "PARSE_TIMESTAMP": _parse_timestamp, 338 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 339 "REGEXP_EXTRACT": lambda args: exp.RegexpExtract( 340 this=seq_get(args, 0), 341 expression=seq_get(args, 1), 342 position=seq_get(args, 2), 343 occurrence=seq_get(args, 3), 344 group=exp.Literal.number(1) if re.compile(args[1].name).groups == 1 else None, 345 ), 346 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 347 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 348 "SPLIT": lambda args: exp.Split( 349 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 350 this=seq_get(args, 0), 351 expression=seq_get(args, 1) or exp.Literal.string(","), 352 ), 353 "TIME_ADD": parse_date_delta_with_interval(exp.TimeAdd), 354 "TIME_SUB": parse_date_delta_with_interval(exp.TimeSub), 355 "TIMESTAMP_ADD": parse_date_delta_with_interval(exp.TimestampAdd), 356 "TIMESTAMP_SUB": parse_date_delta_with_interval(exp.TimestampSub), 357 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 358 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 359 ), 360 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 361 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 362 ), 363 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime( 364 this=seq_get(args, 0), scale=exp.UnixToTime.SECONDS 365 ), 366 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 367 } 368 369 FUNCTION_PARSERS = { 370 **parser.Parser.FUNCTION_PARSERS, 371 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 372 } 373 FUNCTION_PARSERS.pop("TRIM") 374 375 NO_PAREN_FUNCTIONS = { 376 **parser.Parser.NO_PAREN_FUNCTIONS, 377 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 378 } 379 380 NESTED_TYPE_TOKENS = { 381 *parser.Parser.NESTED_TYPE_TOKENS, 382 TokenType.TABLE, 383 } 384 385 ID_VAR_TOKENS = { 386 *parser.Parser.ID_VAR_TOKENS, 387 TokenType.VALUES, 388 } 389 390 PROPERTY_PARSERS = { 391 **parser.Parser.PROPERTY_PARSERS, 392 "NOT DETERMINISTIC": lambda self: self.expression( 393 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 394 ), 395 "OPTIONS": lambda self: self._parse_with_property(), 396 } 397 398 CONSTRAINT_PARSERS = { 399 **parser.Parser.CONSTRAINT_PARSERS, 400 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 401 } 402 403 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 404 RANGE_PARSERS.pop(TokenType.OVERLAPS, None) 405 406 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 407 408 STATEMENT_PARSERS = { 409 **parser.Parser.STATEMENT_PARSERS, 410 TokenType.END: lambda self: self._parse_as_command(self._prev), 411 TokenType.FOR: lambda self: self._parse_for_in(), 412 } 413 414 BRACKET_OFFSETS = { 415 "OFFSET": (0, False), 416 "ORDINAL": (1, False), 417 "SAFE_OFFSET": (0, True), 418 "SAFE_ORDINAL": (1, True), 419 } 420 421 def _parse_for_in(self) -> exp.ForIn: 422 this = self._parse_range() 423 self._match_text_seq("DO") 424 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 425 426 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 427 this = super()._parse_table_part(schema=schema) or self._parse_number() 428 429 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 430 if isinstance(this, exp.Identifier): 431 table_name = this.name 432 while self._match(TokenType.DASH, advance=False) and self._next: 433 self._advance(2) 434 table_name += f"-{self._prev.text}" 435 436 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 437 elif isinstance(this, exp.Literal): 438 table_name = this.name 439 440 if self._is_connected() and self._parse_var(any_token=True): 441 table_name += self._prev.text 442 443 this = exp.Identifier(this=table_name, quoted=True) 444 445 return this 446 447 def _parse_table_parts(self, schema: bool = False) -> exp.Table: 448 table = super()._parse_table_parts(schema=schema) 449 if isinstance(table.this, exp.Identifier) and "." in table.name: 450 catalog, db, this, *rest = ( 451 t.cast(t.Optional[exp.Expression], exp.to_identifier(x)) 452 for x in split_num_words(table.name, ".", 3) 453 ) 454 455 if rest and this: 456 this = exp.Dot.build(t.cast(t.List[exp.Expression], [this, *rest])) 457 458 table = exp.Table(this=this, db=db, catalog=catalog) 459 460 return table 461 462 def _parse_json_object(self) -> exp.JSONObject: 463 json_object = super()._parse_json_object() 464 array_kv_pair = seq_get(json_object.expressions, 0) 465 466 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 467 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 468 if ( 469 array_kv_pair 470 and isinstance(array_kv_pair.this, exp.Array) 471 and isinstance(array_kv_pair.expression, exp.Array) 472 ): 473 keys = array_kv_pair.this.expressions 474 values = array_kv_pair.expression.expressions 475 476 json_object.set( 477 "expressions", 478 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 479 ) 480 481 return json_object 482 483 def _parse_bracket(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 484 bracket = super()._parse_bracket(this) 485 486 if this is bracket: 487 return bracket 488 489 if isinstance(bracket, exp.Bracket): 490 for expression in bracket.expressions: 491 name = expression.name.upper() 492 493 if name not in self.BRACKET_OFFSETS: 494 break 495 496 offset, safe = self.BRACKET_OFFSETS[name] 497 bracket.set("offset", offset) 498 bracket.set("safe", safe) 499 expression.replace(expression.expressions[0]) 500 501 return bracket 502 503 class Generator(generator.Generator): 504 EXPLICIT_UNION = True 505 INTERVAL_ALLOWS_PLURAL_FORM = False 506 JOIN_HINTS = False 507 QUERY_HINTS = False 508 TABLE_HINTS = False 509 LIMIT_FETCH = "LIMIT" 510 RENAME_TABLE_WITH_DB = False 511 NVL2_SUPPORTED = False 512 UNNEST_WITH_ORDINALITY = False 513 COLLATE_IS_FUNC = True 514 LIMIT_ONLY_LITERALS = True 515 516 TRANSFORMS = { 517 **generator.Generator.TRANSFORMS, 518 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 519 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 520 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 521 exp.ArrayContains: _array_contains_sql, 522 exp.ArraySize: rename_func("ARRAY_LENGTH"), 523 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 524 exp.CollateProperty: lambda self, e: f"DEFAULT COLLATE {self.sql(e, 'this')}" 525 if e.args.get("default") 526 else f"COLLATE {self.sql(e, 'this')}", 527 exp.Create: _create_sql, 528 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 529 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 530 exp.DateDiff: lambda self, e: f"DATE_DIFF({self.sql(e, 'this')}, {self.sql(e, 'expression')}, {self.sql(e.args.get('unit', 'DAY'))})", 531 exp.DateFromParts: rename_func("DATE"), 532 exp.DateStrToDate: datestrtodate_sql, 533 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 534 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 535 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 536 exp.DateTrunc: lambda self, e: self.func("DATE_TRUNC", e.this, e.text("unit")), 537 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 538 exp.GroupConcat: rename_func("STRING_AGG"), 539 exp.Hex: rename_func("TO_HEX"), 540 exp.If: if_sql(false_value="NULL"), 541 exp.ILike: no_ilike_sql, 542 exp.IntDiv: rename_func("DIV"), 543 exp.JSONFormat: rename_func("TO_JSON_STRING"), 544 exp.JSONKeyValue: json_keyvalue_comma_sql, 545 exp.Max: max_or_greatest, 546 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 547 exp.MD5Digest: rename_func("MD5"), 548 exp.Min: min_or_least, 549 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 550 exp.RegexpExtract: lambda self, e: self.func( 551 "REGEXP_EXTRACT", 552 e.this, 553 e.expression, 554 e.args.get("position"), 555 e.args.get("occurrence"), 556 ), 557 exp.RegexpReplace: regexp_replace_sql, 558 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 559 exp.ReturnsProperty: _returnsproperty_sql, 560 exp.Select: transforms.preprocess( 561 [ 562 transforms.explode_to_unnest(), 563 _unqualify_unnest, 564 transforms.eliminate_distinct_on, 565 _alias_ordered_group, 566 transforms.eliminate_semi_and_anti_joins, 567 ] 568 ), 569 exp.SHA2: lambda self, e: self.func( 570 f"SHA256" if e.text("length") == "256" else "SHA512", e.this 571 ), 572 exp.StabilityProperty: lambda self, e: f"DETERMINISTIC" 573 if e.name == "IMMUTABLE" 574 else "NOT DETERMINISTIC", 575 exp.StrToDate: lambda self, e: f"PARSE_DATE({self.format_time(e)}, {self.sql(e, 'this')})", 576 exp.StrToTime: lambda self, e: self.func( 577 "PARSE_TIMESTAMP", self.format_time(e), e.this, e.args.get("zone") 578 ), 579 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 580 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 581 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 582 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 583 exp.TimeStrToTime: timestrtotime_sql, 584 exp.TimeToStr: lambda self, e: f"FORMAT_DATE({self.format_time(e)}, {self.sql(e, 'this')})", 585 exp.Trim: lambda self, e: self.func(f"TRIM", e.this, e.expression), 586 exp.TsOrDsAdd: _ts_or_ds_add_sql, 587 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 588 exp.TsOrDsToDate: ts_or_ds_to_date_sql("bigquery"), 589 exp.Unhex: rename_func("FROM_HEX"), 590 exp.UnixToTime: _unix_to_time_sql, 591 exp.Values: _derived_table_values_to_unnest, 592 exp.VariancePop: rename_func("VAR_POP"), 593 } 594 595 TYPE_MAPPING = { 596 **generator.Generator.TYPE_MAPPING, 597 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 598 exp.DataType.Type.BIGINT: "INT64", 599 exp.DataType.Type.BINARY: "BYTES", 600 exp.DataType.Type.BOOLEAN: "BOOL", 601 exp.DataType.Type.CHAR: "STRING", 602 exp.DataType.Type.DECIMAL: "NUMERIC", 603 exp.DataType.Type.DOUBLE: "FLOAT64", 604 exp.DataType.Type.FLOAT: "FLOAT64", 605 exp.DataType.Type.INT: "INT64", 606 exp.DataType.Type.NCHAR: "STRING", 607 exp.DataType.Type.NVARCHAR: "STRING", 608 exp.DataType.Type.SMALLINT: "INT64", 609 exp.DataType.Type.TEXT: "STRING", 610 exp.DataType.Type.TIMESTAMP: "DATETIME", 611 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 612 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 613 exp.DataType.Type.TINYINT: "INT64", 614 exp.DataType.Type.VARBINARY: "BYTES", 615 exp.DataType.Type.VARCHAR: "STRING", 616 exp.DataType.Type.VARIANT: "ANY TYPE", 617 } 618 619 PROPERTIES_LOCATION = { 620 **generator.Generator.PROPERTIES_LOCATION, 621 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 622 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 623 } 624 625 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 626 RESERVED_KEYWORDS = { 627 *generator.Generator.RESERVED_KEYWORDS, 628 "all", 629 "and", 630 "any", 631 "array", 632 "as", 633 "asc", 634 "assert_rows_modified", 635 "at", 636 "between", 637 "by", 638 "case", 639 "cast", 640 "collate", 641 "contains", 642 "create", 643 "cross", 644 "cube", 645 "current", 646 "default", 647 "define", 648 "desc", 649 "distinct", 650 "else", 651 "end", 652 "enum", 653 "escape", 654 "except", 655 "exclude", 656 "exists", 657 "extract", 658 "false", 659 "fetch", 660 "following", 661 "for", 662 "from", 663 "full", 664 "group", 665 "grouping", 666 "groups", 667 "hash", 668 "having", 669 "if", 670 "ignore", 671 "in", 672 "inner", 673 "intersect", 674 "interval", 675 "into", 676 "is", 677 "join", 678 "lateral", 679 "left", 680 "like", 681 "limit", 682 "lookup", 683 "merge", 684 "natural", 685 "new", 686 "no", 687 "not", 688 "null", 689 "nulls", 690 "of", 691 "on", 692 "or", 693 "order", 694 "outer", 695 "over", 696 "partition", 697 "preceding", 698 "proto", 699 "qualify", 700 "range", 701 "recursive", 702 "respect", 703 "right", 704 "rollup", 705 "rows", 706 "select", 707 "set", 708 "some", 709 "struct", 710 "tablesample", 711 "then", 712 "to", 713 "treat", 714 "true", 715 "unbounded", 716 "union", 717 "unnest", 718 "using", 719 "when", 720 "where", 721 "window", 722 "with", 723 "within", 724 } 725 726 def eq_sql(self, expression: exp.EQ) -> str: 727 # Operands of = cannot be NULL in BigQuery 728 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 729 return "NULL" 730 731 return self.binary(expression, "=") 732 733 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 734 parent = expression.parent 735 736 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 737 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 738 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 739 return self.func( 740 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 741 ) 742 743 return super().attimezone_sql(expression) 744 745 def trycast_sql(self, expression: exp.TryCast) -> str: 746 return self.cast_sql(expression, safe_prefix="SAFE_") 747 748 def cte_sql(self, expression: exp.CTE) -> str: 749 if expression.alias_column_names: 750 self.unsupported("Column names in CTE definition are not supported.") 751 return super().cte_sql(expression) 752 753 def array_sql(self, expression: exp.Array) -> str: 754 first_arg = seq_get(expression.expressions, 0) 755 if isinstance(first_arg, exp.Subqueryable): 756 return f"ARRAY{self.wrap(self.sql(first_arg))}" 757 758 return inline_array_sql(self, expression) 759 760 def bracket_sql(self, expression: exp.Bracket) -> str: 761 expressions = expression.expressions 762 expressions_sql = ", ".join(self.sql(e) for e in expressions) 763 offset = expression.args.get("offset") 764 765 if offset == 0: 766 expressions_sql = f"OFFSET({expressions_sql})" 767 elif offset == 1: 768 expressions_sql = f"ORDINAL({expressions_sql})" 769 else: 770 self.unsupported(f"Unsupported array offset: {offset}") 771 772 if expression.args.get("safe"): 773 expressions_sql = f"SAFE_{expressions_sql}" 774 775 return f"{self.sql(expression, 'this')}[{expressions_sql}]" 776 777 def transaction_sql(self, *_) -> str: 778 return "BEGIN TRANSACTION" 779 780 def commit_sql(self, *_) -> str: 781 return "COMMIT TRANSACTION" 782 783 def rollback_sql(self, *_) -> str: 784 return "ROLLBACK TRANSACTION" 785 786 def in_unnest_op(self, expression: exp.Unnest) -> str: 787 return self.sql(expression) 788 789 def except_op(self, expression: exp.Except) -> str: 790 if not expression.args.get("distinct", False): 791 self.unsupported("EXCEPT without DISTINCT is not supported in BigQuery") 792 return f"EXCEPT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 793 794 def intersect_op(self, expression: exp.Intersect) -> str: 795 if not expression.args.get("distinct", False): 796 self.unsupported("INTERSECT without DISTINCT is not supported in BigQuery") 797 return f"INTERSECT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 798 799 def with_properties(self, properties: exp.Properties) -> str: 800 return self.properties(properties, prefix=self.seg("OPTIONS")) 801 802 def version_sql(self, expression: exp.Version) -> str: 803 if expression.name == "TIMESTAMP": 804 expression.set("this", "SYSTEM_TIME") 805 return super().version_sql(expression)
218class BigQuery(Dialect): 219 UNNEST_COLUMN_ONLY = True 220 SUPPORTS_USER_DEFINED_TYPES = False 221 SUPPORTS_SEMI_ANTI_JOIN = False 222 LOG_BASE_FIRST = False 223 224 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 225 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 226 227 # bigquery udfs are case sensitive 228 NORMALIZE_FUNCTIONS = False 229 230 TIME_MAPPING = { 231 "%D": "%m/%d/%y", 232 } 233 234 ESCAPE_SEQUENCES = { 235 "\\a": "\a", 236 "\\b": "\b", 237 "\\f": "\f", 238 "\\n": "\n", 239 "\\r": "\r", 240 "\\t": "\t", 241 "\\v": "\v", 242 } 243 244 FORMAT_MAPPING = { 245 "DD": "%d", 246 "MM": "%m", 247 "MON": "%b", 248 "MONTH": "%B", 249 "YYYY": "%Y", 250 "YY": "%y", 251 "HH": "%I", 252 "HH12": "%I", 253 "HH24": "%H", 254 "MI": "%M", 255 "SS": "%S", 256 "SSSSS": "%f", 257 "TZH": "%z", 258 } 259 260 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 261 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 262 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 263 264 def normalize_identifier(self, expression: E) -> E: 265 if isinstance(expression, exp.Identifier): 266 parent = expression.parent 267 while isinstance(parent, exp.Dot): 268 parent = parent.parent 269 270 # In BigQuery, CTEs aren't case-sensitive, but table names are (by default, at least). 271 # The following check is essentially a heuristic to detect tables based on whether or 272 # not they're qualified. It also avoids normalizing UDFs, because they're case-sensitive. 273 if ( 274 not isinstance(parent, exp.UserDefinedFunction) 275 and not (isinstance(parent, exp.Table) and parent.db) 276 and not expression.meta.get("is_table") 277 ): 278 expression.set("this", expression.this.lower()) 279 280 return expression 281 282 class Tokenizer(tokens.Tokenizer): 283 QUOTES = ["'", '"', '"""', "'''"] 284 COMMENTS = ["--", "#", ("/*", "*/")] 285 IDENTIFIERS = ["`"] 286 STRING_ESCAPES = ["\\"] 287 288 HEX_STRINGS = [("0x", ""), ("0X", "")] 289 290 BYTE_STRINGS = [ 291 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 292 ] 293 294 RAW_STRINGS = [ 295 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 296 ] 297 298 KEYWORDS = { 299 **tokens.Tokenizer.KEYWORDS, 300 "ANY TYPE": TokenType.VARIANT, 301 "BEGIN": TokenType.COMMAND, 302 "BEGIN TRANSACTION": TokenType.BEGIN, 303 "BYTES": TokenType.BINARY, 304 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 305 "DECLARE": TokenType.COMMAND, 306 "FLOAT64": TokenType.DOUBLE, 307 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 308 "MODEL": TokenType.MODEL, 309 "NOT DETERMINISTIC": TokenType.VOLATILE, 310 "RECORD": TokenType.STRUCT, 311 "TIMESTAMP": TokenType.TIMESTAMPTZ, 312 } 313 KEYWORDS.pop("DIV") 314 315 class Parser(parser.Parser): 316 PREFIXED_PIVOT_COLUMNS = True 317 318 LOG_DEFAULTS_TO_LN = True 319 320 FUNCTIONS = { 321 **parser.Parser.FUNCTIONS, 322 "DATE": _parse_date, 323 "DATE_ADD": parse_date_delta_with_interval(exp.DateAdd), 324 "DATE_SUB": parse_date_delta_with_interval(exp.DateSub), 325 "DATE_TRUNC": lambda args: exp.DateTrunc( 326 unit=exp.Literal.string(str(seq_get(args, 1))), 327 this=seq_get(args, 0), 328 ), 329 "DATETIME_ADD": parse_date_delta_with_interval(exp.DatetimeAdd), 330 "DATETIME_SUB": parse_date_delta_with_interval(exp.DatetimeSub), 331 "DIV": binary_from_function(exp.IntDiv), 332 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 333 "MD5": exp.MD5Digest.from_arg_list, 334 "TO_HEX": _parse_to_hex, 335 "PARSE_DATE": lambda args: format_time_lambda(exp.StrToDate, "bigquery")( 336 [seq_get(args, 1), seq_get(args, 0)] 337 ), 338 "PARSE_TIMESTAMP": _parse_timestamp, 339 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 340 "REGEXP_EXTRACT": lambda args: exp.RegexpExtract( 341 this=seq_get(args, 0), 342 expression=seq_get(args, 1), 343 position=seq_get(args, 2), 344 occurrence=seq_get(args, 3), 345 group=exp.Literal.number(1) if re.compile(args[1].name).groups == 1 else None, 346 ), 347 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 348 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 349 "SPLIT": lambda args: exp.Split( 350 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 351 this=seq_get(args, 0), 352 expression=seq_get(args, 1) or exp.Literal.string(","), 353 ), 354 "TIME_ADD": parse_date_delta_with_interval(exp.TimeAdd), 355 "TIME_SUB": parse_date_delta_with_interval(exp.TimeSub), 356 "TIMESTAMP_ADD": parse_date_delta_with_interval(exp.TimestampAdd), 357 "TIMESTAMP_SUB": parse_date_delta_with_interval(exp.TimestampSub), 358 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 359 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 360 ), 361 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 362 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 363 ), 364 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime( 365 this=seq_get(args, 0), scale=exp.UnixToTime.SECONDS 366 ), 367 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 368 } 369 370 FUNCTION_PARSERS = { 371 **parser.Parser.FUNCTION_PARSERS, 372 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 373 } 374 FUNCTION_PARSERS.pop("TRIM") 375 376 NO_PAREN_FUNCTIONS = { 377 **parser.Parser.NO_PAREN_FUNCTIONS, 378 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 379 } 380 381 NESTED_TYPE_TOKENS = { 382 *parser.Parser.NESTED_TYPE_TOKENS, 383 TokenType.TABLE, 384 } 385 386 ID_VAR_TOKENS = { 387 *parser.Parser.ID_VAR_TOKENS, 388 TokenType.VALUES, 389 } 390 391 PROPERTY_PARSERS = { 392 **parser.Parser.PROPERTY_PARSERS, 393 "NOT DETERMINISTIC": lambda self: self.expression( 394 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 395 ), 396 "OPTIONS": lambda self: self._parse_with_property(), 397 } 398 399 CONSTRAINT_PARSERS = { 400 **parser.Parser.CONSTRAINT_PARSERS, 401 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 402 } 403 404 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 405 RANGE_PARSERS.pop(TokenType.OVERLAPS, None) 406 407 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 408 409 STATEMENT_PARSERS = { 410 **parser.Parser.STATEMENT_PARSERS, 411 TokenType.END: lambda self: self._parse_as_command(self._prev), 412 TokenType.FOR: lambda self: self._parse_for_in(), 413 } 414 415 BRACKET_OFFSETS = { 416 "OFFSET": (0, False), 417 "ORDINAL": (1, False), 418 "SAFE_OFFSET": (0, True), 419 "SAFE_ORDINAL": (1, True), 420 } 421 422 def _parse_for_in(self) -> exp.ForIn: 423 this = self._parse_range() 424 self._match_text_seq("DO") 425 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 426 427 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 428 this = super()._parse_table_part(schema=schema) or self._parse_number() 429 430 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 431 if isinstance(this, exp.Identifier): 432 table_name = this.name 433 while self._match(TokenType.DASH, advance=False) and self._next: 434 self._advance(2) 435 table_name += f"-{self._prev.text}" 436 437 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 438 elif isinstance(this, exp.Literal): 439 table_name = this.name 440 441 if self._is_connected() and self._parse_var(any_token=True): 442 table_name += self._prev.text 443 444 this = exp.Identifier(this=table_name, quoted=True) 445 446 return this 447 448 def _parse_table_parts(self, schema: bool = False) -> exp.Table: 449 table = super()._parse_table_parts(schema=schema) 450 if isinstance(table.this, exp.Identifier) and "." in table.name: 451 catalog, db, this, *rest = ( 452 t.cast(t.Optional[exp.Expression], exp.to_identifier(x)) 453 for x in split_num_words(table.name, ".", 3) 454 ) 455 456 if rest and this: 457 this = exp.Dot.build(t.cast(t.List[exp.Expression], [this, *rest])) 458 459 table = exp.Table(this=this, db=db, catalog=catalog) 460 461 return table 462 463 def _parse_json_object(self) -> exp.JSONObject: 464 json_object = super()._parse_json_object() 465 array_kv_pair = seq_get(json_object.expressions, 0) 466 467 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 468 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 469 if ( 470 array_kv_pair 471 and isinstance(array_kv_pair.this, exp.Array) 472 and isinstance(array_kv_pair.expression, exp.Array) 473 ): 474 keys = array_kv_pair.this.expressions 475 values = array_kv_pair.expression.expressions 476 477 json_object.set( 478 "expressions", 479 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 480 ) 481 482 return json_object 483 484 def _parse_bracket(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 485 bracket = super()._parse_bracket(this) 486 487 if this is bracket: 488 return bracket 489 490 if isinstance(bracket, exp.Bracket): 491 for expression in bracket.expressions: 492 name = expression.name.upper() 493 494 if name not in self.BRACKET_OFFSETS: 495 break 496 497 offset, safe = self.BRACKET_OFFSETS[name] 498 bracket.set("offset", offset) 499 bracket.set("safe", safe) 500 expression.replace(expression.expressions[0]) 501 502 return bracket 503 504 class Generator(generator.Generator): 505 EXPLICIT_UNION = True 506 INTERVAL_ALLOWS_PLURAL_FORM = False 507 JOIN_HINTS = False 508 QUERY_HINTS = False 509 TABLE_HINTS = False 510 LIMIT_FETCH = "LIMIT" 511 RENAME_TABLE_WITH_DB = False 512 NVL2_SUPPORTED = False 513 UNNEST_WITH_ORDINALITY = False 514 COLLATE_IS_FUNC = True 515 LIMIT_ONLY_LITERALS = True 516 517 TRANSFORMS = { 518 **generator.Generator.TRANSFORMS, 519 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 520 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 521 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 522 exp.ArrayContains: _array_contains_sql, 523 exp.ArraySize: rename_func("ARRAY_LENGTH"), 524 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 525 exp.CollateProperty: lambda self, e: f"DEFAULT COLLATE {self.sql(e, 'this')}" 526 if e.args.get("default") 527 else f"COLLATE {self.sql(e, 'this')}", 528 exp.Create: _create_sql, 529 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 530 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 531 exp.DateDiff: lambda self, e: f"DATE_DIFF({self.sql(e, 'this')}, {self.sql(e, 'expression')}, {self.sql(e.args.get('unit', 'DAY'))})", 532 exp.DateFromParts: rename_func("DATE"), 533 exp.DateStrToDate: datestrtodate_sql, 534 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 535 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 536 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 537 exp.DateTrunc: lambda self, e: self.func("DATE_TRUNC", e.this, e.text("unit")), 538 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 539 exp.GroupConcat: rename_func("STRING_AGG"), 540 exp.Hex: rename_func("TO_HEX"), 541 exp.If: if_sql(false_value="NULL"), 542 exp.ILike: no_ilike_sql, 543 exp.IntDiv: rename_func("DIV"), 544 exp.JSONFormat: rename_func("TO_JSON_STRING"), 545 exp.JSONKeyValue: json_keyvalue_comma_sql, 546 exp.Max: max_or_greatest, 547 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 548 exp.MD5Digest: rename_func("MD5"), 549 exp.Min: min_or_least, 550 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 551 exp.RegexpExtract: lambda self, e: self.func( 552 "REGEXP_EXTRACT", 553 e.this, 554 e.expression, 555 e.args.get("position"), 556 e.args.get("occurrence"), 557 ), 558 exp.RegexpReplace: regexp_replace_sql, 559 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 560 exp.ReturnsProperty: _returnsproperty_sql, 561 exp.Select: transforms.preprocess( 562 [ 563 transforms.explode_to_unnest(), 564 _unqualify_unnest, 565 transforms.eliminate_distinct_on, 566 _alias_ordered_group, 567 transforms.eliminate_semi_and_anti_joins, 568 ] 569 ), 570 exp.SHA2: lambda self, e: self.func( 571 f"SHA256" if e.text("length") == "256" else "SHA512", e.this 572 ), 573 exp.StabilityProperty: lambda self, e: f"DETERMINISTIC" 574 if e.name == "IMMUTABLE" 575 else "NOT DETERMINISTIC", 576 exp.StrToDate: lambda self, e: f"PARSE_DATE({self.format_time(e)}, {self.sql(e, 'this')})", 577 exp.StrToTime: lambda self, e: self.func( 578 "PARSE_TIMESTAMP", self.format_time(e), e.this, e.args.get("zone") 579 ), 580 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 581 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 582 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 583 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 584 exp.TimeStrToTime: timestrtotime_sql, 585 exp.TimeToStr: lambda self, e: f"FORMAT_DATE({self.format_time(e)}, {self.sql(e, 'this')})", 586 exp.Trim: lambda self, e: self.func(f"TRIM", e.this, e.expression), 587 exp.TsOrDsAdd: _ts_or_ds_add_sql, 588 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 589 exp.TsOrDsToDate: ts_or_ds_to_date_sql("bigquery"), 590 exp.Unhex: rename_func("FROM_HEX"), 591 exp.UnixToTime: _unix_to_time_sql, 592 exp.Values: _derived_table_values_to_unnest, 593 exp.VariancePop: rename_func("VAR_POP"), 594 } 595 596 TYPE_MAPPING = { 597 **generator.Generator.TYPE_MAPPING, 598 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 599 exp.DataType.Type.BIGINT: "INT64", 600 exp.DataType.Type.BINARY: "BYTES", 601 exp.DataType.Type.BOOLEAN: "BOOL", 602 exp.DataType.Type.CHAR: "STRING", 603 exp.DataType.Type.DECIMAL: "NUMERIC", 604 exp.DataType.Type.DOUBLE: "FLOAT64", 605 exp.DataType.Type.FLOAT: "FLOAT64", 606 exp.DataType.Type.INT: "INT64", 607 exp.DataType.Type.NCHAR: "STRING", 608 exp.DataType.Type.NVARCHAR: "STRING", 609 exp.DataType.Type.SMALLINT: "INT64", 610 exp.DataType.Type.TEXT: "STRING", 611 exp.DataType.Type.TIMESTAMP: "DATETIME", 612 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 613 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 614 exp.DataType.Type.TINYINT: "INT64", 615 exp.DataType.Type.VARBINARY: "BYTES", 616 exp.DataType.Type.VARCHAR: "STRING", 617 exp.DataType.Type.VARIANT: "ANY TYPE", 618 } 619 620 PROPERTIES_LOCATION = { 621 **generator.Generator.PROPERTIES_LOCATION, 622 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 623 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 624 } 625 626 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 627 RESERVED_KEYWORDS = { 628 *generator.Generator.RESERVED_KEYWORDS, 629 "all", 630 "and", 631 "any", 632 "array", 633 "as", 634 "asc", 635 "assert_rows_modified", 636 "at", 637 "between", 638 "by", 639 "case", 640 "cast", 641 "collate", 642 "contains", 643 "create", 644 "cross", 645 "cube", 646 "current", 647 "default", 648 "define", 649 "desc", 650 "distinct", 651 "else", 652 "end", 653 "enum", 654 "escape", 655 "except", 656 "exclude", 657 "exists", 658 "extract", 659 "false", 660 "fetch", 661 "following", 662 "for", 663 "from", 664 "full", 665 "group", 666 "grouping", 667 "groups", 668 "hash", 669 "having", 670 "if", 671 "ignore", 672 "in", 673 "inner", 674 "intersect", 675 "interval", 676 "into", 677 "is", 678 "join", 679 "lateral", 680 "left", 681 "like", 682 "limit", 683 "lookup", 684 "merge", 685 "natural", 686 "new", 687 "no", 688 "not", 689 "null", 690 "nulls", 691 "of", 692 "on", 693 "or", 694 "order", 695 "outer", 696 "over", 697 "partition", 698 "preceding", 699 "proto", 700 "qualify", 701 "range", 702 "recursive", 703 "respect", 704 "right", 705 "rollup", 706 "rows", 707 "select", 708 "set", 709 "some", 710 "struct", 711 "tablesample", 712 "then", 713 "to", 714 "treat", 715 "true", 716 "unbounded", 717 "union", 718 "unnest", 719 "using", 720 "when", 721 "where", 722 "window", 723 "with", 724 "within", 725 } 726 727 def eq_sql(self, expression: exp.EQ) -> str: 728 # Operands of = cannot be NULL in BigQuery 729 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 730 return "NULL" 731 732 return self.binary(expression, "=") 733 734 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 735 parent = expression.parent 736 737 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 738 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 739 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 740 return self.func( 741 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 742 ) 743 744 return super().attimezone_sql(expression) 745 746 def trycast_sql(self, expression: exp.TryCast) -> str: 747 return self.cast_sql(expression, safe_prefix="SAFE_") 748 749 def cte_sql(self, expression: exp.CTE) -> str: 750 if expression.alias_column_names: 751 self.unsupported("Column names in CTE definition are not supported.") 752 return super().cte_sql(expression) 753 754 def array_sql(self, expression: exp.Array) -> str: 755 first_arg = seq_get(expression.expressions, 0) 756 if isinstance(first_arg, exp.Subqueryable): 757 return f"ARRAY{self.wrap(self.sql(first_arg))}" 758 759 return inline_array_sql(self, expression) 760 761 def bracket_sql(self, expression: exp.Bracket) -> str: 762 expressions = expression.expressions 763 expressions_sql = ", ".join(self.sql(e) for e in expressions) 764 offset = expression.args.get("offset") 765 766 if offset == 0: 767 expressions_sql = f"OFFSET({expressions_sql})" 768 elif offset == 1: 769 expressions_sql = f"ORDINAL({expressions_sql})" 770 else: 771 self.unsupported(f"Unsupported array offset: {offset}") 772 773 if expression.args.get("safe"): 774 expressions_sql = f"SAFE_{expressions_sql}" 775 776 return f"{self.sql(expression, 'this')}[{expressions_sql}]" 777 778 def transaction_sql(self, *_) -> str: 779 return "BEGIN TRANSACTION" 780 781 def commit_sql(self, *_) -> str: 782 return "COMMIT TRANSACTION" 783 784 def rollback_sql(self, *_) -> str: 785 return "ROLLBACK TRANSACTION" 786 787 def in_unnest_op(self, expression: exp.Unnest) -> str: 788 return self.sql(expression) 789 790 def except_op(self, expression: exp.Except) -> str: 791 if not expression.args.get("distinct", False): 792 self.unsupported("EXCEPT without DISTINCT is not supported in BigQuery") 793 return f"EXCEPT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 794 795 def intersect_op(self, expression: exp.Intersect) -> str: 796 if not expression.args.get("distinct", False): 797 self.unsupported("INTERSECT without DISTINCT is not supported in BigQuery") 798 return f"INTERSECT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 799 800 def with_properties(self, properties: exp.Properties) -> str: 801 return self.properties(properties, prefix=self.seg("OPTIONS")) 802 803 def version_sql(self, expression: exp.Version) -> str: 804 if expression.name == "TIMESTAMP": 805 expression.set("this", "SYSTEM_TIME") 806 return super().version_sql(expression)
264 def normalize_identifier(self, expression: E) -> E: 265 if isinstance(expression, exp.Identifier): 266 parent = expression.parent 267 while isinstance(parent, exp.Dot): 268 parent = parent.parent 269 270 # In BigQuery, CTEs aren't case-sensitive, but table names are (by default, at least). 271 # The following check is essentially a heuristic to detect tables based on whether or 272 # not they're qualified. It also avoids normalizing UDFs, because they're case-sensitive. 273 if ( 274 not isinstance(parent, exp.UserDefinedFunction) 275 and not (isinstance(parent, exp.Table) and parent.db) 276 and not expression.meta.get("is_table") 277 ): 278 expression.set("this", expression.this.lower()) 279 280 return expression
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO would be resolved as foo in Postgres, because it lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so it would resolve it as FOO. If it was quoted, it'd need to be treated as case-sensitive, and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Inherited Members
- sqlglot.dialects.dialect.Dialect
- Dialect
- INDEX_OFFSET
- ALIAS_POST_TABLESAMPLE
- IDENTIFIERS_CAN_START_WITH_DIGIT
- DPIPE_IS_STRING_CONCAT
- STRICT_STRING_CONCAT
- NULL_ORDERING
- TYPED_DIVISION
- SAFE_DIVISION
- CONCAT_COALESCE
- DATE_FORMAT
- DATEINT_FORMAT
- TIME_FORMAT
- get_or_raise
- format_time
- case_sensitive
- can_identify
- quote_identifier
- parse
- parse_into
- generate
- transpile
- tokenize
- tokenizer
- parser
- generator
282 class Tokenizer(tokens.Tokenizer): 283 QUOTES = ["'", '"', '"""', "'''"] 284 COMMENTS = ["--", "#", ("/*", "*/")] 285 IDENTIFIERS = ["`"] 286 STRING_ESCAPES = ["\\"] 287 288 HEX_STRINGS = [("0x", ""), ("0X", "")] 289 290 BYTE_STRINGS = [ 291 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 292 ] 293 294 RAW_STRINGS = [ 295 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 296 ] 297 298 KEYWORDS = { 299 **tokens.Tokenizer.KEYWORDS, 300 "ANY TYPE": TokenType.VARIANT, 301 "BEGIN": TokenType.COMMAND, 302 "BEGIN TRANSACTION": TokenType.BEGIN, 303 "BYTES": TokenType.BINARY, 304 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 305 "DECLARE": TokenType.COMMAND, 306 "FLOAT64": TokenType.DOUBLE, 307 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 308 "MODEL": TokenType.MODEL, 309 "NOT DETERMINISTIC": TokenType.VOLATILE, 310 "RECORD": TokenType.STRUCT, 311 "TIMESTAMP": TokenType.TIMESTAMPTZ, 312 } 313 KEYWORDS.pop("DIV")
315 class Parser(parser.Parser): 316 PREFIXED_PIVOT_COLUMNS = True 317 318 LOG_DEFAULTS_TO_LN = True 319 320 FUNCTIONS = { 321 **parser.Parser.FUNCTIONS, 322 "DATE": _parse_date, 323 "DATE_ADD": parse_date_delta_with_interval(exp.DateAdd), 324 "DATE_SUB": parse_date_delta_with_interval(exp.DateSub), 325 "DATE_TRUNC": lambda args: exp.DateTrunc( 326 unit=exp.Literal.string(str(seq_get(args, 1))), 327 this=seq_get(args, 0), 328 ), 329 "DATETIME_ADD": parse_date_delta_with_interval(exp.DatetimeAdd), 330 "DATETIME_SUB": parse_date_delta_with_interval(exp.DatetimeSub), 331 "DIV": binary_from_function(exp.IntDiv), 332 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 333 "MD5": exp.MD5Digest.from_arg_list, 334 "TO_HEX": _parse_to_hex, 335 "PARSE_DATE": lambda args: format_time_lambda(exp.StrToDate, "bigquery")( 336 [seq_get(args, 1), seq_get(args, 0)] 337 ), 338 "PARSE_TIMESTAMP": _parse_timestamp, 339 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 340 "REGEXP_EXTRACT": lambda args: exp.RegexpExtract( 341 this=seq_get(args, 0), 342 expression=seq_get(args, 1), 343 position=seq_get(args, 2), 344 occurrence=seq_get(args, 3), 345 group=exp.Literal.number(1) if re.compile(args[1].name).groups == 1 else None, 346 ), 347 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 348 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 349 "SPLIT": lambda args: exp.Split( 350 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 351 this=seq_get(args, 0), 352 expression=seq_get(args, 1) or exp.Literal.string(","), 353 ), 354 "TIME_ADD": parse_date_delta_with_interval(exp.TimeAdd), 355 "TIME_SUB": parse_date_delta_with_interval(exp.TimeSub), 356 "TIMESTAMP_ADD": parse_date_delta_with_interval(exp.TimestampAdd), 357 "TIMESTAMP_SUB": parse_date_delta_with_interval(exp.TimestampSub), 358 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 359 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 360 ), 361 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 362 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 363 ), 364 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime( 365 this=seq_get(args, 0), scale=exp.UnixToTime.SECONDS 366 ), 367 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 368 } 369 370 FUNCTION_PARSERS = { 371 **parser.Parser.FUNCTION_PARSERS, 372 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 373 } 374 FUNCTION_PARSERS.pop("TRIM") 375 376 NO_PAREN_FUNCTIONS = { 377 **parser.Parser.NO_PAREN_FUNCTIONS, 378 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 379 } 380 381 NESTED_TYPE_TOKENS = { 382 *parser.Parser.NESTED_TYPE_TOKENS, 383 TokenType.TABLE, 384 } 385 386 ID_VAR_TOKENS = { 387 *parser.Parser.ID_VAR_TOKENS, 388 TokenType.VALUES, 389 } 390 391 PROPERTY_PARSERS = { 392 **parser.Parser.PROPERTY_PARSERS, 393 "NOT DETERMINISTIC": lambda self: self.expression( 394 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 395 ), 396 "OPTIONS": lambda self: self._parse_with_property(), 397 } 398 399 CONSTRAINT_PARSERS = { 400 **parser.Parser.CONSTRAINT_PARSERS, 401 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 402 } 403 404 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 405 RANGE_PARSERS.pop(TokenType.OVERLAPS, None) 406 407 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 408 409 STATEMENT_PARSERS = { 410 **parser.Parser.STATEMENT_PARSERS, 411 TokenType.END: lambda self: self._parse_as_command(self._prev), 412 TokenType.FOR: lambda self: self._parse_for_in(), 413 } 414 415 BRACKET_OFFSETS = { 416 "OFFSET": (0, False), 417 "ORDINAL": (1, False), 418 "SAFE_OFFSET": (0, True), 419 "SAFE_ORDINAL": (1, True), 420 } 421 422 def _parse_for_in(self) -> exp.ForIn: 423 this = self._parse_range() 424 self._match_text_seq("DO") 425 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 426 427 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 428 this = super()._parse_table_part(schema=schema) or self._parse_number() 429 430 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 431 if isinstance(this, exp.Identifier): 432 table_name = this.name 433 while self._match(TokenType.DASH, advance=False) and self._next: 434 self._advance(2) 435 table_name += f"-{self._prev.text}" 436 437 this = exp.Identifier(this=table_name, quoted=this.args.get("quoted")) 438 elif isinstance(this, exp.Literal): 439 table_name = this.name 440 441 if self._is_connected() and self._parse_var(any_token=True): 442 table_name += self._prev.text 443 444 this = exp.Identifier(this=table_name, quoted=True) 445 446 return this 447 448 def _parse_table_parts(self, schema: bool = False) -> exp.Table: 449 table = super()._parse_table_parts(schema=schema) 450 if isinstance(table.this, exp.Identifier) and "." in table.name: 451 catalog, db, this, *rest = ( 452 t.cast(t.Optional[exp.Expression], exp.to_identifier(x)) 453 for x in split_num_words(table.name, ".", 3) 454 ) 455 456 if rest and this: 457 this = exp.Dot.build(t.cast(t.List[exp.Expression], [this, *rest])) 458 459 table = exp.Table(this=this, db=db, catalog=catalog) 460 461 return table 462 463 def _parse_json_object(self) -> exp.JSONObject: 464 json_object = super()._parse_json_object() 465 array_kv_pair = seq_get(json_object.expressions, 0) 466 467 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 468 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 469 if ( 470 array_kv_pair 471 and isinstance(array_kv_pair.this, exp.Array) 472 and isinstance(array_kv_pair.expression, exp.Array) 473 ): 474 keys = array_kv_pair.this.expressions 475 values = array_kv_pair.expression.expressions 476 477 json_object.set( 478 "expressions", 479 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 480 ) 481 482 return json_object 483 484 def _parse_bracket(self, this: t.Optional[exp.Expression]) -> t.Optional[exp.Expression]: 485 bracket = super()._parse_bracket(this) 486 487 if this is bracket: 488 return bracket 489 490 if isinstance(bracket, exp.Bracket): 491 for expression in bracket.expressions: 492 name = expression.name.upper() 493 494 if name not in self.BRACKET_OFFSETS: 495 break 496 497 offset, safe = self.BRACKET_OFFSETS[name] 498 bracket.set("offset", offset) 499 bracket.set("safe", safe) 500 expression.replace(expression.expressions[0]) 501 502 return bracket
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: Determines the amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- INTERVAL_VARS
- COMMENT_TABLE_ALIAS_TOKENS
- UPDATE_ALIAS_TOKENS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- QUERY_MODIFIER_PARSERS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- MODIFIABLES
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- ALTER_TABLE_ADD_REQUIRED_FOR_EACH_COLUMN
- TABLESAMPLE_CSV
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- MODIFIERS_ATTACHED_TO_UNION
- UNION_MODIFIERS
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- errors
- sql
504 class Generator(generator.Generator): 505 EXPLICIT_UNION = True 506 INTERVAL_ALLOWS_PLURAL_FORM = False 507 JOIN_HINTS = False 508 QUERY_HINTS = False 509 TABLE_HINTS = False 510 LIMIT_FETCH = "LIMIT" 511 RENAME_TABLE_WITH_DB = False 512 NVL2_SUPPORTED = False 513 UNNEST_WITH_ORDINALITY = False 514 COLLATE_IS_FUNC = True 515 LIMIT_ONLY_LITERALS = True 516 517 TRANSFORMS = { 518 **generator.Generator.TRANSFORMS, 519 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 520 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 521 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 522 exp.ArrayContains: _array_contains_sql, 523 exp.ArraySize: rename_func("ARRAY_LENGTH"), 524 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 525 exp.CollateProperty: lambda self, e: f"DEFAULT COLLATE {self.sql(e, 'this')}" 526 if e.args.get("default") 527 else f"COLLATE {self.sql(e, 'this')}", 528 exp.Create: _create_sql, 529 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 530 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 531 exp.DateDiff: lambda self, e: f"DATE_DIFF({self.sql(e, 'this')}, {self.sql(e, 'expression')}, {self.sql(e.args.get('unit', 'DAY'))})", 532 exp.DateFromParts: rename_func("DATE"), 533 exp.DateStrToDate: datestrtodate_sql, 534 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 535 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 536 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 537 exp.DateTrunc: lambda self, e: self.func("DATE_TRUNC", e.this, e.text("unit")), 538 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 539 exp.GroupConcat: rename_func("STRING_AGG"), 540 exp.Hex: rename_func("TO_HEX"), 541 exp.If: if_sql(false_value="NULL"), 542 exp.ILike: no_ilike_sql, 543 exp.IntDiv: rename_func("DIV"), 544 exp.JSONFormat: rename_func("TO_JSON_STRING"), 545 exp.JSONKeyValue: json_keyvalue_comma_sql, 546 exp.Max: max_or_greatest, 547 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 548 exp.MD5Digest: rename_func("MD5"), 549 exp.Min: min_or_least, 550 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 551 exp.RegexpExtract: lambda self, e: self.func( 552 "REGEXP_EXTRACT", 553 e.this, 554 e.expression, 555 e.args.get("position"), 556 e.args.get("occurrence"), 557 ), 558 exp.RegexpReplace: regexp_replace_sql, 559 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 560 exp.ReturnsProperty: _returnsproperty_sql, 561 exp.Select: transforms.preprocess( 562 [ 563 transforms.explode_to_unnest(), 564 _unqualify_unnest, 565 transforms.eliminate_distinct_on, 566 _alias_ordered_group, 567 transforms.eliminate_semi_and_anti_joins, 568 ] 569 ), 570 exp.SHA2: lambda self, e: self.func( 571 f"SHA256" if e.text("length") == "256" else "SHA512", e.this 572 ), 573 exp.StabilityProperty: lambda self, e: f"DETERMINISTIC" 574 if e.name == "IMMUTABLE" 575 else "NOT DETERMINISTIC", 576 exp.StrToDate: lambda self, e: f"PARSE_DATE({self.format_time(e)}, {self.sql(e, 'this')})", 577 exp.StrToTime: lambda self, e: self.func( 578 "PARSE_TIMESTAMP", self.format_time(e), e.this, e.args.get("zone") 579 ), 580 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 581 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 582 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 583 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 584 exp.TimeStrToTime: timestrtotime_sql, 585 exp.TimeToStr: lambda self, e: f"FORMAT_DATE({self.format_time(e)}, {self.sql(e, 'this')})", 586 exp.Trim: lambda self, e: self.func(f"TRIM", e.this, e.expression), 587 exp.TsOrDsAdd: _ts_or_ds_add_sql, 588 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 589 exp.TsOrDsToDate: ts_or_ds_to_date_sql("bigquery"), 590 exp.Unhex: rename_func("FROM_HEX"), 591 exp.UnixToTime: _unix_to_time_sql, 592 exp.Values: _derived_table_values_to_unnest, 593 exp.VariancePop: rename_func("VAR_POP"), 594 } 595 596 TYPE_MAPPING = { 597 **generator.Generator.TYPE_MAPPING, 598 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 599 exp.DataType.Type.BIGINT: "INT64", 600 exp.DataType.Type.BINARY: "BYTES", 601 exp.DataType.Type.BOOLEAN: "BOOL", 602 exp.DataType.Type.CHAR: "STRING", 603 exp.DataType.Type.DECIMAL: "NUMERIC", 604 exp.DataType.Type.DOUBLE: "FLOAT64", 605 exp.DataType.Type.FLOAT: "FLOAT64", 606 exp.DataType.Type.INT: "INT64", 607 exp.DataType.Type.NCHAR: "STRING", 608 exp.DataType.Type.NVARCHAR: "STRING", 609 exp.DataType.Type.SMALLINT: "INT64", 610 exp.DataType.Type.TEXT: "STRING", 611 exp.DataType.Type.TIMESTAMP: "DATETIME", 612 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 613 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 614 exp.DataType.Type.TINYINT: "INT64", 615 exp.DataType.Type.VARBINARY: "BYTES", 616 exp.DataType.Type.VARCHAR: "STRING", 617 exp.DataType.Type.VARIANT: "ANY TYPE", 618 } 619 620 PROPERTIES_LOCATION = { 621 **generator.Generator.PROPERTIES_LOCATION, 622 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 623 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 624 } 625 626 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 627 RESERVED_KEYWORDS = { 628 *generator.Generator.RESERVED_KEYWORDS, 629 "all", 630 "and", 631 "any", 632 "array", 633 "as", 634 "asc", 635 "assert_rows_modified", 636 "at", 637 "between", 638 "by", 639 "case", 640 "cast", 641 "collate", 642 "contains", 643 "create", 644 "cross", 645 "cube", 646 "current", 647 "default", 648 "define", 649 "desc", 650 "distinct", 651 "else", 652 "end", 653 "enum", 654 "escape", 655 "except", 656 "exclude", 657 "exists", 658 "extract", 659 "false", 660 "fetch", 661 "following", 662 "for", 663 "from", 664 "full", 665 "group", 666 "grouping", 667 "groups", 668 "hash", 669 "having", 670 "if", 671 "ignore", 672 "in", 673 "inner", 674 "intersect", 675 "interval", 676 "into", 677 "is", 678 "join", 679 "lateral", 680 "left", 681 "like", 682 "limit", 683 "lookup", 684 "merge", 685 "natural", 686 "new", 687 "no", 688 "not", 689 "null", 690 "nulls", 691 "of", 692 "on", 693 "or", 694 "order", 695 "outer", 696 "over", 697 "partition", 698 "preceding", 699 "proto", 700 "qualify", 701 "range", 702 "recursive", 703 "respect", 704 "right", 705 "rollup", 706 "rows", 707 "select", 708 "set", 709 "some", 710 "struct", 711 "tablesample", 712 "then", 713 "to", 714 "treat", 715 "true", 716 "unbounded", 717 "union", 718 "unnest", 719 "using", 720 "when", 721 "where", 722 "window", 723 "with", 724 "within", 725 } 726 727 def eq_sql(self, expression: exp.EQ) -> str: 728 # Operands of = cannot be NULL in BigQuery 729 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 730 return "NULL" 731 732 return self.binary(expression, "=") 733 734 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 735 parent = expression.parent 736 737 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 738 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 739 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 740 return self.func( 741 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 742 ) 743 744 return super().attimezone_sql(expression) 745 746 def trycast_sql(self, expression: exp.TryCast) -> str: 747 return self.cast_sql(expression, safe_prefix="SAFE_") 748 749 def cte_sql(self, expression: exp.CTE) -> str: 750 if expression.alias_column_names: 751 self.unsupported("Column names in CTE definition are not supported.") 752 return super().cte_sql(expression) 753 754 def array_sql(self, expression: exp.Array) -> str: 755 first_arg = seq_get(expression.expressions, 0) 756 if isinstance(first_arg, exp.Subqueryable): 757 return f"ARRAY{self.wrap(self.sql(first_arg))}" 758 759 return inline_array_sql(self, expression) 760 761 def bracket_sql(self, expression: exp.Bracket) -> str: 762 expressions = expression.expressions 763 expressions_sql = ", ".join(self.sql(e) for e in expressions) 764 offset = expression.args.get("offset") 765 766 if offset == 0: 767 expressions_sql = f"OFFSET({expressions_sql})" 768 elif offset == 1: 769 expressions_sql = f"ORDINAL({expressions_sql})" 770 else: 771 self.unsupported(f"Unsupported array offset: {offset}") 772 773 if expression.args.get("safe"): 774 expressions_sql = f"SAFE_{expressions_sql}" 775 776 return f"{self.sql(expression, 'this')}[{expressions_sql}]" 777 778 def transaction_sql(self, *_) -> str: 779 return "BEGIN TRANSACTION" 780 781 def commit_sql(self, *_) -> str: 782 return "COMMIT TRANSACTION" 783 784 def rollback_sql(self, *_) -> str: 785 return "ROLLBACK TRANSACTION" 786 787 def in_unnest_op(self, expression: exp.Unnest) -> str: 788 return self.sql(expression) 789 790 def except_op(self, expression: exp.Except) -> str: 791 if not expression.args.get("distinct", False): 792 self.unsupported("EXCEPT without DISTINCT is not supported in BigQuery") 793 return f"EXCEPT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 794 795 def intersect_op(self, expression: exp.Intersect) -> str: 796 if not expression.args.get("distinct", False): 797 self.unsupported("INTERSECT without DISTINCT is not supported in BigQuery") 798 return f"INTERSECT{' DISTINCT' if expression.args.get('distinct') else ' ALL'}" 799 800 def with_properties(self, properties: exp.Properties) -> str: 801 return self.properties(properties, prefix=self.seg("OPTIONS")) 802 803 def version_sql(self, expression: exp.Version) -> str: 804 if expression.name == "TIMESTAMP": 805 expression.set("this", "SYSTEM_TIME") 806 return super().version_sql(expression)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether or not to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether or not to normalize identifiers to lowercase. Default: False.
- pad: Determines the pad size in a formatted string. Default: 2.
- indent: Determines the indentation size in a formatted string. Default: 2.
- normalize_functions: Whether or not to normalize all function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Determines whether or not the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether or not to preserve comments in the output SQL code. Default: True
734 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 735 parent = expression.parent 736 737 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 738 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 739 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 740 return self.func( 741 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 742 ) 743 744 return super().attimezone_sql(expression)
761 def bracket_sql(self, expression: exp.Bracket) -> str: 762 expressions = expression.expressions 763 expressions_sql = ", ".join(self.sql(e) for e in expressions) 764 offset = expression.args.get("offset") 765 766 if offset == 0: 767 expressions_sql = f"OFFSET({expressions_sql})" 768 elif offset == 1: 769 expressions_sql = f"ORDINAL({expressions_sql})" 770 else: 771 self.unsupported(f"Unsupported array offset: {offset}") 772 773 if expression.args.get("safe"): 774 expressions_sql = f"SAFE_{expressions_sql}" 775 776 return f"{self.sql(expression, 'this')}[{expressions_sql}]"
Inherited Members
- sqlglot.generator.Generator
- Generator
- NULL_ORDERING_SUPPORTED
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SIZE_IS_PERCENT
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- COLUMN_JOIN_MARKS_SUPPORTED
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- STAR_MAPPING
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- KEY_VALUE_DEFINITONS
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- pad_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- transformcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- clone_sql
- describe_sql
- prepend_ctes
- with_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- except_sql
- fetch_sql
- filter_sql
- hint_sql
- index_sql
- identifier_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- intersect_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognize_sql
- query_modifiers
- offset_limit_modifiers
- after_having_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- union_sql
- union_op
- unnest_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- formatjson_sql
- jsonobject_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- aliases_sql
- add_sql
- and_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- cast_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- altercolumn_sql
- renametable_sql
- altertable_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- intdiv_sql
- dpipe_sql
- div_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- ilike_sql
- ilikeany_sql
- is_sql
- like_sql
- likeany_sql
- similarto_sql
- lt_sql
- lte_sql
- mod_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- or_sql
- slice_sql
- sub_sql
- log_sql
- use_sql
- binary
- function_fallback_sql
- func
- format_args
- text_width
- format_time
- expressions
- op_expressions
- naked_property
- set_operation
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- merge_sql
- tochar_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- forin_sql
- refresh_sql
- operator_sql