Merging upstream version 26.15.0.

Signed-off-by: Daniel Baumann <daniel@debian.org>
2025-04-21 09:50:04 +02:00 · 2025-04-21 09:50:04 +02:00 · de6539b796
commit de6539b796
parent 2a79d9df75
58 changed files with 4878 additions and 4677 deletions
--- a/tests/dialects/test_athena.py
+++ b/tests/dialects/test_athena.py
@ -202,6 +202,67 @@ class TestAthena(Validator):
            identify=True,
        )

+    def test_create_table(self):
+        # There are two CREATE TABLE syntaxes
+        # Both hit Athena's Hive engine but creating an Iceberg table is different from creating a normal Hive table
+
+        table_schema = exp.Schema(
+            this=exp.to_table("foo.bar"),
+            expressions=[
+                exp.ColumnDef(this=exp.to_identifier("a"), kind=exp.DataType.build("int")),
+                exp.ColumnDef(this=exp.to_identifier("b"), kind=exp.DataType.build("varchar")),
+            ],
+        )
+
+        # Hive tables - CREATE EXTERNAL TABLE
+        ct_hive = exp.Create(
+            this=table_schema,
+            kind="TABLE",
+            properties=exp.Properties(
+                expressions=[
+                    exp.ExternalProperty(),
+                    exp.FileFormatProperty(this=exp.Literal.string("parquet")),
+                    exp.LocationProperty(this=exp.Literal.string("s3://foo")),
+                    exp.PartitionedByProperty(
+                        this=exp.Schema(expressions=[exp.to_column("partition_col")])
+                    ),
+                ]
+            ),
+        )
+        self.assertEqual(
+            ct_hive.sql(dialect=self.dialect, identify=True),
+            "CREATE EXTERNAL TABLE `foo`.`bar` (`a` INT, `b` STRING) STORED AS PARQUET LOCATION 's3://foo' PARTITIONED BY (`partition_col`)",
+        )
+
+        # Iceberg tables - CREATE TABLE... TBLPROPERTIES ('table_type'='iceberg')
+        # no EXTERNAL keyword and the 'table_type=iceberg' property must be set
+        # ref: https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg-creating-tables.html#querying-iceberg-partitioning
+        ct_iceberg = exp.Create(
+            this=table_schema,
+            kind="TABLE",
+            properties=exp.Properties(
+                expressions=[
+                    exp.FileFormatProperty(this=exp.Literal.string("parquet")),
+                    exp.LocationProperty(this=exp.Literal.string("s3://foo")),
+                    exp.PartitionedByProperty(
+                        this=exp.Schema(
+                            expressions=[
+                                exp.to_column("partition_col"),
+                                exp.PartitionedByBucket(
+                                    this=exp.to_column("a"), expression=exp.Literal.number(4)
+                                ),
+                            ]
+                        )
+                    ),
+                    exp.Property(this=exp.var("table_type"), value=exp.Literal.string("iceberg")),
+                ]
+            ),
+        )
+        self.assertEqual(
+            ct_iceberg.sql(dialect=self.dialect, identify=True),
+            "CREATE TABLE `foo`.`bar` (`a` INT, `b` STRING) STORED AS PARQUET LOCATION 's3://foo' PARTITIONED BY (`partition_col`, BUCKET(4, `a`)) TBLPROPERTIES ('table_type'='iceberg')",
+        )
+
    def test_ctas(self):
        # Hive tables use 'external_location' to specify the table location, Iceberg tables use 'location' to specify the table location
        # In addition, Hive tables used 'partitioned_by' to specify the partition fields and Iceberg tables use 'partitioning' to specify the partition fields
@ -223,7 +284,11 @@ class TestAthena(Validator):
        )
        self.assertEqual(
            ctas_hive.sql(dialect=self.dialect, identify=True),
-            "CREATE TABLE \"foo\".\"bar\" WITH (format='parquet', external_location='s3://foo', partitioned_by=ARRAY['partition_col']) AS SELECT 1",
+            "CREATE TABLE \"foo\".\"bar\" WITH (format='parquet', external_location='s3://foo', partitioned_by=ARRAY['\"partition_col\"']) AS SELECT 1",
+        )
+        self.assertEqual(
+            ctas_hive.sql(dialect=self.dialect, identify=False),
+            "CREATE TABLE foo.bar WITH (format='parquet', external_location='s3://foo', partitioned_by=ARRAY['partition_col']) AS SELECT 1",
        )

        ctas_iceberg = exp.Create(
@ -234,7 +299,14 @@ class TestAthena(Validator):
                    exp.Property(this=exp.var("table_type"), value=exp.Literal.string("iceberg")),
                    exp.LocationProperty(this=exp.Literal.string("s3://foo")),
                    exp.PartitionedByProperty(
-                        this=exp.Schema(expressions=[exp.to_column("partition_col")])
+                        this=exp.Schema(
+                            expressions=[
+                                exp.to_column("partition_col"),
+                                exp.PartitionedByBucket(
+                                    this=exp.to_column("a"), expression=exp.Literal.number(4)
+                                ),
+                            ]
+                        )
                    ),
                ]
            ),
@ -242,5 +314,9 @@ class TestAthena(Validator):
        )
        self.assertEqual(
            ctas_iceberg.sql(dialect=self.dialect, identify=True),
-            "CREATE TABLE \"foo\".\"bar\" WITH (table_type='iceberg', location='s3://foo', partitioning=ARRAY['partition_col']) AS SELECT 1",
+            "CREATE TABLE \"foo\".\"bar\" WITH (table_type='iceberg', location='s3://foo', partitioning=ARRAY['\"partition_col\"', 'BUCKET(\"a\", 4)']) AS SELECT 1",
+        )
+        self.assertEqual(
+            ctas_iceberg.sql(dialect=self.dialect, identify=False),
+            "CREATE TABLE foo.bar WITH (table_type='iceberg', location='s3://foo', partitioning=ARRAY['partition_col', 'BUCKET(a, 4)']) AS SELECT 1",
        )
--- a/tests/dialects/test_bigquery.py
+++ b/tests/dialects/test_bigquery.py
@ -308,10 +308,6 @@ LANGUAGE js AS
            """SELECT JSON '"foo"' AS json_data""",
            """SELECT PARSE_JSON('"foo"') AS json_data""",
        )
-        self.validate_identity(
-            "SELECT * FROM UNNEST(x) WITH OFFSET EXCEPT DISTINCT SELECT * FROM UNNEST(y) WITH OFFSET",
-            "SELECT * FROM UNNEST(x) WITH OFFSET AS offset EXCEPT DISTINCT SELECT * FROM UNNEST(y) WITH OFFSET AS offset",
-        )
        self.validate_identity(
            "SELECT * FROM (SELECT a, b, c FROM test) PIVOT(SUM(b) d, COUNT(*) e FOR c IN ('x', 'y'))",
            "SELECT * FROM (SELECT a, b, c FROM test) PIVOT(SUM(b) AS d, COUNT(*) AS e FOR c IN ('x', 'y'))",
@ -1519,8 +1515,8 @@ WHERE
        self.validate_all(
            "SELECT GENERATE_DATE_ARRAY('2016-10-05', '2016-10-08')",
            write={
-                "duckdb": "SELECT CAST(GENERATE_SERIES(CAST('2016-10-05' AS DATE), CAST('2016-10-08' AS DATE), INTERVAL 1 DAY) AS DATE[])",
-                "bigquery": "SELECT GENERATE_DATE_ARRAY('2016-10-05', '2016-10-08', INTERVAL 1 DAY)",
+                "duckdb": "SELECT CAST(GENERATE_SERIES(CAST('2016-10-05' AS DATE), CAST('2016-10-08' AS DATE), INTERVAL '1' DAY) AS DATE[])",
+                "bigquery": "SELECT GENERATE_DATE_ARRAY('2016-10-05', '2016-10-08', INTERVAL '1' DAY)",
            },
        )
        self.validate_all(
@ -2424,3 +2420,16 @@ OPTIONS (
            "SELECT 1 AS x UNION ALL STRICT CORRESPONDING BY (foo, bar) SELECT 2 AS x",
            "SELECT 1 AS x UNION ALL BY NAME ON (foo, bar) SELECT 2 AS x",
        )
+
+    def test_with_offset(self):
+        self.validate_identity(
+            "SELECT * FROM UNNEST(x) WITH OFFSET EXCEPT DISTINCT SELECT * FROM UNNEST(y) WITH OFFSET",
+            "SELECT * FROM UNNEST(x) WITH OFFSET AS offset EXCEPT DISTINCT SELECT * FROM UNNEST(y) WITH OFFSET AS offset",
+        )
+
+        for join_ops in ("LEFT", "RIGHT", "FULL", "NATURAL", "SEMI", "ANTI"):
+            with self.subTest(f"Testing {join_ops} in test_with_offset"):
+                self.validate_identity(
+                    f"SELECT * FROM t1, UNNEST([1, 2]) AS hit WITH OFFSET {join_ops} JOIN foo",
+                    f"SELECT * FROM t1, UNNEST([1, 2]) AS hit WITH OFFSET AS offset {join_ops} JOIN foo",
+                )
--- a/tests/dialects/test_duckdb.py
+++ b/tests/dialects/test_duckdb.py
@ -1034,7 +1034,7 @@ class TestDuckDB(Validator):
                "clickhouse": "fromUnixTimestamp64Milli(CAST(x AS Nullable(Int64)))",
                "duckdb": "EPOCH_MS(x)",
                "mysql": "FROM_UNIXTIME(x / POWER(10, 3))",
-                "postgres": "TO_TIMESTAMP(CAST(x AS DOUBLE PRECISION) / 10 ^ 3)",
+                "postgres": "TO_TIMESTAMP(CAST(x AS DOUBLE PRECISION) / POWER(10, 3))",
                "presto": "FROM_UNIXTIME(CAST(x AS DOUBLE) / POW(10, 3))",
                "spark": "TIMESTAMP_MILLIS(x)",
            },
--- a/tests/dialects/test_postgres.py
+++ b/tests/dialects/test_postgres.py
@ -568,7 +568,7 @@ FROM json_data, field_ids""",
            "x ^ y",
            write={
                "": "POWER(x, y)",
-                "postgres": "x ^ y",
+                "postgres": "POWER(x, y)",
            },
        )
        self.validate_all(
@ -765,7 +765,7 @@ FROM json_data, field_ids""",
            "x / y ^ z",
            write={
                "": "x / POWER(y, z)",
-                "postgres": "x / y ^ z",
+                "postgres": "x / POWER(y, z)",
            },
        )
        self.validate_all(
--- a/tests/dialects/test_snowflake.py
+++ b/tests/dialects/test_snowflake.py
@ -609,7 +609,7 @@ class TestSnowflake(Validator):
                "hive": "POWER(x, 2)",
                "mysql": "POWER(x, 2)",
                "oracle": "POWER(x, 2)",
-                "postgres": "x ^ 2",
+                "postgres": "POWER(x, 2)",
                "presto": "POWER(x, 2)",
                "redshift": "POWER(x, 2)",
                "snowflake": "POWER(x, 2)",
@ -2563,3 +2563,12 @@ SINGLE = TRUE""",
                    "duckdb": f"SELECT LISTAGG({distinct}col, '|SEPARATOR|' ORDER BY col2) FROM t",
                },
            )
+
+    def test_rely_options(self):
+        for option in ("NORELY", "RELY"):
+            self.validate_identity(
+                f"CREATE TABLE t (col1 INT PRIMARY KEY {option}, col2 INT UNIQUE {option}, col3 INT NOT NULL FOREIGN KEY REFERENCES other_t (id) {option})"
+            )
+            self.validate_identity(
+                f"CREATE TABLE t (col1 INT, col2 INT, col3 INT, PRIMARY KEY (col1) {option}, UNIQUE (col1, col2) {option}, FOREIGN KEY (col3) REFERENCES other_t (id) {option})"
+            )
--- a/tests/dialects/test_spark.py
+++ b/tests/dialects/test_spark.py
@ -56,7 +56,7 @@ class TestSpark(Validator):
            "CREATE TABLE x USING ICEBERG PARTITIONED BY (MONTHS(y)) LOCATION 's3://z'",
            write={
                "duckdb": "CREATE TABLE x",
-                "presto": "CREATE TABLE x WITH (FORMAT='ICEBERG', PARTITIONED_BY=ARRAY['MONTHS'])",
+                "presto": "CREATE TABLE x WITH (FORMAT='ICEBERG', PARTITIONED_BY=ARRAY['MONTHS(y)'])",
                "hive": "CREATE TABLE x STORED AS ICEBERG PARTITIONED BY (MONTHS(y)) LOCATION 's3://z'",
                "spark": "CREATE TABLE x USING ICEBERG PARTITIONED BY (MONTHS(y)) LOCATION 's3://z'",
            },
--- a/tests/dialects/test_trino.py
+++ b/tests/dialects/test_trino.py
@ -93,6 +93,16 @@ class TestTrino(Validator):
            "CREATE TABLE foo.bar WITH (LOCATION='s3://bucket/foo/bar') AS SELECT 1"
        )

+        # Hive connector syntax (partitioned_by)
+        self.validate_identity(
+            "CREATE TABLE foo (a VARCHAR, b INTEGER, c DATE) WITH (PARTITIONED_BY=ARRAY['a', 'b'])"
+        )
+
+        # Iceberg connector syntax (partitioning, can contain Iceberg transform expressions)
+        self.validate_identity(
+            "CREATE TABLE foo (a VARCHAR, b INTEGER, c DATE) WITH (PARTITIONING=ARRAY['a', 'bucket(4, b)', 'month(c)'])",
+        )
+
    def test_analyze(self):
        self.validate_identity("ANALYZE tbl")
        self.validate_identity("ANALYZE tbl WITH (prop1=val1, prop2=val2)")
--- a/tests/fixtures/optimizer/canonicalize.sql
+++ b/tests/fixtures/optimizer/canonicalize.sql
@ -124,6 +124,12 @@ SELECT CAST(CAST(`t`.`some_col` AS DATE) AS DATETIME) < CAST(CAST(`t`.`other_col
 --------------------------------------
 -- Remove redundant casts
 --------------------------------------
+CAST(CAST("foo" AS DECIMAL(4, 2)) AS DECIMAL(8, 4)) AS "x";
+CAST(CAST("foo" AS DECIMAL(4, 2)) AS DECIMAL(8, 4)) AS "x";
+
+CAST(CAST("foo" AS DECIMAL(4, 2)) AS DECIMAL(4, 2)) AS "x";
+CAST("foo" AS DECIMAL(4, 2)) AS "x";
+
 CAST(CAST('2023-01-01' AS DATE) AS DATE);
 CAST('2023-01-01' AS DATE);

--- a/tests/fixtures/optimizer/unnest_subqueries.sql
+++ b/tests/fixtures/optimizer/unnest_subqueries.sql
@ -67,3 +67,14 @@ SELECT x.a > _u_0.b FROM x CROSS JOIN (SELECT SUM(y.a) AS b FROM y) AS _u_0;

 SELECT (SELECT MAX(t2.c1) AS c1 FROM t2 WHERE t2.c2 = t1.c2 AND t2.c3 <= TRUNC(t1.c3)) AS c FROM t1;
 SELECT _u_0.c1 AS c FROM t1 LEFT JOIN (SELECT MAX(t2.c1) AS c1, t2.c2 AS _u_1, MAX(t2.c3) AS _u_2 FROM t2 WHERE TRUE AND TRUE GROUP BY t2.c2) AS _u_0 ON _u_0._u_1 = t1.c2 WHERE _u_0._u_2 <= TRUNC(t1.c3);
+
+SELECT s.t AS t FROM s WHERE 1 IN (SELECT t.a AS a FROM t WHERE t.b > 1);
+SELECT s.t AS t FROM s LEFT JOIN (SELECT t.a AS a FROM t WHERE t.b > 1 GROUP BY t.a) AS _u_0 ON 1 = _u_0.a WHERE NOT _u_0.a IS NULL;
+
+# title: can't create GROUP BY clause with an aggregate
+SELECT s.t FROM s WHERE 1 IN (SELECT MAX(t.a) AS t1 FROM t);
+SELECT s.t FROM s LEFT JOIN (SELECT MAX(t.a) AS t1 FROM t) AS _u_0 ON 1 = _u_0.t1 WHERE NOT _u_0.t1 IS NULL;
+
+# title: can't create GROUP BY clause with an aggregate (nested)
+SELECT s.t FROM s WHERE 1 IN (SELECT MAX(t.a) + 1 AS t1 FROM t);
+SELECT s.t FROM s LEFT JOIN (SELECT MAX(t.a) + 1 AS t1 FROM t) AS _u_0 ON 1 = _u_0.t1 WHERE NOT _u_0.t1 IS NULL
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@ -533,6 +533,11 @@ class TestOptimizer(unittest.TestCase):
    def test_simplify(self):
        self.check_file("simplify", simplify)

+        # Ensure simplify mutates the AST properly
+        expression = parse_one("SELECT 1 + 2")
+        simplify(expression.selects[0])
+        self.assertEqual(expression.sql(), "SELECT 3")
+
        expression = parse_one("SELECT a, c, b FROM table1 WHERE 1 = 1")
        self.assertEqual(simplify(simplify(expression.find(exp.Where))).sql(), "WHERE TRUE")