2025-02-13 06:15:54 +01:00
|
|
|
from tests.dialects.test_dialect import Validator
|
|
|
|
|
|
|
|
|
|
|
|
class TestSpark(Validator):
|
|
|
|
dialect = "spark"
|
|
|
|
|
|
|
|
def test_ddl(self):
|
|
|
|
self.validate_all(
|
|
|
|
"CREATE TABLE db.example_table (col_a struct<struct_col_a:int, struct_col_b:string>)",
|
|
|
|
write={
|
2025-02-13 14:46:58 +01:00
|
|
|
"duckdb": "CREATE TABLE db.example_table (col_a STRUCT(struct_col_a INT, struct_col_b TEXT))",
|
2025-02-13 06:15:54 +01:00
|
|
|
"presto": "CREATE TABLE db.example_table (col_a ROW(struct_col_a INTEGER, struct_col_b VARCHAR))",
|
|
|
|
"hive": "CREATE TABLE db.example_table (col_a STRUCT<struct_col_a INT, struct_col_b STRING>)",
|
|
|
|
"spark": "CREATE TABLE db.example_table (col_a STRUCT<struct_col_a: INT, struct_col_b: STRING>)",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"CREATE TABLE db.example_table (col_a struct<struct_col_a:int, struct_col_b:struct<nested_col_a:string, nested_col_b:string>>)",
|
|
|
|
write={
|
|
|
|
"bigquery": "CREATE TABLE db.example_table (col_a STRUCT<struct_col_a INT64, struct_col_b STRUCT<nested_col_a STRING, nested_col_b STRING>>)",
|
2025-02-13 14:46:58 +01:00
|
|
|
"duckdb": "CREATE TABLE db.example_table (col_a STRUCT(struct_col_a INT, struct_col_b STRUCT(nested_col_a TEXT, nested_col_b TEXT)))",
|
2025-02-13 06:15:54 +01:00
|
|
|
"presto": "CREATE TABLE db.example_table (col_a ROW(struct_col_a INTEGER, struct_col_b ROW(nested_col_a VARCHAR, nested_col_b VARCHAR)))",
|
|
|
|
"hive": "CREATE TABLE db.example_table (col_a STRUCT<struct_col_a INT, struct_col_b STRUCT<nested_col_a STRING, nested_col_b STRING>>)",
|
|
|
|
"spark": "CREATE TABLE db.example_table (col_a STRUCT<struct_col_a: INT, struct_col_b: STRUCT<nested_col_a: STRING, nested_col_b: STRING>>)",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"CREATE TABLE db.example_table (col_a array<int>, col_b array<array<int>>)",
|
|
|
|
write={
|
|
|
|
"bigquery": "CREATE TABLE db.example_table (col_a ARRAY<INT64>, col_b ARRAY<ARRAY<INT64>>)",
|
2025-02-13 14:46:58 +01:00
|
|
|
"duckdb": "CREATE TABLE db.example_table (col_a INT[], col_b INT[][])",
|
2025-02-13 06:15:54 +01:00
|
|
|
"presto": "CREATE TABLE db.example_table (col_a ARRAY(INTEGER), col_b ARRAY(ARRAY(INTEGER)))",
|
|
|
|
"hive": "CREATE TABLE db.example_table (col_a ARRAY<INT>, col_b ARRAY<ARRAY<INT>>)",
|
|
|
|
"spark": "CREATE TABLE db.example_table (col_a ARRAY<INT>, col_b ARRAY<ARRAY<INT>>)",
|
2025-02-13 14:56:25 +01:00
|
|
|
"snowflake": "CREATE TABLE db.example_table (col_a ARRAY, col_b ARRAY)",
|
2025-02-13 06:15:54 +01:00
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"CREATE TABLE x USING ICEBERG PARTITIONED BY (MONTHS(y)) LOCATION 's3://z'",
|
|
|
|
write={
|
2025-02-13 14:46:58 +01:00
|
|
|
"duckdb": "CREATE TABLE x",
|
2025-02-13 14:56:25 +01:00
|
|
|
"presto": "CREATE TABLE x WITH (TABLE_FORMAT='ICEBERG', PARTITIONED_BY=ARRAY['MONTHS'])",
|
2025-02-13 06:15:54 +01:00
|
|
|
"hive": "CREATE TABLE x USING ICEBERG PARTITIONED BY (MONTHS(y)) LOCATION 's3://z'",
|
|
|
|
"spark": "CREATE TABLE x USING ICEBERG PARTITIONED BY (MONTHS(y)) LOCATION 's3://z'",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"CREATE TABLE test STORED AS PARQUET AS SELECT 1",
|
|
|
|
write={
|
2025-02-13 14:46:58 +01:00
|
|
|
"duckdb": "CREATE TABLE test AS SELECT 1",
|
2025-02-13 14:40:43 +01:00
|
|
|
"presto": "CREATE TABLE test WITH (FORMAT='PARQUET') AS SELECT 1",
|
2025-02-13 06:15:54 +01:00
|
|
|
"hive": "CREATE TABLE test STORED AS PARQUET AS SELECT 1",
|
2025-02-13 14:31:47 +01:00
|
|
|
"spark": "CREATE TABLE test USING PARQUET AS SELECT 1",
|
2025-02-13 06:15:54 +01:00
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"""CREATE TABLE blah (col_a INT) COMMENT "Test comment: blah" PARTITIONED BY (date STRING) STORED AS ICEBERG TBLPROPERTIES('x' = '1')""",
|
|
|
|
write={
|
2025-02-13 14:46:58 +01:00
|
|
|
"duckdb": """CREATE TABLE blah (
|
|
|
|
col_a INT
|
|
|
|
)""", # Partition columns should exist in table
|
2025-02-13 06:15:54 +01:00
|
|
|
"presto": """CREATE TABLE blah (
|
|
|
|
col_a INTEGER,
|
|
|
|
date VARCHAR
|
|
|
|
)
|
|
|
|
COMMENT='Test comment: blah'
|
|
|
|
WITH (
|
2025-02-13 14:40:43 +01:00
|
|
|
PARTITIONED_BY=ARRAY['date'],
|
|
|
|
FORMAT='ICEBERG',
|
|
|
|
x='1'
|
2025-02-13 06:15:54 +01:00
|
|
|
)""",
|
|
|
|
"hive": """CREATE TABLE blah (
|
|
|
|
col_a INT
|
|
|
|
)
|
|
|
|
COMMENT 'Test comment: blah'
|
|
|
|
PARTITIONED BY (
|
|
|
|
date STRING
|
|
|
|
)
|
|
|
|
STORED AS ICEBERG
|
|
|
|
TBLPROPERTIES (
|
2025-02-13 14:40:43 +01:00
|
|
|
'x'='1'
|
2025-02-13 06:15:54 +01:00
|
|
|
)""",
|
|
|
|
"spark": """CREATE TABLE blah (
|
|
|
|
col_a INT
|
|
|
|
)
|
|
|
|
COMMENT 'Test comment: blah'
|
|
|
|
PARTITIONED BY (
|
|
|
|
date STRING
|
|
|
|
)
|
2025-02-13 14:31:47 +01:00
|
|
|
USING ICEBERG
|
2025-02-13 06:15:54 +01:00
|
|
|
TBLPROPERTIES (
|
2025-02-13 14:40:43 +01:00
|
|
|
'x'='1'
|
2025-02-13 06:15:54 +01:00
|
|
|
)""",
|
|
|
|
},
|
|
|
|
pretty=True,
|
|
|
|
)
|
|
|
|
|
2025-02-13 14:56:25 +01:00
|
|
|
self.validate_all(
|
|
|
|
"CACHE TABLE testCache OPTIONS ('storageLevel' 'DISK_ONLY') SELECT * FROM testData",
|
|
|
|
write={
|
|
|
|
"spark": "CACHE TABLE testCache OPTIONS('storageLevel' = 'DISK_ONLY') AS SELECT * FROM testData"
|
|
|
|
},
|
|
|
|
)
|
2025-02-13 15:01:55 +01:00
|
|
|
self.validate_all(
|
|
|
|
"ALTER TABLE StudentInfo ADD COLUMNS (LastName STRING, DOB TIMESTAMP)",
|
|
|
|
write={
|
|
|
|
"spark": "ALTER TABLE StudentInfo ADD COLUMNS (LastName STRING, DOB TIMESTAMP)",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"ALTER TABLE StudentInfo DROP COLUMNS (LastName, DOB)",
|
|
|
|
write={
|
|
|
|
"spark": "ALTER TABLE StudentInfo DROP COLUMNS (LastName, DOB)",
|
|
|
|
},
|
|
|
|
)
|
2025-02-13 14:56:25 +01:00
|
|
|
|
2025-02-13 06:15:54 +01:00
|
|
|
def test_to_date(self):
|
|
|
|
self.validate_all(
|
|
|
|
"TO_DATE(x, 'yyyy-MM-dd')",
|
|
|
|
write={
|
|
|
|
"duckdb": "CAST(x AS DATE)",
|
|
|
|
"hive": "TO_DATE(x)",
|
|
|
|
"presto": "CAST(SUBSTR(CAST(x AS VARCHAR), 1, 10) AS DATE)",
|
|
|
|
"spark": "TO_DATE(x)",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"TO_DATE(x, 'yyyy')",
|
|
|
|
write={
|
|
|
|
"duckdb": "CAST(STRPTIME(x, '%Y') AS DATE)",
|
|
|
|
"hive": "TO_DATE(x, 'yyyy')",
|
|
|
|
"presto": "CAST(DATE_PARSE(x, '%Y') AS DATE)",
|
|
|
|
"spark": "TO_DATE(x, 'yyyy')",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_hint(self):
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT /*+ COALESCE(3) */ * FROM x",
|
|
|
|
write={
|
|
|
|
"spark": "SELECT /*+ COALESCE(3) */ * FROM x",
|
2025-02-13 14:45:11 +01:00
|
|
|
"bigquery": "SELECT * FROM x",
|
2025-02-13 06:15:54 +01:00
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT /*+ COALESCE(3), REPARTITION(1) */ * FROM x",
|
|
|
|
write={
|
|
|
|
"spark": "SELECT /*+ COALESCE(3), REPARTITION(1) */ * FROM x",
|
2025-02-13 14:45:11 +01:00
|
|
|
"bigquery": "SELECT * FROM x",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT /*+ BROADCAST(table) */ cola FROM table",
|
|
|
|
write={
|
|
|
|
"spark": "SELECT /*+ BROADCAST(table) */ cola FROM table",
|
|
|
|
"bigquery": "SELECT cola FROM table",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT /*+ BROADCASTJOIN(table) */ cola FROM table",
|
|
|
|
write={
|
|
|
|
"spark": "SELECT /*+ BROADCASTJOIN(table) */ cola FROM table",
|
|
|
|
"bigquery": "SELECT cola FROM table",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT /*+ MAPJOIN(table) */ cola FROM table",
|
|
|
|
write={
|
|
|
|
"spark": "SELECT /*+ MAPJOIN(table) */ cola FROM table",
|
|
|
|
"bigquery": "SELECT cola FROM table",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT /*+ MERGE(table) */ cola FROM table",
|
|
|
|
write={
|
|
|
|
"spark": "SELECT /*+ MERGE(table) */ cola FROM table",
|
|
|
|
"bigquery": "SELECT cola FROM table",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT /*+ SHUFFLEMERGE(table) */ cola FROM table",
|
|
|
|
write={
|
|
|
|
"spark": "SELECT /*+ SHUFFLEMERGE(table) */ cola FROM table",
|
|
|
|
"bigquery": "SELECT cola FROM table",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT /*+ MERGEJOIN(table) */ cola FROM table",
|
|
|
|
write={
|
|
|
|
"spark": "SELECT /*+ MERGEJOIN(table) */ cola FROM table",
|
|
|
|
"bigquery": "SELECT cola FROM table",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT /*+ SHUFFLE_HASH(table) */ cola FROM table",
|
|
|
|
write={
|
|
|
|
"spark": "SELECT /*+ SHUFFLE_HASH(table) */ cola FROM table",
|
|
|
|
"bigquery": "SELECT cola FROM table",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT /*+ SHUFFLE_REPLICATE_NL(table) */ cola FROM table",
|
|
|
|
write={
|
|
|
|
"spark": "SELECT /*+ SHUFFLE_REPLICATE_NL(table) */ cola FROM table",
|
|
|
|
"bigquery": "SELECT cola FROM table",
|
2025-02-13 06:15:54 +01:00
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
def test_spark(self):
|
2025-02-13 15:03:38 +01:00
|
|
|
self.validate_identity("SELECT UNIX_TIMESTAMP()")
|
2025-02-13 06:15:54 +01:00
|
|
|
self.validate_all(
|
|
|
|
"ARRAY_SORT(x, (left, right) -> -1)",
|
|
|
|
write={
|
|
|
|
"duckdb": "ARRAY_SORT(x)",
|
|
|
|
"presto": "ARRAY_SORT(x, (left, right) -> -1)",
|
|
|
|
"hive": "SORT_ARRAY(x)",
|
|
|
|
"spark": "ARRAY_SORT(x, (left, right) -> -1)",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"ARRAY(0, 1, 2)",
|
|
|
|
write={
|
|
|
|
"bigquery": "[0, 1, 2]",
|
|
|
|
"duckdb": "LIST_VALUE(0, 1, 2)",
|
|
|
|
"presto": "ARRAY[0, 1, 2]",
|
|
|
|
"hive": "ARRAY(0, 1, 2)",
|
|
|
|
"spark": "ARRAY(0, 1, 2)",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT fname, lname, age FROM person ORDER BY age DESC NULLS FIRST, fname ASC NULLS LAST, lname",
|
|
|
|
write={
|
|
|
|
"clickhouse": "SELECT fname, lname, age FROM person ORDER BY age DESC NULLS FIRST, fname, lname NULLS FIRST",
|
|
|
|
"duckdb": "SELECT fname, lname, age FROM person ORDER BY age DESC NULLS FIRST, fname NULLS LAST, lname",
|
|
|
|
"postgres": "SELECT fname, lname, age FROM person ORDER BY age DESC, fname, lname NULLS FIRST",
|
|
|
|
"presto": "SELECT fname, lname, age FROM person ORDER BY age DESC NULLS FIRST, fname, lname NULLS FIRST",
|
|
|
|
"hive": "SELECT fname, lname, age FROM person ORDER BY age DESC NULLS FIRST, fname NULLS LAST, lname",
|
|
|
|
"spark": "SELECT fname, lname, age FROM person ORDER BY age DESC NULLS FIRST, fname NULLS LAST, lname",
|
|
|
|
"snowflake": "SELECT fname, lname, age FROM person ORDER BY age DESC, fname, lname NULLS FIRST",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT APPROX_COUNT_DISTINCT(a) FROM foo",
|
|
|
|
write={
|
|
|
|
"duckdb": "SELECT APPROX_COUNT_DISTINCT(a) FROM foo",
|
|
|
|
"presto": "SELECT APPROX_DISTINCT(a) FROM foo",
|
|
|
|
"hive": "SELECT APPROX_COUNT_DISTINCT(a) FROM foo",
|
|
|
|
"spark": "SELECT APPROX_COUNT_DISTINCT(a) FROM foo",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"MONTH('2021-03-01')",
|
|
|
|
write={
|
|
|
|
"duckdb": "MONTH(CAST('2021-03-01' AS DATE))",
|
|
|
|
"presto": "MONTH(CAST(SUBSTR(CAST('2021-03-01' AS VARCHAR), 1, 10) AS DATE))",
|
|
|
|
"hive": "MONTH(TO_DATE('2021-03-01'))",
|
|
|
|
"spark": "MONTH(TO_DATE('2021-03-01'))",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"YEAR('2021-03-01')",
|
|
|
|
write={
|
|
|
|
"duckdb": "YEAR(CAST('2021-03-01' AS DATE))",
|
|
|
|
"presto": "YEAR(CAST(SUBSTR(CAST('2021-03-01' AS VARCHAR), 1, 10) AS DATE))",
|
|
|
|
"hive": "YEAR(TO_DATE('2021-03-01'))",
|
|
|
|
"spark": "YEAR(TO_DATE('2021-03-01'))",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"'\u6bdb'",
|
|
|
|
write={
|
|
|
|
"duckdb": "'毛'",
|
|
|
|
"presto": "'毛'",
|
|
|
|
"hive": "'毛'",
|
|
|
|
"spark": "'毛'",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT LEFT(x, 2), RIGHT(x, 2)",
|
|
|
|
write={
|
|
|
|
"duckdb": "SELECT SUBSTRING(x, 1, 2), SUBSTRING(x, LENGTH(x) - 2 + 1, 2)",
|
|
|
|
"presto": "SELECT SUBSTRING(x, 1, 2), SUBSTRING(x, LENGTH(x) - 2 + 1, 2)",
|
|
|
|
"hive": "SELECT SUBSTRING(x, 1, 2), SUBSTRING(x, LENGTH(x) - 2 + 1, 2)",
|
|
|
|
"spark": "SELECT SUBSTRING(x, 1, 2), SUBSTRING(x, LENGTH(x) - 2 + 1, 2)",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"MAP_FROM_ARRAYS(ARRAY(1), c)",
|
|
|
|
write={
|
|
|
|
"duckdb": "MAP(LIST_VALUE(1), c)",
|
|
|
|
"presto": "MAP(ARRAY[1], c)",
|
|
|
|
"hive": "MAP(ARRAY(1), c)",
|
|
|
|
"spark": "MAP_FROM_ARRAYS(ARRAY(1), c)",
|
2025-02-13 14:56:25 +01:00
|
|
|
"snowflake": "OBJECT_CONSTRUCT([1], c)",
|
2025-02-13 06:15:54 +01:00
|
|
|
},
|
|
|
|
)
|
|
|
|
self.validate_all(
|
|
|
|
"SELECT ARRAY_SORT(x)",
|
|
|
|
write={
|
|
|
|
"duckdb": "SELECT ARRAY_SORT(x)",
|
|
|
|
"presto": "SELECT ARRAY_SORT(x)",
|
|
|
|
"hive": "SELECT SORT_ARRAY(x)",
|
|
|
|
"spark": "SELECT ARRAY_SORT(x)",
|
|
|
|
},
|
|
|
|
)
|
2025-02-13 14:50:31 +01:00
|
|
|
|
|
|
|
def test_iif(self):
|
2025-02-13 14:53:05 +01:00
|
|
|
self.validate_all(
|
|
|
|
"SELECT IIF(cond, 'True', 'False')", write={"spark": "SELECT IF(cond, 'True', 'False')"}
|
|
|
|
)
|