diff --git a/test/parser/spark/syntax/fixtures/selectAggregateFn.sql b/test/parser/spark/syntax/fixtures/selectAggregateFn.sql new file mode 100644 index 0000000..2264746 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectAggregateFn.sql @@ -0,0 +1,39 @@ +-- aggregate_function(input1 [, input2, ...]) FILTER (WHERE boolean_expression) + +-- { PERCENTILE_CONT | PERCENTILE_DISC }(percentile) WITHIN GROUP (ORDER BY { order_by_expression [ ASC | DESC ] [ NULLS { FIRST | LAST } ] [ , ... ] }) FILTER (WHERE boolean_expression) + +CREATE OR REPLACE TEMPORARY VIEW basic_pays AS SELECT * FROM VALUES +('Diane Murphy','Accounting',8435), +('Mary Patterson','Accounting',9998), +('Jeff Firrelli','Accounting',8992), +('William Patterson','Accounting',8870), +('Gerard Bondur','Accounting',11472), +('Anthony Bow','Accounting',6627), +('Leslie Jennings','IT',8113), +('Leslie Thompson','IT',5186), +('Julie Firrelli','Sales',9181), +('Steve Patterson','Sales',9441), +('Foon Yue Tseng','Sales',6660), +('George Vanauf','Sales',10563), +('Loui Bondur','SCM',10449), +('Gerard Hernandez','SCM',6949), +('Pamela Castillo','SCM',11303), +('Larry Bott','SCM',11798), +('Barry Jones','SCM',10586) +AS basic_pays(employee_name, department, salary); + +SELECT * FROM basic_pays; + +SELECT + department, + percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) AS pc1, + percentile_cont(0.25) WITHIN GROUP (ORDER BY salary) FILTER (WHERE employee_name LIKE '%Bo%') AS pc2, + percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) AS pc3, + percentile_cont(0.25) WITHIN GROUP (ORDER BY salary DESC) FILTER (WHERE employee_name LIKE '%Bo%') AS pc4, + percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) AS pd1, + percentile_disc(0.25) WITHIN GROUP (ORDER BY salary) FILTER (WHERE employee_name LIKE '%Bo%') AS pd2, + percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) AS pd3, + percentile_disc(0.25) WITHIN GROUP (ORDER BY salary DESC) FILTER (WHERE employee_name LIKE '%Bo%') AS pd4 +FROM basic_pays +GROUP BY department +ORDER BY department; diff --git a/test/parser/spark/syntax/fixtures/selectCET.sql b/test/parser/spark/syntax/fixtures/selectCET.sql new file mode 100644 index 0000000..3f79194 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectCET.sql @@ -0,0 +1,29 @@ +-- WITH common_table_expression [ , ... ] +-- expression_name [ ( column_name [ , ... ] ) ] [ AS ] ( query ) + + +-- CTE with multiple column aliases +WITH t(x, y) AS (SELECT 1, 2) +SELECT * FROM t WHERE x = 1 AND y = 2; + +-- CTE in CTE definition +WITH t AS (WITH t2 AS (SELECT 1) SELECT * FROM t2) SELECT * FROM t; + +-- CTE in subquery expression +SELECT (WITH t AS (SELECT 1) SELECT * FROM t); + +-- CTE in CREATE VIEW statement +CREATE VIEW v AS WITH t(a, b, c, d) AS (SELECT 1, 2, 3, 4) SELECT * FROM t; +SELECT * FROM v; + +-- If name conflict is detected in nested CTE, then AnalysisException is thrown by default. +-- SET spark.sql.legacy.ctePrecedencePolicy = CORRECTED (which is recommended), +-- inner CTE definitions take precedence over outer definitions. +SET spark.sql.legacy.ctePrecedencePolicy = CORRECTED; +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +SELECT * FROM t2; diff --git a/test/parser/spark/syntax/fixtures/selectCase.sql b/test/parser/spark/syntax/fixtures/selectCase.sql new file mode 100644 index 0000000..004b44b --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectCase.sql @@ -0,0 +1,10 @@ +-- CASE [ expression ] { WHEN boolean_expression THEN then_expression } [ ... ] +-- [ ELSE else_expression ] +-- END + + +SELECT id, CASE WHEN id > 200 THEN 'bigger' ELSE 'small' END FROM person; + +SELECT id, CASE id WHEN 100 then 'bigger' WHEN id > 300 THEN '300' ELSE 'small' END FROM person; + +SELECT * FROM person WHERE CASE 1 = 1 WHEN 100 THEN 'big' WHEN 200 THEN 'bigger' WHEN 300 THEN 'biggest' ELSE 'small' END = 'small'; diff --git a/test/parser/spark/syntax/fixtures/selectClusterBy.sql b/test/parser/spark/syntax/fixtures/selectClusterBy.sql new file mode 100644 index 0000000..7e9b975 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectClusterBy.sql @@ -0,0 +1,27 @@ + +-- CLUSTER BY { expression [ , ... ] } + +CREATE TABLE person (name STRING, age INT); +INSERT INTO person VALUES + ('Zen Hui', 25), + ('Anil B', 18), + ('Shone S', 16), + ('Mike A', 25), + ('John A', 18), + ('Jack N', 16); + +-- Reduce the number of shuffle partitions to 2 to illustrate the behavior of `CLUSTER BY`. +-- It's easier to see the clustering and sorting behavior with less number of partitions. +SET spark.sql.shuffle.partitions = 2; + +-- Select the rows with no ordering. Please note that without any sort directive, the results +-- of the query is not deterministic. It's included here to show the difference in behavior +-- of a query when `CLUSTER BY` is not used vs when it's used. The query below produces rows +-- where age column is not sorted. +SELECT name, age FROM person; + +-- Produces rows clustered by age. Persons with same age are clustered together. +-- In the query below, persons with age 18 and 25 are in first partition and the +-- persons with age 16 are in the second partition. The rows are sorted based +-- on age within each partition. +SELECT age, name FROM person CLUSTER BY age; diff --git a/test/parser/spark/syntax/fixtures/selectDistributeBy.sql b/test/parser/spark/syntax/fixtures/selectDistributeBy.sql new file mode 100644 index 0000000..ca51386 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectDistributeBy.sql @@ -0,0 +1,5 @@ +-- DISTRIBUTE BY { expression [ , ... ] } + +-- Produces rows clustered by age. Persons with same age are clustered together. +-- Unlike `CLUSTER BY` clause, the rows are not sorted within a partition. +SELECT age, name FROM person DISTRIBUTE BY age; diff --git a/test/parser/spark/syntax/fixtures/selectExplain.sql b/test/parser/spark/syntax/fixtures/selectExplain.sql new file mode 100644 index 0000000..b5500ef --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectExplain.sql @@ -0,0 +1,14 @@ +-- EXPLAIN [ EXTENDED | CODEGEN | COST | FORMATTED ] statement + +-- Default Output +EXPLAIN select k, sum(v) from values (1, 2), (1, 3) t(k, v) group by k; + +-- Using Extended +EXPLAIN EXTENDED select k, sum(v) from values (1, 2), (1, 3) t(k, v) group by k; + + +-- https://github.com/DTStack/dt-sql-parser/issues/131 +SELECT count(1) FROM tableName; + +-- https://github.com/DTStack/dt-sql-parser/issues/131 +DELETE FROM tableName WHERE (SELECT count(1) FROM tableName WHERE pt = '20230601') > 0 AND pt = '20230601'; diff --git a/test/parser/spark/syntax/fixtures/selectFile.sql b/test/parser/spark/syntax/fixtures/selectFile.sql new file mode 100644 index 0000000..44a59b1 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectFile.sql @@ -0,0 +1,10 @@ +-- file_format.`file_path` + +-- PARQUET file +SELECT * FROM parquet.`examples/src/main/resources/users.parquet`; + +-- ORC file +SELECT * FROM orc.`examples/src/main/resources/users.orc`; + +-- JSON file +SELECT * FROM json.`examples/src/main/resources/people.json`; diff --git a/test/parser/spark/syntax/fixtures/selectGroupBy.sql b/test/parser/spark/syntax/fixtures/selectGroupBy.sql new file mode 100644 index 0000000..4104351 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectGroupBy.sql @@ -0,0 +1,55 @@ + +-- GROUP BY group_expression [ , group_expression [ , ... ] ] [ WITH { ROLLUP | CUBE } ] +-- GROUP BY { group_expression | { ROLLUP | CUBE | GROUPING SETS } (grouping_set [ , ...]) } [ , ... ] + +-- aggregate_name ( [ DISTINCT ] expression [ , ... ] ) [ FILTER ( WHERE boolean_expression ) ] + + +-- Sum of quantity per dealership. Group by `id`. +SELECT id, sum(quantity) FROM dealer GROUP BY id ORDER BY id; + +-- Use column position in GROUP by clause. +SELECT id, sum(quantity) FROM dealer GROUP BY 1 ORDER BY 1; + +-- Multiple aggregations. +-- 1. Sum of quantity per dealership. +-- 2. Max quantity per dealership. +SELECT id, sum(quantity) AS sum, max(quantity) AS max FROM dealer GROUP BY id ORDER BY id; + +-- Count the number of distinct dealer cities per car_model. +SELECT car_model, count(DISTINCT city) AS count FROM dealer GROUP BY car_model; + +-- Sum of only 'Honda Civic' and 'Honda CRV' quantities per dealership. +SELECT id, sum(quantity) FILTER ( + WHERE car_model IN ('Honda Civic', 'Honda CRV') + ) AS `sum(quantity)` FROM dealer + GROUP BY id ORDER BY id; + +-- Aggregations using multiple sets of grouping columns in a single statement. +-- Following performs aggregations based on four sets of grouping columns. +-- 1. city, car_model +-- 2. city +-- 3. car_model +-- 4. Empty grouping set. Returns quantities for all city and car models. +SELECT city, car_model, sum(quantity) AS sum FROM dealer + GROUP BY GROUPING SETS ((city, car_model), (city), (car_model), ()) + ORDER BY city; + +-- Group by processing with `ROLLUP` clause. +-- Equivalent GROUP BY GROUPING SETS ((city, car_model), (city), ()) +SELECT city, car_model, sum(quantity) AS sum FROM dealer + GROUP BY city, car_model WITH ROLLUP + ORDER BY city, car_model; + +-- Group by processing with `CUBE` clause. +-- Equivalent GROUP BY GROUPING SETS ((city, car_model), (city), (car_model), ()) +SELECT city, car_model, sum(quantity) AS sum FROM dealer + GROUP BY city, car_model WITH CUBE + ORDER BY city, car_model; + +--Select the first row in column age +SELECT FIRST(age) FROM person; + +--Get the first row in column `age` ignore nulls,last row in column `id` and sum of column `id`. +SELECT FIRST(age IGNORE NULLS), LAST(id), SUM(id) FROM person; + diff --git a/test/parser/spark/syntax/fixtures/selectHint.sql b/test/parser/spark/syntax/fixtures/selectHint.sql new file mode 100644 index 0000000..630b886 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectHint.sql @@ -0,0 +1,75 @@ +/*+ hint [ , ... ] */ + +SELECT /*+ COALESCE(3) */ * FROM t; + +SELECT /*+ REPARTITION(3) */ * FROM t; + +SELECT /*+ REPARTITION(c) */ * FROM t; + +SELECT /*+ REPARTITION(3, c) */ * FROM t; + +SELECT /*+ REPARTITION_BY_RANGE(c) */ * FROM t; + +SELECT /*+ REPARTITION_BY_RANGE(3, c) */ * FROM t; + +SELECT /*+ REBALANCE */ * FROM t; + +SELECT /*+ REBALANCE(3) */ * FROM t; + +SELECT /*+ REBALANCE(c) */ * FROM t; + +SELECT /*+ REBALANCE(3, c) */ * FROM t; + +-- multiple partitioning hints +EXPLAIN EXTENDED SELECT /*+ REPARTITION(100), COALESCE(500), REPARTITION_BY_RANGE(3, c) */ * FROM t; + +-- == Parsed Logical Plan == +-- 'UnresolvedHint REPARTITION, [100] +-- +- 'UnresolvedHint COALESCE, [500] +-- +- 'UnresolvedHint REPARTITION_BY_RANGE, [3, 'c] +-- +- 'Project [*] +-- +- 'UnresolvedRelation [t] + +-- == Analyzed Logical Plan == +-- name: string, c: int +-- Repartition 100, true +-- +- Repartition 500, false +-- +- RepartitionByExpression [c#30 ASC NULLS FIRST], 3 +-- +- Project [name#29, c#30] +-- +- SubqueryAlias spark_catalog.default.t +-- +- Relation[name#29,c#30] parquet + +-- == Optimized Logical Plan == +-- Repartition 100, true +-- +- Relation[name#29,c#30] parquet + +-- == Physical Plan == +-- Exchange RoundRobinPartitioning(100), false, [id=#121] +-- +- *(1) ColumnarToRow +-- +- FileScan parquet default.t[name#29,c#30] Batched: true, DataFilters: [], Format: Parquet, +-- Location: CatalogFileIndex[file:/spark/spark-warehouse/t], PartitionFilters: [], +-- PushedFilters: [], ReadSchema: struct + +-- Join Hints for broadcast join +SELECT /*+ BROADCAST(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; +SELECT /*+ BROADCASTJOIN (t1) */ * FROM t1 left JOIN t2 ON t1.key = t2.key; +SELECT /*+ MAPJOIN(t2) */ * FROM t1 right JOIN t2 ON t1.key = t2.key; + +-- Join Hints for shuffle sort merge join +SELECT /*+ SHUFFLE_MERGE(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; +SELECT /*+ MERGEJOIN(t2) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; +SELECT /*+ MERGE(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; + +-- Join Hints for shuffle hash join +SELECT /*+ SHUFFLE_HASH(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; + +-- Join Hints for shuffle-and-replicate nested loop join +SELECT /*+ SHUFFLE_REPLICATE_NL(t1) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; + +-- When different join strategy hints are specified on both sides of a join, Spark +-- prioritizes the BROADCAST hint over the MERGE hint over the SHUFFLE_HASH hint +-- over the SHUFFLE_REPLICATE_NL hint. +-- Spark will issue Warning in the following example +-- org.apache.spark.sql.catalyst.analysis.HintErrorLogger: Hint (strategy=merge) +-- is overridden by another hint and will not take effect. +SELECT /*+ BROADCAST(t1), MERGE(t1, t2) */ * FROM t1 INNER JOIN t2 ON t1.key = t2.key; diff --git a/test/parser/spark/syntax/fixtures/selectHiving.sql b/test/parser/spark/syntax/fixtures/selectHiving.sql new file mode 100644 index 0000000..3fdfcf5 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectHiving.sql @@ -0,0 +1,32 @@ +-- HAVING boolean_expression + + +CREATE TABLE dealer (id INT, city STRING, car_model STRING, quantity INT); +INSERT INTO dealer VALUES + (100, 'Fremont', 'Honda Civic', 10), + (100, 'Fremont', 'Honda Accord', 15), + (100, 'Fremont', 'Honda CRV', 7), + (200, 'Dublin', 'Honda Civic', 20), + (200, 'Dublin', 'Honda Accord', 10), + (200, 'Dublin', 'Honda CRV', 3), + (300, 'San Jose', 'Honda Civic', 5), + (300, 'San Jose', 'Honda Accord', 8); + +-- `HAVING` clause referring to column in `GROUP BY`. +SELECT city, sum(quantity) AS sum FROM dealer GROUP BY city HAVING city = 'Fremont'; + +-- `HAVING` clause referring to aggregate function. +SELECT city, sum(quantity) AS sum FROM dealer GROUP BY city HAVING sum(quantity) > 15; + +-- `HAVING` clause referring to aggregate function by its alias. +SELECT city, sum(quantity) AS sum FROM dealer GROUP BY city HAVING sum > 15; + +-- `HAVING` clause referring to a different aggregate function than what is present in +-- `SELECT` list. +SELECT city, sum(quantity) AS sum FROM dealer GROUP BY city HAVING max(quantity) > 15; + +-- `HAVING` clause referring to constant expression. +SELECT city, sum(quantity) AS sum FROM dealer GROUP BY city HAVING 1 > 0 ORDER BY city; + +-- `HAVING` clause without a `GROUP BY` clause. +SELECT sum(quantity) AS sum FROM dealer HAVING sum(quantity) > 10; diff --git a/test/parser/spark/syntax/fixtures/selectInlineTable.sql b/test/parser/spark/syntax/fixtures/selectInlineTable.sql new file mode 100644 index 0000000..6f3648f --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectInlineTable.sql @@ -0,0 +1,10 @@ +-- VALUES ( expression [ , ... ] ) [ table_alias ] + +-- single row, without a table alias +SELECT * FROM VALUES ("one", 1); + +-- three rows with a table alias +SELECT * FROM VALUES ("one", 1), ("two", 2), ("three", null) AS data(a, b); + +-- complex types with a table alias +SELECT * FROM VALUES ("one", array(0, 1)), ("two", array(2, 3)) AS data(a, b); diff --git a/test/parser/spark/syntax/fixtures/selectJoin.sql b/test/parser/spark/syntax/fixtures/selectJoin.sql new file mode 100644 index 0000000..2a03e37 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectJoin.sql @@ -0,0 +1,27 @@ +-- relation { [ join_type ] JOIN [ LATERAL ] relation [ join_criteria ] | NATURAL join_type JOIN [ LATERAL ] relation } + +-- Use employee and department tables to demonstrate different type of joins. +SELECT * FROM employee; + +SELECT * FROM department; + +-- Use employee and department tables to demonstrate inner join. +SELECT id, name, employee.deptno, deptname FROM employee INNER JOIN department ON employee.deptno = department.deptno; + +-- Use employee and department tables to demonstrate left join. +SELECT id, name, employee.deptno, deptname FROM employee LEFT JOIN department ON employee.deptno = department.deptno; + +-- Use employee and department tables to demonstrate right join. +SELECT id, name, employee.deptno, deptname FROM employee RIGHT JOIN department ON employee.deptno = department.deptno; + +-- Use employee and department tables to demonstrate full join. +SELECT id, name, employee.deptno, deptname FROM employee FULL JOIN department ON employee.deptno = department.deptno; + +-- Use employee and department tables to demonstrate cross join. +SELECT id, name, employee.deptno, deptname FROM employee CROSS JOIN department; + +-- Use employee and department tables to demonstrate semi join. +SELECT * FROM employee SEMI JOIN department ON employee.deptno = department.deptno; + +-- Use employee and department tables to demonstrate anti join. +SELECT * FROM employee ANTI JOIN department ON employee.deptno = department.deptno; diff --git a/test/parser/spark/syntax/fixtures/selectLateralSubQuery.sql b/test/parser/spark/syntax/fixtures/selectLateralSubQuery.sql new file mode 100644 index 0000000..98e12bc --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectLateralSubQuery.sql @@ -0,0 +1,9 @@ +-- [ LATERAL ] primary_relation [ join_relation ] + +SELECT * FROM t1, + LATERAL (SELECT * FROM t2 WHERE t1.c1 = t2.c1); + +SELECT a, b, c FROM t1, + LATERAL (SELECT c1 + c2 AS a), + LATERAL (SELECT c1 - c2 AS b), + LATERAL (SELECT a * b AS c); diff --git a/test/parser/spark/syntax/fixtures/selectLateralView.sql b/test/parser/spark/syntax/fixtures/selectLateralView.sql new file mode 100644 index 0000000..7a49687 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectLateralView.sql @@ -0,0 +1,14 @@ +-- LATERAL VIEW [ OUTER ] generator_function ( expression [ , ... ] ) [ table_alias ] AS column_alias [ , ... ] + +SELECT * FROM person + LATERAL VIEW EXPLODE(ARRAY(30, 60)) tableName AS c_age + LATERAL VIEW EXPLODE(ARRAY(40, 80)) AS d_age; + +SELECT c_age, COUNT(1) FROM person + LATERAL VIEW EXPLODE(ARRAY(30, 60)) AS c_age + LATERAL VIEW EXPLODE(ARRAY(40, 80)) AS d_age +GROUP BY c_age; + +SELECT * FROM person LATERAL VIEW EXPLODE(ARRAY()) tableName AS c_age; + +SELECT * FROM person LATERAL VIEW OUTER EXPLODE(ARRAY()) tableName AS c_age; diff --git a/test/parser/spark/syntax/fixtures/selectLike.sql b/test/parser/spark/syntax/fixtures/selectLike.sql new file mode 100644 index 0000000..8826ab7 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectLike.sql @@ -0,0 +1,29 @@ +-- [ NOT ] { LIKE search_pattern [ ESCAPE esc_char ] | [ RLIKE | REGEXP ] regex_pattern } + +-- [ NOT ] { LIKE quantifiers ( search_pattern [ , ... ]) } + +SELECT * FROM person WHERE name LIKE 'M%'; + +SELECT * FROM person WHERE name LIKE 'M_ry'; + +SELECT * FROM person WHERE name NOT LIKE 'M_ry'; + +SELECT * FROM person WHERE name RLIKE 'M+'; + +SELECT * FROM person WHERE name REGEXP 'M+'; + +SELECT * FROM person WHERE name LIKE '%\_%'; + +SELECT * FROM person WHERE name LIKE '%$_%' ESCAPE '$'; + +SELECT * FROM person WHERE name LIKE ALL ('%an%', '%an'); + +SELECT * FROM person WHERE name LIKE ANY ('%an%', '%an'); + +SELECT * FROM person WHERE name LIKE SOME ('%an%', '%an'); + +SELECT * FROM person WHERE name NOT LIKE ALL ('%an%', '%an'); + +SELECT * FROM person WHERE name NOT LIKE ANY ('%an%', '%an'); + +SELECT * FROM person WHERE name NOT LIKE SOME ('%an%', '%an'); diff --git a/test/parser/spark/syntax/fixtures/selectLimit.sql b/test/parser/spark/syntax/fixtures/selectLimit.sql new file mode 100644 index 0000000..507da9f --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectLimit.sql @@ -0,0 +1,13 @@ +-- LIMIT { ALL | integer_expression } + +-- Select the first two rows. +SELECT name, age FROM person ORDER BY name LIMIT 2; + +-- Specifying ALL option on LIMIT returns all the rows. +SELECT name, age FROM person ORDER BY name LIMIT ALL; + +-- A function expression as an input to LIMIT. +SELECT name, age FROM person ORDER BY name LIMIT length('SPARK'); + +-- A non-foldable expression as an input to LIMIT is not allowed. +SELECT name, age FROM person ORDER BY name LIMIT length(name); diff --git a/test/parser/spark/syntax/fixtures/selectOffset.sql b/test/parser/spark/syntax/fixtures/selectOffset.sql new file mode 100644 index 0000000..b44159c --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectOffset.sql @@ -0,0 +1,13 @@ +-- OFFSET integer_expression + +-- Skip the first two rows. +SELECT name, age FROM person ORDER BY name OFFSET 2; + +-- Skip the first two rows and returns the next three rows. +SELECT name, age FROM person ORDER BY name LIMIT 3 OFFSET 2; + +-- A function expression as an input to OFFSET. +SELECT name, age FROM person ORDER BY name OFFSET length('SPARK'); + +-- A non-foldable expression as an input to OFFSET is not allowed. +SELECT name, age FROM person ORDER BY name OFFSET length(name); diff --git a/test/parser/spark/syntax/fixtures/selectOrderBy.sql b/test/parser/spark/syntax/fixtures/selectOrderBy.sql new file mode 100644 index 0000000..4defec4 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectOrderBy.sql @@ -0,0 +1,17 @@ +-- ORDER BY { expression [ sort_direction | nulls_sort_order ] [ , ... ] } + +-- Sort rows by age. By default rows are sorted in ascending manner with NULL FIRST. +SELECT name, age FROM person ORDER BY age; + +-- Sort rows in ascending manner keeping null values to be last. +SELECT name, age FROM person ORDER BY age NULLS LAST; + +-- Sort rows by age in descending manner, which defaults to NULL LAST. +SELECT name, age FROM person ORDER BY age DESC; + +-- Sort rows in ascending manner keeping null values to be first. +SELECT name, age FROM person ORDER BY age DESC NULLS FIRST; + +-- Sort rows based on more than one column with each column having different +-- sort direction. +SELECT * FROM person ORDER BY name ASC, age DESC; diff --git a/test/parser/spark/syntax/fixtures/selectPivot.sql b/test/parser/spark/syntax/fixtures/selectPivot.sql new file mode 100644 index 0000000..c58b85f --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectPivot.sql @@ -0,0 +1,6 @@ +-- PIVOT ( { aggregate_expression [ AS aggregate_expression_alias ] } [ , ... ] +-- FOR column_list IN ( expression_list ) ) + +SELECT * FROM person PIVOT (SUM(age) AS a, AVG(class) AS c FOR name IN ('John' AS john, 'Mike' AS mike)); + +SELECT * FROM person PIVOT (SUM(age) AS a, AVG(class) AS c FOR (name, age) IN (('John', 30) AS c1, ('Mike', 40) AS c2)); diff --git a/test/parser/spark/syntax/fixtures/selectSet.sql b/test/parser/spark/syntax/fixtures/selectSet.sql new file mode 100644 index 0000000..0b85a56 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectSet.sql @@ -0,0 +1,31 @@ +-- [ ( ] relation [ ) ] EXCEPT | MINUS [ ALL | DISTINCT ] [ ( ] relation [ ) ] + +-- Use number1 and number2 tables to demonstrate set operators in this page. +SELECT * FROM number1; + +SELECT * FROM number2; + +SELECT c FROM number1 EXCEPT SELECT c FROM number2; + +SELECT c FROM number1 MINUS SELECT c FROM number2; + +SELECT c FROM number1 EXCEPT ALL (SELECT c FROM number2); + +SELECT c FROM number1 MINUS ALL (SELECT c FROM number2); + +-- [ ( ] relation [ ) ] INTERSECT [ ALL | DISTINCT ] [ ( ] relation [ ) ] + +(SELECT c FROM number1) INTERSECT (SELECT c FROM number2); + +(SELECT c FROM number1) INTERSECT DISTINCT (SELECT c FROM number2); + +(SELECT c FROM number1) INTERSECT ALL (SELECT c FROM number2); + + +-- [ ( ] relation [ ) ] UNION [ ALL | DISTINCT ] [ ( ] relation [ ) ] + +(SELECT c FROM number1) UNION (SELECT c FROM number2); + +(SELECT c FROM number1) UNION DISTINCT (SELECT c FROM number2); + +SELECT c FROM number1 UNION ALL (SELECT c FROM number2); diff --git a/test/parser/spark/syntax/fixtures/selectSortBy.sql b/test/parser/spark/syntax/fixtures/selectSortBy.sql new file mode 100644 index 0000000..5f22b0c --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectSortBy.sql @@ -0,0 +1,20 @@ +-- SORT BY { expression [ sort_direction | nulls_sort_order ] [ , ... ] } + +-- Sort rows by `name` within each partition in ascending manner +SELECT /*+ REPARTITION(zip_code) */ name, age, zip_code FROM person SORT BY name; + +-- Sort rows within each partition using column position. +SELECT /*+ REPARTITION(zip_code) */ name, age, zip_code FROM person SORT BY 1; + +-- Sort rows within partition in ascending manner keeping null values to be last. +SELECT /*+ REPARTITION(zip_code) */ age, name, zip_code FROM person SORT BY age NULLS LAST; + +-- Sort rows by age within each partition in descending manner, which defaults to NULL LAST. +SELECT /*+ REPARTITION(zip_code) */ age, name, zip_code FROM person SORT BY age DESC; + +-- Sort rows by age within each partition in descending manner keeping null values to be first. +SELECT /*+ REPARTITION(zip_code) */ age, name, zip_code FROM person SORT BY age DESC NULLS FIRST; + +-- Sort rows within each partition based on more than one column with each column having +-- different sort direction. +SELECT /*+ REPARTITION(zip_code) */ name, age, zip_code FROM person SORT BY name ASC, age DESC; diff --git a/test/parser/spark/syntax/fixtures/selectTVF.sql b/test/parser/spark/syntax/fixtures/selectTVF.sql new file mode 100644 index 0000000..2916a23 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectTVF.sql @@ -0,0 +1,25 @@ +-- range call with end +SELECT * FROM range(6 + cos(3)); + +-- range call with start and end +SELECT * FROM range(5, 10); + +-- range call with numPartitions +SELECT * FROM range(0, 10, 2, 200); + +-- range call with a table alias +SELECT * FROM range(5, 8) AS test; + +SELECT explode(array(10, 20)); + +SELECT inline(array(struct(1, 'a'), struct(2, 'b'))); + +SELECT posexplode(array(10,20)); + +SELECT stack(2, 1, 2, 3); + +SELECT json_tuple('{"a":1, "b":2}', 'a', 'b'); + +SELECT parse_url('http://spark.apache.org/path?query=1', 'HOST'); + +SELECT * FROM test LATERAL VIEW explode (ARRAY(3,4)) AS c2; diff --git a/test/parser/spark/syntax/fixtures/selectTableSample.sql b/test/parser/spark/syntax/fixtures/selectTableSample.sql new file mode 100644 index 0000000..aaa3ceb --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectTableSample.sql @@ -0,0 +1,9 @@ +-- TABLESAMPLE ({ integer_expression | decimal_expression } PERCENT) +-- | TABLESAMPLE ( integer_expression ROWS ) +-- | TABLESAMPLE ( BUCKET integer_expression OUT OF integer_expression ) + +SELECT * FROM test TABLESAMPLE (50 PERCENT); + +SELECT * FROM test TABLESAMPLE (5 ROWS); + +SELECT * FROM test TABLESAMPLE (BUCKET 4 OUT OF 10); diff --git a/test/parser/spark/syntax/fixtures/selectTransform.sql b/test/parser/spark/syntax/fixtures/selectTransform.sql new file mode 100644 index 0000000..cd90040 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectTransform.sql @@ -0,0 +1,51 @@ +-- SELECT TRANSFORM ( expression [ , ... ] ) +-- [ ROW FORMAT row_format ] +-- [ RECORDWRITER record_writer_class ] +-- USING command_or_script [ AS ( [ col_name [ col_type ] ] [ , ... ] ) ] +-- [ ROW FORMAT row_format ] +-- [ RECORDREADER record_reader_class ] + +-- With specified output without data type +SELECT TRANSFORM(zip_code, name, age) + USING 'cat' AS (a, b, c) +FROM person +WHERE zip_code > 94511; + +-- With specified output with data type +SELECT TRANSFORM(zip_code, name, age) + USING 'cat' AS (a STRING, b STRING, c STRING) +FROM person +WHERE zip_code > 94511; + +-- Using ROW FORMAT DELIMITED +SELECT TRANSFORM(name, age) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY ',' + LINES TERMINATED BY '\n' + NULL DEFINED AS 'NULL' + USING 'cat' AS (name_age string) + ROW FORMAT DELIMITED + FIELDS TERMINATED BY '@' + LINES TERMINATED BY '\n' + NULL DEFINED AS 'NULL' +FROM person; + +-- Using Hive Serde +SELECT TRANSFORM(zip_code, name, age) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ( + 'field.delim' = '\t' + ) + USING 'cat' AS (a STRING, b STRING, c STRING) + ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + WITH SERDEPROPERTIES ( + 'field.delim' = '\t' + ) +FROM person +WHERE zip_code > 94511; + +-- Schema-less mode +SELECT TRANSFORM(zip_code, name, age) + USING 'cat' +FROM person +WHERE zip_code > 94500; diff --git a/test/parser/spark/syntax/fixtures/selectUnPivot.sql b/test/parser/spark/syntax/fixtures/selectUnPivot.sql new file mode 100644 index 0000000..15a04af --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectUnPivot.sql @@ -0,0 +1,38 @@ +-- UNPIVOT [ { INCLUDE | EXCLUDE } NULLS ] ( +-- { single_value_column_unpivot | multi_value_column_unpivot } +-- ) [[AS] alias] + +-- single_value_column_unpivot: +-- values_column +-- FOR name_column +-- IN (unpivot_column [[AS] alias] [, ...]) + +-- multi_value_column_unpivot: +-- (values_column [, ...]) +-- FOR name_column +-- IN ((unpivot_column [, ...]) [[AS] alias] [, ...]) + + +-- column names are used as unpivot columns +SELECT * FROM sales_quarterly + UNPIVOT ( + sales FOR quarter IN (q1, q2, q3, q4) + ); + +-- NULL values are excluded by default, they can be included +-- unpivot columns can be alias +-- unpivot result can be referenced via its alias +SELECT up.* FROM sales_quarterly + UNPIVOT INCLUDE NULLS ( + sales FOR quarter IN (q1 AS Q1, q2 AS Q2, q3 AS Q3, q4 AS Q4) + ) AS up; + +-- multiple value columns can be unpivoted per row +SELECT * FROM sales_quarterly + UNPIVOT EXCLUDE NULLS ( + (first_quarter, second_quarter) + FOR half_of_the_year IN ( + (q1, q2) AS H1, + (q3, q4) AS H2 + ) + ); diff --git a/test/parser/spark/syntax/fixtures/selectWhere.sql b/test/parser/spark/syntax/fixtures/selectWhere.sql new file mode 100644 index 0000000..5b93e59 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectWhere.sql @@ -0,0 +1,22 @@ +-- WHERE boolean_expression + +-- Comparison operator in `WHERE` clause. +SELECT * FROM person WHERE id > 200 ORDER BY id; + +-- Comparison and logical operators in `WHERE` clause. +SELECT * FROM person WHERE id = 200 OR id = 300 ORDER BY id; + +-- IS NULL expression in `WHERE` clause. +SELECT * FROM person WHERE id > 300 OR age IS NULL ORDER BY id; + +-- Function expression in `WHERE` clause. +SELECT * FROM person WHERE length(name) > 3 ORDER BY id; + +-- `BETWEEN` expression in `WHERE` clause. +SELECT * FROM person WHERE id BETWEEN 200 AND 300 ORDER BY id; + +-- Scalar Subquery in `WHERE` clause. +SELECT * FROM person WHERE age > (SELECT avg(age) FROM person); + +-- Correlated Subquery in `WHERE` clause. +SELECT * FROM person AS parent WHERE EXISTS (SELECT 1 FROM person AS child WHERE parent.id = child.id AND child.age IS NULL); diff --git a/test/parser/spark/syntax/fixtures/selectWindowFn.sql b/test/parser/spark/syntax/fixtures/selectWindowFn.sql new file mode 100644 index 0000000..f36e241 --- /dev/null +++ b/test/parser/spark/syntax/fixtures/selectWindowFn.sql @@ -0,0 +1,31 @@ +-- window_function [ nulls_option ] OVER +-- ( [ { PARTITION | DISTRIBUTE } BY partition_col_name = partition_col_val ( [ , ... ] ) ] +-- { ORDER | SORT } BY expression [ ASC | DESC ] [ NULLS { FIRST | LAST } ] [ , ... ] +-- [ window_frame ] ) + + +CREATE TABLE employees (name STRING, dept STRING, salary INT, age INT); + +INSERT INTO employees VALUES ("Lisa", "Sales", 10000, 35); +INSERT INTO employees VALUES ("Evan", "Sales", 32000, 38); +INSERT INTO employees VALUES ("Fred", "Engineering", 21000, 28); +INSERT INTO employees VALUES ("Alex", "Sales", 30000, 33); +INSERT INTO employees VALUES ("Tom", "Engineering", 23000, 33); +INSERT INTO employees VALUES ("Jane", "Marketing", 29000, 28); +INSERT INTO employees VALUES ("Jeff", "Marketing", 35000, 38); +INSERT INTO employees VALUES ("Paul", "Engineering", 29000, 23); +INSERT INTO employees VALUES ("Chloe", "Engineering", 23000, 25); + +SELECT * FROM employees; + +SELECT name, dept, salary, RANK() OVER (PARTITION BY dept ORDER BY salary) AS rank FROM employees; + +SELECT name, dept, salary, DENSE_RANK() OVER (PARTITION BY dept ORDER BY salary ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS dense_rank FROM employees; + +SELECT name, dept, age, CUME_DIST() OVER (PARTITION BY dept ORDER BY age RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cume_dist FROM employees; + +SELECT name, dept, salary, MIN(salary) OVER (PARTITION BY dept ORDER BY salary) AS min FROM employees; + +SELECT name, salary, LAG(salary) OVER (PARTITION BY dept ORDER BY salary) AS lag, LEAD(salary, 1, 0) OVER (PARTITION BY dept ORDER BY salary) AS lead FROM employees; + +SELECT id, v, LEAD(v, 0) IGNORE NULLS OVER w lead, LAG(v, 0) IGNORE NULLS OVER w lag, NTH_VALUE(v, 2) IGNORE NULLS OVER w nth_value, FIRST_VALUE(v) IGNORE NULLS OVER w first_value, LAST_VALUE(v) IGNORE NULLS OVER w last_value FROM test_ignore_null WINDOW w AS (ORDER BY id) ORDER BY id; diff --git a/test/parser/spark/syntax/selectStatement.test.ts b/test/parser/spark/syntax/selectStatement.test.ts new file mode 100644 index 0000000..a19166e --- /dev/null +++ b/test/parser/spark/syntax/selectStatement.test.ts @@ -0,0 +1,165 @@ +import SparkSQL from '../../../../src/parser/spark'; +import { readSQL } from '../../../helper'; + +const parser = new SparkSQL(); + +const features = { + selectAggregateFn: readSQL(__dirname, 'selectAggregateFn.sql'), + selectCase: readSQL(__dirname, 'selectCase.sql'), + selectCET: readSQL(__dirname, 'selectCET.sql'), + selectWindowFn: readSQL(__dirname, 'selectWindowFn.sql'), + selectWhere: readSQL(__dirname, 'selectWhere.sql'), + selectUnPivot: readSQL(__dirname, 'selectUnPivot.sql'), + selectTVF: readSQL(__dirname, 'selectTVF.sql'), + selectTransform: readSQL(__dirname, 'selectTransform.sql'), + selectTableSample: readSQL(__dirname, 'selectTableSample.sql'), + selectSortBy: readSQL(__dirname, 'selectSortBy.sql'), + selectPivot: readSQL(__dirname, 'selectPivot.sql'), + selectOrderBy: readSQL(__dirname, 'selectOrderBy.sql'), + selectOffset: readSQL(__dirname, 'selectOffset.sql'), + selectLimit: readSQL(__dirname, 'selectLimit.sql'), + selectLike: readSQL(__dirname, 'selectLike.sql'), + selectLateralView: readSQL(__dirname, 'selectLateralView.sql'), + selectLateralSubQuery: readSQL(__dirname, 'selectLateralSubQuery.sql'), + selectJoin: readSQL(__dirname, 'selectJoin.sql'), + selectInlineTable: readSQL(__dirname, 'selectInlineTable.sql'), + selectHiving: readSQL(__dirname, 'selectHiving.sql'), + selectHint: readSQL(__dirname, 'selectHint.sql'), + selectGroupBy: readSQL(__dirname, 'selectGroupBy.sql'), + selectFile: readSQL(__dirname, 'selectFile.sql'), + selectExplain: readSQL(__dirname, 'selectExplain.sql'), + selectDistributeBy: readSQL(__dirname, 'selectDistributeBy.sql'), + selectClusterBy: readSQL(__dirname, 'selectClusterBy.sql'), +}; +describe('Spark select Syntax Tests', () => { + features.selectAggregateFn.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectCase.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectCET.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectWindowFn.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectWhere.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectUnPivot.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectTVF.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectTransform.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectTableSample.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectSortBy.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectPivot.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectOrderBy.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectOffset.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectLimit.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectLike.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectLateralView.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectLateralSubQuery.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectJoin.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectInlineTable.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectHiving.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectHint.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectGroupBy.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectFile.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectExplain.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectDistributeBy.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); + features.selectClusterBy.forEach((itemSql) => { + it(itemSql, () => { + expect(parser.validate(itemSql).length).toBe(0); + }); + }); +});