feat(spark): support materialized view for spark sql (#262)

* feat(spark): support materialized view for spark sql

* fix(spark): code review update

* fix(spark): update spark  materilized view and zorder grammar

* test(spark): add syntaxSuggestion test of materialized view

---------

Co-authored-by: jialan <jialan@dtstack.com>
This commit is contained in:
JackWang032
2024-02-26 17:25:19 +08:00
committed by GitHub
parent 081ff7f067
commit 5ce89cb421
26 changed files with 10156 additions and 9101 deletions

View File

@ -0,0 +1,13 @@
-- ALTER MATERIALIZED VIEW view_identifier ENABLE|DISABLE REWRITE;
ALTER MATERIALIZED VIEW mv ENABLE REWRITE;
ALTER MATERIALIZED VIEW userDB.mv ENABLE REWRITE;
ALTER MATERIALIZED VIEW mv DISABLE REWRITE;
-- ALTER MATERIALIZED VIEW view_identifier SET TBLPROPERTIES ( property_name=property_value, ... );
ALTER MATERIALIZED VIEW mv SET TBLPROPERTIES ('mv.enableAutoRefresh'='true', 'mv.refreshInterval'='10min');
ALTER MATERIALIZED VIEW userDB.mv SET TBLPROPERTIES ('mv.enableAutoRefresh'='true', 'mv.refreshInterval'='10min');

View File

@ -0,0 +1,99 @@
/**
** Notes:
** 1. MATERIALIZED VIEW syntax has not been officially supported by Spark yet.
** 2. The support for the following syntax is based on the self-developed component of dtstack.
**/
-- CREATE MATERIALIZED VIEW [ IF NOT EXISTS ] view_identifier
-- [ USING data_source ]
-- [ OPTIONS ( key1=val1, key2=val2, ... ) ]
-- [ PARTITIONED BY ( col_name1, col_name2, ... ) ]
-- [ SKEWED BY ( col_name, col_name, ... )
-- ON ( ( col_value, col_value, ... ), ( col_value, col_value, ... ), ... )
-- [ STORED AS DIRECTORIES ] ]
-- [ CLUSTERED BY ( col_name3, col_name4, ... )
-- [ SORTED BY ( col_name [ ASC | DESC ], ... ) ]
-- INTO num_buckets BUCKETS ]
-- [ ROW FORMAT row_format ]
-- [ [ STORED AS file_format ]
-- | STORED BY 'storage.handler.class.name' [ WITH SERDEPROPERTIES (...) ] ) ]
-- [ LOCATION hdfs_path ]
-- [ COMMENT table_comment ]
-- [ TBLPROPERTIES (property_name=property_value, ...) ]
-- AS select_statement;
CREATE MATERIALIZED VIEW mv AS SELECT id FROM students;
CREATE MATERIALIZED VIEW userDB.mv AS SELECT id FROM students;
CREATE MATERIALIZED VIEW IF NOT EXISTS mv AS SELECT id FROM students;
-- Use data source
CREATE MATERIALIZED VIEW mv USING CSV AS SELECT id FROM students;
-- Use parquet data source with parquet storage options
CREATE MATERIALIZED VIEW mv
USING PARQUET
OPTIONS (
'parquet.bloom.filter.enabled'='true',
'parquet.bloom.filter.enabled#age'='false'
)
AS SELECT id, age FROM students;
CREATE MATERIALIZED VIEW mv
PARTITIONED BY (id)
AS SELECT id FROM students;
CREATE MATERIALIZED VIEW mv
SKEWED BY (id) ON (1,5,6)
AS SELECT id FROM students;
CREATE MATERIALIZED VIEW mv
SKEWED BY (id) ON (1,5,6) STORED AS DIRECTORIES
AS SELECT id FROM students;
-- Create bucketed materialized view
CREATE MATERIALIZED VIEW mv
CLUSTERED BY (id) SORTED BY (id) INTO 3 BUCKETS
AS SELECT id FROM students;
-- Use row format
CREATE MATERIALIZED VIEW mv
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = ".*"
)
STORED AS TEXTFILE
AS SELECT id FROM students;
-- Use file format with 'stored as'
CREATE MATERIALIZED VIEW mv
STORED AS TEXTFILE
AS SELECT id FROM students;
-- Use file format with 'stored by'
CREATE MATERIALIZED VIEW mv
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES (
"hbase.columns.mapping" = "cf:string",
"hbase.table.name" = "hbase_table_0"
)
AS SELECT id FROM students;
-- Specify view storage path
CREATE MATERIALIZED VIEW mv
STORED AS PARQUET
LOCATION 'hdfs://mv/'
AS SELECT id FROM students;
-- Add mv comment
CREATE MATERIALIZED VIEW mv
STORED AS PARQUET
LOCATION 'hdfs://mv/'
COMMENT 'A materialized view'
AS SELECT id FROM students;
-- Set refresh properties
CREATE MATERIALIZED VIEW mv
TBLPROPERTIES("mv.enableAutoRefresh"="true", "mv.refreshInterval"="10min")
AS SELECT id FROM students;

View File

@ -0,0 +1,7 @@
-- DROP MATERIALIZED VIEW [ IF EXISTS ] view_identifier
DROP MATERIALIZED VIEW mv;
DROP MATERIALIZED VIEW userDB.mv;
DROP MATERIALIZED VIEW IF EXISTS mv;

View File

@ -0,0 +1,7 @@
-- OPTIMIZE view_identifier [ WHERE where_expression ] ZORDER BY col1, col2...
OPTIMIZE students ZORDER BY id, name;
OPTIMIZE userDB.students ZORDER BY id, name;
OPTIMIZE students WHERE id=1 ZORDER BY id, name;

View File

@ -15,3 +15,9 @@ REFRESH FUNCTION db1.func1;
REFRESH TABLE tbl1;
REFRESH TABLE tempDB.view1;
-- REFRESH MATERIALIZED VIEW view_identifier
REFRESH MATERIALIZED VIEW mv;
REFRESH MATERIALIZED VIEW userDB.mv;

View File

@ -99,3 +99,29 @@ SHOW VIEWS IN global_temp;
SHOW VIEWS FROM default LIKE 'sam*';
SHOW VIEWS LIKE 'sam|suj|temp*';
-- SHOW MATERIALIZED VIEWS [ { FROM | IN } database_name ] [ LIKE? regex_pattern ];
SHOW MATERIALIZED VIEWS;
SHOW MATERIALIZED VIEWS IN userdb;
SHOW MATERIALIZED VIEWS FROM userdb;
SHOW MATERIALIZED VIEWS LIKE 'test_view1|test_view2';
SHOW MATERIALIZED VIEWS IN userdb LIKE 'test_view1|test_view2';
SHOW MATERIALIZED VIEWS FROM userdb LIKE 'test_view1|test_view2';
SHOW MATERIALIZED VIEWS "test_*";
SHOW MATERIALIZED VIEWS IN userdb "test_*";
-- SHOW CREATE MATERIALIZED VIEW view_identifier [ AS SERDE ];
SHOW CREATE MATERIALIZED VIEW mv;
SHOW CREATE MATERIALIZED VIEW userdb.mv;
SHOW CREATE MATERIALIZED VIEW mv AS SERDE;