feat(spark): support materialized view for spark sql (#262)

* feat(spark): support materialized view for spark sql

* fix(spark): code review update

* fix(spark): update spark  materilized view and zorder grammar

* test(spark): add syntaxSuggestion test of materialized view

---------

Co-authored-by: jialan <jialan@dtstack.com>
This commit is contained in:
JackWang032
2024-02-26 17:25:19 +08:00
committed by GitHub
parent 081ff7f067
commit 5ce89cb421
26 changed files with 10156 additions and 9101 deletions

View File

@ -46,4 +46,22 @@ SELECT id, n FROM tbl GROUP BY ;
SELECT id, n FROM tbl ORDER BY name, i ;
SELECT id FROM tb1 GROUP BY ROLLUP( );
SELECT id FROM tb1 GROUP BY ROLLUP( );
CREATE MATERIALIZED VIEW db.mv;
DROP MATERIALIZED VIEW db.mv;
ALTER MATERIALIZED VIEW db.mv;
REFRESH MATERIALIZED VIEW db.mv;
SHOW CREATE MATERIALIZED VIEW db.mv;
SHOW MATERIALIZED VIEWS from db;
OPTIMIZE db.tb;
OPTIMIZE db.tb ZORDER BY ;
OPTIMIZE db.tb ZORDER BY name, i;

View File

@ -442,4 +442,157 @@ describe('Spark SQL Syntax Suggestion', () => {
expect(suggestion).not.toBeUndefined();
expect(suggestion?.wordRanges.map((token) => token.text)).toEqual([]);
});
test('Create materialized view', () => {
const pos: CaretPosition = {
lineNumber: 51,
column: 31,
};
const syntaxes = parser.getSuggestionAtCaretPosition(
commentOtherLine(syntaxSql, pos.lineNumber),
pos
)?.syntax;
const suggestion = syntaxes?.find(
(syn) => syn.syntaxContextType === SyntaxContextType.VIEW_CREATE
);
expect(suggestion).not.toBeUndefined();
expect(suggestion?.wordRanges.map((token) => token.text)).toEqual(['db', '.', 'mv']);
});
test('Drop materialized view', () => {
const pos: CaretPosition = {
lineNumber: 53,
column: 29,
};
const syntaxes = parser.getSuggestionAtCaretPosition(
commentOtherLine(syntaxSql, pos.lineNumber),
pos
)?.syntax;
const suggestion = syntaxes?.find(
(syn) => syn.syntaxContextType === SyntaxContextType.VIEW
);
expect(suggestion).not.toBeUndefined();
expect(suggestion?.wordRanges.map((token) => token.text)).toEqual(['db', '.', 'mv']);
});
test('Alter materialized view', () => {
const pos: CaretPosition = {
lineNumber: 55,
column: 30,
};
const syntaxes = parser.getSuggestionAtCaretPosition(
commentOtherLine(syntaxSql, pos.lineNumber),
pos
)?.syntax;
const suggestion = syntaxes?.find(
(syn) => syn.syntaxContextType === SyntaxContextType.VIEW
);
expect(suggestion).not.toBeUndefined();
expect(suggestion?.wordRanges.map((token) => token.text)).toEqual(['db', '.', 'mv']);
});
test('Refresh materialized view', () => {
const pos: CaretPosition = {
lineNumber: 57,
column: 32,
};
const syntaxes = parser.getSuggestionAtCaretPosition(
commentOtherLine(syntaxSql, pos.lineNumber),
pos
)?.syntax;
const suggestion = syntaxes?.find(
(syn) => syn.syntaxContextType === SyntaxContextType.VIEW
);
expect(suggestion).not.toBeUndefined();
expect(suggestion?.wordRanges.map((token) => token.text)).toEqual(['db', '.', 'mv']);
});
test('Show create materialized view', () => {
const pos: CaretPosition = {
lineNumber: 59,
column: 36,
};
const syntaxes = parser.getSuggestionAtCaretPosition(
commentOtherLine(syntaxSql, pos.lineNumber),
pos
)?.syntax;
const suggestion = syntaxes?.find(
(syn) => syn.syntaxContextType === SyntaxContextType.VIEW
);
expect(suggestion).not.toBeUndefined();
expect(suggestion?.wordRanges.map((token) => token.text)).toEqual(['db', '.', 'mv']);
});
test('Show all materialized from database', () => {
const pos: CaretPosition = {
lineNumber: 61,
column: 32,
};
const syntaxes = parser.getSuggestionAtCaretPosition(
commentOtherLine(syntaxSql, pos.lineNumber),
pos
)?.syntax;
const suggestion = syntaxes?.find(
(syn) => syn.syntaxContextType === SyntaxContextType.DATABASE
);
expect(suggestion).not.toBeUndefined();
expect(suggestion?.wordRanges.map((token) => token.text)).toEqual(['db']);
});
test('Optimize table', () => {
const pos: CaretPosition = {
lineNumber: 63,
column: 15,
};
const syntaxes = parser.getSuggestionAtCaretPosition(
commentOtherLine(syntaxSql, pos.lineNumber),
pos
)?.syntax;
const suggestion = syntaxes?.find(
(syn) => syn.syntaxContextType === SyntaxContextType.TABLE
);
expect(suggestion).not.toBeUndefined();
expect(suggestion?.wordRanges.map((token) => token.text)).toEqual(['db', '.', 'tb']);
});
test('Optimize table zorder by empty', () => {
const pos: CaretPosition = {
lineNumber: 65,
column: 26,
};
const syntaxes = parser.getSuggestionAtCaretPosition(
commentOtherLine(syntaxSql, pos.lineNumber),
pos
)?.syntax;
const suggestion = syntaxes?.find(
(syn) => syn.syntaxContextType === SyntaxContextType.COLUMN
);
expect(suggestion).not.toBeUndefined();
expect(suggestion?.wordRanges.map((token) => token.text)).toEqual([]);
});
test('Optimize table zorder by columns', () => {
const pos: CaretPosition = {
lineNumber: 67,
column: 33,
};
const syntaxes = parser.getSuggestionAtCaretPosition(
commentOtherLine(syntaxSql, pos.lineNumber),
pos
)?.syntax;
const suggestion = syntaxes?.find(
(syn) => syn.syntaxContextType === SyntaxContextType.COLUMN
);
expect(suggestion).not.toBeUndefined();
expect(suggestion?.wordRanges.map((token) => token.text)).toEqual(['i']);
});
});

View File

@ -19,7 +19,15 @@ describe('Spark SQL Token Suggestion', () => {
pos
)?.keywords;
expect(suggestion).toEqual(['TABLE', 'INDEX', 'VIEW', 'DATABASE', 'NAMESPACE', 'SCHEMA']);
expect(suggestion).toEqual([
'TABLE',
'INDEX',
'VIEW',
'MATERIALIZED',
'DATABASE',
'NAMESPACE',
'SCHEMA',
]);
});
test('After CREATE', () => {
@ -36,6 +44,7 @@ describe('Spark SQL Token Suggestion', () => {
'TEMPORARY',
'INDEX',
'ROLE',
'MATERIALIZED',
'FUNCTION',
'OR',
'GLOBAL',
@ -102,6 +111,7 @@ describe('Spark SQL Token Suggestion', () => {
'INDEX',
'ROLE',
'FUNCTION',
'MATERIALIZED',
'VIEW',
'TABLE',
'DATABASE',
@ -157,6 +167,7 @@ describe('Spark SQL Token Suggestion', () => {
'PRINCIPALS',
'ROLE',
'GRANT',
'MATERIALIZED',
'CATALOGS',
'FUNCTIONS',
'ALL',

View File

@ -4,12 +4,13 @@ import { readSQL } from 'test/helper';
const parser = new SparkSQL();
const features = {
alertDatabase: readSQL(__dirname, 'alertDatabase.sql'),
alertTable: readSQL(__dirname, 'alertTable.sql'),
alertView: readSQL(__dirname, 'alertView.sql'),
alterDatabase: readSQL(__dirname, 'alterDatabase.sql'),
altertTable: readSQL(__dirname, 'alterTable.sql'),
alterView: readSQL(__dirname, 'alterView.sql'),
alterMaterializedView: readSQL(__dirname, 'alterMaterializedView.sql'),
};
describe('SparkSQL Alert Syntax Tests', () => {
describe('SparkSQL Alter Syntax Tests', () => {
Object.keys(features).forEach((key) => {
features[key].forEach((sql) => {
it(sql, () => {

View File

@ -7,6 +7,7 @@ const features = {
createDatabase: readSQL(__dirname, 'createDatabase.sql'),
createFunction: readSQL(__dirname, 'createFunction.sql'),
createView: readSQL(__dirname, 'createView.sql'),
createMaterializedView: readSQL(__dirname, 'createMaterializedView.sql'),
};
describe('SparkSQL Create Syntax Tests', () => {

View File

@ -8,6 +8,7 @@ const features = {
dropFunction: readSQL(__dirname, 'dropFunction.sql'),
dropTable: readSQL(__dirname, 'dropTable.sql'),
dropView: readSQL(__dirname, 'dropView.sql'),
dropMaterializedView: readSQL(__dirname, 'dropMaterializedView.sql'),
};
describe('SparkSQL Drop Syntax Tests', () => {

View File

@ -0,0 +1,13 @@
-- ALTER MATERIALIZED VIEW view_identifier ENABLE|DISABLE REWRITE;
ALTER MATERIALIZED VIEW mv ENABLE REWRITE;
ALTER MATERIALIZED VIEW userDB.mv ENABLE REWRITE;
ALTER MATERIALIZED VIEW mv DISABLE REWRITE;
-- ALTER MATERIALIZED VIEW view_identifier SET TBLPROPERTIES ( property_name=property_value, ... );
ALTER MATERIALIZED VIEW mv SET TBLPROPERTIES ('mv.enableAutoRefresh'='true', 'mv.refreshInterval'='10min');
ALTER MATERIALIZED VIEW userDB.mv SET TBLPROPERTIES ('mv.enableAutoRefresh'='true', 'mv.refreshInterval'='10min');

View File

@ -0,0 +1,99 @@
/**
** Notes:
** 1. MATERIALIZED VIEW syntax has not been officially supported by Spark yet.
** 2. The support for the following syntax is based on the self-developed component of dtstack.
**/
-- CREATE MATERIALIZED VIEW [ IF NOT EXISTS ] view_identifier
-- [ USING data_source ]
-- [ OPTIONS ( key1=val1, key2=val2, ... ) ]
-- [ PARTITIONED BY ( col_name1, col_name2, ... ) ]
-- [ SKEWED BY ( col_name, col_name, ... )
-- ON ( ( col_value, col_value, ... ), ( col_value, col_value, ... ), ... )
-- [ STORED AS DIRECTORIES ] ]
-- [ CLUSTERED BY ( col_name3, col_name4, ... )
-- [ SORTED BY ( col_name [ ASC | DESC ], ... ) ]
-- INTO num_buckets BUCKETS ]
-- [ ROW FORMAT row_format ]
-- [ [ STORED AS file_format ]
-- | STORED BY 'storage.handler.class.name' [ WITH SERDEPROPERTIES (...) ] ) ]
-- [ LOCATION hdfs_path ]
-- [ COMMENT table_comment ]
-- [ TBLPROPERTIES (property_name=property_value, ...) ]
-- AS select_statement;
CREATE MATERIALIZED VIEW mv AS SELECT id FROM students;
CREATE MATERIALIZED VIEW userDB.mv AS SELECT id FROM students;
CREATE MATERIALIZED VIEW IF NOT EXISTS mv AS SELECT id FROM students;
-- Use data source
CREATE MATERIALIZED VIEW mv USING CSV AS SELECT id FROM students;
-- Use parquet data source with parquet storage options
CREATE MATERIALIZED VIEW mv
USING PARQUET
OPTIONS (
'parquet.bloom.filter.enabled'='true',
'parquet.bloom.filter.enabled#age'='false'
)
AS SELECT id, age FROM students;
CREATE MATERIALIZED VIEW mv
PARTITIONED BY (id)
AS SELECT id FROM students;
CREATE MATERIALIZED VIEW mv
SKEWED BY (id) ON (1,5,6)
AS SELECT id FROM students;
CREATE MATERIALIZED VIEW mv
SKEWED BY (id) ON (1,5,6) STORED AS DIRECTORIES
AS SELECT id FROM students;
-- Create bucketed materialized view
CREATE MATERIALIZED VIEW mv
CLUSTERED BY (id) SORTED BY (id) INTO 3 BUCKETS
AS SELECT id FROM students;
-- Use row format
CREATE MATERIALIZED VIEW mv
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = ".*"
)
STORED AS TEXTFILE
AS SELECT id FROM students;
-- Use file format with 'stored as'
CREATE MATERIALIZED VIEW mv
STORED AS TEXTFILE
AS SELECT id FROM students;
-- Use file format with 'stored by'
CREATE MATERIALIZED VIEW mv
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES (
"hbase.columns.mapping" = "cf:string",
"hbase.table.name" = "hbase_table_0"
)
AS SELECT id FROM students;
-- Specify view storage path
CREATE MATERIALIZED VIEW mv
STORED AS PARQUET
LOCATION 'hdfs://mv/'
AS SELECT id FROM students;
-- Add mv comment
CREATE MATERIALIZED VIEW mv
STORED AS PARQUET
LOCATION 'hdfs://mv/'
COMMENT 'A materialized view'
AS SELECT id FROM students;
-- Set refresh properties
CREATE MATERIALIZED VIEW mv
TBLPROPERTIES("mv.enableAutoRefresh"="true", "mv.refreshInterval"="10min")
AS SELECT id FROM students;

View File

@ -0,0 +1,7 @@
-- DROP MATERIALIZED VIEW [ IF EXISTS ] view_identifier
DROP MATERIALIZED VIEW mv;
DROP MATERIALIZED VIEW userDB.mv;
DROP MATERIALIZED VIEW IF EXISTS mv;

View File

@ -0,0 +1,7 @@
-- OPTIMIZE view_identifier [ WHERE where_expression ] ZORDER BY col1, col2...
OPTIMIZE students ZORDER BY id, name;
OPTIMIZE userDB.students ZORDER BY id, name;
OPTIMIZE students WHERE id=1 ZORDER BY id, name;

View File

@ -15,3 +15,9 @@ REFRESH FUNCTION db1.func1;
REFRESH TABLE tbl1;
REFRESH TABLE tempDB.view1;
-- REFRESH MATERIALIZED VIEW view_identifier
REFRESH MATERIALIZED VIEW mv;
REFRESH MATERIALIZED VIEW userDB.mv;

View File

@ -99,3 +99,29 @@ SHOW VIEWS IN global_temp;
SHOW VIEWS FROM default LIKE 'sam*';
SHOW VIEWS LIKE 'sam|suj|temp*';
-- SHOW MATERIALIZED VIEWS [ { FROM | IN } database_name ] [ LIKE? regex_pattern ];
SHOW MATERIALIZED VIEWS;
SHOW MATERIALIZED VIEWS IN userdb;
SHOW MATERIALIZED VIEWS FROM userdb;
SHOW MATERIALIZED VIEWS LIKE 'test_view1|test_view2';
SHOW MATERIALIZED VIEWS IN userdb LIKE 'test_view1|test_view2';
SHOW MATERIALIZED VIEWS FROM userdb LIKE 'test_view1|test_view2';
SHOW MATERIALIZED VIEWS "test_*";
SHOW MATERIALIZED VIEWS IN userdb "test_*";
-- SHOW CREATE MATERIALIZED VIEW view_identifier [ AS SERDE ];
SHOW CREATE MATERIALIZED VIEW mv;
SHOW CREATE MATERIALIZED VIEW userdb.mv;
SHOW CREATE MATERIALIZED VIEW mv AS SERDE;

View File

@ -0,0 +1,16 @@
import SparkSQL from 'src/parser/spark';
import { readSQL } from 'test/helper';
const parser = new SparkSQL();
const features = {
set: readSQL(__dirname, 'optimize.sql'),
};
describe('Spark Optimize Syntax Tests', () => {
features.set.forEach((itemSql) => {
it(itemSql, () => {
expect(parser.validate(itemSql).length).toBe(0);
});
});
});