lava-oushudb-dt-sql-parser/src/grammar/spark/SparkSqlLexer.g4
JackWang032 5ce89cb421
feat(spark): support materialized view for spark sql (#262)
* feat(spark): support materialized view for spark sql

* fix(spark): code review update

* fix(spark): update spark  materilized view and zorder grammar

* test(spark): add syntaxSuggestion test of materialized view

---------

Co-authored-by: jialan <jialan@dtstack.com>
2024-02-26 17:25:19 +08:00

484 lines
16 KiB
ANTLR

/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
/**
* This file is an adaptation of spark's spark/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4 grammar.
* Reference: https://github.com/apache/spark/blob/master/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseLexer.g4
*/
// $antlr-format alignTrailingComments true, columnLimit 150, maxEmptyLinesToKeep 1, reflowComments false, useTab false
// $antlr-format allowShortRulesOnASingleLine true, allowShortBlocksOnASingleLine true, minEmptyLines 0, alignSemicolons none, alignColons trailing
// $antlr-format singleLineOverrulesHangingColon true, alignLexerCommands true, alignLabels true, alignTrailers true
// $antlr-format spaceBeforeAssignmentOperators false, groupedAlignments true
lexer grammar SparkSqlLexer;
@members {
/**
* When true, parser should throw ParseException for unclosed bracketed comment.
*/
public has_unclosed_bracketed_comment = false;
/**
* This method will be called when the character stream ends and try to find out the
* unclosed bracketed comment.
* If the method be called, it means the end of the entire character stream match,
* and we set the flag and fail later.
*/
public markUnclosedComment() {
this.has_unclosed_bracketed_comment = true;
}
}
SEMICOLON: ';';
LEFT_PAREN : '(';
RIGHT_PAREN : ')';
COMMA : ',';
DOT : '.';
LEFT_BRACKET : '[';
RIGHT_BRACKET : ']';
// NOTE: If you add a new token in the list below, you should update the list of keywords
// and reserved tag in `docs/sql-ref-ansi-compliance.md#sql-keywords`, and
// modify `ParserUtils.toExprAlias()` which assumes all keywords are between `ADD` and `ZONE`.
//============================
// Start of the keywords list
//============================
//--SPARK-KEYWORD-LIST-START
KW_ADD : 'ADD';
KW_AFTER : 'AFTER';
KW_ALL : 'ALL';
KW_ALTER : 'ALTER';
KW_ALWAYS : 'ALWAYS';
KW_ANALYZE : 'ANALYZE';
KW_AND : 'AND';
KW_ANTI : 'ANTI';
KW_ANY : 'ANY';
KW_ANY_VALUE : 'ANY_VALUE';
KW_ARCHIVE : 'ARCHIVE';
KW_ARRAY : 'ARRAY';
KW_AS : 'AS';
KW_ASC : 'ASC';
KW_AT : 'AT';
KW_AUTHORIZATION : 'AUTHORIZATION';
KW_BETWEEN : 'BETWEEN';
KW_BIGINT : 'BIGINT';
KW_BINARY : 'BINARY';
KW_BOOLEAN : 'BOOLEAN';
KW_BOTH : 'BOTH';
KW_BUCKET : 'BUCKET';
KW_BUCKETS : 'BUCKETS';
KW_BY : 'BY';
KW_BYTE : 'BYTE';
KW_CACHE : 'CACHE';
KW_CASCADE : 'CASCADE';
KW_CASE : 'CASE';
KW_CAST : 'CAST';
KW_CATALOG : 'CATALOG';
KW_CATALOGS : 'CATALOGS';
KW_CHANGE : 'CHANGE';
KW_CHAR : 'CHAR';
KW_CHARACTER : 'CHARACTER';
KW_CHECK : 'CHECK';
KW_CLEAR : 'CLEAR';
KW_CLUSTER : 'CLUSTER';
KW_CLUSTERED : 'CLUSTERED';
KW_CODEGEN : 'CODEGEN';
KW_COLLATE : 'COLLATE';
KW_COLLECTION : 'COLLECTION';
KW_COLUMN : 'COLUMN';
KW_COLUMNS : 'COLUMNS';
KW_COMMENT : 'COMMENT';
KW_COMMIT : 'COMMIT';
KW_COMPACT : 'COMPACT';
KW_COMPACTIONS : 'COMPACTIONS';
KW_COMPUTE : 'COMPUTE';
KW_CONCATENATE : 'CONCATENATE';
KW_CONSTRAINT : 'CONSTRAINT';
KW_COST : 'COST';
KW_CREATE : 'CREATE';
KW_CROSS : 'CROSS';
KW_CUBE : 'CUBE';
KW_CURRENT : 'CURRENT';
KW_CURRENT_DATE : 'CURRENT_DATE';
KW_CURRENT_TIME : 'CURRENT_TIME';
KW_CURRENT_TIMESTAMP : 'CURRENT_TIMESTAMP';
KW_CURRENT_USER : 'CURRENT_USER';
KW_DAY : 'DAY';
KW_DAYS : 'DAYS';
KW_DAYOFYEAR : 'DAYOFYEAR';
KW_DATA : 'DATA';
KW_DATE : 'DATE';
KW_DATABASE : 'DATABASE';
KW_DATABASES : 'DATABASES';
KW_DATEADD : 'DATEADD';
KW_DATE_ADD : 'DATE_ADD';
KW_DATEDIFF : 'DATEDIFF';
KW_DATE_DIFF : 'DATE_DIFF';
KW_DBPROPERTIES : 'DBPROPERTIES';
KW_DEC : 'DEC';
KW_DECIMAL : 'DECIMAL';
KW_DECLARE : 'DECLARE';
KW_DEFAULT : 'DEFAULT';
KW_DEFINED : 'DEFINED';
KW_DELETE : 'DELETE';
KW_DELIMITED : 'DELIMITED';
KW_DESC : 'DESC';
KW_DESCRIBE : 'DESCRIBE';
KW_DFS : 'DFS';
KW_DIRECTORIES : 'DIRECTORIES';
KW_DIRECTORY : 'DIRECTORY';
KW_DISABLE : 'DISABLE';
KW_DISTINCT : 'DISTINCT';
KW_DISTRIBUTE : 'DISTRIBUTE';
KW_DIV : 'DIV';
KW_DOUBLE : 'DOUBLE';
KW_DROP : 'DROP';
KW_ELSE : 'ELSE';
KW_ENABLE : 'ENABLE';
KW_END : 'END';
KW_ESCAPE : 'ESCAPE';
KW_ESCAPED : 'ESCAPED';
KW_EXCEPT : 'EXCEPT';
KW_EXCHANGE : 'EXCHANGE';
KW_EXCLUDE : 'EXCLUDE';
KW_EXISTS : 'EXISTS';
KW_EXPLAIN : 'EXPLAIN';
KW_EXPORT : 'EXPORT';
KW_EXTENDED : 'EXTENDED';
KW_EXTERNAL : 'EXTERNAL';
KW_EXTRACT : 'EXTRACT';
KW_FALSE : 'FALSE';
KW_FETCH : 'FETCH';
KW_FIELDS : 'FIELDS';
KW_FILTER : 'FILTER';
KW_FILEFORMAT : 'FILEFORMAT';
KW_FIRST : 'FIRST';
KW_FLOAT : 'FLOAT';
KW_FOLLOWING : 'FOLLOWING';
KW_FOR : 'FOR';
KW_FOREIGN : 'FOREIGN';
KW_FORMAT : 'FORMAT';
KW_FORMATTED : 'FORMATTED';
KW_FROM : 'FROM';
KW_FULL : 'FULL';
KW_FUNCTION : 'FUNCTION';
KW_FUNCTIONS : 'FUNCTIONS';
KW_GENERATED : 'GENERATED';
KW_GLOBAL : 'GLOBAL';
KW_GRANT : 'GRANT';
KW_GROUP : 'GROUP';
KW_GROUPING : 'GROUPING';
KW_HAVING : 'HAVING';
KW_BINARY_HEX : 'X';
KW_HOUR : 'HOUR';
KW_HOURS : 'HOURS';
KW_IDENTIFIER : 'IDENTIFIER';
KW_IF : 'IF';
KW_IGNORE : 'IGNORE';
KW_IMPORT : 'IMPORT';
KW_IN : 'IN';
KW_INCLUDE : 'INCLUDE';
KW_INDEX : 'INDEX';
KW_INDEXES : 'INDEXES';
KW_INNER : 'INNER';
KW_INPATH : 'INPATH';
KW_INPUTFORMAT : 'INPUTFORMAT';
KW_INSERT : 'INSERT';
KW_INTERSECT : 'INTERSECT';
KW_INTERVAL : 'INTERVAL';
KW_INT : 'INT';
KW_INTEGER : 'INTEGER';
KW_INTO : 'INTO';
KW_IS : 'IS';
KW_ITEMS : 'ITEMS';
KW_JOIN : 'JOIN';
KW_KEYS : 'KEYS';
KW_LAST : 'LAST';
KW_LATERAL : 'LATERAL';
KW_LAZY : 'LAZY';
KW_LEADING : 'LEADING';
KW_LEFT : 'LEFT';
KW_LIKE : 'LIKE';
KW_ILIKE : 'ILIKE';
KW_LIMIT : 'LIMIT';
KW_LINES : 'LINES';
KW_LIST : 'LIST';
KW_LOAD : 'LOAD';
KW_LOCAL : 'LOCAL';
KW_LOCATION : 'LOCATION';
KW_LOCK : 'LOCK';
KW_LOCKS : 'LOCKS';
KW_LOGICAL : 'LOGICAL';
KW_LONG : 'LONG';
KW_MACRO : 'MACRO';
KW_MATERIALIZED : 'MATERIALIZED';
KW_MAP : 'MAP';
KW_MATCHED : 'MATCHED';
KW_MERGE : 'MERGE';
KW_MICROSECOND : 'MICROSECOND';
KW_MICROSECONDS : 'MICROSECONDS';
KW_MILLISECOND : 'MILLISECOND';
KW_MILLISECONDS : 'MILLISECONDS';
KW_MINUTE : 'MINUTE';
KW_MINUTES : 'MINUTES';
KW_MONTH : 'MONTH';
KW_MONTHS : 'MONTHS';
KW_MSCK : 'MSCK';
KW_NAME : 'NAME';
KW_NAMESPACE : 'NAMESPACE';
KW_NAMESPACES : 'NAMESPACES';
KW_NANOSECOND : 'NANOSECOND';
KW_NANOSECONDS : 'NANOSECONDS';
KW_NATURAL : 'NATURAL';
KW_NO : 'NO';
KW_NOSCAN : 'NOSCAN';
KW_NOT : 'NOT';
KW_NULL : 'NULL';
KW_NULLS : 'NULLS';
KW_NUMERIC : 'NUMERIC';
KW_OF : 'OF';
KW_OFFSET : 'OFFSET';
KW_ON : 'ON';
KW_ONLY : 'ONLY';
KW_OPTIMIZE : 'OPTIMIZE';
KW_OPTION : 'OPTION';
KW_OPTIONS : 'OPTIONS';
KW_OR : 'OR';
KW_ORDER : 'ORDER';
KW_OUT : 'OUT';
KW_OUTER : 'OUTER';
KW_OUTPUTFORMAT : 'OUTPUTFORMAT';
KW_OVER : 'OVER';
KW_OVERLAPS : 'OVERLAPS';
KW_OVERLAY : 'OVERLAY';
KW_OVERWRITE : 'OVERWRITE';
KW_PARTITION : 'PARTITION';
KW_PARTITIONED : 'PARTITIONED';
KW_PARTITIONS : 'PARTITIONS';
KW_PERCENTILE_CONT : 'PERCENTILE_CONT';
KW_PERCENTILE_DISC : 'PERCENTILE_DISC';
KW_PERCENTLIT : 'PERCENT';
KW_PIVOT : 'PIVOT';
KW_PLACING : 'PLACING';
KW_POSITION : 'POSITION';
KW_PRECEDING : 'PRECEDING';
KW_PRIMARY : 'PRIMARY';
KW_PRINCIPALS : 'PRINCIPALS';
KW_PROPERTIES : 'PROPERTIES';
KW_PURGE : 'PURGE';
KW_QUARTER : 'QUARTER';
KW_QUERY : 'QUERY';
KW_RANGE : 'RANGE';
KW_REAL : 'REAL';
KW_RECORDREADER : 'RECORDREADER';
KW_RECORDWRITER : 'RECORDWRITER';
KW_RECOVER : 'RECOVER';
KW_REDUCE : 'REDUCE';
KW_REFERENCES : 'REFERENCES';
KW_REFRESH : 'REFRESH';
KW_RENAME : 'RENAME';
KW_REPAIR : 'REPAIR';
KW_REPEATABLE : 'REPEATABLE';
KW_REPLACE : 'REPLACE';
KW_RESET : 'RESET';
KW_RESPECT : 'RESPECT';
KW_RESTRICT : 'RESTRICT';
KW_REWRITE : 'REWRITE';
KW_REVOKE : 'REVOKE';
KW_RIGHT : 'RIGHT';
KW_RLIKE : 'RLIKE';
KW_REGEXP : 'REGEXP';
KW_ROLE : 'ROLE';
KW_ROLES : 'ROLES';
KW_ROLLBACK : 'ROLLBACK';
KW_ROLLUP : 'ROLLUP';
KW_ROW : 'ROW';
KW_ROWS : 'ROWS';
KW_SECOND : 'SECOND';
KW_SECONDS : 'SECONDS';
KW_SCHEMA : 'SCHEMA';
KW_SCHEMAS : 'SCHEMAS';
KW_SELECT : 'SELECT';
KW_SEMI : 'SEMI';
KW_SEPARATED : 'SEPARATED';
KW_SERDE : 'SERDE';
KW_SERDEPROPERTIES : 'SERDEPROPERTIES';
KW_SESSION_USER : 'SESSION_USER';
KW_SET : 'SET';
KW_MINUS : 'MINUS';
KW_SETS : 'SETS';
KW_SHORT : 'SHORT';
KW_SHOW : 'SHOW';
KW_SINGLE : 'SINGLE';
KW_SKEWED : 'SKEWED';
KW_SMALLINT : 'SMALLINT';
KW_SOME : 'SOME';
KW_SORT : 'SORT';
KW_SORTED : 'SORTED';
KW_SOURCE : 'SOURCE';
KW_START : 'START';
KW_STATISTICS : 'STATISTICS';
KW_STORED : 'STORED';
KW_STRATIFY : 'STRATIFY';
KW_STRING : 'STRING';
KW_STRUCT : 'STRUCT';
KW_SUBSTR : 'SUBSTR';
KW_SUBSTRING : 'SUBSTRING';
KW_SYNC : 'SYNC';
KW_SYSTEM : 'SYSTEM';
KW_SYSTEM_TIME : 'SYSTEM_TIME';
KW_SYSTEM_VERSION : 'SYSTEM_VERSION';
KW_TABLE : 'TABLE';
KW_TABLES : 'TABLES';
KW_TABLESAMPLE : 'TABLESAMPLE';
KW_TARGET : 'TARGET';
KW_TBLPROPERTIES : 'TBLPROPERTIES';
KW_TEMPORARY : 'TEMPORARY';
KW_TERMINATED : 'TERMINATED';
KW_THEN : 'THEN';
KW_TIME : 'TIME';
KW_TIMEDIFF : 'TIMEDIFF';
KW_TIMESTAMP : 'TIMESTAMP';
KW_TIMESTAMP_LTZ : 'TIMESTAMP_LTZ';
KW_TIMESTAMP_NTZ : 'TIMESTAMP_NTZ';
KW_TIMESTAMPADD : 'TIMESTAMPADD';
KW_TIMESTAMPDIFF : 'TIMESTAMPDIFF';
KW_TINYINT : 'TINYINT';
KW_TO : 'TO';
KW_TOUCH : 'TOUCH';
KW_TRAILING : 'TRAILING';
KW_TRANSACTION : 'TRANSACTION';
KW_TRANSACTIONS : 'TRANSACTIONS';
KW_TRANSFORM : 'TRANSFORM';
KW_TRIM : 'TRIM';
KW_TRUE : 'TRUE';
KW_TRUNCATE : 'TRUNCATE';
KW_TRY_CAST : 'TRY_CAST';
KW_TYPE : 'TYPE';
KW_UNARCHIVE : 'UNARCHIVE';
KW_UNBOUNDED : 'UNBOUNDED';
KW_UNCACHE : 'UNCACHE';
KW_UNION : 'UNION';
KW_UNIQUE : 'UNIQUE';
KW_UNKNOWN : 'UNKNOWN';
KW_UNLOCK : 'UNLOCK';
KW_UNPIVOT : 'UNPIVOT';
KW_UNSET : 'UNSET';
KW_UPDATE : 'UPDATE';
KW_USE : 'USE';
KW_USER : 'USER';
KW_USING : 'USING';
KW_VALUES : 'VALUES';
KW_VARCHAR : 'VARCHAR';
KW_VAR : 'VAR';
KW_VARIABLE : 'VARIABLE';
KW_VERSION : 'VERSION';
KW_VIEW : 'VIEW';
KW_VIEWS : 'VIEWS';
KW_VOID : 'VOID';
KW_WEEK : 'WEEK';
KW_WEEKS : 'WEEKS';
KW_WHEN : 'WHEN';
KW_WHERE : 'WHERE';
KW_WINDOW : 'WINDOW';
KW_WITH : 'WITH';
KW_WITHIN : 'WITHIN';
KW_YEAR : 'YEAR';
KW_YEARS : 'YEARS';
KW_ZONE : 'ZONE';
KW_ZORDER : 'ZORDER';
//--SPARK-KEYWORD-LIST-END
//============================
// End of the keywords list
//============================
EQ : '=' | '==';
NSEQ : '<=>';
NEQ : '<>';
NEQJ : '!=';
LT : '<';
LTE : '<=' | '!>';
GT : '>';
GTE : '>=' | '!<';
NOT : '!';
PLUS : '+';
MINUS : '-';
ASTERISK : '*';
SLASH : '/';
PERCENT : '%';
TILDE : '~';
AMPERSAND : '&';
PIPE : '|';
CONCAT_PIPE : '||';
HAT : '^';
COLON : ':';
ARROW : '->';
FAT_ARROW : '=>';
HENT_START : '/*+';
HENT_END : '*/';
QUESTION : '?';
STRING_LITERAL: '\'' ( ~('\'' | '\\') | ('\\' .))* '\'' | 'R\'' (~'\'')* '\'' | 'R"' (~'"')* '"';
DOUBLEQUOTED_STRING: '"' ( ~('"' | '\\') | ('\\' .))* '"';
// NOTE: If you move a numeric literal, you should modify `ParserUtils.toExprAlias()`
// which assumes all numeric literals are between `BIGINT_LITERAL` and `BIGDECIMAL_LITERAL`.
BIGINT_LITERAL: DIGIT+ 'L';
SMALLINT_LITERAL: DIGIT+ 'S';
TINYINT_LITERAL: DIGIT+ 'Y';
INTEGER_VALUE: DIGIT+;
EXPONENT_VALUE: DIGIT+ EXPONENT | DECIMAL_DIGITS EXPONENT;
DECIMAL_VALUE: DECIMAL_DIGITS;
FLOAT_LITERAL: DIGIT+ EXPONENT? 'F' | DECIMAL_DIGITS EXPONENT? 'F';
DOUBLE_LITERAL: DIGIT+ EXPONENT? 'D' | DECIMAL_DIGITS EXPONENT? 'D';
BIGDECIMAL_LITERAL: DIGIT+ EXPONENT? 'BD' | DECIMAL_DIGITS EXPONENT? 'BD';
IDENTIFIER: (LETTER | DIGIT | '_')+;
BACKQUOTED_IDENTIFIER: '`' ( ~'`' | '``')* '`';
fragment DECIMAL_DIGITS: DIGIT+ '.' DIGIT* | '.' DIGIT+;
fragment EXPONENT: 'E' [+-]? DIGIT+;
fragment DIGIT: [0-9];
fragment LETTER: [A-Za-z];
SIMPLE_COMMENT: '--' ('\\\n' | ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN);
BRACKETED_COMMENT:
'/*' (BRACKETED_COMMENT | .)*? ('*/' | {this.markUnclosedComment();} EOF) -> channel(HIDDEN);
WS: [ \r\n\t]+ -> channel(HIDDEN);
// Catch-all for anything we can't recognize.
// We use this to be able to ignore and recover all the text
// when splitting statements with DelimiterLexer
UNRECOGNIZED: .;