Refactor/basic parser (#182)

* feat: replace errorCollector with ParserErrorListener to collect lexer error

* refactor: remove useless method in basicParser

* feat: correct splitSQLByStatement method

* feat: rename parse to parseWithCache and add new parse method

* refactor: rename parserTree to parseTree

* test: rename parserTree to parseTree

* refactor: rename parserError to parseError

* feat: export ErrorHandler type

* feat: use errorhandler form params

* test: basic parser unit tests

* style: lint via prettier
This commit is contained in:
Hayden
2023-10-16 17:59:28 +08:00
committed by GitHub
parent 2f1325d4fc
commit f9dbd9fc23
22 changed files with 321 additions and 207 deletions

View File

@ -17,4 +17,4 @@ export * from './lib/trinosql/TrinoSqlVisitor';
export { SyntaxContextType } from './parser/common/basic-parser-types';
export type * from './parser/common/basic-parser-types';
export type { SyntaxError, ParserError } from './parser/common/parserErrorListener';
export type { SyntaxError, ParseError, ErrorHandler } from './parser/common/parseErrorListener';

View File

@ -71,10 +71,13 @@ export interface Suggestions<T = WordRange> {
}
export interface TextSlice {
/** start at 0 */
startIndex: number;
endIndex: number;
/** start at 1 */
startLine: number;
endLine: number;
/** start at 1 */
startColumn: number;
endColumn: number;
text: string;

View File

@ -17,11 +17,7 @@ import {
WordRange,
TextSlice,
} from './basic-parser-types';
import ParserErrorListener, {
ParserError,
ErrorHandler,
ParserErrorCollector,
} from './parserErrorListener';
import ParseErrorListener, { ParseError, ErrorHandler } from './parseErrorListener';
interface IParser<IParserRuleContext extends ParserRuleContext> extends Parser {
// Customized in our parser
@ -40,13 +36,19 @@ export default abstract class BasicParser<
PRC extends ParserRuleContext = ParserRuleContext,
P extends IParser<PRC> = IParser<PRC>,
> {
/** members for cache start */
protected _charStreams: CodePointCharStream;
protected _lexer: L;
protected _tokenStream: CommonTokenStream;
protected _parser: P;
protected _parserTree: PRC;
protected _errorCollector: ParserErrorCollector = new ParserErrorCollector();
protected _parseTree: PRC;
protected _parsedInput: string = null;
protected _parseErrors: ParseError[] = [];
/** members for cache end */
private _errorHandler: ErrorHandler<any> = (error) => {
this._parseErrors.push(error);
};
/**
* PreferredRules for antlr4-c3
@ -54,7 +56,7 @@ export default abstract class BasicParser<
protected abstract preferredRules: Set<number>;
/**
* Create a antrl4 Lexer instance
* Create a antlr4 Lexer instance.
* @param input source string
*/
protected abstract createLexerFormCharStream(charStreams: CodePointCharStream): L;
@ -70,8 +72,7 @@ export default abstract class BasicParser<
* @param candidates candidate list
* @param allTokens all tokens from input
* @param caretTokenIndex tokenIndex of caretPosition
* @param tokenIndexOffset offset of the tokenIndex in the candidates
* compared to the tokenIndex in allTokens
* @param tokenIndexOffset offset of the tokenIndex in the candidates compared to the tokenIndex in allTokens
*/
protected abstract processCandidates(
candidates: CandidatesCollection,
@ -86,38 +87,68 @@ export default abstract class BasicParser<
protected abstract get splitListener(): SplitListener;
/**
* Create an anltr4 lexer from input.
* Create an antlr4 lexer from input.
* @param input string
*/
public createLexer(input: string) {
public createLexer(input: string, errorListener?: ErrorHandler<any>) {
const charStreams = CharStreams.fromString(input.toUpperCase());
const lexer = this.createLexerFormCharStream(charStreams);
if (errorListener) {
lexer.removeErrorListeners();
lexer.addErrorListener(new ParseErrorListener(errorListener));
}
return lexer;
}
/**
* Create an anltr4 parser from input.
* Create an antlr4 parser from input.
* @param input string
*/
public createParser(input: string) {
const lexer = this.createLexer(input);
public createParser(input: string, errorListener?: ErrorHandler<any>) {
const lexer = this.createLexer(input, errorListener);
const tokenStream = new CommonTokenStream(lexer);
const parser = this.createParserFromTokenStream(tokenStream);
if (errorListener) {
parser.removeErrorListeners();
parser.addErrorListener(new ParseErrorListener(errorListener));
}
return parser;
}
/**
* Create an anltr4 parser from input.
* Parse input string and return parseTree.
* @param input string
* @param errorListener listen parse errors and lexer errors.
* @returns parseTree
*/
public parse(input: string, errorListener?: ErrorHandler<any>) {
const parser = this.createParser(input, errorListener);
parser.buildParseTree = true;
return parser.program();
}
/**
* Create an antlr4 parser from input.
* And the instances will be cache.
* @param input string
*/
protected createParserWithCache(input: string): P {
this._parserTree = null;
private createParserWithCache(input: string): P {
this._parseTree = null;
this._charStreams = CharStreams.fromString(input.toUpperCase());
this._lexer = this.createLexerFormCharStream(this._charStreams);
this._lexer.removeErrorListeners();
this._lexer.addErrorListener(new ParseErrorListener(this._errorHandler));
this._tokenStream = new CommonTokenStream(this._lexer);
/**
* All tokens are generated in advance.
* This can cause performance degradation, but it seems necessary for now.
* Because the tokens will be used multiple times.
*/
this._tokenStream.fill();
this._parser = this.createParserFromTokenStream(this._tokenStream);
@ -132,28 +163,23 @@ export default abstract class BasicParser<
* unless the errorListener parameter is passed.
* @param input source string
* @param errorListener listen errors
* @returns parserTree
* @returns parseTree
*/
public parse(input: string, errorListener?: ErrorHandler<any>) {
private parseWithCache(input: string, errorListener?: ErrorHandler<any>) {
// Avoid parsing the same input repeatedly.
if (this._parsedInput === input && !errorListener) {
return this._parserTree;
return this._parseTree;
}
this._parseErrors = [];
const parser = this.createParserWithCache(input);
this._parsedInput = input;
parser.removeErrorListeners();
this._errorCollector.clear();
parser.addErrorListener(new ParseErrorListener(this._errorHandler));
parser.addErrorListener(this._errorCollector);
if (errorListener) {
parser.addErrorListener(new ParserErrorListener(errorListener));
}
this._parseTree = parser.program();
this._parserTree = parser.program();
return this._parserTree;
return this._parseTree;
}
/**
@ -161,51 +187,34 @@ export default abstract class BasicParser<
* @param input source string
* @returns syntax errors
*/
public validate(input: string): ParserError[] {
this.parse(input);
const lexerError = [];
return lexerError.concat(this._errorCollector.parserErrors);
public validate(input: string): ParseError[] {
this.parseWithCache(input);
return this._parseErrors;
}
/**
* Get all Tokens of input string'<EOF>' is not included
* Get all Tokens of input string'<EOF>' is not included.
* @param input source string
* @returns Token[]
*/
public getAllTokens(input: string): Token[] {
this.parse(input);
this.parseWithCache(input);
let allTokens = this._tokenStream.getTokens();
if (allTokens[allTokens.length - 1].text === '<EOF>') {
allTokens = allTokens.slice(0, -1);
}
return allTokens;
}
/**
* It convert tree to string, it's convenient to use in unit test.
* @param string input
*/
public parserTreeToString(input: string): string {
this.parse(input);
return this._parserTree.toStringTree(this._parser.ruleNames);
}
/**
* Get List-like style tree string
* @param parserTree ProgramRuleContext
*/
public toString(parserTree: PRC): string {
return parserTree.toStringTree(this._parser.ruleNames);
}
/**
* @param listener Listener instance extends ParserListener
* @param parserTree parser Tree
* @param parseTree parser Tree
*/
public listen<PTL extends ParseTreeListener = ParseTreeListener>(
listener: PTL,
parserTree: PRC
parseTree: PRC
) {
ParseTreeWalker.DEFAULT.walk(listener, parserTree);
ParseTreeWalker.DEFAULT.walk(listener, parseTree);
}
/**
@ -214,9 +223,15 @@ export default abstract class BasicParser<
* @param input source string
*/
public splitSQLByStatement(input): TextSlice[] {
this.parse(input);
const errors = this.validate(input);
if (errors.length) {
return null;
}
const splitListener = this.splitListener;
this.listen(splitListener, this._parserTree);
// TODO: add splitListener to all sqlParser implements add remove following if
if (!splitListener) return null;
this.listen(splitListener, this._parseTree);
const res = splitListener.statementsContext.map((context) => {
const { start, stop } = context;
@ -248,11 +263,11 @@ export default abstract class BasicParser<
// TODO: add splitListener to all sqlParser implements add remove following if
if (!splitListener) return null;
this.parse(input);
this.parseWithCache(input);
let sqlParserIns = this._parser;
const allTokens = this.getAllTokens(input);
let caretTokenIndex = findCaretTokenIndex(caretPosition, allTokens);
let c3Context: ParserRuleContext = this._parserTree;
let c3Context: ParserRuleContext = this._parseTree;
let tokenIndexOffset: number = 0;
if (!caretTokenIndex && caretTokenIndex !== 0) return null;
@ -261,7 +276,7 @@ export default abstract class BasicParser<
* Split sql by statement.
* Try to collect candidates from the caret statement only.
*/
this.listen(splitListener, this._parserTree);
this.listen(splitListener, this._parseTree);
// If there are multiple statements.
if (splitListener.statementsContext.length > 1) {
@ -282,7 +297,7 @@ export default abstract class BasicParser<
/**
* If caretStatementContext is not found and it follows all statements.
* Reparses part of the input following the penultimate statement.
* And c3 will collect candidates in the new parserTreeContext.
* And c3 will collect candidates in the new parseTreeContext.
*/
if (caretTokenIndex > lastStatementToken?.tokenIndex) {
/**
@ -295,11 +310,14 @@ export default abstract class BasicParser<
const inputSlice = input.slice(lastStatementToken.startIndex);
const lexer = this.createLexer(inputSlice);
lexer.removeErrorListeners();
const tokenStream = new CommonTokenStream(lexer);
tokenStream.fill();
const parser = this.createParserFromTokenStream(tokenStream);
parser.removeErrorListeners();
parser.buildParseTree = true;
sqlParserIns = parser;
c3Context = parser.program();
}

View File

@ -0,0 +1,72 @@
import { Token, Recognizer, ANTLRErrorListener, RecognitionException } from 'antlr4ts';
import { ATNSimulator } from 'antlr4ts/atn/ATNSimulator';
/**
* Converted from {@link SyntaxError}.
*/
export interface ParseError {
startLine: number;
endLine: number;
startCol: number;
endCol: number;
message: string;
}
/**
* The type of error resulting from lexical parsing and parsing.
*/
export interface SyntaxError<T> {
recognizer: Recognizer<T, ATNSimulator>;
offendingSymbol: Token;
line: number;
charPositionInLine: number;
msg: string;
e: RecognitionException;
}
/**
* ErrorHandler will be invoked when it encounters a parsing error.
* Includes lexical errors and parsing errors.
*/
export type ErrorHandler<T> = (parseError: ParseError, originalError: SyntaxError<T>) => void;
export default class ParseErrorListener implements ANTLRErrorListener<Token> {
private _errorHandler;
constructor(errorListener: ErrorHandler<Token>) {
this._errorHandler = errorListener;
}
syntaxError(
recognizer: Recognizer<Token, ATNSimulator>,
offendingSymbol,
line: number,
charPositionInLine: number,
msg: string,
e: RecognitionException
) {
let endCol = charPositionInLine + 1;
if (offendingSymbol && offendingSymbol.text !== null) {
endCol = charPositionInLine + offendingSymbol.text.length;
}
if (this._errorHandler) {
this._errorHandler(
{
startLine: line,
endLine: line,
startCol: charPositionInLine,
endCol: endCol,
message: msg,
},
{
e,
line,
msg,
recognizer,
offendingSymbol,
charPositionInLine,
}
);
}
}
}

View File

@ -1,106 +0,0 @@
import { Token, Recognizer, ParserErrorListener, RecognitionException } from 'antlr4ts';
import { ATNSimulator } from 'antlr4ts/atn/ATNSimulator';
export interface ParserError {
startLine: number;
endLine: number;
startCol: number;
endCol: number;
message: string;
}
export interface SyntaxError<T> {
recognizer: Recognizer<T, ATNSimulator>;
offendingSymbol: Token;
line: number;
charPositionInLine: number;
msg: string;
e: RecognitionException;
}
export type ErrorHandler<T> = (err: ParserError, errOption: SyntaxError<T>) => void;
export class ParserErrorCollector implements ParserErrorListener {
private _parseErrors: ParserError[] = [];
private _syntaxErrors: SyntaxError<Token>[] = [];
syntaxError(
recognizer: Recognizer<Token, ATNSimulator>,
offendingSymbol: Token,
line: number,
charPositionInLine: number,
msg: string,
e: RecognitionException
) {
let endCol = charPositionInLine + 1;
if (offendingSymbol && offendingSymbol.text !== null) {
endCol = charPositionInLine + (offendingSymbol.text?.length ?? 0);
}
this._parseErrors.push({
startLine: line,
endLine: line,
startCol: charPositionInLine,
endCol: endCol,
message: msg,
});
this._syntaxErrors.push({
e,
line,
msg,
recognizer,
offendingSymbol,
charPositionInLine,
});
}
clear() {
this._parseErrors = [];
this._syntaxErrors = [];
}
get parserErrors() {
return this._parseErrors;
}
}
export default class CustomParserErrorListener implements ParserErrorListener {
private _errorHandler;
constructor(errorListener: ErrorHandler<Token>) {
this._errorHandler = errorListener;
}
syntaxError(
recognizer: Recognizer<Token, ATNSimulator>,
offendingSymbol: Token,
line: number,
charPositionInLine: number,
msg: string,
e: RecognitionException
) {
let endCol = charPositionInLine + 1;
if (offendingSymbol && offendingSymbol.text !== null) {
endCol = charPositionInLine + offendingSymbol.text.length;
}
if (this._errorHandler) {
this._errorHandler(
{
startLine: line,
endLine: line,
startCol: charPositionInLine,
endCol: endCol,
message: msg,
},
{
e,
line,
msg,
recognizer,
offendingSymbol,
charPositionInLine,
}
);
}
}
}

View File

@ -1,13 +1,7 @@
import { Token } from 'antlr4ts';
import { CandidatesCollection } from 'antlr4-c3';
import { HiveSqlLexer } from '../lib/hive/HiveSqlLexer';
import {
HiveSqlParser,
ProgramContext,
StatementContext,
ExplainStatementContext,
ExecStatementContext,
} from '../lib/hive/HiveSqlParser';
import { HiveSqlParser, ProgramContext, StatementContext } from '../lib/hive/HiveSqlParser';
import BasicParser from './common/basicParser';
import { HiveSqlParserListener } from '../lib/hive/HiveSqlParserListener';
import { SyntaxContextType, Suggestions, SyntaxSuggestion } from './common/basic-parser-types';