lava-oushudb-dt-sql-parser/src/parser/common/basicParser.ts
Hayden 3c7c59fb70
feat: optimize suggestion (#231)
* feat: optimize the strategy of finding the right range

* test: apply commentOtherLine util to all suggestion tests

* test: decomment suggestion test cases

* test: add suggestion test cases in multiple statements

* chore: improve comments

* test: update log info in test
2023-12-13 11:33:47 +08:00

385 lines
13 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import {
Parser,
Lexer,
Token,
CharStreams,
CommonTokenStream,
CodePointCharStream,
ParserRuleContext,
} from 'antlr4ts';
import { ParseTreeWalker, ParseTreeListener } from 'antlr4ts/tree';
import { CandidatesCollection, CodeCompletionCore } from 'antlr4-c3';
import { findCaretTokenIndex } from './utils/findCaretTokenIndex';
import {
CaretPosition,
Suggestions,
SyntaxSuggestion,
WordRange,
TextSlice,
} from './basic-parser-types';
import ParseErrorListener, { ParseError, ErrorListener } from './parseErrorListener';
import { ErrorStrategy } from './errorStrategy';
interface IParser<IParserRuleContext extends ParserRuleContext> extends Parser {
// Customized in our parser
program(): IParserRuleContext;
}
interface SplitListener extends ParseTreeListener {
statementsContext: ParserRuleContext[];
}
/**
* Custom Parser class, subclass needs extends it.
*/
export default abstract class BasicParser<
L extends Lexer = Lexer,
PRC extends ParserRuleContext = ParserRuleContext,
P extends IParser<PRC> = IParser<PRC>,
> {
/** members for cache start */
protected _charStreams: CodePointCharStream;
protected _lexer: L;
protected _tokenStream: CommonTokenStream;
protected _parser: P;
protected _parseTree: PRC;
protected _parsedInput: string = null;
protected _parseErrors: ParseError[] = [];
/** members for cache end */
private _errorListener: ErrorListener<any> = (error) => {
this._parseErrors.push(error);
};
/**
* PreferredRules for antlr4-c3
*/
protected abstract preferredRules: Set<number>;
/**
* Create a antlr4 Lexer instance.
* @param input source string
*/
protected abstract createLexerFormCharStream(charStreams: CodePointCharStream): L;
/**
* Create Parser by CommonTokenStream
* @param tokenStream CommonTokenStream
*/
protected abstract createParserFromTokenStream(tokenStream: CommonTokenStream): P;
/**
* Convert candidates to suggestions
* @param candidates candidate list
* @param allTokens all tokens from input
* @param caretTokenIndex tokenIndex of caretPosition
* @param tokenIndexOffset offset of the tokenIndex in the candidates compared to the tokenIndex in allTokens
*/
protected abstract processCandidates(
candidates: CandidatesCollection,
allTokens: Token[],
caretTokenIndex: number,
tokenIndexOffset: number
): Suggestions<Token>;
/**
* Get splitListener instance.
*/
protected abstract get splitListener(): SplitListener;
/**
* Create an antlr4 lexer from input.
* @param input string
*/
public createLexer(input: string, errorListener?: ErrorListener<any>) {
const charStreams = CharStreams.fromString(input.toUpperCase());
const lexer = this.createLexerFormCharStream(charStreams);
if (errorListener) {
lexer.removeErrorListeners();
lexer.addErrorListener(new ParseErrorListener(errorListener));
}
return lexer;
}
/**
* Create an antlr4 parser from input.
* @param input string
*/
public createParser(input: string, errorListener?: ErrorListener<any>) {
const lexer = this.createLexer(input, errorListener);
const tokenStream = new CommonTokenStream(lexer);
const parser = this.createParserFromTokenStream(tokenStream);
if (errorListener) {
parser.removeErrorListeners();
parser.addErrorListener(new ParseErrorListener(errorListener));
}
return parser;
}
/**
* Parse input string and return parseTree.
* @param input string
* @param errorListener listen parse errors and lexer errors.
* @returns parseTree
*/
public parse(input: string, errorListener?: ErrorListener<any>) {
const parser = this.createParser(input, errorListener);
parser.buildParseTree = true;
parser.errorHandler = new ErrorStrategy();
return parser.program();
}
/**
* Create an antlr4 parser from input.
* And the instances will be cache.
* @param input string
*/
private createParserWithCache(input: string): P {
this._parseTree = null;
this._charStreams = CharStreams.fromString(input.toUpperCase());
this._lexer = this.createLexerFormCharStream(this._charStreams);
this._lexer.removeErrorListeners();
this._lexer.addErrorListener(new ParseErrorListener(this._errorListener));
this._tokenStream = new CommonTokenStream(this._lexer);
/**
* All tokens are generated in advance.
* This can cause performance degradation, but it seems necessary for now.
* Because the tokens will be used multiple times.
*/
this._tokenStream.fill();
this._parser = this.createParserFromTokenStream(this._tokenStream);
this._parser.buildParseTree = true;
this._parser.errorHandler = new ErrorStrategy();
return this._parser;
}
/**
* If it is invoked multiple times in a row and the input parameters is the same,
* this method returns the parsing result directly for the first time
* unless the errorListener parameter is passed.
* @param input source string
* @param errorListener listen errors
* @returns parseTree
*/
private parseWithCache(input: string, errorListener?: ErrorListener<any>) {
// Avoid parsing the same input repeatedly.
if (this._parsedInput === input && !errorListener) {
return this._parseTree;
}
this._parseErrors = [];
const parser = this.createParserWithCache(input);
this._parsedInput = input;
parser.removeErrorListeners();
parser.addErrorListener(new ParseErrorListener(this._errorListener));
this._parseTree = parser.program();
return this._parseTree;
}
/**
* Validate input string and return syntax errors if exists.
* @param input source string
* @returns syntax errors
*/
public validate(input: string): ParseError[] {
this.parseWithCache(input);
return this._parseErrors;
}
/**
* Get all Tokens of input string'<EOF>' is not included.
* @param input source string
* @returns Token[]
*/
public getAllTokens(input: string): Token[] {
this.parseWithCache(input);
let allTokens = this._tokenStream.getTokens();
if (allTokens[allTokens.length - 1].text === '<EOF>') {
allTokens = allTokens.slice(0, -1);
}
return allTokens;
}
/**
* @param listener Listener instance extends ParserListener
* @param parseTree parser Tree
*/
public listen<PTL extends ParseTreeListener = ParseTreeListener>(
listener: PTL,
parseTree: PRC
) {
ParseTreeWalker.DEFAULT.walk(listener, parseTree);
}
/**
* Split input into statements.
* If exist syntax error it will return null.
* @param input source string
*/
public splitSQLByStatement(input): TextSlice[] {
const errors = this.validate(input);
if (errors.length) {
return null;
}
const splitListener = this.splitListener;
// TODO: add splitListener to all sqlParser implements add remove following if
if (!splitListener) return null;
this.listen(splitListener, this._parseTree);
const res = splitListener.statementsContext.map((context) => {
const { start, stop } = context;
return {
startIndex: start.startIndex,
endIndex: stop.stopIndex,
startLine: start.line,
endLine: stop.line,
startColumn: start.charPositionInLine + 1,
endColumn: stop.charPositionInLine + stop.text.length,
text: this._parsedInput.slice(start.startIndex, stop.stopIndex + 1),
};
});
return res;
}
/**
* Get suggestions of syntax and token at caretPosition
* @param input source string
* @param caretPosition caret position, such as cursor position
* @returns suggestion
*/
public getSuggestionAtCaretPosition(
input: string,
caretPosition: CaretPosition
): Suggestions | null {
const splitListener = this.splitListener;
// TODO: add splitListener to all sqlParser implements add remove following if
if (!splitListener) return null;
this.parseWithCache(input);
let sqlParserIns = this._parser;
const allTokens = this.getAllTokens(input);
let caretTokenIndex = findCaretTokenIndex(caretPosition, allTokens);
let c3Context: ParserRuleContext = this._parseTree;
let tokenIndexOffset: number = 0;
if (!caretTokenIndex && caretTokenIndex !== 0) return null;
/**
* Split sql by statement.
* Try to collect candidates in as small a range as possible.
*/
this.listen(splitListener, this._parseTree);
const statementCount = splitListener.statementsContext?.length;
const statementsContext = splitListener.statementsContext;
// If there are multiple statements.
if (statementCount > 1) {
/**
* Find a minimum valid range, reparse the fragment, and provide a new parse tree to C3.
* The boundaries of this range must be statements with no syntax errors.
* This can ensure the stable performance of the C3.
*/
let startStatement: ParserRuleContext;
let stopStatement: ParserRuleContext;
for (let index = 0; index < statementCount; index++) {
const ctx = statementsContext[index];
const isCurrentCtxValid = !ctx.exception;
if (!isCurrentCtxValid) continue;
/**
* Ensure that the statementContext before the left boundary
* and the last statementContext on the right boundary are qualified SQL statements.
*/
const isPrevCtxValid = index === 0 || !statementsContext[index - 1]?.exception;
const isNextCtxValid =
index === statementCount - 1 || !statementsContext[index + 1]?.exception;
if (ctx.stop.tokenIndex < caretTokenIndex && isPrevCtxValid) {
startStatement = ctx;
}
if (!stopStatement && ctx.start.tokenIndex > caretTokenIndex && isNextCtxValid) {
stopStatement = ctx;
break;
}
}
// A boundary consisting of the index of the input.
const startIndex = startStatement?.start?.startIndex ?? 0;
const stopIndex = stopStatement?.stop?.stopIndex ?? input.length - 1;
/**
* Save offset of the tokenIndex in the range of input
* compared to the tokenIndex in the whole input
*/
tokenIndexOffset = startStatement?.start?.tokenIndex ?? 0;
caretTokenIndex = caretTokenIndex - tokenIndexOffset;
/**
* Reparse the input fragment
* and c3 will collect candidates in the newly generated parseTree.
*/
const inputSlice = input.slice(startIndex, stopIndex);
const lexer = this.createLexer(inputSlice);
lexer.removeErrorListeners();
const tokenStream = new CommonTokenStream(lexer);
tokenStream.fill();
const parser = this.createParserFromTokenStream(tokenStream);
parser.removeErrorListeners();
parser.buildParseTree = true;
parser.errorHandler = new ErrorStrategy();
sqlParserIns = parser;
c3Context = parser.program();
}
const core = new CodeCompletionCore(sqlParserIns);
core.preferredRules = this.preferredRules;
const candidates = core.collectCandidates(caretTokenIndex, c3Context);
const originalSuggestions = this.processCandidates(
candidates,
allTokens,
caretTokenIndex,
tokenIndexOffset
);
const syntaxSuggestions: SyntaxSuggestion<WordRange>[] = originalSuggestions.syntax.map(
(syntaxCtx) => {
const wordRanges: WordRange[] = syntaxCtx.wordRanges.map((token) => {
return {
text: this._parsedInput.slice(token.startIndex, token.stopIndex + 1),
startIndex: token.startIndex,
stopIndex: token.stopIndex,
line: token.line,
startColumn: token.charPositionInLine + 1,
stopColumn: token.charPositionInLine + token.text.length,
};
});
return {
syntaxContextType: syntaxCtx.syntaxContextType,
wordRanges,
};
}
);
return {
syntax: syntaxSuggestions,
keywords: originalSuggestions.keywords,
};
}
}