1
0
mirror of https://github.com/pdemian/human2regex.git synced 2025-05-16 12:30:09 -07:00

Updated tokenizer

This commit is contained in:
Patrick Demian 2020-10-27 14:26:54 -04:00
parent 44838b8a43
commit c5db6fa986

View File

@ -1,6 +1,6 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */ /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
import { Lexer, IToken, createTokenInstance, ILexingResult } from "chevrotain"; import { Lexer, IToken, createTokenInstance, ILexingResult, ILexingError } from "chevrotain";
import { last, findLastIndex } from "./utilities"; import { last, findLastIndex } from "./utilities";
import { Indent, Outdent, EndOfLine, AllTokens } from "./tokens"; import { Indent, Outdent, EndOfLine, AllTokens } from "./tokens";
@ -48,10 +48,20 @@ export class Human2RegexLexer {
this.lexer = new Lexer(AllTokens, { ensureOptimizations: true }); this.lexer = new Lexer(AllTokens, { ensureOptimizations: true });
} }
private lex_error(token: IToken) : ILexingError {
return {
offset: token.startOffset,
line: token.startLine ?? NaN,
column: token.startColumn ?? NaN,
length: token.endOffset ?? NaN - token.startOffset,
message: "Unexpected indentation found"
};
}
public tokenize(text: string) : ILexingResult { public tokenize(text: string) : ILexingResult {
const lexResult = this.lexer.tokenize(text); const lexResult = this.lexer.tokenize(text);
if (lexResult.tokens.length == 0) { if (lexResult.tokens.length === 0) {
return lexResult; return lexResult;
} }
@ -66,6 +76,8 @@ export class Human2RegexLexer {
let hadIndents = false; let hadIndents = false;
for (let i = 0; i < lexResult.tokens.length; i++) { for (let i = 0; i < lexResult.tokens.length; i++) {
// EoL? check for indents next (by setting startOfLine = true)
if (lexResult.tokens[i].tokenType === EndOfLine) { if (lexResult.tokens[i].tokenType === EndOfLine) {
startOfLine = true; startOfLine = true;
tokens.push(lexResult.tokens[i]); tokens.push(lexResult.tokens[i]);
@ -77,20 +89,22 @@ export class Human2RegexLexer {
const start_token = lexResult.tokens[i]; const start_token = lexResult.tokens[i];
let length = lexResult.tokens[i].image.length; let length = lexResult.tokens[i].image.length;
while (lexResult.tokens[i+1].tokenType === Indent) { // grab all the indents (and their length)
while (lexResult.tokens.length > i && lexResult.tokens[i+1].tokenType === Indent) {
currIndentLevel++; currIndentLevel++;
i++; i++;
length += lexResult.tokens[i].image.length; length += lexResult.tokens[i].image.length;
} }
if (!startOfLine || (currIndentLevel > last(indentStack) + 1)) { start_token.endOffset = start_token.startOffset + length;
lexResult.errors.push({
offset: start_token.startOffset, // are we an empty line?
line: start_token.startLine ?? NaN, if (lexResult.tokens.length > i && lexResult.tokens[i+1].tokenType === EndOfLine) {
column: start_token.startColumn ?? NaN, // Ignore all indents AND newline
length: length, // continue;
message: "Unexpected indentation found" }
}); else if (!startOfLine || (currIndentLevel > last(indentStack) + 1)) {
lexResult.errors.push(this.lex_error(start_token));
} }
else if (currIndentLevel > last(indentStack)) { else if (currIndentLevel > last(indentStack)) {
indentStack.push(currIndentLevel); indentStack.push(currIndentLevel);
@ -100,13 +114,7 @@ export class Human2RegexLexer {
const index = findLastIndex(indentStack, currIndentLevel); const index = findLastIndex(indentStack, currIndentLevel);
if (index < 0) { if (index < 0) {
lexResult.errors.push({ lexResult.errors.push(this.lex_error(start_token));
offset: start_token.startOffset,
line: start_token.startLine ?? NaN,
column: start_token.startColumn ?? NaN,
length: length,
message: "Unexpected indentation found"
});
} }
else { else {
const numberOfDedents = indentStack.length - index - 1; const numberOfDedents = indentStack.length - index - 1;
@ -119,12 +127,14 @@ export class Human2RegexLexer {
} }
else { else {
// same indent level: don't care // same indent level: don't care
// continue;
} }
} }
else { else {
if(startOfLine && !hadIndents) { if(startOfLine && !hadIndents) {
const tok = lexResult.tokens[i]; const tok = lexResult.tokens[i];
//add remaining Outdents
while (indentStack.length > 1) { while (indentStack.length > 1) {
indentStack.pop(); indentStack.pop();
tokens.push(createTokenInstance(Outdent, "", tok.startOffset, tok.startOffset, tok.startLine ?? NaN, NaN, tok.startColumn ?? NaN, NaN)); tokens.push(createTokenInstance(Outdent, "", tok.startOffset, tok.startOffset, tok.startLine ?? NaN, NaN, tok.startColumn ?? NaN, NaN));