mirror of
https://github.com/pdemian/human2regex.git
synced 2025-05-16 12:30:09 -07:00
Updated tokenizer
This commit is contained in:
parent
44838b8a43
commit
c5db6fa986
@ -1,6 +1,6 @@
|
|||||||
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
||||||
|
|
||||||
import { Lexer, IToken, createTokenInstance, ILexingResult } from "chevrotain";
|
import { Lexer, IToken, createTokenInstance, ILexingResult, ILexingError } from "chevrotain";
|
||||||
import { last, findLastIndex } from "./utilities";
|
import { last, findLastIndex } from "./utilities";
|
||||||
import { Indent, Outdent, EndOfLine, AllTokens } from "./tokens";
|
import { Indent, Outdent, EndOfLine, AllTokens } from "./tokens";
|
||||||
|
|
||||||
@ -48,10 +48,20 @@ export class Human2RegexLexer {
|
|||||||
this.lexer = new Lexer(AllTokens, { ensureOptimizations: true });
|
this.lexer = new Lexer(AllTokens, { ensureOptimizations: true });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private lex_error(token: IToken) : ILexingError {
|
||||||
|
return {
|
||||||
|
offset: token.startOffset,
|
||||||
|
line: token.startLine ?? NaN,
|
||||||
|
column: token.startColumn ?? NaN,
|
||||||
|
length: token.endOffset ?? NaN - token.startOffset,
|
||||||
|
message: "Unexpected indentation found"
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
public tokenize(text: string) : ILexingResult {
|
public tokenize(text: string) : ILexingResult {
|
||||||
const lexResult = this.lexer.tokenize(text);
|
const lexResult = this.lexer.tokenize(text);
|
||||||
|
|
||||||
if (lexResult.tokens.length == 0) {
|
if (lexResult.tokens.length === 0) {
|
||||||
return lexResult;
|
return lexResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,6 +76,8 @@ export class Human2RegexLexer {
|
|||||||
let hadIndents = false;
|
let hadIndents = false;
|
||||||
|
|
||||||
for (let i = 0; i < lexResult.tokens.length; i++) {
|
for (let i = 0; i < lexResult.tokens.length; i++) {
|
||||||
|
|
||||||
|
// EoL? check for indents next (by setting startOfLine = true)
|
||||||
if (lexResult.tokens[i].tokenType === EndOfLine) {
|
if (lexResult.tokens[i].tokenType === EndOfLine) {
|
||||||
startOfLine = true;
|
startOfLine = true;
|
||||||
tokens.push(lexResult.tokens[i]);
|
tokens.push(lexResult.tokens[i]);
|
||||||
@ -77,20 +89,22 @@ export class Human2RegexLexer {
|
|||||||
const start_token = lexResult.tokens[i];
|
const start_token = lexResult.tokens[i];
|
||||||
let length = lexResult.tokens[i].image.length;
|
let length = lexResult.tokens[i].image.length;
|
||||||
|
|
||||||
while (lexResult.tokens[i+1].tokenType === Indent) {
|
// grab all the indents (and their length)
|
||||||
|
while (lexResult.tokens.length > i && lexResult.tokens[i+1].tokenType === Indent) {
|
||||||
currIndentLevel++;
|
currIndentLevel++;
|
||||||
i++;
|
i++;
|
||||||
length += lexResult.tokens[i].image.length;
|
length += lexResult.tokens[i].image.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!startOfLine || (currIndentLevel > last(indentStack) + 1)) {
|
start_token.endOffset = start_token.startOffset + length;
|
||||||
lexResult.errors.push({
|
|
||||||
offset: start_token.startOffset,
|
// are we an empty line?
|
||||||
line: start_token.startLine ?? NaN,
|
if (lexResult.tokens.length > i && lexResult.tokens[i+1].tokenType === EndOfLine) {
|
||||||
column: start_token.startColumn ?? NaN,
|
// Ignore all indents AND newline
|
||||||
length: length,
|
// continue;
|
||||||
message: "Unexpected indentation found"
|
}
|
||||||
});
|
else if (!startOfLine || (currIndentLevel > last(indentStack) + 1)) {
|
||||||
|
lexResult.errors.push(this.lex_error(start_token));
|
||||||
}
|
}
|
||||||
else if (currIndentLevel > last(indentStack)) {
|
else if (currIndentLevel > last(indentStack)) {
|
||||||
indentStack.push(currIndentLevel);
|
indentStack.push(currIndentLevel);
|
||||||
@ -100,13 +114,7 @@ export class Human2RegexLexer {
|
|||||||
const index = findLastIndex(indentStack, currIndentLevel);
|
const index = findLastIndex(indentStack, currIndentLevel);
|
||||||
|
|
||||||
if (index < 0) {
|
if (index < 0) {
|
||||||
lexResult.errors.push({
|
lexResult.errors.push(this.lex_error(start_token));
|
||||||
offset: start_token.startOffset,
|
|
||||||
line: start_token.startLine ?? NaN,
|
|
||||||
column: start_token.startColumn ?? NaN,
|
|
||||||
length: length,
|
|
||||||
message: "Unexpected indentation found"
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
const numberOfDedents = indentStack.length - index - 1;
|
const numberOfDedents = indentStack.length - index - 1;
|
||||||
@ -119,12 +127,14 @@ export class Human2RegexLexer {
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// same indent level: don't care
|
// same indent level: don't care
|
||||||
|
// continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if(startOfLine && !hadIndents) {
|
if(startOfLine && !hadIndents) {
|
||||||
const tok = lexResult.tokens[i];
|
const tok = lexResult.tokens[i];
|
||||||
|
|
||||||
|
//add remaining Outdents
|
||||||
while (indentStack.length > 1) {
|
while (indentStack.length > 1) {
|
||||||
indentStack.pop();
|
indentStack.pop();
|
||||||
tokens.push(createTokenInstance(Outdent, "", tok.startOffset, tok.startOffset, tok.startLine ?? NaN, NaN, tok.startColumn ?? NaN, NaN));
|
tokens.push(createTokenInstance(Outdent, "", tok.startOffset, tok.startOffset, tok.startLine ?? NaN, NaN, tok.startColumn ?? NaN, NaN));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user