mirror of
https://github.com/pdemian/human2regex.git
synced 2025-05-16 12:30:09 -07:00
Tokenizer now correctly recognizes Outdents
This commit is contained in:
parent
5e9c185923
commit
44838b8a43
9155
docs/bundle.min.js
vendored
9155
docs/bundle.min.js
vendored
File diff suppressed because one or more lines are too long
@ -29,7 +29,7 @@
|
|||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"build": "webpack --config webpack.full.config.js",
|
"build": "webpack --config webpack.full.config.js",
|
||||||
"partial": "webpack --config webpack.partial.config.js",
|
"partial": "webpack --config webpack.partial.config.js",
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
},
|
},
|
||||||
"keywords": [
|
"keywords": [
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
||||||
|
|
||||||
import { CstParser } from "chevrotain";
|
import { CstParser } from "chevrotain";
|
||||||
import * as T from "./tokenizer";
|
import * as T from "./tokens";
|
||||||
|
|
||||||
export class Human2RegexParser extends CstParser {
|
export class Human2RegexParser extends CstParser {
|
||||||
constructor() {
|
constructor() {
|
||||||
|
@ -2,8 +2,7 @@
|
|||||||
|
|
||||||
import "./style.css";
|
import "./style.css";
|
||||||
|
|
||||||
import { Human2RegexLexer } from './tokenizer';
|
import { Human2RegexLexer } from "./tokenizer";
|
||||||
import { Human2RegexParser } from './parser';
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
$(function() {
|
$(function() {
|
||||||
@ -11,7 +10,9 @@ $(function() {
|
|||||||
});
|
});
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const result = Human2RegexLexer.tokenize(`
|
const lexer = new Human2RegexLexer();
|
||||||
|
|
||||||
|
const result = lexer.tokenize(`
|
||||||
// H2R supports // # and /**/ as comments
|
// H2R supports // # and /**/ as comments
|
||||||
// A group is only captured if given a name.
|
// A group is only captured if given a name.
|
||||||
// You can use "and", "or", "not" to specify "[]" regex
|
// You can use "and", "or", "not" to specify "[]" regex
|
||||||
@ -52,8 +53,16 @@ create an optional group
|
|||||||
match 0+ any thing
|
match 0+ any thing
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
//let str = "";
|
||||||
|
|
||||||
|
//for(const r of result.tokens) {
|
||||||
|
// str += r.tokenType === Newline ? "\n" : r.image + " ";
|
||||||
|
//}
|
||||||
|
|
||||||
|
//console.log(str);
|
||||||
|
|
||||||
for(const r of result.tokens) {
|
for(const r of result.tokens) {
|
||||||
console.log(r);
|
console.log(r);
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(result.errors);
|
console.log(result.errors);
|
413
src/tokenizer.ts
413
src/tokenizer.ts
@ -1,179 +1,8 @@
|
|||||||
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
||||||
|
|
||||||
import { createToken, Lexer, IToken, createTokenInstance, ILexingResult } from "chevrotain";
|
import { Lexer, IToken, createTokenInstance, ILexingResult } from "chevrotain";
|
||||||
|
import { last, findLastIndex } from "./utilities";
|
||||||
export const Zero = createToken({name: "Zero", pattern: /zero/i });
|
import { Indent, Outdent, EndOfLine, AllTokens } from "./tokens";
|
||||||
export const One = createToken({name: "One", pattern: /one/i });
|
|
||||||
export const Two = createToken({name: "Two", pattern: /two/i });
|
|
||||||
export const Three = createToken({name: "Three", pattern: /three/i });
|
|
||||||
export const Four = createToken({name: "Four", pattern: /four/i });
|
|
||||||
export const Five = createToken({name: "Five", pattern: /five/i });
|
|
||||||
export const Six = createToken({name: "Six", pattern: /six/i });
|
|
||||||
export const Seven = createToken({name: "Seven", pattern: /seven/i });
|
|
||||||
export const Eight = createToken({name: "Eight", pattern: /eight/i });
|
|
||||||
export const Nine = createToken({name: "Nine", pattern: /nine/i });
|
|
||||||
export const Ten = createToken({name: "Ten", pattern: /ten/i });
|
|
||||||
|
|
||||||
export const Optional = createToken({name: "Optional", pattern: /optional(ly)?/i });
|
|
||||||
export const Match = createToken({name: "Match", pattern: /match(es)?/i });
|
|
||||||
export const Then = createToken({name: "Then", pattern: /then/i });
|
|
||||||
export const Anything = createToken({name: "Anything", pattern: /(any thing|any|anything)(s)?/i});
|
|
||||||
export const Of = createToken({name: "Of", pattern: /of/i});
|
|
||||||
export const Or = createToken({name: "Or", pattern: /or/i});
|
|
||||||
export const And = createToken({name: "And", pattern: /and|,/i});
|
|
||||||
export const Word = createToken({name: "Word Specifier", pattern: /word(s)?/i});
|
|
||||||
export const Digit = createToken({name: "Digit Specifier", pattern: /digit(s)?/i});
|
|
||||||
export const Character = createToken({name: "Character Specifier", pattern: /character(s)?/i});
|
|
||||||
export const Whitespace = createToken({name: "Whitespace Specifier", pattern: /(white space|whitespace)(s)?/i});
|
|
||||||
export const Number = createToken({name: "NumberSpecifier", pattern: /number(s)?/i});
|
|
||||||
export const As = createToken({name: "As", pattern: /as/i});
|
|
||||||
export const If = createToken({name: "If", pattern: /if/i});
|
|
||||||
export const Start = createToken({name: "Start", pattern: /start(s)?/i});
|
|
||||||
export const With = createToken({name: "With", pattern: /with/i});
|
|
||||||
export const Ends = createToken({name: "Ends", pattern: /end(s)?/i});
|
|
||||||
export const Otherwise = createToken({name: "Otherwise", pattern: /(other wise|otherwise)/i});
|
|
||||||
export const Else = createToken({name: "Else", pattern: /else/i});
|
|
||||||
export const Unless = createToken({name: "Unless", pattern: /unless/i});
|
|
||||||
export const While = createToken({name: "While", pattern: /while/i});
|
|
||||||
export const More = createToken({name: "More", pattern: /more/i});
|
|
||||||
export const Using = createToken({name: "Using", pattern: /using/i});
|
|
||||||
export const Global = createToken({name: "Global", pattern: /global/i});
|
|
||||||
export const Multiline = createToken({name: "Multiline", pattern: /(multi line|multiline)/i});
|
|
||||||
export const Exact = createToken({name: "Exact", pattern: /exact/i});
|
|
||||||
export const Matching = createToken({name: "Matching", pattern: /matching/i});
|
|
||||||
export const Nothing = createToken({name: "Nothing", pattern: /nothing/i});
|
|
||||||
export const Not = createToken({name: "Not", pattern: /not/i }); //, longer_alt: Nothing});
|
|
||||||
export const Between = createToken({name: "Between", pattern: /between/i});
|
|
||||||
export const Tab = createToken({name: "Tab", pattern: /tab/i});
|
|
||||||
export const Linefeed = createToken({name: "Linefeed", pattern: /(line feed|linefeed)/i});
|
|
||||||
export const Group = createToken({name: "Group", pattern: /group/i});
|
|
||||||
export const By = createToken({name: "By", pattern: /by/i});
|
|
||||||
export const A = createToken({name: "A", pattern: /a(n)?/i }); //, longer_alt: Anything});
|
|
||||||
export const The = createToken({name: "The", pattern: /the/i }); //, longer_alt: Then});
|
|
||||||
export const Exactly = createToken({name: "Exactly", pattern: /exact(ly)?/i});
|
|
||||||
export const Inclusive = createToken({name: "Inclusive", pattern: /inclusive(ly)?/i});
|
|
||||||
export const Exclusive = createToken({name: "Exclusive", pattern: /exclusive(ly)?/i});
|
|
||||||
export const From = createToken({name: "From", pattern: /from/i});
|
|
||||||
export const To = createToken({name: "To", pattern: /(to|\-|\.\.|\.\.\.)/i});
|
|
||||||
export const Create = createToken({name: "Create", pattern: /create(s)?/i});
|
|
||||||
export const Called = createToken({name: "Called", pattern: /called/i});
|
|
||||||
export const Repeat = createToken({name: "Repeat", pattern: /repeat(s|ing)?/i});
|
|
||||||
export const Newline = createToken({name: "Newline", pattern: /(new line|newline)/i});
|
|
||||||
export const None = createToken({name: "None", pattern: /none/i});
|
|
||||||
export const Neither = createToken({name: "Neither", pattern: /neither/i});
|
|
||||||
export const CarriageReturn = createToken({name: "Carriage Return", pattern: /carriage return/i});
|
|
||||||
export const CaseInsensitive = createToken({name: "Case Insensitive", pattern: /case insensitive/i});
|
|
||||||
export const CaseSensitive = createToken({name: "Case Sensitive", pattern: /case sensitive/i});
|
|
||||||
export const OrMore = createToken({name: "Or More", pattern: /\+/ });
|
|
||||||
|
|
||||||
export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
|
|
||||||
export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
|
|
||||||
|
|
||||||
export const EndOfLine = createToken({name: "EOL", pattern: /\n/, group: "nl" });
|
|
||||||
export const WhiteSpace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED });
|
|
||||||
export const SingleLineComment = createToken({name: "Single-Line Comment", pattern: /(#|\/\/).*/, group: Lexer.SKIPPED });
|
|
||||||
export const MultilineComment = createToken({name: "Multi-Line Comment", pattern: /\/\*(.*)\*\//, line_breaks: true, group: Lexer.SKIPPED });
|
|
||||||
|
|
||||||
export const Identifier = createToken({name: "Identifier", pattern: /[a-z]\w*/i });
|
|
||||||
export const NumberLiteral = createToken({name: "Number Literal", pattern: /-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?/ });
|
|
||||||
export const StringLiteral = createToken({name: "String Literal", pattern: /"(?:[^\\"]|\\(?:[bfnrtv"\\/]|u[0-9a-f]{4}|U[0-9a-f]{8}))*"/i });
|
|
||||||
|
|
||||||
enum IndentBaseType {
|
|
||||||
Indent,
|
|
||||||
Outdent
|
|
||||||
}
|
|
||||||
|
|
||||||
export const Indent = createToken({
|
|
||||||
name: "Indent",
|
|
||||||
start_chars_hint: [ "\t", " " ],
|
|
||||||
pattern: (text, offset, matchedTokens, groups) => Human2RegexLexer.matchIndentBase(text, offset, matchedTokens, groups, IndentBaseType.Indent),
|
|
||||||
// custom token patterns should explicitly specify the line_breaks option
|
|
||||||
line_breaks: false
|
|
||||||
});
|
|
||||||
|
|
||||||
export const Outdent = createToken({
|
|
||||||
name: "Outdent",
|
|
||||||
start_chars_hint: [ "\t", " " ],
|
|
||||||
pattern: (text, offset, matchedTokens, groups) => Human2RegexLexer.matchIndentBase(text, offset, matchedTokens, groups, IndentBaseType.Outdent),
|
|
||||||
// custom token patterns should explicitly specify the line_breaks option
|
|
||||||
line_breaks: false
|
|
||||||
});
|
|
||||||
|
|
||||||
export const AllTokens = [
|
|
||||||
Zero,
|
|
||||||
One,
|
|
||||||
Two,
|
|
||||||
Three,
|
|
||||||
Four,
|
|
||||||
Five,
|
|
||||||
Six,
|
|
||||||
Seven,
|
|
||||||
Eight,
|
|
||||||
Nine,
|
|
||||||
Ten,
|
|
||||||
Optional,
|
|
||||||
Matching,
|
|
||||||
Match,
|
|
||||||
Then,
|
|
||||||
Anything,
|
|
||||||
Of,
|
|
||||||
Or,
|
|
||||||
And,
|
|
||||||
Word,
|
|
||||||
Digit,
|
|
||||||
Character,
|
|
||||||
Whitespace,
|
|
||||||
Number,
|
|
||||||
As,
|
|
||||||
If,
|
|
||||||
Start,
|
|
||||||
With,
|
|
||||||
Ends,
|
|
||||||
Otherwise,
|
|
||||||
Else,
|
|
||||||
Unless,
|
|
||||||
While,
|
|
||||||
More,
|
|
||||||
Using,
|
|
||||||
Global,
|
|
||||||
Multiline,
|
|
||||||
Exact,
|
|
||||||
Nothing,
|
|
||||||
Not,
|
|
||||||
Between,
|
|
||||||
Tab,
|
|
||||||
Linefeed,
|
|
||||||
Group,
|
|
||||||
By,
|
|
||||||
A,
|
|
||||||
The,
|
|
||||||
Exactly,
|
|
||||||
Inclusive,
|
|
||||||
Exclusive,
|
|
||||||
From,
|
|
||||||
Create,
|
|
||||||
Called,
|
|
||||||
Repeat,
|
|
||||||
Newline,
|
|
||||||
None,
|
|
||||||
Neither,
|
|
||||||
CarriageReturn,
|
|
||||||
CaseInsensitive,
|
|
||||||
CaseSensitive,
|
|
||||||
OrMore,
|
|
||||||
To,
|
|
||||||
EndOfLine,
|
|
||||||
Indent,
|
|
||||||
Outdent,
|
|
||||||
WhiteSpace,
|
|
||||||
SingleLineComment,
|
|
||||||
MultilineComment,
|
|
||||||
Identifier,
|
|
||||||
NumberLiteral,
|
|
||||||
StringLiteral,
|
|
||||||
];
|
|
||||||
|
|
||||||
const H2RLexer = new Lexer(AllTokens, { ensureOptimizations: true });
|
|
||||||
|
|
||||||
export enum IndentType {
|
export enum IndentType {
|
||||||
Tabs,
|
Tabs,
|
||||||
@ -188,118 +17,21 @@ export class Human2RegexLexerOptions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export class Human2RegexLexer {
|
export class Human2RegexLexer {
|
||||||
//Taken and adapted from https://github.com/SAP/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
|
private static already_init = false;
|
||||||
|
|
||||||
// State required for matching the indentations
|
private lexer : Lexer;
|
||||||
private static options = new Human2RegexLexerOptions();
|
|
||||||
private static indentStack = [ 0 ];
|
|
||||||
private static wsRegExp: RegExp;
|
|
||||||
private static spacesPerTab = " ";
|
|
||||||
|
|
||||||
private static findLastIndex<T>(array: T[], predicate: (x: T) => boolean) : number {
|
constructor(private options: Human2RegexLexerOptions = new Human2RegexLexerOptions()) {
|
||||||
for (let index = array.length; index >= 0; index--) {
|
if (Human2RegexLexer.already_init) {
|
||||||
if (predicate(array[index])) {
|
throw new Error("Only 1 instance of Human2RegexLexer allowed");
|
||||||
return index;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This custom Token matcher uses Lexer context ("matchedTokens" and "groups" arguments)
|
|
||||||
* combined with state via closure ("indentStack" and "lastTextMatched") to match indentation.
|
|
||||||
*/
|
|
||||||
public static matchIndentBase(text: string, offset: number, matchedTokens: IToken[], groups: {[groupName: string]: IToken[]}, type: IndentBaseType) : RegExpExecArray | null {
|
|
||||||
const noTokensMatchedYet = !matchedTokens.length;
|
|
||||||
const newLines = groups.nl;
|
|
||||||
const noNewLinesMatchedYet = !newLines.length;
|
|
||||||
const isFirstLine = noTokensMatchedYet && noNewLinesMatchedYet;
|
|
||||||
const isStartOfLine =
|
|
||||||
// only newlines matched so far
|
|
||||||
(noTokensMatchedYet && !noNewLinesMatchedYet) ||
|
|
||||||
// Both newlines and other Tokens have been matched AND the offset is just after the last newline
|
|
||||||
(!noTokensMatchedYet &&
|
|
||||||
!noNewLinesMatchedYet &&
|
|
||||||
offset === newLines[newLines.length-1].startOffset + 1);
|
|
||||||
|
|
||||||
// indentation can only be matched at the start of a line.
|
|
||||||
if (isFirstLine || isStartOfLine) {
|
|
||||||
let currIndentLevel: number = -1;
|
|
||||||
|
|
||||||
Human2RegexLexer.wsRegExp.lastIndex = offset;
|
|
||||||
const match = Human2RegexLexer.wsRegExp.exec(text);
|
|
||||||
|
|
||||||
// possible non-empty indentation
|
|
||||||
if (match !== null) {
|
|
||||||
currIndentLevel = match[0].length;
|
|
||||||
//if (this.options.type === IndentType.Tabs) {
|
|
||||||
// currIndentLevel = match[0].length;
|
|
||||||
//}
|
|
||||||
//else {
|
|
||||||
// currIndentLevel = match[0].replace(Human2RegexLexer.spacesPerTab, "\t").length;
|
|
||||||
//}
|
|
||||||
}
|
|
||||||
// "empty" indentation means indentLevel of 0.
|
|
||||||
else {
|
|
||||||
currIndentLevel = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
const prevIndentLevel = this.indentStack[this.indentStack.length-1];
|
|
||||||
// deeper indentation
|
|
||||||
if (currIndentLevel > prevIndentLevel && type === IndentBaseType.Indent) {
|
|
||||||
this.indentStack.push(currIndentLevel);
|
|
||||||
return match;
|
|
||||||
}
|
|
||||||
// shallower indentation
|
|
||||||
else if (currIndentLevel < prevIndentLevel && type === IndentBaseType.Outdent) {
|
|
||||||
const matchIndentIndex = this.findLastIndex(this.indentStack, (stackIndentDepth) => stackIndentDepth === currIndentLevel);
|
|
||||||
|
|
||||||
// any outdent must match some previous indentation level.
|
|
||||||
if (matchIndentIndex === -1) {
|
|
||||||
throw Error(`invalid outdent at offset: ${offset}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
const numberOfDedents = this.indentStack.length - matchIndentIndex - 1;
|
|
||||||
|
|
||||||
// This is a little tricky
|
|
||||||
// 1. If there is no match (0 level indent) than this custom token
|
|
||||||
// matcher would return "null" and so we need to add all the required outdents ourselves.
|
|
||||||
// 2. If there was match (> 0 level indent) than we need to add minus one number of outsents
|
|
||||||
// because the lexer would create one due to returning a none null result.
|
|
||||||
const iStart = match !== null ? 1 : 0;
|
|
||||||
for (let i = iStart; i < numberOfDedents; i++) {
|
|
||||||
this.indentStack.pop();
|
|
||||||
matchedTokens.push(createTokenInstance(Outdent, "", NaN, NaN, NaN, NaN, NaN, NaN));
|
|
||||||
}
|
|
||||||
|
|
||||||
// even though we are adding fewer outdents directly we still need to update the indent stack fully.
|
|
||||||
if (iStart === 1) {
|
|
||||||
this.indentStack.pop();
|
|
||||||
}
|
|
||||||
return match;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// same indent, this should be lexed as simple whitespace and ignored
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// indentation cannot be matched under other circumstances
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static tokenize(text: string, options: Human2RegexLexerOptions | null = null) : ILexingResult{
|
|
||||||
// have to reset the indent stack between processing of different text inputs
|
|
||||||
Human2RegexLexer.indentStack = [ 0 ];
|
|
||||||
|
|
||||||
if (options !== null) {
|
|
||||||
Human2RegexLexer.options = this.options;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
Human2RegexLexer.already_init = true;
|
||||||
|
|
||||||
|
let indent_regex: RegExp | null = null;
|
||||||
|
|
||||||
if (this.options.type === IndentType.Tabs) {
|
if (this.options.type === IndentType.Tabs) {
|
||||||
Human2RegexLexer.wsRegExp = /\t/y;
|
indent_regex = /\t/y;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
let reg = ` {${this.options.spaces_per_tab}}`;
|
let reg = ` {${this.options.spaces_per_tab}}`;
|
||||||
@ -308,20 +40,117 @@ export class Human2RegexLexer {
|
|||||||
reg += "|\\t";
|
reg += "|\\t";
|
||||||
}
|
}
|
||||||
|
|
||||||
Human2RegexLexer.wsRegExp = new RegExp(reg, "y");
|
indent_regex = new RegExp(reg, "y");
|
||||||
|
}
|
||||||
|
|
||||||
Human2RegexLexer.spacesPerTab = Array(this.options.spaces_per_tab+1).join(" ");
|
Indent.PATTERN = indent_regex;
|
||||||
}*/
|
|
||||||
Human2RegexLexer.wsRegExp = / +/y;
|
this.lexer = new Lexer(AllTokens, { ensureOptimizations: true });
|
||||||
|
}
|
||||||
const lexResult = H2RLexer.tokenize(text);
|
|
||||||
|
public tokenize(text: string) : ILexingResult {
|
||||||
//add remaining Outdents
|
const lexResult = this.lexer.tokenize(text);
|
||||||
while (Human2RegexLexer.indentStack.length > 1) {
|
|
||||||
lexResult.tokens.push(createTokenInstance(Outdent, "", NaN, NaN, NaN, NaN, NaN, NaN));
|
if (lexResult.tokens.length == 0) {
|
||||||
Human2RegexLexer.indentStack.pop();
|
return lexResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create Outdents
|
||||||
|
|
||||||
|
const tokens: IToken[] = [];
|
||||||
|
|
||||||
|
const indentStack = [ 0 ];
|
||||||
|
|
||||||
|
let currIndentLevel = 0;
|
||||||
|
let startOfLine = true;
|
||||||
|
let hadIndents = false;
|
||||||
|
|
||||||
|
for (let i = 0; i < lexResult.tokens.length; i++) {
|
||||||
|
if (lexResult.tokens[i].tokenType === EndOfLine) {
|
||||||
|
startOfLine = true;
|
||||||
|
tokens.push(lexResult.tokens[i]);
|
||||||
|
}
|
||||||
|
else if (lexResult.tokens[i].tokenType === Indent) {
|
||||||
|
hadIndents = true;
|
||||||
|
currIndentLevel = 1;
|
||||||
|
|
||||||
|
const start_token = lexResult.tokens[i];
|
||||||
|
let length = lexResult.tokens[i].image.length;
|
||||||
|
|
||||||
|
while (lexResult.tokens[i+1].tokenType === Indent) {
|
||||||
|
currIndentLevel++;
|
||||||
|
i++;
|
||||||
|
length += lexResult.tokens[i].image.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!startOfLine || (currIndentLevel > last(indentStack) + 1)) {
|
||||||
|
lexResult.errors.push({
|
||||||
|
offset: start_token.startOffset,
|
||||||
|
line: start_token.startLine ?? NaN,
|
||||||
|
column: start_token.startColumn ?? NaN,
|
||||||
|
length: length,
|
||||||
|
message: "Unexpected indentation found"
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else if (currIndentLevel > last(indentStack)) {
|
||||||
|
indentStack.push(currIndentLevel);
|
||||||
|
tokens.push(start_token);
|
||||||
|
}
|
||||||
|
else if (currIndentLevel < last(indentStack)) {
|
||||||
|
const index = findLastIndex(indentStack, currIndentLevel);
|
||||||
|
|
||||||
|
if (index < 0) {
|
||||||
|
lexResult.errors.push({
|
||||||
|
offset: start_token.startOffset,
|
||||||
|
line: start_token.startLine ?? NaN,
|
||||||
|
column: start_token.startColumn ?? NaN,
|
||||||
|
length: length,
|
||||||
|
message: "Unexpected indentation found"
|
||||||
|
});
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const numberOfDedents = indentStack.length - index - 1;
|
||||||
|
|
||||||
|
for(let i = 0; i < numberOfDedents; i++) {
|
||||||
|
indentStack.pop();
|
||||||
|
tokens.push(createTokenInstance(Outdent, "", start_token.startOffset, start_token.startOffset + length, start_token.startLine ?? NaN, start_token.endLine ?? NaN, start_token.startColumn ?? NaN, (start_token.startColumn ?? NaN) + length));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// same indent level: don't care
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if(startOfLine && !hadIndents) {
|
||||||
|
const tok = lexResult.tokens[i];
|
||||||
|
|
||||||
|
while (indentStack.length > 1) {
|
||||||
|
indentStack.pop();
|
||||||
|
tokens.push(createTokenInstance(Outdent, "", tok.startOffset, tok.startOffset, tok.startLine ?? NaN, NaN, tok.startColumn ?? NaN, NaN));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
startOfLine = false;
|
||||||
|
hadIndents = false;
|
||||||
|
tokens.push(lexResult.tokens[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const tok = last(tokens);
|
||||||
|
|
||||||
|
// Do we have an EOL marker at the end?
|
||||||
|
if(tok.tokenType !== EndOfLine) {
|
||||||
|
tokens.push(createTokenInstance(EndOfLine, "\n", tok.endOffset ?? NaN, tok.endOffset ?? NaN, tok.startLine ?? NaN, NaN, tok.startColumn ?? NaN, NaN));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//add remaining Outdents
|
||||||
|
while (indentStack.length > 1) {
|
||||||
|
indentStack.pop();
|
||||||
|
tokens.push(createTokenInstance(Outdent, "", tok.endOffset ?? NaN, tok.endOffset ?? NaN, tok.startLine ?? NaN, NaN, tok.startColumn ?? NaN, NaN));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
lexResult.tokens = tokens;
|
||||||
return lexResult;
|
return lexResult;
|
||||||
}
|
}
|
||||||
}
|
}
|
154
src/tokens.ts
Normal file
154
src/tokens.ts
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
import { createToken, Lexer } from "chevrotain";
|
||||||
|
|
||||||
|
export const Zero = createToken({name: "Zero", pattern: /zero/i });
|
||||||
|
export const One = createToken({name: "One", pattern: /one/i });
|
||||||
|
export const Two = createToken({name: "Two", pattern: /two/i });
|
||||||
|
export const Three = createToken({name: "Three", pattern: /three/i });
|
||||||
|
export const Four = createToken({name: "Four", pattern: /four/i });
|
||||||
|
export const Five = createToken({name: "Five", pattern: /five/i });
|
||||||
|
export const Six = createToken({name: "Six", pattern: /six/i });
|
||||||
|
export const Seven = createToken({name: "Seven", pattern: /seven/i });
|
||||||
|
export const Eight = createToken({name: "Eight", pattern: /eight/i });
|
||||||
|
export const Nine = createToken({name: "Nine", pattern: /nine/i });
|
||||||
|
export const Ten = createToken({name: "Ten", pattern: /ten/i });
|
||||||
|
|
||||||
|
export const Optional = createToken({name: "Optional", pattern: /optional(ly)?/i });
|
||||||
|
export const Match = createToken({name: "Match", pattern: /match(es)?/i });
|
||||||
|
export const Then = createToken({name: "Then", pattern: /then/i });
|
||||||
|
export const Anything = createToken({name: "Anything", pattern: /(any thing|any|anything)(s)?/i});
|
||||||
|
export const Of = createToken({name: "Of", pattern: /of/i});
|
||||||
|
export const Or = createToken({name: "Or", pattern: /or/i});
|
||||||
|
export const And = createToken({name: "And", pattern: /and|,/i});
|
||||||
|
export const Word = createToken({name: "Word Specifier", pattern: /word(s)?/i});
|
||||||
|
export const Digit = createToken({name: "Digit Specifier", pattern: /digit(s)?/i});
|
||||||
|
export const Character = createToken({name: "Character Specifier", pattern: /character(s)?/i});
|
||||||
|
export const Whitespace = createToken({name: "Whitespace Specifier", pattern: /(white space|whitespace)(s)?/i});
|
||||||
|
export const Number = createToken({name: "NumberSpecifier", pattern: /number(s)?/i});
|
||||||
|
export const As = createToken({name: "As", pattern: /as/i});
|
||||||
|
export const If = createToken({name: "If", pattern: /if/i});
|
||||||
|
export const Start = createToken({name: "Start", pattern: /start(s)?/i});
|
||||||
|
export const With = createToken({name: "With", pattern: /with/i});
|
||||||
|
export const Ends = createToken({name: "Ends", pattern: /end(s)?/i});
|
||||||
|
export const Otherwise = createToken({name: "Otherwise", pattern: /(other wise|otherwise)/i});
|
||||||
|
export const Else = createToken({name: "Else", pattern: /else/i});
|
||||||
|
export const Unless = createToken({name: "Unless", pattern: /unless/i});
|
||||||
|
export const While = createToken({name: "While", pattern: /while/i});
|
||||||
|
export const More = createToken({name: "More", pattern: /more/i});
|
||||||
|
export const Using = createToken({name: "Using", pattern: /using/i});
|
||||||
|
export const Global = createToken({name: "Global", pattern: /global/i});
|
||||||
|
export const Multiline = createToken({name: "Multiline", pattern: /(multi line|multiline)/i});
|
||||||
|
export const Exact = createToken({name: "Exact", pattern: /exact/i});
|
||||||
|
export const Matching = createToken({name: "Matching", pattern: /matching/i});
|
||||||
|
export const Nothing = createToken({name: "Nothing", pattern: /nothing/i});
|
||||||
|
export const Not = createToken({name: "Not", pattern: /not/i }); //, longer_alt: Nothing});
|
||||||
|
export const Between = createToken({name: "Between", pattern: /between/i});
|
||||||
|
export const Tab = createToken({name: "Tab", pattern: /tab/i});
|
||||||
|
export const Linefeed = createToken({name: "Linefeed", pattern: /(line feed|linefeed)/i});
|
||||||
|
export const Group = createToken({name: "Group", pattern: /group/i});
|
||||||
|
export const By = createToken({name: "By", pattern: /by/i});
|
||||||
|
export const A = createToken({name: "A", pattern: /a(n)?/i }); //, longer_alt: Anything});
|
||||||
|
export const The = createToken({name: "The", pattern: /the/i }); //, longer_alt: Then});
|
||||||
|
export const Exactly = createToken({name: "Exactly", pattern: /exact(ly)?/i});
|
||||||
|
export const Inclusive = createToken({name: "Inclusive", pattern: /inclusive(ly)?/i});
|
||||||
|
export const Exclusive = createToken({name: "Exclusive", pattern: /exclusive(ly)?/i});
|
||||||
|
export const From = createToken({name: "From", pattern: /from/i});
|
||||||
|
export const To = createToken({name: "To", pattern: /(to|\-|\.\.|\.\.\.)/i});
|
||||||
|
export const Create = createToken({name: "Create", pattern: /create(s)?/i});
|
||||||
|
export const Called = createToken({name: "Called", pattern: /called/i});
|
||||||
|
export const Repeat = createToken({name: "Repeat", pattern: /repeat(s|ing)?/i});
|
||||||
|
export const Newline = createToken({name: "Newline", pattern: /(new line|newline)/i});
|
||||||
|
export const None = createToken({name: "None", pattern: /none/i});
|
||||||
|
export const Neither = createToken({name: "Neither", pattern: /neither/i});
|
||||||
|
export const CarriageReturn = createToken({name: "Carriage Return", pattern: /carriage return/i});
|
||||||
|
export const CaseInsensitive = createToken({name: "Case Insensitive", pattern: /case insensitive/i});
|
||||||
|
export const CaseSensitive = createToken({name: "Case Sensitive", pattern: /case sensitive/i});
|
||||||
|
export const OrMore = createToken({name: "Or More", pattern: /\+/ });
|
||||||
|
|
||||||
|
export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
|
||||||
|
export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
|
||||||
|
|
||||||
|
export const EndOfLine = createToken({name: "EOL", pattern: /\n/ });
|
||||||
|
export const WhiteSpace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED });
|
||||||
|
export const SingleLineComment = createToken({name: "Single-Line Comment", pattern: /(#|\/\/).*/, group: Lexer.SKIPPED });
|
||||||
|
export const MultilineComment = createToken({name: "Multi-Line Comment", pattern: /\/\*(.*)\*\//, line_breaks: true, group: Lexer.SKIPPED });
|
||||||
|
|
||||||
|
export const Identifier = createToken({name: "Identifier", pattern: /[a-z]\w*/i });
|
||||||
|
export const NumberLiteral = createToken({name: "Number Literal", pattern: /-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?/ });
|
||||||
|
export const StringLiteral = createToken({name: "String Literal", pattern: /"(?:[^\\"]|\\(?:[bfnrtv"\\/]|u[0-9a-f]{4}|U[0-9a-f]{8}))*"/i });
|
||||||
|
|
||||||
|
export const Indent = createToken({name: "Indent"});
|
||||||
|
|
||||||
|
export const Outdent = createToken({name: "Outdent"});
|
||||||
|
|
||||||
|
export const AllTokens = [
|
||||||
|
Zero,
|
||||||
|
One,
|
||||||
|
Two,
|
||||||
|
Three,
|
||||||
|
Four,
|
||||||
|
Five,
|
||||||
|
Six,
|
||||||
|
Seven,
|
||||||
|
Eight,
|
||||||
|
Nine,
|
||||||
|
Ten,
|
||||||
|
Optional,
|
||||||
|
Matching,
|
||||||
|
Match,
|
||||||
|
Then,
|
||||||
|
Anything,
|
||||||
|
Of,
|
||||||
|
Or,
|
||||||
|
And,
|
||||||
|
Word,
|
||||||
|
Digit,
|
||||||
|
Character,
|
||||||
|
Whitespace,
|
||||||
|
Number,
|
||||||
|
As,
|
||||||
|
If,
|
||||||
|
Start,
|
||||||
|
With,
|
||||||
|
Ends,
|
||||||
|
Otherwise,
|
||||||
|
Else,
|
||||||
|
Unless,
|
||||||
|
While,
|
||||||
|
More,
|
||||||
|
Using,
|
||||||
|
Global,
|
||||||
|
Multiline,
|
||||||
|
Exact,
|
||||||
|
Nothing,
|
||||||
|
Not,
|
||||||
|
Between,
|
||||||
|
Tab,
|
||||||
|
Linefeed,
|
||||||
|
Group,
|
||||||
|
By,
|
||||||
|
A,
|
||||||
|
The,
|
||||||
|
Exactly,
|
||||||
|
Inclusive,
|
||||||
|
Exclusive,
|
||||||
|
From,
|
||||||
|
Create,
|
||||||
|
Called,
|
||||||
|
Repeat,
|
||||||
|
Newline,
|
||||||
|
None,
|
||||||
|
Neither,
|
||||||
|
CarriageReturn,
|
||||||
|
CaseInsensitive,
|
||||||
|
CaseSensitive,
|
||||||
|
OrMore,
|
||||||
|
To,
|
||||||
|
EndOfLine,
|
||||||
|
Indent,
|
||||||
|
WhiteSpace,
|
||||||
|
SingleLineComment,
|
||||||
|
MultilineComment,
|
||||||
|
Identifier,
|
||||||
|
NumberLiteral,
|
||||||
|
StringLiteral,
|
||||||
|
];
|
21
src/utilities.ts
Normal file
21
src/utilities.ts
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
export function last<T>(array: T[]) : T {
|
||||||
|
return array[array.length-1];
|
||||||
|
}
|
||||||
|
|
||||||
|
export function findLastIndex<T>(array: T[], value: T) : number {
|
||||||
|
for (let index = array.length-1; index >= 0; index--) {
|
||||||
|
if (array[index] === value) {
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function findLastIndexPredicate<T>(array: T[], predicate: (x: T) => boolean) : number {
|
||||||
|
for (let index = array.length-1; index >= 0; index--) {
|
||||||
|
if (predicate(array[index])) {
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user