diff --git a/.eslintrc.json b/.eslintrc.json index 4398a6c..979e328 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -21,7 +21,7 @@ "@typescript-eslint/explicit-function-return-type": "off", "no-magic-numbers": [ "warn", - { "ignoreArrayIndexes": true, "ignore": [0,1,2,3,4,5,6,7,8,9]} + { "ignoreArrayIndexes": true, "ignore": [-1,0,1,2,3,4,5,6,7,8,9]} ], "curly": "warn", "no-loss-of-precision": "error", @@ -45,7 +45,7 @@ "error", "always" ], - "no-shadow": "error", + "no-shadow": "off", "no-undefined": "error", "brace-style": [ "error", diff --git a/src/tokenizer.ts b/src/tokenizer.ts index 6536442..615c25b 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -1,6 +1,6 @@ /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */ -import { createToken, Lexer } from "chevrotain"; +import { createToken, Lexer, IToken, createTokenInstance, ILexingResult } from "chevrotain"; export const Zero = createToken({name: "Zero", pattern: /zero/i }); export const One = createToken({name: "One", pattern: /one/i }); @@ -17,7 +17,7 @@ export const Ten = createToken({name: "Ten", pattern: /ten/i }); export const Optional = createToken({name: "Optional", pattern: /optional(ly)?/i }); export const Match = createToken({name: "Match", pattern: /match(es)?/i }); export const Then = createToken({name: "Then", pattern: /then/i }); -export const Anything = createToken({name: "Anything", pattern: /(any|anything|any thing)(s)?/i}); +export const Anything = createToken({name: "Anything", pattern: /(any thing|any|anything)(s)?/i}); export const Of = createToken({name: "Of", pattern: /of/i}); export const Or = createToken({name: "Or", pattern: /or/i}); export const And = createToken({name: "And", pattern: /and|,/i}); @@ -69,8 +69,7 @@ export const OrMore = createToken({name: "Or More", pattern: /\+/ }); export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ }); export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ }); -export const Indent = createToken({name: "Indent", pattern: /(( ){4}\t)/ }); -export const EndOfLine = createToken({name: "EOL", pattern: /\n/ }); +export const EndOfLine = createToken({name: "EOL", pattern: /\n/, group: "nl" }); export const WhiteSpace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED }); export const SingleLineComment = createToken({name: "Single-Line Comment", pattern: /(#|\/\/).*/, group: Lexer.SKIPPED }); export const MultilineComment = createToken({name: "Multi-Line Comment", pattern: /\/\*(.*)\*\//, line_breaks: true, group: Lexer.SKIPPED }); @@ -79,78 +78,250 @@ export const Identifier = createToken({name: "Identifier", pattern: /[a-z]\w*/i export const NumberLiteral = createToken({name: "Number Literal", pattern: /-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?/ }); export const StringLiteral = createToken({name: "String Literal", pattern: /"(?:[^\\"]|\\(?:[bfnrtv"\\/]|u[0-9a-f]{4}|U[0-9a-f]{8}))*"/i }); +enum IndentBaseType { + Indent, + Outdent +} + +export const Indent = createToken({ + name: "Indent", + start_chars_hint: [ "\t", " " ], + pattern: (text, offset, matchedTokens, groups) => Human2RegexLexer.matchIndentBase(text, offset, matchedTokens, groups, IndentBaseType.Indent), + // custom token patterns should explicitly specify the line_breaks option + line_breaks: false +}); + +export const Outdent = createToken({ + name: "Outdent", + start_chars_hint: [ "\t", " " ], + pattern: (text, offset, matchedTokens, groups) => Human2RegexLexer.matchIndentBase(text, offset, matchedTokens, groups, IndentBaseType.Outdent), + // custom token patterns should explicitly specify the line_breaks option + line_breaks: false +}); export const AllTokens = [ - Zero, - One, - Two, - Three, - Four, - Five, - Six, - Seven, - Eight, - Nine, - Ten, - Optional, - Matching, - Match, - Then, - Anything, - Of, - Or, - And, - Word, - Digit, - Character, - Whitespace, - Number, - As, - If, - Start, - With, - Ends, - Otherwise, - Else, - Unless, - While, - More, - Using, - Global, - Multiline, - Exact, - Nothing, - Not, - Between, - Tab, - Linefeed, - Group, - By, - A, - The, - Exactly, - Inclusive, - Exclusive, - From, - Create, - Called, - Repeat, - Newline, - None, - Neither, - CarriageReturn, - CaseInsensitive, - CaseSensitive, - OrMore, - To, - Indent, - EndOfLine, - WhiteSpace, - SingleLineComment, - MultilineComment, - Identifier, - NumberLiteral, - StringLiteral, + Zero, + One, + Two, + Three, + Four, + Five, + Six, + Seven, + Eight, + Nine, + Ten, + Optional, + Matching, + Match, + Then, + Anything, + Of, + Or, + And, + Word, + Digit, + Character, + Whitespace, + Number, + As, + If, + Start, + With, + Ends, + Otherwise, + Else, + Unless, + While, + More, + Using, + Global, + Multiline, + Exact, + Nothing, + Not, + Between, + Tab, + Linefeed, + Group, + By, + A, + The, + Exactly, + Inclusive, + Exclusive, + From, + Create, + Called, + Repeat, + Newline, + None, + Neither, + CarriageReturn, + CaseInsensitive, + CaseSensitive, + OrMore, + To, + EndOfLine, + Indent, + Outdent, + WhiteSpace, + SingleLineComment, + MultilineComment, + Identifier, + NumberLiteral, + StringLiteral, ]; -export const Human2RegexLexer = new Lexer(AllTokens, { ensureOptimizations: true }); \ No newline at end of file +const H2RLexer = new Lexer(AllTokens, { ensureOptimizations: true }); + +export enum IndentType { + Tabs, + Spaces, + Both +} + +export class Human2RegexLexerOptions { + constructor(public type: IndentType = IndentType.Both, public spaces_per_tab: number = 4) { + /* empty */ + } +} + +export class Human2RegexLexer { + //Taken and adapted from https://github.com/SAP/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js + + // State required for matching the indentations + private static options = new Human2RegexLexerOptions(); + private static indentStack = [ 0 ]; + private static wsRegExp: RegExp; + private static spacesPerTab = " "; + + private static findLastIndex(array: T[], predicate: (x: T) => boolean) : number { + for (let index = array.length; index >= 0; index--) { + if (predicate(array[index])) { + return index; + } + } + return -1; + } + + /** + * This custom Token matcher uses Lexer context ("matchedTokens" and "groups" arguments) + * combined with state via closure ("indentStack" and "lastTextMatched") to match indentation. + */ + public static matchIndentBase(text: string, offset: number, matchedTokens: IToken[], groups: {[groupName: string]: IToken[]}, type: IndentBaseType) : RegExpExecArray | null { + const noTokensMatchedYet = !matchedTokens.length; + const newLines = groups.nl; + const noNewLinesMatchedYet = !newLines.length; + const isFirstLine = noTokensMatchedYet && noNewLinesMatchedYet; + const isStartOfLine = + // only newlines matched so far + (noTokensMatchedYet && !noNewLinesMatchedYet) || + // Both newlines and other Tokens have been matched AND the offset is just after the last newline + (!noTokensMatchedYet && + !noNewLinesMatchedYet && + offset === newLines[newLines.length-1].startOffset + 1); + + // indentation can only be matched at the start of a line. + if (isFirstLine || isStartOfLine) { + let currIndentLevel: number = -1; + + Human2RegexLexer.wsRegExp.lastIndex = offset; + const match = Human2RegexLexer.wsRegExp.exec(text); + + // possible non-empty indentation + if (match !== null) { + currIndentLevel = match[0].length; + //if (this.options.type === IndentType.Tabs) { + // currIndentLevel = match[0].length; + //} + //else { + // currIndentLevel = match[0].replace(Human2RegexLexer.spacesPerTab, "\t").length; + //} + } + // "empty" indentation means indentLevel of 0. + else { + currIndentLevel = 0; + } + + const prevIndentLevel = this.indentStack[this.indentStack.length-1]; + // deeper indentation + if (currIndentLevel > prevIndentLevel && type === IndentBaseType.Indent) { + this.indentStack.push(currIndentLevel); + return match; + } + // shallower indentation + else if (currIndentLevel < prevIndentLevel && type === IndentBaseType.Outdent) { + const matchIndentIndex = this.findLastIndex(this.indentStack, (stackIndentDepth) => stackIndentDepth === currIndentLevel); + + // any outdent must match some previous indentation level. + if (matchIndentIndex === -1) { + throw Error(`invalid outdent at offset: ${offset}`); + } + + const numberOfDedents = this.indentStack.length - matchIndentIndex - 1; + + // This is a little tricky + // 1. If there is no match (0 level indent) than this custom token + // matcher would return "null" and so we need to add all the required outdents ourselves. + // 2. If there was match (> 0 level indent) than we need to add minus one number of outsents + // because the lexer would create one due to returning a none null result. + const iStart = match !== null ? 1 : 0; + for (let i = iStart; i < numberOfDedents; i++) { + this.indentStack.pop(); + matchedTokens.push(createTokenInstance(Outdent, "", NaN, NaN, NaN, NaN, NaN, NaN)); + } + + // even though we are adding fewer outdents directly we still need to update the indent stack fully. + if (iStart === 1) { + this.indentStack.pop(); + } + return match; + } + else { + // same indent, this should be lexed as simple whitespace and ignored + return null; + } + } + else { + // indentation cannot be matched under other circumstances + return null; + } + } + + public static tokenize(text: string, options: Human2RegexLexerOptions | null = null) : ILexingResult{ + // have to reset the indent stack between processing of different text inputs + Human2RegexLexer.indentStack = [ 0 ]; + + if (options !== null) { + Human2RegexLexer.options = this.options; + } + + /* + if (this.options.type === IndentType.Tabs) { + Human2RegexLexer.wsRegExp = /\t/y; + } + else { + let reg = ` {${this.options.spaces_per_tab}}`; + + if (this.options.type === IndentType.Both) { + reg += "|\\t"; + } + + Human2RegexLexer.wsRegExp = new RegExp(reg, "y"); + + Human2RegexLexer.spacesPerTab = Array(this.options.spaces_per_tab+1).join(" "); + }*/ + Human2RegexLexer.wsRegExp = / +/y; + + const lexResult = H2RLexer.tokenize(text); + + //add remaining Outdents + while (Human2RegexLexer.indentStack.length > 1) { + lexResult.tokens.push(createTokenInstance(Outdent, "", NaN, NaN, NaN, NaN, NaN, NaN)); + Human2RegexLexer.indentStack.pop(); + } + + return lexResult; + } +} \ No newline at end of file