And I broke it

2025-05-16 12:30:09 -07:00 · 2020-10-26 05:08:17 -04:00 · 2020-10-26 05:08:17 -04:00 · 5e9c185923
commit 5e9c185923
parent 0601dc20ca
2 changed files with 248 additions and 77 deletions
--- a/.eslintrc.json
+++ b/.eslintrc.json
@ -21,7 +21,7 @@
 		"@typescript-eslint/explicit-function-return-type": "off",
 		"no-magic-numbers": [ 
 			"warn", 
-			{ "ignoreArrayIndexes": true, "ignore": [0,1,2,3,4,5,6,7,8,9]}
+			{ "ignoreArrayIndexes": true, "ignore": [-1,0,1,2,3,4,5,6,7,8,9]}
 		],
        "curly": "warn",
        "no-loss-of-precision": "error",
@ -45,7 +45,7 @@
            "error", 
            "always"
        ],
-        "no-shadow": "error",
+        "no-shadow": "off",
        "no-undefined": "error",
        "brace-style": [
            "error", 
--- a/src/tokenizer.ts
+++ b/src/tokenizer.ts
@ -1,6 +1,6 @@
 /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
-import { createToken, Lexer } from "chevrotain";
+import { createToken, Lexer, IToken, createTokenInstance, ILexingResult } from "chevrotain";
 export const Zero = createToken({name: "Zero", pattern: /zero/i });
 export const One = createToken({name: "One", pattern: /one/i });
@ -17,7 +17,7 @@ export const Ten = createToken({name: "Ten", pattern: /ten/i });
 export const Optional = createToken({name: "Optional", pattern: /optional(ly)?/i });
 export const Match = createToken({name: "Match", pattern: /match(es)?/i });
 export const Then = createToken({name: "Then", pattern: /then/i });
-export const Anything = createToken({name: "Anything", pattern: /(any|anything|any thing)(s)?/i});
+export const Anything = createToken({name: "Anything", pattern: /(any thing|any|anything)(s)?/i});
 export const Of = createToken({name: "Of", pattern: /of/i});
 export const Or = createToken({name: "Or", pattern: /or/i});
 export const And = createToken({name: "And", pattern: /and|,/i});
@ -69,8 +69,7 @@ export const OrMore = createToken({name: "Or More", pattern: /\+/ });
 export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
 export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
-export const Indent = createToken({name: "Indent", pattern: /(( ){4}\t)/ });
+export const EndOfLine = createToken({name: "EOL", pattern: /\n/, group: "nl" });
 export const EndOfLine = createToken({name: "EOL", pattern: /\n/ });
 export const WhiteSpace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED });
 export const SingleLineComment = createToken({name: "Single-Line Comment", pattern: /(#|\/\/).*/, group: Lexer.SKIPPED });
 export const MultilineComment = createToken({name: "Multi-Line Comment", pattern: /\/\*(.*)\*\//, line_breaks: true, group: Lexer.SKIPPED });
@ -79,78 +78,250 @@ export const Identifier = createToken({name: "Identifier", pattern: /[a-z]\w*/i
 export const NumberLiteral = createToken({name: "Number Literal", pattern: /-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?/ });
 export const StringLiteral = createToken({name: "String Literal", pattern: /"(?:[^\\"]|\\(?:[bfnrtv"\\/]|u[0-9a-f]{4}|U[0-9a-f]{8}))*"/i });
 enum IndentBaseType {
    Indent,
    Outdent
 }
 export const Indent = createToken({
    name: "Indent",
    start_chars_hint: [ "\t", " " ],
    pattern: (text, offset, matchedTokens, groups) => Human2RegexLexer.matchIndentBase(text, offset, matchedTokens, groups, IndentBaseType.Indent),
    // custom token patterns should explicitly specify the line_breaks option
    line_breaks: false
 });
 export const Outdent = createToken({
    name: "Outdent",
    start_chars_hint: [ "\t", " " ],
    pattern: (text, offset, matchedTokens, groups) => Human2RegexLexer.matchIndentBase(text, offset, matchedTokens, groups, IndentBaseType.Outdent),
    // custom token patterns should explicitly specify the line_breaks option
    line_breaks: false
 });
 export const AllTokens = [
-  Zero,
+    Zero,
-  One,
+    One,
-  Two,
+    Two,
-  Three,
+    Three,
-  Four,
+    Four,
-  Five,
+    Five,
-  Six,
+    Six,
-  Seven,
+    Seven,
-  Eight,
+    Eight,
-  Nine,
+    Nine,
-  Ten,
+    Ten,
-  Optional,
+    Optional,
-  Matching,
+    Matching,
-  Match,
+    Match,
-  Then,
+    Then,
-  Anything,
+    Anything,
-  Of,
+    Of,
-  Or,
+    Or,
-  And,
+    And,
-  Word,
+    Word,
-  Digit,
+    Digit,
-  Character,
+    Character,
-  Whitespace,
+    Whitespace,
-  Number,
+    Number,
-  As,
+    As,
-  If,
+    If,
-  Start,
+    Start,
-  With,
+    With,
-  Ends,
+    Ends,
-  Otherwise,
+    Otherwise,
-  Else,
+    Else,
-  Unless,
+    Unless,
-  While,
+    While,
-  More,
+    More,
-  Using,
+    Using,
-  Global,
+    Global,
-  Multiline,
+    Multiline,
-  Exact,
+    Exact,
-  Nothing,
+    Nothing,
-  Not,
+    Not,
-  Between,
+    Between,
-  Tab,
+    Tab,
-  Linefeed,
+    Linefeed,
-  Group,
+    Group,
-  By,
+    By,
-  A,
+    A,
-  The,
+    The,
-  Exactly,
+    Exactly,
-  Inclusive,
+    Inclusive,
-  Exclusive,
+    Exclusive,
-  From,
+    From,
-  Create,
+    Create,
-  Called,
+    Called,
-  Repeat,
+    Repeat,
-  Newline,
+    Newline,
-  None,
+    None,
-  Neither,
+    Neither,
-  CarriageReturn,
+    CarriageReturn,
-  CaseInsensitive,
+    CaseInsensitive,
-  CaseSensitive,
+    CaseSensitive,
-  OrMore,
+    OrMore,
-  To,
+    To,
-  Indent,
+    EndOfLine,
-  EndOfLine,
+    Indent,
-  WhiteSpace,
+    Outdent,
-  SingleLineComment,
+    WhiteSpace,
-  MultilineComment,
+    SingleLineComment,
-  Identifier,
+    MultilineComment,
-  NumberLiteral,
+    Identifier,
-  StringLiteral,
+    NumberLiteral,
    StringLiteral,
 ];
-export const Human2RegexLexer = new Lexer(AllTokens, { ensureOptimizations: true });
+const H2RLexer = new Lexer(AllTokens, { ensureOptimizations: true });
 export enum IndentType {
    Tabs,
    Spaces,
    Both
 }
 export class Human2RegexLexerOptions {
    constructor(public type: IndentType = IndentType.Both, public spaces_per_tab: number = 4) {
        /* empty */
    }
 }
 export class Human2RegexLexer {
    //Taken and adapted from https://github.com/SAP/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
    // State required for matching the indentations
    private static options = new Human2RegexLexerOptions();
    private static indentStack = [ 0 ];
    private static wsRegExp: RegExp;
    private static spacesPerTab = "   ";
    private static findLastIndex<T>(array: T[], predicate: (x: T) => boolean) : number {
        for (let index = array.length; index >= 0; index--) {
            if (predicate(array[index])) {
                return index;
            }
        }
        return -1;
    }
    /**
     * This custom Token matcher uses Lexer context ("matchedTokens" and "groups" arguments)
     * combined with state via closure ("indentStack" and "lastTextMatched") to match indentation.
     */
    public static matchIndentBase(text: string, offset: number, matchedTokens: IToken[], groups: {[groupName: string]: IToken[]}, type: IndentBaseType) : RegExpExecArray | null  {
        const noTokensMatchedYet = !matchedTokens.length;
        const newLines = groups.nl;
        const noNewLinesMatchedYet = !newLines.length;
        const isFirstLine = noTokensMatchedYet && noNewLinesMatchedYet;
        const isStartOfLine =
            // only newlines matched so far
            (noTokensMatchedYet && !noNewLinesMatchedYet) ||
            // Both newlines and other Tokens have been matched AND the offset is just after the last newline
            (!noTokensMatchedYet &&
            !noNewLinesMatchedYet &&
            offset === newLines[newLines.length-1].startOffset + 1);
        // indentation can only be matched at the start of a line.
        if (isFirstLine || isStartOfLine) {
            let currIndentLevel: number = -1;
            Human2RegexLexer.wsRegExp.lastIndex = offset;
            const match = Human2RegexLexer.wsRegExp.exec(text);
            // possible non-empty indentation
            if (match !== null) {
                currIndentLevel = match[0].length;
                //if (this.options.type === IndentType.Tabs) {
                //    currIndentLevel = match[0].length;
                //}
                //else {
                //    currIndentLevel = match[0].replace(Human2RegexLexer.spacesPerTab, "\t").length;
                //}
            }
            // "empty" indentation means indentLevel of 0.
            else {
                currIndentLevel = 0;
            }
            const prevIndentLevel = this.indentStack[this.indentStack.length-1];
            // deeper indentation
            if (currIndentLevel > prevIndentLevel && type === IndentBaseType.Indent) {
                this.indentStack.push(currIndentLevel);
                return match;
            }
            // shallower indentation
            else if (currIndentLevel < prevIndentLevel && type === IndentBaseType.Outdent) {
                const matchIndentIndex = this.findLastIndex(this.indentStack, (stackIndentDepth) => stackIndentDepth === currIndentLevel);
                // any outdent must match some previous indentation level.
                if (matchIndentIndex === -1) {
                    throw Error(`invalid outdent at offset: ${offset}`);
                }
                const numberOfDedents = this.indentStack.length - matchIndentIndex - 1;
                // This is a little tricky
                // 1. If there is no match (0 level indent) than this custom token
                //    matcher would return "null" and so we need to add all the required outdents ourselves.
                // 2. If there was match (> 0 level indent) than we need to add minus one number of outsents
                //    because the lexer would create one due to returning a none null result.
                const iStart = match !== null ? 1 : 0;
                for (let i = iStart; i < numberOfDedents; i++) {
                    this.indentStack.pop();
                    matchedTokens.push(createTokenInstance(Outdent, "", NaN, NaN, NaN, NaN, NaN, NaN));
                }
                // even though we are adding fewer outdents directly we still need to update the indent stack fully.
                if (iStart === 1) {
                    this.indentStack.pop();
                }
                return match;
            } 
            else {
                // same indent, this should be lexed as simple whitespace and ignored
                return null;
            }
        } 
        else {
            // indentation cannot be matched under other circumstances
            return null;
        }
    }
    public static tokenize(text: string, options: Human2RegexLexerOptions | null = null) : ILexingResult{
        // have to reset the indent stack between processing of different text inputs
        Human2RegexLexer.indentStack = [ 0 ];
        if (options !== null) {
            Human2RegexLexer.options = this.options;
        }
        /*
        if (this.options.type === IndentType.Tabs) {
            Human2RegexLexer.wsRegExp = /\t/y;
        }
        else {
            let reg = ` {${this.options.spaces_per_tab}}`;
            if (this.options.type === IndentType.Both) {
                reg += "|\\t";
            }
            Human2RegexLexer.wsRegExp = new RegExp(reg, "y");
            Human2RegexLexer.spacesPerTab = Array(this.options.spaces_per_tab+1).join(" ");
        }*/
        Human2RegexLexer.wsRegExp = / +/y;
        const lexResult = H2RLexer.tokenize(text);
        //add remaining Outdents
        while (Human2RegexLexer.indentStack.length > 1) {
            lexResult.tokens.push(createTokenInstance(Outdent, "", NaN, NaN, NaN, NaN, NaN, NaN));
            Human2RegexLexer.indentStack.pop();
        }
        return lexResult;
    }
 }