Parser got started

but it's broken
2025-06-30 18:00:17 -07:00 · 2020-10-28 00:00:29 -04:00 · 2020-10-28 00:00:29 -04:00 · 79c9b9edd4
commit 79c9b9edd4
parent c5db6fa986
7 changed files with 984 additions and 849 deletions
--- a/docs/bundle.min.js
+++ b/docs/bundle.min.js
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -2,15 +2,12 @@
  "name": "human2regex",
  "version": "0.0.1",
  "description": "Humanized Regular Expressions",
-  "main": "build.js",
+  "main": "bundle.min.js",
  "devDependencies": {
-    "@types/clean-css": "^4.2.1",
-    "@types/fs-extra": "^9.0.1",
    "@types/glob": "^7.1.3",
    "@types/html-minifier": "^3.5.3",
-    "@types/jquery": "^3.5.2",
+    "@types/jquery": "^3.5.3",
    "@types/mustache": "^4.0.1",
-    "@types/uglify-es": "^3.0.0",
    "@typescript-eslint/eslint-plugin": "^4.4.0",
    "@typescript-eslint/parser": "^4.4.0",
    "before-build-webpack": "^0.2.9",
@ -38,7 +35,8 @@
  "author": "Patrick Demian",
  "license": "MIT",
  "dependencies": {
-    "chevrotain": "^7.0.2"
+    "chevrotain": "^7.0.2",
+    "jquery": "^3.5.1"
  },
  "repository": {
    "type": "git",
--- a/src/parser.ts
+++ b/src/parser.ts
@ -1,15 +1,31 @@
 /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */

-import { CstParser } from "chevrotain";
+import { CstParser, CstNode } from "chevrotain";
 import * as T from "./tokens";

-export class Human2RegexParser extends CstParser {
+export class Human2RegexParserOptions {
    constructor() {
-        super(T.AllTokens, { recoveryEnabled: true, maxLookahead: 2});
+        /* empty */
+    }
+}

+export class Human2RegexParser extends CstParser {
+    private static already_init = false;
+
+    public nodes: { [key: string]: (idxInCallingRule?: number, ...args: unknown[]) => CstNode } = {};
+
+    constructor(private options: Human2RegexParserOptions = new Human2RegexParserOptions()) {
+        super(T.AllTokens, { recoveryEnabled: true, maxLookahead: 4});
+
+        if (Human2RegexParser.already_init) {
+            throw new Error("Only 1 instance of Human2RegexParser allowed");
+        }
+
+        Human2RegexParser.already_init = true;
+        
        const $ = this;

-        const Number = $.RULE("Number", () => {
+        this.nodes.NumberSubStatement = $.RULE("Number Sub-Statement", () => {
            $.OR([
                { ALT: () => $.CONSUME(T.One) },
                { ALT: () => $.CONSUME(T.Two) },
@ -27,32 +43,35 @@ export class Human2RegexParser extends CstParser {
        });

        // 1, 1..2, between 1 and/to 2 inclusively/exclusively
-        const Count = $.RULE("Count", () => {
+        this.nodes.CountSubStatement = $.RULE("Count Sub-Statement", () => {
            $.OR([
                { ALT: () => { 
                    $.OPTION(() => $.CONSUME(T.Exactly));
-                    $.SUBRULE(Number);
+                    $.SUBRULE(this.nodes.NumberSubStatement);
+                    $.OPTION(() => $.CONSUME(T.Times));
                }},
                { ALT: () => { 
                    $.OPTION(() => $.CONSUME(T.From));
-                    $.SUBRULE(Number);
+                    $.SUBRULE(this.nodes.NumberSubStatement);
                    $.OR([
                        { ALT: () => $.CONSUME(T.OrMore) },
                        { ALT: () => { 
                            $.CONSUME(T.To); 
-                            $.SUBRULE(Number); 
+                            $.SUBRULE(this.nodes.NumberSubStatement); 
                        }}
                    ]);
+                    $.OPTION(() => $.CONSUME(T.Times));
                }},

                { ALT: () => {
                    $.CONSUME(T.Between);
-                    $.SUBRULE(Number);
+                    $.SUBRULE(this.nodes.NumberSubStatement);
                    $.OR([
                        { ALT: () => $.CONSUME(T.To) },
                        { ALT: () => $.CONSUME(T.And) }
                    ]);
-                    $.SUBRULE(Number);
+                    $.SUBRULE(this.nodes.NumberSubStatement);
+                    $.OPTION(() => $.CONSUME(T.Times));
                    $.OPTION(() => {
                        $.OR([
                            { ALT: () => $.CONSUME(T.Inclusive) },
@ -63,21 +82,50 @@ export class Human2RegexParser extends CstParser {
            ]);
        });

-        const MatchStatement = $.RULE("Match Statement", () => {
-            $.OPTION(() => $.CONSUME(T.Optional));
-            $.CONSUME(T.Match);
-            $.OPTION(() => {
-                $.SUBRULE(Count);
-            });
+        this.nodes.MatchSubStatement = $.RULE("Match Sub-Statement", () => {
+            $.OPTION(() => $.SUBRULE(this.nodes.CountSubStatement) );
+            $.OPTION(() => $.CONSUME(T.Not));
            $.AT_LEAST_ONE_SEP({
                SEP: T.Or,
                DEF: () => {
-                    $.CONSUME(T.StringLiteral);
+                    $.OR([
+                        { ALT: () => $.CONSUME(T.Anything) },
+                        { ALT: () => $.CONSUME(T.StringLiteral) },
+                        { ALT: () => $.CONSUME(T.Word) },
+                        { ALT: () => $.CONSUME(T.Digit) },
+                        { ALT: () => $.CONSUME(T.Character) },
+                        { ALT: () => $.CONSUME(T.Whitespace) },
+                        { ALT: () => $.CONSUME(T.Number) },
+                        { ALT: () => $.CONSUME(T.Tab) },
+                        { ALT: () => $.CONSUME(T.Linefeed) },
+                        { ALT: () => $.CONSUME(T.Newline) },
+                        { ALT: () => $.CONSUME(T.CarriageReturn) },
+                    ]);
+                    
                }
            });
        });

-        const UsingStatement = $.RULE("Using Statement", () => {
+        // optionally match "+" then 1+ words
+        this.nodes.MatchStatement = $.RULE("Match Statement", () => {
+            $.OPTION(() => $.CONSUME(T.Optional));
+            $.CONSUME(T.Match);
+            $.SUBRULE(this.nodes.MatchSubStatement);
+            $.MANY(() => {
+                $.OR([
+                    { ALT: () => $.CONSUME(T.And) },
+                    { ALT: () => { 
+                        $.OPTION(() => $.CONSUME(T.And)); 
+                        $.CONSUME(T.Then); 
+                    }}
+                ]);
+                $.OPTION(() => $.CONSUME(T.Optional));
+                $.SUBRULE(this.nodes.MatchSubStatement);
+            });
+        });
+
+        // using global matching
+        this.nodes.UsingStatement = $.RULE("Using Statement", () => {
            $.CONSUME(T.Using);
            $.AT_LEAST_ONE_SEP({
                SEP: T.And,
@ -87,24 +135,55 @@ export class Human2RegexParser extends CstParser {
                        { ALT: () => $.CONSUME(T.Global) },
                        { ALT: () => $.CONSUME(T.CaseInsensitive) },
                        { ALT: () => $.CONSUME(T.CaseSensitive) },
-                        { ALT: () => { 
-                            $.CONSUME(T.Exact); $.CONSUME(T.Matching); 
-                        }},
+                        { ALT: () => $.CONSUME(T.Exact) }
                    ]);
+                    $.OPTION(() => $.CONSUME(T.Matching));
                }
            });
        });

-        const Statement = $.RULE("Statement", () => {
-            $.OR([
-                { ALT: () => $.SUBRULE(MatchStatement) },
-                { ALT: () => $.SUBRULE(UsingStatement) }
-            ]);
-            $.OPTION(() => $.CONSUME(T.EndOfLine));
+        this.nodes.GroupStatement = $.RULE("Group Statement", () => {
+            $.OPTION(() => $.CONSUME(T.Optional));
+            $.CONSUME(T.Create);
+            $.CONSUME(T.A);
+            $.OPTION(() => $.CONSUME(T.Optional));
+            $.CONSUME(T.Group);
+            $.OPTION(() => {
+                $.CONSUME(T.Called);
+                $.CONSUME(T.StringLiteral);
+            });
+            $.CONSUME(T.Indent);
+            $.AT_LEAST_ONE(() => this.nodes.Statement);
+            $.CONSUME(T.Outdent);
        });

+        this.nodes.RepeatStatement = $.RULE("Repeat Statement", () => {
+            $.OPTION(() => $.CONSUME(T.Optional));
+            $.CONSUME(T.Repeat);
+            $.OPTION(() => $.SUBRULE(this.nodes.CountSubStatement));
+            $.CONSUME(T.Indent);
+            $.AT_LEAST_ONE(() => this.nodes.Statement);
+            $.CONSUME(T.Outdent);
+        });
+
+        this.nodes.Statement = $.RULE("Statement", () => {
+            $.OR([
+                { ALT: () => $.SUBRULE(this.nodes.MatchStatement) },
+                { ALT: () => $.SUBRULE(this.nodes.GroupStatement) },
+                { ALT: () => $.SUBRULE(this.nodes.RepeatStatement) }
+            ]);
+            $.CONSUME(T.EndOfLine);
+        });
+
+        this.nodes.Regex = $.RULE("Regex", () => {
+            $.OPTION(() => $.SUBRULE(this.nodes.UsingStatement));
+            $.MANY(() => $.SUBRULE(this.nodes.Statement) );
+        });

        this.performSelfAnalysis();
-
    }
+
+    //public set_options(options: Human2RegexParserOptions) : void {
+    //    // empty so far
+    //}
 }
--- a/src/script.ts
+++ b/src/script.ts
@ -3,6 +3,8 @@
 import "./style.css";

 import { Human2RegexLexer } from "./tokenizer";
+import { Human2RegexParser } from "./parser";
+

 /*
 $(function() {
@ -11,6 +13,7 @@ $(function() {
 */

 const lexer = new Human2RegexLexer();
+const parser = new Human2RegexParser();

 const result = lexer.tokenize(`
 // H2R supports // # and /**/ as comments
@ -53,16 +56,22 @@ create an optional group
 	match 0+ any thing
 `);

-//let str = "";
-
-//for(const r of result.tokens) {
-//    str += r.tokenType === Newline ? "\n" : r.image + " ";
-//}
-
-//console.log(str);
-
 for(const r of result.tokens) {
-	console.log(r);
+	console.log(`[${r.tokenType.name}]: ${r.image}`);
 }

-console.log(result.errors);
+console.log(result.errors);
+
+parser.input = result.tokens;
+const regex = parser.nodes.regex;
+
+console.log(regex);
+console.log(parser.errors);
+
+
+//interpreter.visit(regex);
+
+//parser.getBaseCstVisitorConstructor();
+
+
+
--- a/src/tokenizer.ts
+++ b/src/tokenizer.ts
@ -19,15 +19,22 @@ export class Human2RegexLexerOptions {
 export class Human2RegexLexer {
    private static already_init = false;

-    private lexer : Lexer;
+    private lexer!: Lexer;
+    private options!: Human2RegexLexerOptions;

-    constructor(private options: Human2RegexLexerOptions = new Human2RegexLexerOptions()) {
+    constructor(options: Human2RegexLexerOptions = new Human2RegexLexerOptions()) {
        if (Human2RegexLexer.already_init) {
            throw new Error("Only 1 instance of Human2RegexLexer allowed");
        }

        Human2RegexLexer.already_init = true;

+        this.set_options(options);
+    }
+
+    public set_options(options: Human2RegexLexerOptions) : void {
+        this.options = options;
+        
        let indent_regex: RegExp | null = null;

        if (this.options.type === IndentType.Tabs) {
@ -66,9 +73,7 @@ export class Human2RegexLexer {
        }

        // create Outdents
-
        const tokens: IToken[] = [];
-
        const indentStack = [ 0 ];

        let currIndentLevel = 0;
@ -79,9 +84,15 @@ export class Human2RegexLexer {

            // EoL? check for indents next (by setting startOfLine = true)
            if (lexResult.tokens[i].tokenType === EndOfLine) {
-                startOfLine = true;
-                tokens.push(lexResult.tokens[i]);
+                if(tokens.length === 0 || tokens[tokens.length-1].tokenType === EndOfLine) {
+                    // Ignore multiple EOLs and ignore first EOL
+                }
+                else {
+                    startOfLine = true;
+                    tokens.push(lexResult.tokens[i]);
+                }
            }
+            // start with 1 indent. Append all other indents 
            else if (lexResult.tokens[i].tokenType === Indent) {
                hadIndents = true;
                currIndentLevel = 1; 
@ -97,6 +108,9 @@ export class Human2RegexLexer {
                }

                start_token.endOffset = start_token.startOffset + length;
+                start_token.endColumn = lexResult.tokens[i].endColumn;
+                // must be the same line
+                //start_token.endLine = lexResult.tokens[i].endLine;

                // are we an empty line? 
                if (lexResult.tokens.length > i && lexResult.tokens[i+1].tokenType === EndOfLine) {
@ -158,7 +172,6 @@ export class Human2RegexLexer {
            indentStack.pop();
            tokens.push(createTokenInstance(Outdent, "", tok.endOffset ?? NaN, tok.endOffset ?? NaN, tok.startLine ?? NaN, NaN, tok.startColumn ?? NaN, NaN));
        }
-    

        lexResult.tokens = tokens;
        return lexResult;
--- a/src/tokens.ts
+++ b/src/tokens.ts
@ -16,7 +16,6 @@ export const Optional = createToken({name: "Optional", pattern: /optional(ly)?/i
 export const Match = createToken({name: "Match", pattern: /match(es)?/i });
 export const Then = createToken({name: "Then", pattern: /then/i });
 export const Anything = createToken({name: "Anything", pattern: /(any thing|any|anything)(s)?/i});
-export const Of = createToken({name: "Of", pattern: /of/i});
 export const Or = createToken({name: "Or", pattern: /or/i});
 export const And = createToken({name: "And", pattern: /and|,/i});
 export const Word = createToken({name: "Word Specifier", pattern: /word(s)?/i});
@ -24,51 +23,55 @@ export const Digit = createToken({name: "Digit Specifier", pattern: /digit(s)?/i
 export const Character = createToken({name: "Character Specifier", pattern: /character(s)?/i});
 export const Whitespace = createToken({name: "Whitespace Specifier", pattern: /(white space|whitespace)(s)?/i});
 export const Number = createToken({name: "NumberSpecifier", pattern: /number(s)?/i});
-export const As = createToken({name: "As", pattern: /as/i});
-export const If = createToken({name: "If", pattern: /if/i});
-export const Start = createToken({name: "Start", pattern: /start(s)?/i});
-export const With = createToken({name: "With", pattern: /with/i});
-export const Ends = createToken({name: "Ends", pattern: /end(s)?/i});
-export const Otherwise = createToken({name: "Otherwise", pattern: /(other wise|otherwise)/i});
-export const Else = createToken({name: "Else", pattern: /else/i});
-export const Unless = createToken({name: "Unless", pattern: /unless/i});
-export const While = createToken({name: "While", pattern: /while/i});
-export const More = createToken({name: "More", pattern: /more/i});
 export const Using = createToken({name: "Using", pattern: /using/i});
 export const Global = createToken({name: "Global", pattern: /global/i});
 export const Multiline = createToken({name: "Multiline", pattern: /(multi line|multiline)/i});
 export const Exact = createToken({name: "Exact", pattern: /exact/i});
 export const Matching = createToken({name: "Matching", pattern: /matching/i});
-export const Nothing = createToken({name: "Nothing", pattern: /nothing/i});
 export const Not = createToken({name: "Not", pattern: /not/i }); //, longer_alt: Nothing});
 export const Between = createToken({name: "Between", pattern: /between/i});
 export const Tab = createToken({name: "Tab", pattern: /tab/i});
 export const Linefeed = createToken({name: "Linefeed", pattern: /(line feed|linefeed)/i});
 export const Group = createToken({name: "Group", pattern: /group/i});
-export const By = createToken({name: "By", pattern: /by/i});
 export const A = createToken({name: "A", pattern: /a(n)?/i }); //, longer_alt: Anything});
-export const The = createToken({name: "The", pattern: /the/i }); //, longer_alt: Then});
+export const Times = createToken({name: "Times", pattern: /times/i });
 export const Exactly = createToken({name: "Exactly", pattern: /exact(ly)?/i});
 export const Inclusive = createToken({name: "Inclusive", pattern: /inclusive(ly)?/i});
 export const Exclusive = createToken({name: "Exclusive", pattern: /exclusive(ly)?/i});
 export const From = createToken({name: "From", pattern: /from/i});
 export const To = createToken({name: "To", pattern: /(to|\-|\.\.|\.\.\.)/i});
 export const Create = createToken({name: "Create", pattern: /create(s)?/i});
-export const Called = createToken({name: "Called", pattern: /called/i});
+export const Called = createToken({name: "Called", pattern: /name(d)?|call(ed)?/i});
 export const Repeat = createToken({name: "Repeat", pattern: /repeat(s|ing)?/i});
 export const Newline = createToken({name: "Newline", pattern: /(new line|newline)/i});
-export const None = createToken({name: "None", pattern: /none/i});
-export const Neither = createToken({name: "Neither", pattern: /neither/i});
 export const CarriageReturn = createToken({name: "Carriage Return", pattern: /carriage return/i});
 export const CaseInsensitive = createToken({name: "Case Insensitive", pattern: /case insensitive/i});
 export const CaseSensitive = createToken({name: "Case Sensitive", pattern: /case sensitive/i});
 export const OrMore = createToken({name: "Or More", pattern: /\+/ });

+/*
+//Not being used currently
+export const Of = createToken({name: "Of", pattern: /of/i});
+export const Nothing = createToken({name: "Nothing", pattern: /nothing/i});
+export const As = createToken({name: "As", pattern: /as/i});
+export const If = createToken({name: "If", pattern: /if/i});
+export const Start = createToken({name: "Start", pattern: /start(s) with?/i});
+export const Ends = createToken({name: "Ends", pattern: /end(s)? with/i});
+export const Else = createToken({name: "Else", pattern: /(other wise|otherwise|else)/i});
+export const Unless = createToken({name: "Unless", pattern: /unless/i});
+export const While = createToken({name: "While", pattern: /while/i});
+export const More = createToken({name: "More", pattern: /more/i});
 export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
 export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
+export const None = createToken({name: "None", pattern: /none/i});
+export const Neither = createToken({name: "Neither", pattern: /neither/i});
+export const The = createToken({name: "The", pattern: /the/i }); //, longer_alt: Then});
+export const By = createToken({name: "By", pattern: /by/i});
+*/
+

 export const EndOfLine = createToken({name: "EOL", pattern: /\n/ });
-export const WhiteSpace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED });
+export const WS = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED });
 export const SingleLineComment = createToken({name: "Single-Line Comment", pattern: /(#|\/\/).*/, group: Lexer.SKIPPED });
 export const MultilineComment = createToken({name: "Multi-Line Comment", pattern: /\/\*(.*)\*\//, line_breaks: true, group: Lexer.SKIPPED });

@ -77,7 +80,6 @@ export const NumberLiteral = createToken({name: "Number Literal", pattern: /-?(0
 export const StringLiteral = createToken({name: "String Literal", pattern: /"(?:[^\\"]|\\(?:[bfnrtv"\\/]|u[0-9a-f]{4}|U[0-9a-f]{8}))*"/i });

 export const Indent = createToken({name: "Indent"});
-
 export const Outdent = createToken({name: "Outdent"});

 export const AllTokens = [
@ -97,7 +99,6 @@ export const AllTokens = [
    Match,
    Then,
    Anything,
-    Of,
    Or,
    And,
    Word,
@ -105,29 +106,33 @@ export const AllTokens = [
    Character,
    Whitespace,
    Number,
+    /*
+    Of,
    As,
    If,
    Start,
-    With,
    Ends,
-    Otherwise,
    Else,
    Unless,
    While,
    More,
+    Nothing,
+    By,
+    The,
+    None,
+    Neither,
+    */
    Using,
    Global,
    Multiline,
    Exact,
-    Nothing,
    Not,
    Between,
    Tab,
    Linefeed,
    Group,
-    By,
    A,
-    The,
+    Times,
    Exactly,
    Inclusive,
    Exclusive,
@ -136,8 +141,6 @@ export const AllTokens = [
    Called,
    Repeat,
    Newline,
-    None,
-    Neither,
    CarriageReturn,
    CaseInsensitive,
    CaseSensitive,
@ -145,7 +148,7 @@ export const AllTokens = [
    To,
    EndOfLine,
    Indent,
-    WhiteSpace,
+    WS,
    SingleLineComment,
    MultilineComment,
    Identifier,