1
0
mirror of https://github.com/pdemian/human2regex.git synced 2025-05-16 12:30:09 -07:00

Parser got started

but it's broken
This commit is contained in:
Patrick Demian 2020-10-28 00:00:29 -04:00
parent c5db6fa986
commit 79c9b9edd4
7 changed files with 984 additions and 849 deletions

5
docs/bundle.min.js vendored

File diff suppressed because one or more lines are too long

1568
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -2,15 +2,12 @@
"name": "human2regex",
"version": "0.0.1",
"description": "Humanized Regular Expressions",
"main": "build.js",
"main": "bundle.min.js",
"devDependencies": {
"@types/clean-css": "^4.2.1",
"@types/fs-extra": "^9.0.1",
"@types/glob": "^7.1.3",
"@types/html-minifier": "^3.5.3",
"@types/jquery": "^3.5.2",
"@types/jquery": "^3.5.3",
"@types/mustache": "^4.0.1",
"@types/uglify-es": "^3.0.0",
"@typescript-eslint/eslint-plugin": "^4.4.0",
"@typescript-eslint/parser": "^4.4.0",
"before-build-webpack": "^0.2.9",
@ -38,7 +35,8 @@
"author": "Patrick Demian",
"license": "MIT",
"dependencies": {
"chevrotain": "^7.0.2"
"chevrotain": "^7.0.2",
"jquery": "^3.5.1"
},
"repository": {
"type": "git",

View File

@ -1,15 +1,31 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
import { CstParser } from "chevrotain";
import { CstParser, CstNode } from "chevrotain";
import * as T from "./tokens";
export class Human2RegexParser extends CstParser {
export class Human2RegexParserOptions {
constructor() {
super(T.AllTokens, { recoveryEnabled: true, maxLookahead: 2});
/* empty */
}
}
export class Human2RegexParser extends CstParser {
private static already_init = false;
public nodes: { [key: string]: (idxInCallingRule?: number, ...args: unknown[]) => CstNode } = {};
constructor(private options: Human2RegexParserOptions = new Human2RegexParserOptions()) {
super(T.AllTokens, { recoveryEnabled: true, maxLookahead: 4});
if (Human2RegexParser.already_init) {
throw new Error("Only 1 instance of Human2RegexParser allowed");
}
Human2RegexParser.already_init = true;
const $ = this;
const Number = $.RULE("Number", () => {
this.nodes.NumberSubStatement = $.RULE("Number Sub-Statement", () => {
$.OR([
{ ALT: () => $.CONSUME(T.One) },
{ ALT: () => $.CONSUME(T.Two) },
@ -27,32 +43,35 @@ export class Human2RegexParser extends CstParser {
});
// 1, 1..2, between 1 and/to 2 inclusively/exclusively
const Count = $.RULE("Count", () => {
this.nodes.CountSubStatement = $.RULE("Count Sub-Statement", () => {
$.OR([
{ ALT: () => {
$.OPTION(() => $.CONSUME(T.Exactly));
$.SUBRULE(Number);
$.SUBRULE(this.nodes.NumberSubStatement);
$.OPTION(() => $.CONSUME(T.Times));
}},
{ ALT: () => {
$.OPTION(() => $.CONSUME(T.From));
$.SUBRULE(Number);
$.SUBRULE(this.nodes.NumberSubStatement);
$.OR([
{ ALT: () => $.CONSUME(T.OrMore) },
{ ALT: () => {
$.CONSUME(T.To);
$.SUBRULE(Number);
$.SUBRULE(this.nodes.NumberSubStatement);
}}
]);
$.OPTION(() => $.CONSUME(T.Times));
}},
{ ALT: () => {
$.CONSUME(T.Between);
$.SUBRULE(Number);
$.SUBRULE(this.nodes.NumberSubStatement);
$.OR([
{ ALT: () => $.CONSUME(T.To) },
{ ALT: () => $.CONSUME(T.And) }
]);
$.SUBRULE(Number);
$.SUBRULE(this.nodes.NumberSubStatement);
$.OPTION(() => $.CONSUME(T.Times));
$.OPTION(() => {
$.OR([
{ ALT: () => $.CONSUME(T.Inclusive) },
@ -63,21 +82,50 @@ export class Human2RegexParser extends CstParser {
]);
});
const MatchStatement = $.RULE("Match Statement", () => {
$.OPTION(() => $.CONSUME(T.Optional));
$.CONSUME(T.Match);
$.OPTION(() => {
$.SUBRULE(Count);
});
this.nodes.MatchSubStatement = $.RULE("Match Sub-Statement", () => {
$.OPTION(() => $.SUBRULE(this.nodes.CountSubStatement) );
$.OPTION(() => $.CONSUME(T.Not));
$.AT_LEAST_ONE_SEP({
SEP: T.Or,
DEF: () => {
$.CONSUME(T.StringLiteral);
$.OR([
{ ALT: () => $.CONSUME(T.Anything) },
{ ALT: () => $.CONSUME(T.StringLiteral) },
{ ALT: () => $.CONSUME(T.Word) },
{ ALT: () => $.CONSUME(T.Digit) },
{ ALT: () => $.CONSUME(T.Character) },
{ ALT: () => $.CONSUME(T.Whitespace) },
{ ALT: () => $.CONSUME(T.Number) },
{ ALT: () => $.CONSUME(T.Tab) },
{ ALT: () => $.CONSUME(T.Linefeed) },
{ ALT: () => $.CONSUME(T.Newline) },
{ ALT: () => $.CONSUME(T.CarriageReturn) },
]);
}
});
});
const UsingStatement = $.RULE("Using Statement", () => {
// optionally match "+" then 1+ words
this.nodes.MatchStatement = $.RULE("Match Statement", () => {
$.OPTION(() => $.CONSUME(T.Optional));
$.CONSUME(T.Match);
$.SUBRULE(this.nodes.MatchSubStatement);
$.MANY(() => {
$.OR([
{ ALT: () => $.CONSUME(T.And) },
{ ALT: () => {
$.OPTION(() => $.CONSUME(T.And));
$.CONSUME(T.Then);
}}
]);
$.OPTION(() => $.CONSUME(T.Optional));
$.SUBRULE(this.nodes.MatchSubStatement);
});
});
// using global matching
this.nodes.UsingStatement = $.RULE("Using Statement", () => {
$.CONSUME(T.Using);
$.AT_LEAST_ONE_SEP({
SEP: T.And,
@ -87,24 +135,55 @@ export class Human2RegexParser extends CstParser {
{ ALT: () => $.CONSUME(T.Global) },
{ ALT: () => $.CONSUME(T.CaseInsensitive) },
{ ALT: () => $.CONSUME(T.CaseSensitive) },
{ ALT: () => {
$.CONSUME(T.Exact); $.CONSUME(T.Matching);
}},
{ ALT: () => $.CONSUME(T.Exact) }
]);
$.OPTION(() => $.CONSUME(T.Matching));
}
});
});
const Statement = $.RULE("Statement", () => {
$.OR([
{ ALT: () => $.SUBRULE(MatchStatement) },
{ ALT: () => $.SUBRULE(UsingStatement) }
]);
$.OPTION(() => $.CONSUME(T.EndOfLine));
this.nodes.GroupStatement = $.RULE("Group Statement", () => {
$.OPTION(() => $.CONSUME(T.Optional));
$.CONSUME(T.Create);
$.CONSUME(T.A);
$.OPTION(() => $.CONSUME(T.Optional));
$.CONSUME(T.Group);
$.OPTION(() => {
$.CONSUME(T.Called);
$.CONSUME(T.StringLiteral);
});
$.CONSUME(T.Indent);
$.AT_LEAST_ONE(() => this.nodes.Statement);
$.CONSUME(T.Outdent);
});
this.nodes.RepeatStatement = $.RULE("Repeat Statement", () => {
$.OPTION(() => $.CONSUME(T.Optional));
$.CONSUME(T.Repeat);
$.OPTION(() => $.SUBRULE(this.nodes.CountSubStatement));
$.CONSUME(T.Indent);
$.AT_LEAST_ONE(() => this.nodes.Statement);
$.CONSUME(T.Outdent);
});
this.nodes.Statement = $.RULE("Statement", () => {
$.OR([
{ ALT: () => $.SUBRULE(this.nodes.MatchStatement) },
{ ALT: () => $.SUBRULE(this.nodes.GroupStatement) },
{ ALT: () => $.SUBRULE(this.nodes.RepeatStatement) }
]);
$.CONSUME(T.EndOfLine);
});
this.nodes.Regex = $.RULE("Regex", () => {
$.OPTION(() => $.SUBRULE(this.nodes.UsingStatement));
$.MANY(() => $.SUBRULE(this.nodes.Statement) );
});
this.performSelfAnalysis();
}
//public set_options(options: Human2RegexParserOptions) : void {
// // empty so far
//}
}

View File

@ -3,6 +3,8 @@
import "./style.css";
import { Human2RegexLexer } from "./tokenizer";
import { Human2RegexParser } from "./parser";
/*
$(function() {
@ -11,6 +13,7 @@ $(function() {
*/
const lexer = new Human2RegexLexer();
const parser = new Human2RegexParser();
const result = lexer.tokenize(`
// H2R supports // # and /**/ as comments
@ -53,16 +56,22 @@ create an optional group
match 0+ any thing
`);
//let str = "";
//for(const r of result.tokens) {
// str += r.tokenType === Newline ? "\n" : r.image + " ";
//}
//console.log(str);
for(const r of result.tokens) {
console.log(r);
console.log(`[${r.tokenType.name}]: ${r.image}`);
}
console.log(result.errors);
console.log(result.errors);
parser.input = result.tokens;
const regex = parser.nodes.regex;
console.log(regex);
console.log(parser.errors);
//interpreter.visit(regex);
//parser.getBaseCstVisitorConstructor();

View File

@ -19,15 +19,22 @@ export class Human2RegexLexerOptions {
export class Human2RegexLexer {
private static already_init = false;
private lexer : Lexer;
private lexer!: Lexer;
private options!: Human2RegexLexerOptions;
constructor(private options: Human2RegexLexerOptions = new Human2RegexLexerOptions()) {
constructor(options: Human2RegexLexerOptions = new Human2RegexLexerOptions()) {
if (Human2RegexLexer.already_init) {
throw new Error("Only 1 instance of Human2RegexLexer allowed");
}
Human2RegexLexer.already_init = true;
this.set_options(options);
}
public set_options(options: Human2RegexLexerOptions) : void {
this.options = options;
let indent_regex: RegExp | null = null;
if (this.options.type === IndentType.Tabs) {
@ -66,9 +73,7 @@ export class Human2RegexLexer {
}
// create Outdents
const tokens: IToken[] = [];
const indentStack = [ 0 ];
let currIndentLevel = 0;
@ -79,9 +84,15 @@ export class Human2RegexLexer {
// EoL? check for indents next (by setting startOfLine = true)
if (lexResult.tokens[i].tokenType === EndOfLine) {
startOfLine = true;
tokens.push(lexResult.tokens[i]);
if(tokens.length === 0 || tokens[tokens.length-1].tokenType === EndOfLine) {
// Ignore multiple EOLs and ignore first EOL
}
else {
startOfLine = true;
tokens.push(lexResult.tokens[i]);
}
}
// start with 1 indent. Append all other indents
else if (lexResult.tokens[i].tokenType === Indent) {
hadIndents = true;
currIndentLevel = 1;
@ -97,6 +108,9 @@ export class Human2RegexLexer {
}
start_token.endOffset = start_token.startOffset + length;
start_token.endColumn = lexResult.tokens[i].endColumn;
// must be the same line
//start_token.endLine = lexResult.tokens[i].endLine;
// are we an empty line?
if (lexResult.tokens.length > i && lexResult.tokens[i+1].tokenType === EndOfLine) {
@ -158,7 +172,6 @@ export class Human2RegexLexer {
indentStack.pop();
tokens.push(createTokenInstance(Outdent, "", tok.endOffset ?? NaN, tok.endOffset ?? NaN, tok.startLine ?? NaN, NaN, tok.startColumn ?? NaN, NaN));
}
lexResult.tokens = tokens;
return lexResult;

View File

@ -16,7 +16,6 @@ export const Optional = createToken({name: "Optional", pattern: /optional(ly)?/i
export const Match = createToken({name: "Match", pattern: /match(es)?/i });
export const Then = createToken({name: "Then", pattern: /then/i });
export const Anything = createToken({name: "Anything", pattern: /(any thing|any|anything)(s)?/i});
export const Of = createToken({name: "Of", pattern: /of/i});
export const Or = createToken({name: "Or", pattern: /or/i});
export const And = createToken({name: "And", pattern: /and|,/i});
export const Word = createToken({name: "Word Specifier", pattern: /word(s)?/i});
@ -24,51 +23,55 @@ export const Digit = createToken({name: "Digit Specifier", pattern: /digit(s)?/i
export const Character = createToken({name: "Character Specifier", pattern: /character(s)?/i});
export const Whitespace = createToken({name: "Whitespace Specifier", pattern: /(white space|whitespace)(s)?/i});
export const Number = createToken({name: "NumberSpecifier", pattern: /number(s)?/i});
export const As = createToken({name: "As", pattern: /as/i});
export const If = createToken({name: "If", pattern: /if/i});
export const Start = createToken({name: "Start", pattern: /start(s)?/i});
export const With = createToken({name: "With", pattern: /with/i});
export const Ends = createToken({name: "Ends", pattern: /end(s)?/i});
export const Otherwise = createToken({name: "Otherwise", pattern: /(other wise|otherwise)/i});
export const Else = createToken({name: "Else", pattern: /else/i});
export const Unless = createToken({name: "Unless", pattern: /unless/i});
export const While = createToken({name: "While", pattern: /while/i});
export const More = createToken({name: "More", pattern: /more/i});
export const Using = createToken({name: "Using", pattern: /using/i});
export const Global = createToken({name: "Global", pattern: /global/i});
export const Multiline = createToken({name: "Multiline", pattern: /(multi line|multiline)/i});
export const Exact = createToken({name: "Exact", pattern: /exact/i});
export const Matching = createToken({name: "Matching", pattern: /matching/i});
export const Nothing = createToken({name: "Nothing", pattern: /nothing/i});
export const Not = createToken({name: "Not", pattern: /not/i }); //, longer_alt: Nothing});
export const Between = createToken({name: "Between", pattern: /between/i});
export const Tab = createToken({name: "Tab", pattern: /tab/i});
export const Linefeed = createToken({name: "Linefeed", pattern: /(line feed|linefeed)/i});
export const Group = createToken({name: "Group", pattern: /group/i});
export const By = createToken({name: "By", pattern: /by/i});
export const A = createToken({name: "A", pattern: /a(n)?/i }); //, longer_alt: Anything});
export const The = createToken({name: "The", pattern: /the/i }); //, longer_alt: Then});
export const Times = createToken({name: "Times", pattern: /times/i });
export const Exactly = createToken({name: "Exactly", pattern: /exact(ly)?/i});
export const Inclusive = createToken({name: "Inclusive", pattern: /inclusive(ly)?/i});
export const Exclusive = createToken({name: "Exclusive", pattern: /exclusive(ly)?/i});
export const From = createToken({name: "From", pattern: /from/i});
export const To = createToken({name: "To", pattern: /(to|\-|\.\.|\.\.\.)/i});
export const Create = createToken({name: "Create", pattern: /create(s)?/i});
export const Called = createToken({name: "Called", pattern: /called/i});
export const Called = createToken({name: "Called", pattern: /name(d)?|call(ed)?/i});
export const Repeat = createToken({name: "Repeat", pattern: /repeat(s|ing)?/i});
export const Newline = createToken({name: "Newline", pattern: /(new line|newline)/i});
export const None = createToken({name: "None", pattern: /none/i});
export const Neither = createToken({name: "Neither", pattern: /neither/i});
export const CarriageReturn = createToken({name: "Carriage Return", pattern: /carriage return/i});
export const CaseInsensitive = createToken({name: "Case Insensitive", pattern: /case insensitive/i});
export const CaseSensitive = createToken({name: "Case Sensitive", pattern: /case sensitive/i});
export const OrMore = createToken({name: "Or More", pattern: /\+/ });
/*
//Not being used currently
export const Of = createToken({name: "Of", pattern: /of/i});
export const Nothing = createToken({name: "Nothing", pattern: /nothing/i});
export const As = createToken({name: "As", pattern: /as/i});
export const If = createToken({name: "If", pattern: /if/i});
export const Start = createToken({name: "Start", pattern: /start(s) with?/i});
export const Ends = createToken({name: "Ends", pattern: /end(s)? with/i});
export const Else = createToken({name: "Else", pattern: /(other wise|otherwise|else)/i});
export const Unless = createToken({name: "Unless", pattern: /unless/i});
export const While = createToken({name: "While", pattern: /while/i});
export const More = createToken({name: "More", pattern: /more/i});
export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
export const None = createToken({name: "None", pattern: /none/i});
export const Neither = createToken({name: "Neither", pattern: /neither/i});
export const The = createToken({name: "The", pattern: /the/i }); //, longer_alt: Then});
export const By = createToken({name: "By", pattern: /by/i});
*/
export const EndOfLine = createToken({name: "EOL", pattern: /\n/ });
export const WhiteSpace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED });
export const WS = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED });
export const SingleLineComment = createToken({name: "Single-Line Comment", pattern: /(#|\/\/).*/, group: Lexer.SKIPPED });
export const MultilineComment = createToken({name: "Multi-Line Comment", pattern: /\/\*(.*)\*\//, line_breaks: true, group: Lexer.SKIPPED });
@ -77,7 +80,6 @@ export const NumberLiteral = createToken({name: "Number Literal", pattern: /-?(0
export const StringLiteral = createToken({name: "String Literal", pattern: /"(?:[^\\"]|\\(?:[bfnrtv"\\/]|u[0-9a-f]{4}|U[0-9a-f]{8}))*"/i });
export const Indent = createToken({name: "Indent"});
export const Outdent = createToken({name: "Outdent"});
export const AllTokens = [
@ -97,7 +99,6 @@ export const AllTokens = [
Match,
Then,
Anything,
Of,
Or,
And,
Word,
@ -105,29 +106,33 @@ export const AllTokens = [
Character,
Whitespace,
Number,
/*
Of,
As,
If,
Start,
With,
Ends,
Otherwise,
Else,
Unless,
While,
More,
Nothing,
By,
The,
None,
Neither,
*/
Using,
Global,
Multiline,
Exact,
Nothing,
Not,
Between,
Tab,
Linefeed,
Group,
By,
A,
The,
Times,
Exactly,
Inclusive,
Exclusive,
@ -136,8 +141,6 @@ export const AllTokens = [
Called,
Repeat,
Newline,
None,
Neither,
CarriageReturn,
CaseInsensitive,
CaseSensitive,
@ -145,7 +148,7 @@ export const AllTokens = [
To,
EndOfLine,
Indent,
WhiteSpace,
WS,
SingleLineComment,
MultilineComment,
Identifier,