1
0
mirror of https://github.com/pdemian/human2regex.git synced 2025-05-16 12:30:09 -07:00

Merge branch 'new-features' into dependabot/npm_and_yarn/node-notifier-8.0.1

This commit is contained in:
Patrick Demian 2021-01-03 04:04:47 -05:00 committed by GitHub
commit ed89a2995a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 1517 additions and 566 deletions

View File

@ -81,6 +81,7 @@ The API reference is available [here](API.md)
## Todo ## Todo
- Add more regex options such as back references, subroutines, lookahead/behind, and more character classes (eg, `[:alpha:]`) - Add more regex options such as subroutines, conditions, and lookahead/behind
- Fix error messages (They sometimes point to the wrong location, off by 1 errors, etc) - Fix error messages (They sometimes point to the wrong location, off by 1 errors, etc)
- Add more useful lex/parse errors (What even is an EarlyExitException?)
- Use a different/better static site generation method - Use a different/better static site generation method

15
docs/bundle.min.js vendored

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

140
lib/generator.d.ts vendored
View File

@ -21,29 +21,45 @@ export interface ISemanticError {
message: string; message: string;
} }
/** /**
* The base concrete syntax tree class * Context for validation
* *
* @remarks Currently only used to validate groups
* @internal * @internal
*/ */
export declare abstract class H2RCST { export declare class GeneratorContext {
tokens: IToken[]; groups: {
[key: string]: {
startLine: number;
startColumn: number;
length: number;
};
};
/** /**
* Constructor for H2RCST * Checks to see if we already have a group defined
* *
* @param tokens Tokens used to calculate where an error occured * @param identifier the group name
* @internal * @returns true if the group name already exists
*/ */
constructor(tokens: IToken[]); hasGroup(identifier: string): boolean;
/**
* Adds the identifier to the group list
*
* @param identifier the group name
*/
addGroup(identifier: string, tokens: IToken[]): void;
}
interface Generates {
/** /**
* Validate that this is both valid and can be generated in the specified language * Validate that this is both valid and can be generated in the specified language
* *
* @remarks There is no guarantee toRegex will work unless validate returns no errors * @remarks There is no guarantee toRegex will work unless validate returns no errors
* *
* @param language the regex dialect we're validating * @param language the regex dialect we're validating
* @param context the generator context
* @returns A list of errors * @returns A list of errors
* @public * @public
*/ */
abstract validate(language: RegexDialect): ISemanticError[]; validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
/** /**
* Generate a regular expression fragment based on this syntax tree * Generate a regular expression fragment based on this syntax tree
* *
@ -53,6 +69,23 @@ export declare abstract class H2RCST {
* @returns a regular expression fragment * @returns a regular expression fragment
* @public * @public
*/ */
toRegex(language: RegexDialect): string;
}
/**
* The base concrete syntax tree class
*
* @internal
*/
export declare abstract class H2RCST implements Generates {
tokens: IToken[];
/**
* Constructor for H2RCST
*
* @param tokens Tokens used to calculate where an error occured
* @internal
*/
constructor(tokens: IToken[]);
abstract validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
abstract toRegex(language: RegexDialect): string; abstract toRegex(language: RegexDialect): string;
/** /**
* Creates an ISemanticError with a given message and the tokens provided from the constructor * Creates an ISemanticError with a given message and the tokens provided from the constructor
@ -126,7 +159,7 @@ export declare class MatchSubStatementValue {
* *
* @internal * @internal
*/ */
export declare class MatchStatementValue { export declare class MatchStatementValue implements Generates {
optional: boolean; optional: boolean;
statement: MatchSubStatementCST; statement: MatchSubStatementCST;
/** /**
@ -137,6 +170,8 @@ export declare class MatchStatementValue {
* @internal * @internal
*/ */
constructor(optional: boolean, statement: MatchSubStatementCST); constructor(optional: boolean, statement: MatchSubStatementCST);
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
} }
/** /**
* The base class for all statement concrete syntax trees * The base class for all statement concrete syntax trees
@ -163,7 +198,7 @@ export declare class MatchSubStatementCST extends H2RCST {
* @param values sub statements to match * @param values sub statements to match
*/ */
constructor(tokens: IToken[], count: CountSubStatementCST | null, invert: boolean, values: MatchSubStatementValue[]); constructor(tokens: IToken[], count: CountSubStatementCST | null, invert: boolean, values: MatchSubStatementValue[]);
validate(language: RegexDialect): ISemanticError[]; validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string; toRegex(language: RegexDialect): string;
} }
/** /**
@ -180,7 +215,7 @@ export declare class UsingStatementCST extends H2RCST {
* @param flags using flags * @param flags using flags
*/ */
constructor(tokens: IToken[], flags: UsingFlags[]); constructor(tokens: IToken[], flags: UsingFlags[]);
validate(language: RegexDialect): ISemanticError[]; validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string; toRegex(language: RegexDialect): string;
} }
/** /**
@ -201,7 +236,7 @@ export declare class CountSubStatementCST extends H2RCST {
* @param opt option modifier * @param opt option modifier
*/ */
constructor(tokens: IToken[], from: number, to?: number | null, opt?: "inclusive" | "exclusive" | "+" | null); constructor(tokens: IToken[], from: number, to?: number | null, opt?: "inclusive" | "exclusive" | "+" | null);
validate(language: RegexDialect): ISemanticError[]; validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string; toRegex(language: RegexDialect): string;
} }
/** /**
@ -216,10 +251,10 @@ export declare class MatchStatementCST extends StatementCST {
* Constructor for MatchStatementCST * Constructor for MatchStatementCST
* *
* @param tokens Tokens used to calculate where an error occured * @param tokens Tokens used to calculate where an error occured
* @param matches * @param matches the list of matches
*/ */
constructor(tokens: IToken[], completely_optional: boolean, matches: MatchStatementValue[]); constructor(tokens: IToken[], completely_optional: boolean, matches: MatchStatementValue[]);
validate(language: RegexDialect): ISemanticError[]; validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string; toRegex(language: RegexDialect): string;
} }
/** /**
@ -240,7 +275,7 @@ export declare class RepeatStatementCST extends StatementCST {
* @param statements the statements to repeat * @param statements the statements to repeat
*/ */
constructor(tokens: IToken[], optional: boolean, count: CountSubStatementCST | null, statements: StatementCST[]); constructor(tokens: IToken[], optional: boolean, count: CountSubStatementCST | null, statements: StatementCST[]);
validate(language: RegexDialect): ISemanticError[]; validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string; toRegex(language: RegexDialect): string;
} }
/** /**
@ -262,7 +297,70 @@ export declare class GroupStatementCST extends StatementCST {
* @internal * @internal
*/ */
constructor(tokens: IToken[], optional: boolean, name: string | null, statements: StatementCST[]); constructor(tokens: IToken[], optional: boolean, name: string | null, statements: StatementCST[]);
validate(language: RegexDialect): ISemanticError[]; validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
* Concrete Syntax Tree for a Backreference statement
*
* @internal
*/
export declare class BackrefStatementCST extends StatementCST {
private optional;
private count;
private name;
/**
* Constructor for BackrefStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param optional is this backref optional
* @param count optional number of times to repeat
* @param name the group name to call
*/
constructor(tokens: IToken[], optional: boolean, count: CountSubStatementCST | null, name: string);
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
* Concrete Syntax Tree for an If Pattern statement
*
* @internal
*/
export declare class IfPatternStatementCST extends StatementCST {
private matches;
private true_statements;
private false_statements;
/**
* Constructor for IfPatternStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param matches list of matches to test against
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens: IToken[], matches: MatchStatementValue[], true_statements: StatementCST[], false_statements: StatementCST[]);
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
* Concrete Syntax Tree for an If group Ident statement
*
* @internal
*/
export declare class IfIdentStatementCST extends StatementCST {
private identifier;
private true_statements;
private false_statements;
/**
* Constructor for IfIdentStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param identifier the group identifier to check
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens: IToken[], identifier: string, true_statements: StatementCST[], false_statements: StatementCST[]);
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string; toRegex(language: RegexDialect): string;
} }
/** /**
@ -282,13 +380,7 @@ export declare class RegularExpressionCST extends H2RCST {
* @internal * @internal
*/ */
constructor(tokens: IToken[], usings: UsingStatementCST, statements: StatementCST[]); constructor(tokens: IToken[], usings: UsingStatementCST, statements: StatementCST[]);
validate(language: RegexDialect): ISemanticError[]; validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string; toRegex(language: RegexDialect): string;
} }
/** export {};
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
export declare function minimizeMatchString(arr: string[]): string;

View File

@ -1,12 +1,13 @@
"use strict"; "use strict";
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */ /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
Object.defineProperty(exports, "__esModule", { value: true }); Object.defineProperty(exports, "__esModule", { value: true });
exports.minimizeMatchString = exports.RegularExpressionCST = exports.GroupStatementCST = exports.RepeatStatementCST = exports.MatchStatementCST = exports.CountSubStatementCST = exports.UsingStatementCST = exports.MatchSubStatementCST = exports.StatementCST = exports.MatchStatementValue = exports.MatchSubStatementValue = exports.MatchSubStatementType = exports.UsingFlags = exports.H2RCST = exports.RegexDialect = void 0; exports.RegularExpressionCST = exports.IfIdentStatementCST = exports.IfPatternStatementCST = exports.BackrefStatementCST = exports.GroupStatementCST = exports.RepeatStatementCST = exports.MatchStatementCST = exports.CountSubStatementCST = exports.UsingStatementCST = exports.MatchSubStatementCST = exports.StatementCST = exports.MatchStatementValue = exports.MatchSubStatementValue = exports.MatchSubStatementType = exports.UsingFlags = exports.H2RCST = exports.GeneratorContext = exports.RegexDialect = void 0;
/** /**
* Includes all Concrete Syntax Trees for Human2Regex * Includes all Concrete Syntax Trees for Human2Regex
* @packageDocumentation * @packageDocumentation
*/ */
const utilities_1 = require("./utilities"); const utilities_1 = require("./utilities");
const generator_helper_1 = require("./generator_helper");
/** /**
* List of regular expression dialects we support * List of regular expression dialects we support
*/ */
@ -49,6 +50,42 @@ const unicode_script_codes = [
"Tai_Tham", "Tai_Viet", "Takri", "Tamil", "Telugu", "Thaana", "Thai", "Tai_Tham", "Tai_Viet", "Takri", "Tamil", "Telugu", "Thaana", "Thai",
"Tibetan", "Tifinagh", "Ugaritic", "Vai", "Yi" "Tibetan", "Tifinagh", "Ugaritic", "Vai", "Yi"
]; ];
/**
* Context for validation
*
* @remarks Currently only used to validate groups
* @internal
*/
class GeneratorContext {
constructor() {
this.groups = {};
}
/**
* Checks to see if we already have a group defined
*
* @param identifier the group name
* @returns true if the group name already exists
*/
hasGroup(identifier) {
return Object.prototype.hasOwnProperty.call(this.groups, identifier);
}
/**
* Adds the identifier to the group list
*
* @param identifier the group name
*/
addGroup(identifier, tokens) {
var _a, _b, _c;
const f = utilities_1.first(tokens);
const l = utilities_1.last(tokens);
this.groups[identifier] = {
startLine: (_a = f.startLine) !== null && _a !== void 0 ? _a : NaN,
startColumn: (_b = f.startColumn) !== null && _b !== void 0 ? _b : NaN,
length: ((_c = l.endOffset) !== null && _c !== void 0 ? _c : l.startOffset) - f.startOffset,
};
}
}
exports.GeneratorContext = GeneratorContext;
/** /**
* The base concrete syntax tree class * The base concrete syntax tree class
* *
@ -166,6 +203,17 @@ class MatchStatementValue {
this.statement = statement; this.statement = statement;
/* empty */ /* empty */
} }
validate(language, context) {
return this.statement.validate(language, context);
}
toRegex(language) {
let match_stmt = this.statement.toRegex(language);
// need to group if optional and ungrouped
if (this.optional) {
match_stmt = generator_helper_1.groupIfRequired(match_stmt) + "?";
}
return match_stmt;
}
} }
exports.MatchStatementValue = MatchStatementValue; exports.MatchStatementValue = MatchStatementValue;
/** /**
@ -196,10 +244,10 @@ class MatchSubStatementCST extends H2RCST {
this.invert = invert; this.invert = invert;
this.values = values; this.values = values;
} }
validate(language) { validate(language, context) {
const errors = []; const errors = [];
if (this.count) { if (this.count) {
utilities_1.append(errors, this.count.validate(language)); utilities_1.append(errors, this.count.validate(language, context));
} }
for (const value of this.values) { for (const value of this.values) {
if (value.type === MatchSubStatementType.Between) { if (value.type === MatchSubStatementType.Between) {
@ -311,50 +359,15 @@ class MatchSubStatementCST extends H2RCST {
break; break;
} }
} }
let ret = ""; let ret = generator_helper_1.minimizeMatchString(matches);
let require_grouping = false;
let dont_clobber_plus = false;
if (matches.length === 1) {
ret = utilities_1.first(matches);
if (ret.endsWith("+")) {
dont_clobber_plus = true;
}
}
else {
ret = minimizeMatchString(matches);
if (ret.length > 1 &&
(!ret.startsWith("(") || !ret.endsWith("["))) {
require_grouping = true;
}
}
if (this.count) { if (this.count) {
if (dont_clobber_plus) { if (matches.length === 1) {
const clobber = this.count.toRegex(language); // we don't group if there's only 1 element
// + can be ignored as well as a count as long as that count is > 0 // but we need to make sure we don't add an additional + or *
switch (clobber) { ret = generator_helper_1.dontClobberRepetition(ret, this.count.toRegex(language));
case "*":
case "?":
ret = "(?:" + ret + ")" + clobber;
break;
case "+":
// ignore
break;
default:
if (clobber.startsWith("{0")) {
ret = "(?:" + ret + ")" + clobber;
}
else {
// remove + and replace with count
ret.substring(0, ret.length - 1) + clobber;
}
break;
}
} }
else { else {
if (require_grouping) { ret = generator_helper_1.groupIfRequired(ret) + this.count.toRegex(language);
ret = "(?:" + ret + ")";
}
ret += this.count.toRegex(language);
} }
} }
return ret; return ret;
@ -377,8 +390,9 @@ class UsingStatementCST extends H2RCST {
super(tokens); super(tokens);
this.flags = flags; this.flags = flags;
} }
validate(language) { validate(language, context) {
utilities_1.unusedParameter(language, "Using Statement does not change based on language"); utilities_1.unusedParameter(language, "Count does not need checking");
utilities_1.unusedParameter(context, "Context is not needed");
const errors = []; const errors = [];
let flag = this.flags[0]; let flag = this.flags[0];
for (let i = 1; i < this.flags.length; i++) { for (let i = 1; i < this.flags.length; i++) {
@ -434,13 +448,11 @@ class CountSubStatementCST extends H2RCST {
this.to = to; this.to = to;
this.opt = opt; this.opt = opt;
} }
validate(language) { validate(language, context) {
utilities_1.unusedParameter(language, "Count does not need checking"); utilities_1.unusedParameter(language, "Count does not need checking");
utilities_1.unusedParameter(context, "Context is not needed");
const errors = []; const errors = [];
if (this.from < 0) { if (this.to !== null && ((this.opt === "exclusive" && (this.to - 1) <= this.from) || this.to <= this.from)) {
errors.push(this.error("Value cannot be negative"));
}
else if (this.to !== null && ((this.opt === "exclusive" && (this.to - 1) <= this.from) || this.to <= this.from)) {
errors.push(this.error("Values must be in range of eachother")); errors.push(this.error("Values must be in range of eachother"));
} }
return errors; return errors;
@ -483,43 +495,24 @@ class MatchStatementCST extends StatementCST {
* Constructor for MatchStatementCST * Constructor for MatchStatementCST
* *
* @param tokens Tokens used to calculate where an error occured * @param tokens Tokens used to calculate where an error occured
* @param matches * @param matches the list of matches
*/ */
constructor(tokens, completely_optional, matches) { constructor(tokens, completely_optional, matches) {
super(tokens); super(tokens);
this.completely_optional = completely_optional; this.completely_optional = completely_optional;
this.matches = matches; this.matches = matches;
} }
validate(language) { validate(language, context) {
const errors = []; const errors = [];
for (const match of this.matches) { for (const match of this.matches) {
utilities_1.append(errors, match.statement.validate(language)); utilities_1.append(errors, match.statement.validate(language, context));
} }
return errors; return errors;
} }
toRegex(language) { toRegex(language) {
let final_matches = this.matches.map((x) => { let final_matches = this.matches.map((x) => x.toRegex(language)).join("");
let match_stmt = x.statement.toRegex(language);
// need to group if optional and ungrouped
if (x.optional) {
if (!utilities_1.isSingleRegexCharacter(match_stmt)) {
// don't re-group a group
if (match_stmt[0] !== "(" && match_stmt[match_stmt.length - 1] !== ")") {
match_stmt = "(?:" + match_stmt + ")";
}
}
match_stmt += "?";
}
return match_stmt;
}).join("");
if (this.completely_optional) { if (this.completely_optional) {
if (!utilities_1.isSingleRegexCharacter(final_matches)) { final_matches = generator_helper_1.groupIfRequired(final_matches) + "?";
// don't re-group a group
if (final_matches[0] !== "(" && final_matches[final_matches.length - 1] !== ")") {
final_matches = "(?:" + final_matches + ")";
}
}
final_matches += "?";
} }
return final_matches; return final_matches;
} }
@ -545,18 +538,18 @@ class RepeatStatementCST extends StatementCST {
this.count = count; this.count = count;
this.statements = statements; this.statements = statements;
} }
validate(language) { validate(language, context) {
const errors = []; const errors = [];
if (this.count !== null) { if (this.count !== null) {
utilities_1.append(errors, this.count.validate(language)); utilities_1.append(errors, this.count.validate(language, context));
} }
for (const statement of this.statements) { for (const statement of this.statements) {
utilities_1.append(errors, statement.validate(language)); utilities_1.append(errors, statement.validate(language, context));
} }
return errors; return errors;
} }
toRegex(language) { toRegex(language) {
let str = "(?:" + this.statements.map((x) => x.toRegex(language)).join("") + ")"; let str = generator_helper_1.groupIfRequired(this.statements.map((x) => x.toRegex(language)).join(""));
if (this.count) { if (this.count) {
str += this.count.toRegex(language); str += this.count.toRegex(language);
// group for optionality because count would be incorrect otherwise // group for optionality because count would be incorrect otherwise
@ -595,14 +588,19 @@ class GroupStatementCST extends StatementCST {
this.name = name; this.name = name;
this.statements = statements; this.statements = statements;
} }
validate(language) { validate(language, context) {
const errors = []; const errors = [];
// All languages currently support named groups if (this.name !== null) {
//if (false) { if (context.hasGroup(this.name)) {
// errors.push(this.error("This language does not support named groups")); const past_group = context.groups[this.name];
//} errors.push(this.error(`Group with name "${this.name}" was already defined here: ${past_group.startLine}:${past_group.startLine}-${past_group.startLine}:${past_group.startLine + past_group.length}`));
}
else {
context.addGroup(this.name, this.tokens);
}
}
for (const statement of this.statements) { for (const statement of this.statements) {
utilities_1.append(errors, statement.validate(language)); utilities_1.append(errors, statement.validate(language, context));
} }
return errors; return errors;
} }
@ -623,6 +621,169 @@ class GroupStatementCST extends StatementCST {
} }
} }
exports.GroupStatementCST = GroupStatementCST; exports.GroupStatementCST = GroupStatementCST;
/**
* Concrete Syntax Tree for a Backreference statement
*
* @internal
*/
class BackrefStatementCST extends StatementCST {
/**
* Constructor for BackrefStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param optional is this backref optional
* @param count optional number of times to repeat
* @param name the group name to call
*/
constructor(tokens, optional, count, name) {
super(tokens);
this.optional = optional;
this.count = count;
this.name = name;
}
validate(language, context) {
const errors = [];
if (!context.hasGroup(this.name)) {
errors.push(this.error(`Cannot call group with name "${this.name}" as it was never previously defined`));
}
if (this.count !== null) {
utilities_1.append(errors, this.count.validate(language, context));
}
return errors;
}
toRegex(language) {
let str = "";
switch (language) {
case RegexDialect.Python:
str = `(?P=${this.name})`;
break;
case RegexDialect.DotNet:
case RegexDialect.Java:
str = `\\k<${this.name}>`;
break;
default:
str = `\\g<${this.name}>`;
break;
}
if (this.count) {
str += this.count.toRegex(language);
// group for optionality because count would be incorrect otherwise
if (this.optional) {
str = "(?:" + str + ")?";
}
}
else if (this.optional) {
str = "?";
}
return str;
}
}
exports.BackrefStatementCST = BackrefStatementCST;
/**
* Concrete Syntax Tree for an If Pattern statement
*
* @internal
*/
class IfPatternStatementCST extends StatementCST {
/**
* Constructor for IfPatternStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param matches list of matches to test against
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens, matches, true_statements, false_statements) {
super(tokens);
this.matches = matches;
this.true_statements = true_statements;
this.false_statements = false_statements;
}
validate(language, context) {
const errors = [];
if (language === RegexDialect.Java || language === RegexDialect.JS) {
errors.push(this.error("This language does not support conditionals"));
}
if (language === RegexDialect.Python) {
errors.push(this.error("This language does not support pattern conditionals"));
}
for (const match of this.matches) {
utilities_1.append(errors, match.validate(language, context));
}
for (const statement of this.true_statements) {
utilities_1.append(errors, statement.validate(language, context));
}
for (const statement of this.false_statements) {
utilities_1.append(errors, statement.validate(language, context));
}
return errors;
}
toRegex(language) {
const if_stmt = this.matches.map((x) => x.toRegex(language)).join("");
const true_stmt = generator_helper_1.groupIfRequired(this.true_statements.map((x) => x.toRegex(language)).join(""));
if (this.false_statements.length > 0) {
const false_stmt = generator_helper_1.groupIfRequired(this.false_statements.map((x) => x.toRegex(language)).join(""));
return `(?(${if_stmt})${true_stmt}|${false_stmt})`;
}
else {
return `(?(${if_stmt})${true_stmt})`;
}
}
}
exports.IfPatternStatementCST = IfPatternStatementCST;
/**
* Concrete Syntax Tree for an If group Ident statement
*
* @internal
*/
class IfIdentStatementCST extends StatementCST {
/**
* Constructor for IfIdentStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param identifier the group identifier to check
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens, identifier, true_statements, false_statements) {
super(tokens);
this.identifier = identifier;
this.true_statements = true_statements;
this.false_statements = false_statements;
}
validate(language, context) {
const errors = [];
if (language === RegexDialect.Java || language === RegexDialect.JS) {
errors.push(this.error("This language does not support conditionals"));
}
if (!context.hasGroup(this.identifier)) {
errors.push(this.error(`Group with name "${this.identifier}" does not exist`));
}
for (const statement of this.true_statements) {
utilities_1.append(errors, statement.validate(language, context));
}
for (const statement of this.false_statements) {
utilities_1.append(errors, statement.validate(language, context));
}
return errors;
}
toRegex(language) {
let if_stmt = this.identifier;
// be more clear with languages that support it
if (language === RegexDialect.Boost) {
if_stmt = "<" + if_stmt + ">";
}
const true_stmt = generator_helper_1.groupIfRequired(this.true_statements.map((x) => x.toRegex(language)).join(""));
if (this.false_statements.length > 0) {
const false_stmt = generator_helper_1.groupIfRequired(this.false_statements.map((x) => x.toRegex(language)).join(""));
return `(?(${if_stmt})${true_stmt}|${false_stmt})`;
}
else {
return `(?(${if_stmt})${true_stmt})`;
}
}
}
exports.IfIdentStatementCST = IfIdentStatementCST;
/** /**
* Concrete Syntax Tree for a regular expression * Concrete Syntax Tree for a regular expression
* *
@ -642,10 +803,10 @@ class RegularExpressionCST extends H2RCST {
this.usings = usings; this.usings = usings;
this.statements = statements; this.statements = statements;
} }
validate(language) { validate(language, context) {
const errors = this.usings.validate(language); const errors = this.usings.validate(language, context);
for (const statement of this.statements) { for (const statement of this.statements) {
utilities_1.append(errors, statement.validate(language)); utilities_1.append(errors, statement.validate(language, context));
} }
return errors; return errors;
} }
@ -656,87 +817,3 @@ class RegularExpressionCST extends H2RCST {
} }
} }
exports.RegularExpressionCST = RegularExpressionCST; exports.RegularExpressionCST = RegularExpressionCST;
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
function minimizeMatchString(arr) {
return minMatchString(arr, 0);
}
exports.minimizeMatchString = minimizeMatchString;
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array
* @param depth must be 0 for initial call
* @internal
*/
function minMatchString(arr, depth = 0) {
// base case: arr is empty
if (arr.length === 0) {
return "";
}
// base case: arr has 1 element (must have at least 2, so this means this value is optional)
if (arr.length === 1) {
return utilities_1.first(arr) + "?";
}
// remove duplicates
arr = [...new Set(arr)];
// base case: arr has 1 element (after duplicate removal means this is required)
if (arr.length === 1) {
return utilities_1.first(arr);
}
// base case: arr is all single letters
if (arr.every(utilities_1.isSingleRegexCharacter)) {
return "[" + arr.join("") + "]";
}
// now the real magic begins
// You are not expected to understand this
let longest_begin_substring = utilities_1.first(arr);
let longest_end_substring = utilities_1.first(arr);
for (let i = 1; i < arr.length; i++) {
// reduce longest_substring to match everything
for (let j = 0; j < longest_begin_substring.length; j++) {
if (arr[i].length < j || longest_begin_substring[j] !== arr[i][j]) {
longest_begin_substring = longest_begin_substring.substr(0, j);
break;
}
}
for (let j = 0; j < longest_end_substring.length; j++) {
if (arr[i].length - j < 0 || longest_end_substring[longest_end_substring.length - j - 1] !== arr[i][arr[i].length - j - 1]) {
longest_end_substring = longest_end_substring.substr(longest_end_substring.length - j, longest_end_substring.length);
break;
}
}
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
break;
}
}
// No matches whatsoever
// *technically* we can optimize further, but that is a VERY non-trivial problem
// For example optimizing: [ "a1x1z", "a2y2z", "a3z3z" ] to: "a[123][xyz][123]z"
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
if (depth > 0) {
return "(?:" + arr.join("|") + ")";
}
else {
return arr.join("|");
}
}
// we have some matches
else {
// remove begin (if exists) and end (if exists) from each element and remove empty strings
const begin_pos = longest_begin_substring.length;
const end_pos = longest_end_substring.length;
const similar_matches = [];
for (const ele of arr) {
const match = ele.substring(begin_pos, ele.length - end_pos);
if (match.length !== 0) {
similar_matches.push(match);
}
}
return longest_begin_substring + minMatchString(similar_matches, depth + 1) + longest_end_substring;
}
}

23
lib/generator_helper.d.ts vendored Normal file
View File

@ -0,0 +1,23 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
export declare function minimizeMatchString(arr: string[]): string;
/**
* Groups a regex fragment if it needs to be grouped
*
* @param fragment fragment of regular expression to potentially group
* @returns a non-capturing group if there needs to be one
* @internal
*/
export declare function groupIfRequired(fragment: string): string;
/**
* Checks to see if fragment has a + or * at the end and has a repetition statement
*
* @param fragment fragment of regular expression
* @param repetition repetition that may clobber the fragment
*/
export declare function dontClobberRepetition(fragment: string, repetition: string): string;

203
lib/generator_helper.js Normal file
View File

@ -0,0 +1,203 @@
"use strict";
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
Object.defineProperty(exports, "__esModule", { value: true });
exports.dontClobberRepetition = exports.groupIfRequired = exports.minimizeMatchString = void 0;
/**
* Includes helper functions for the Generator
* @packageDocumentation
*/
const utilities_1 = require("./utilities");
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
function minimizeMatchString(arr) {
// don't process an array of length 1, otherwise you'll get the wrong result
if (arr.length === 1) {
return utilities_1.first(arr);
}
return minMatchString(arr, 0);
}
exports.minimizeMatchString = minimizeMatchString;
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array
* @param depth must be 0 for initial call
* @returns an optimized string
* @internal
*/
function minMatchString(arr, depth = 0) {
// base case: arr is empty
if (arr.length === 0) {
return "";
}
// base case: arr has 1 element (must have at least 2, so this means this value is optional)
if (arr.length === 1) {
return utilities_1.first(arr) + "?";
}
// remove duplicates
arr = [...new Set(arr)];
// base case: arr has 1 element (after duplicate removal means this is required)
if (arr.length === 1) {
return utilities_1.first(arr);
}
// base case: arr is all single letters
if (arr.every(utilities_1.isSingleRegexCharacter)) {
return "[" + arr.join("") + "]";
}
// now the real magic begins
// You are not expected to understand this
let longest_begin_substring = utilities_1.first(arr);
let longest_end_substring = utilities_1.first(arr);
for (let i = 1; i < arr.length; i++) {
// reduce longest_substring to match everything
for (let j = 0; j < longest_begin_substring.length; j++) {
if (arr[i].length < j || longest_begin_substring[j] !== arr[i][j]) {
longest_begin_substring = longest_begin_substring.substr(0, j);
break;
}
}
for (let j = 0; j < longest_end_substring.length; j++) {
if (arr[i].length - j < 0 || longest_end_substring[longest_end_substring.length - j - 1] !== arr[i][arr[i].length - j - 1]) {
longest_end_substring = longest_end_substring.substr(longest_end_substring.length - j, longest_end_substring.length);
break;
}
}
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
break;
}
}
// No matches whatsoever
// *technically* we can optimize further, but that is a VERY non-trivial problem
// For example optimizing: [ "a1x1z", "a2y2z", "a3z3z" ] to: "a[123][xyz][123]z"
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
if (depth > 0) {
return "(?:" + arr.join("|") + ")";
}
else {
return arr.join("|");
}
}
// we have some matches
else {
// remove begin (if exists) and end (if exists) from each element and remove empty strings
const begin_pos = longest_begin_substring.length;
const end_pos = longest_end_substring.length;
const similar_matches = [];
for (const ele of arr) {
const match = ele.substring(begin_pos, ele.length - end_pos);
if (match.length !== 0) {
similar_matches.push(match);
}
}
return longest_begin_substring + minMatchString(similar_matches, depth + 1) + longest_end_substring;
}
}
/**
* Groups a regex fragment if it needs to be grouped
*
* @param fragment fragment of regular expression to potentially group
* @returns a non-capturing group if there needs to be one
* @internal
*/
function groupIfRequired(fragment) {
if (utilities_1.isSingleRegexCharacter(fragment)) {
return fragment;
}
if (fragment[0] === "(" && fragment[fragment.length - 1] === ")") {
let bracket_count = 0;
for (let i = 1; i < fragment.length - 2; i++) {
if (fragment[i] === "\\") {
i++;
}
else if (fragment[i] === "(") {
bracket_count++;
}
else if (fragment[i] === ")") {
bracket_count--;
if (bracket_count === -1) {
break;
}
}
}
return bracket_count === 0 ? fragment : "(?:" + fragment + ")";
}
else if (fragment[0] === "[" && fragment[fragment.length - 1] === "]") {
let bracket_count = 0;
for (let i = 1; i < fragment.length - 2; i++) {
if (fragment[i] === "\\") {
i++;
}
//you'll never have a raw [ inside a []
//else if (fragment[i] === "[") {
// bracket_count++;
//}
else if (fragment[i] === "]") {
bracket_count--;
if (bracket_count === -1) {
break;
}
}
}
return bracket_count === 0 ? fragment : "(?:" + fragment + ")";
}
else {
return "(?:" + fragment + ")";
}
}
exports.groupIfRequired = groupIfRequired;
/**
* Checks to see if fragment has a + or * at the end and has a repetition statement
*
* @param fragment fragment of regular expression
* @param repetition repetition that may clobber the fragment
*/
function dontClobberRepetition(fragment, repetition) {
// + can be ignored as well as a count as long as that count is > 0
if (fragment.endsWith("+")) {
switch (repetition) {
case "*":
// ignore: + is greater than *
break;
case "?":
// non-greedy qualifier
fragment += repetition;
break;
case "+":
// ignore: already +
break;
default:
if (repetition.startsWith("{0")) {
fragment = "(?:" + fragment + ")" + repetition;
}
else {
// remove + and replace with count
fragment = fragment.substring(0, fragment.length - 1) + repetition;
}
break;
}
}
else if (fragment.endsWith("*")) {
switch (repetition) {
case "*":
// ignore: already +
break;
case "?":
// non-greedy qualifier
fragment += repetition;
break;
default:
// remove * and replace with count
fragment = fragment.substring(0, fragment.length - 1) + repetition;
break;
}
}
else {
fragment += repetition;
}
return fragment;
}
exports.dontClobberRepetition = dontClobberRepetition;

View File

@ -83,7 +83,7 @@ class ParseResult {
* @public * @public
*/ */
validate(language) { validate(language) {
return this.regexp_cst.validate(language).map(utilities_1.CommonError.fromSemanticError); return this.regexp_cst.validate(language, new generator_1.GeneratorContext()).map(utilities_1.CommonError.fromSemanticError);
} }
/** /**
* Generate a regular expression string based on the parse result * Generate a regular expression string based on the parse result
@ -499,12 +499,91 @@ class Human2RegexParser extends chevrotain_1.EmbeddedActionsParser {
tokens.push($.CONSUME(T.Outdent)); tokens.push($.CONSUME(T.Outdent));
return new generator_1.RepeatStatementCST(tokens, optional, count, statements); return new generator_1.RepeatStatementCST(tokens, optional, count, statements);
}); });
const BackrefStatement = $.RULE("BackrefStatement", () => {
const tokens = [];
let optional = false;
let count = null;
$.OPTION5(() => {
tokens.push($.CONSUME(T.Optional));
optional = true;
});
tokens.push($.CONSUME(T.Call));
$.OPTION6(() => count = $.SUBRULE(CountSubStatement));
$.OPTION7(() => {
$.OPTION(() => $.CONSUME(T.The));
$.CONSUME(T.Group);
$.OPTION2(() => $.CONSUME(T.Called));
});
const name = $.CONSUME(T.Identifier).image;
tokens.push($.CONSUME4(T.EndOfLine));
return new generator_1.BackrefStatementCST(tokens, optional, count, name);
});
const IfStatement = $.RULE("IfStatement", () => {
const tokens = [];
const msv = [];
let optional = false;
const true_statements = [];
const false_statements = [];
let name = "";
tokens.push($.CONSUME(T.If));
$.OR2([
{ ALT: () => {
name = $.CONSUME(T.Identifier).image;
} },
{ ALT: () => {
$.CONSUME(T.Match);
$.OPTION4(() => {
$.CONSUME3(T.Optional);
optional = true;
});
msv.push(new generator_1.MatchStatementValue(optional, $.SUBRULE(MatchSubStatement)));
$.MANY(() => {
$.OR([
{ ALT: () => {
$.OPTION2(() => $.CONSUME2(T.And));
$.CONSUME(T.Then);
} },
{ ALT: () => $.CONSUME(T.And) },
]);
optional = false;
$.OPTION3(() => {
$.CONSUME2(T.Optional);
optional = true;
});
msv.push(new generator_1.MatchStatementValue(optional, $.SUBRULE2(MatchSubStatement)));
});
} }
]);
tokens.push($.CONSUME3(T.EndOfLine));
$.CONSUME2(T.Indent);
$.AT_LEAST_ONE2(() => {
true_statements.push($.SUBRULE(Statement));
});
$.CONSUME2(T.Outdent);
$.OPTION(() => {
$.CONSUME(T.Else);
$.CONSUME4(T.EndOfLine);
$.CONSUME3(T.Indent);
$.AT_LEAST_ONE3(() => {
false_statements.push($.SUBRULE2(Statement));
});
$.CONSUME3(T.Outdent);
});
if (name === "") {
return new generator_1.IfPatternStatementCST(tokens, msv, true_statements, false_statements);
}
else {
return new generator_1.IfIdentStatementCST(tokens, name, true_statements, false_statements);
}
});
// statement super class // statement super class
const Statement = $.RULE("Statement", () => { const Statement = $.RULE("Statement", () => {
return $.OR([ return $.OR([
{ ALT: () => $.SUBRULE(MatchStatement) }, { ALT: () => $.SUBRULE(MatchStatement) },
{ ALT: () => $.SUBRULE(GroupStatement) }, { ALT: () => $.SUBRULE(GroupStatement) },
{ ALT: () => $.SUBRULE(RepeatStatement) } { ALT: () => $.SUBRULE(RepeatStatement) },
{ ALT: () => $.SUBRULE(BackrefStatement) },
{ ALT: () => $.SUBRULE(IfStatement) }
]); ]);
}); });
// full regex // full regex

4
lib/tokens.d.ts vendored
View File

@ -51,6 +51,10 @@
/** @internal */ export declare const CaseInsensitive: import("chevrotain").TokenType; /** @internal */ export declare const CaseInsensitive: import("chevrotain").TokenType;
/** @internal */ export declare const CaseSensitive: import("chevrotain").TokenType; /** @internal */ export declare const CaseSensitive: import("chevrotain").TokenType;
/** @internal */ export declare const OrMore: import("chevrotain").TokenType; /** @internal */ export declare const OrMore: import("chevrotain").TokenType;
/** @internal */ export declare const Call: import("chevrotain").TokenType;
/** @internal */ export declare const The: import("chevrotain").TokenType;
/** @internal */ export declare const If: import("chevrotain").TokenType;
/** @internal */ export declare const Else: import("chevrotain").TokenType;
/** @internal */ export declare const EndOfLine: import("chevrotain").TokenType; /** @internal */ export declare const EndOfLine: import("chevrotain").TokenType;
/** @internal */ export declare const WS: import("chevrotain").TokenType; /** @internal */ export declare const WS: import("chevrotain").TokenType;
/** @internal */ export declare const SingleLineComment: import("chevrotain").TokenType; /** @internal */ export declare const SingleLineComment: import("chevrotain").TokenType;

View File

@ -1,7 +1,8 @@
"use strict"; "use strict";
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */ /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
Object.defineProperty(exports, "__esModule", { value: true }); Object.defineProperty(exports, "__esModule", { value: true });
exports.AllTokens = exports.Outdent = exports.Indent = exports.StringLiteral = exports.NumberLiteral = exports.Identifier = exports.MultilineComment = exports.SingleLineComment = exports.WS = exports.EndOfLine = exports.OrMore = exports.CaseSensitive = exports.CaseInsensitive = exports.CarriageReturn = exports.Newline = exports.Repeat = exports.Called = exports.Create = exports.To = exports.From = exports.Exclusive = exports.Inclusive = exports.Exactly = exports.Times = exports.A = exports.Group = exports.Linefeed = exports.Tab = exports.Between = exports.Not = exports.Matching = exports.Exact = exports.Multiline = exports.Global = exports.Using = exports.Unicode = exports.Number = exports.Boundary = exports.Whitespace = exports.Integer = exports.Decimal = exports.Letter = exports.Character = exports.Digit = exports.Word = exports.And = exports.Or = exports.Anything = exports.Then = exports.Match = exports.Optional = exports.Ten = exports.Nine = exports.Eight = exports.Seven = exports.Six = exports.Five = exports.Four = exports.Three = exports.Two = exports.One = exports.Zero = void 0; exports.CaseInsensitive = exports.CarriageReturn = exports.Newline = exports.Repeat = exports.Called = exports.Create = exports.To = exports.From = exports.Exclusive = exports.Inclusive = exports.Exactly = exports.Times = exports.A = exports.Group = exports.Linefeed = exports.Tab = exports.Between = exports.Not = exports.Matching = exports.Exact = exports.Multiline = exports.Global = exports.Using = exports.Unicode = exports.Number = exports.Boundary = exports.Whitespace = exports.Integer = exports.Decimal = exports.Letter = exports.Character = exports.Digit = exports.Word = exports.And = exports.Or = exports.Anything = exports.Then = exports.Match = exports.Optional = exports.Ten = exports.Nine = exports.Eight = exports.Seven = exports.Six = exports.Five = exports.Four = exports.Three = exports.Two = exports.One = exports.Zero = void 0;
exports.AllTokens = exports.Outdent = exports.Indent = exports.StringLiteral = exports.NumberLiteral = exports.Identifier = exports.MultilineComment = exports.SingleLineComment = exports.WS = exports.EndOfLine = exports.Else = exports.If = exports.The = exports.Call = exports.OrMore = exports.CaseSensitive = void 0;
/** /**
* The tokens required for Human2Regex * The tokens required for Human2Regex
* @packageDocumentation * @packageDocumentation
@ -52,32 +53,17 @@ const chevrotain_1 = require("chevrotain");
/** @internal */ exports.From = chevrotain_1.createToken({ name: "From", pattern: /from/i }); /** @internal */ exports.From = chevrotain_1.createToken({ name: "From", pattern: /from/i });
/** @internal */ exports.To = chevrotain_1.createToken({ name: "To", pattern: /(to|through|thru|\-|\.\.\.?)/i }); /** @internal */ exports.To = chevrotain_1.createToken({ name: "To", pattern: /(to|through|thru|\-|\.\.\.?)/i });
/** @internal */ exports.Create = chevrotain_1.createToken({ name: "Create", pattern: /create(s)?/i }); /** @internal */ exports.Create = chevrotain_1.createToken({ name: "Create", pattern: /create(s)?/i });
/** @internal */ exports.Called = chevrotain_1.createToken({ name: "Called", pattern: /name(d)?|call(ed)?/i }); /** @internal */ exports.Called = chevrotain_1.createToken({ name: "Called", pattern: /named|called/i });
/** @internal */ exports.Repeat = chevrotain_1.createToken({ name: "Repeat", pattern: /repeat(s|ing)?/i }); /** @internal */ exports.Repeat = chevrotain_1.createToken({ name: "Repeat", pattern: /repeat(s|ing)?/i });
/** @internal */ exports.Newline = chevrotain_1.createToken({ name: "Newline", pattern: /(new line|newline)/i }); /** @internal */ exports.Newline = chevrotain_1.createToken({ name: "Newline", pattern: /(new line|newline)/i });
/** @internal */ exports.CarriageReturn = chevrotain_1.createToken({ name: "CarriageReturn", pattern: /carriage return/i }); /** @internal */ exports.CarriageReturn = chevrotain_1.createToken({ name: "CarriageReturn", pattern: /carriage return/i });
/** @internal */ exports.CaseInsensitive = chevrotain_1.createToken({ name: "CaseInsensitive", pattern: /case insensitive/i }); /** @internal */ exports.CaseInsensitive = chevrotain_1.createToken({ name: "CaseInsensitive", pattern: /case insensitive/i });
/** @internal */ exports.CaseSensitive = chevrotain_1.createToken({ name: "CaseSensitive", pattern: /case sensitive/i }); /** @internal */ exports.CaseSensitive = chevrotain_1.createToken({ name: "CaseSensitive", pattern: /case sensitive/i });
/** @internal */ exports.OrMore = chevrotain_1.createToken({ name: "OrMore", pattern: /\+|or more/i }); /** @internal */ exports.OrMore = chevrotain_1.createToken({ name: "OrMore", pattern: /\+|or more/i });
/* /** @internal */ exports.Call = chevrotain_1.createToken({ name: "Call", pattern: /call|invoke|execute|(re ?)?run/i });
//Not being used currently /** @internal */ exports.The = chevrotain_1.createToken({ name: "The", pattern: /the/i });
export const Of = createToken({name: "Of", pattern: /of/i}); /** @internal */ exports.If = chevrotain_1.createToken({ name: "If", pattern: /if/i });
export const Nothing = createToken({name: "Nothing", pattern: /nothing/i}); /** @internal */ exports.Else = chevrotain_1.createToken({ name: "Else", pattern: /else|otherwise/i });
export const As = createToken({name: "As", pattern: /as/i});
export const If = createToken({name: "If", pattern: /if/i});
export const Start = createToken({name: "Start", pattern: /start(s) with?/i});
export const Ends = createToken({name: "Ends", pattern: /end(s)? with/i});
export const Else = createToken({name: "Else", pattern: /(other wise|otherwise|else)/i});
export const Unless = createToken({name: "Unless", pattern: /unless/i});
export const While = createToken({name: "While", pattern: /while/i});
export const More = createToken({name: "More", pattern: /more/i});
export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
export const None = createToken({name: "None", pattern: /none/i});
export const Neither = createToken({name: "Neither", pattern: /neither/i});
export const The = createToken({name: "The", pattern: /the/i }); //, longer_alt: Then});
export const By = createToken({name: "By", pattern: /by/i});
*/
/** @internal */ exports.EndOfLine = chevrotain_1.createToken({ name: "EOL", pattern: /\n/ }); /** @internal */ exports.EndOfLine = chevrotain_1.createToken({ name: "EOL", pattern: /\n/ });
/** @internal */ exports.WS = chevrotain_1.createToken({ name: "Whitespace", pattern: /[^\S\n]+/, start_chars_hint: [" ", "\r"], group: chevrotain_1.Lexer.SKIPPED }); /** @internal */ exports.WS = chevrotain_1.createToken({ name: "Whitespace", pattern: /[^\S\n]+/, start_chars_hint: [" ", "\r"], group: chevrotain_1.Lexer.SKIPPED });
/** @internal */ exports.SingleLineComment = chevrotain_1.createToken({ name: "SingleLineComment", pattern: /(#|\/\/).*/, group: chevrotain_1.Lexer.SKIPPED }); /** @internal */ exports.SingleLineComment = chevrotain_1.createToken({ name: "SingleLineComment", pattern: /(#|\/\/).*/, group: chevrotain_1.Lexer.SKIPPED });
@ -120,22 +106,11 @@ exports.AllTokens = [
exports.Whitespace, exports.Whitespace,
exports.Number, exports.Number,
exports.Unicode, exports.Unicode,
/* exports.Called,
Of, exports.Call,
As, exports.If,
If, exports.Else,
Start, exports.The,
Ends,
Else,
Unless,
While,
More,
Nothing,
By,
The,
None,
Neither,
*/
exports.Using, exports.Using,
exports.Global, exports.Global,
exports.Multiline, exports.Multiline,
@ -151,7 +126,6 @@ exports.AllTokens = [
exports.Exclusive, exports.Exclusive,
exports.From, exports.From,
exports.Create, exports.Create,
exports.Called,
exports.Repeat, exports.Repeat,
exports.Newline, exports.Newline,
exports.CarriageReturn, exports.CarriageReturn,

3
lib/utilities.d.ts vendored
View File

@ -130,6 +130,7 @@ export declare class CommonError {
* *
* @param error The lexing error * @param error The lexing error
* @returns a new CommonError * @returns a new CommonError
* @internal
*/ */
static fromLexError(error: ILexingError): CommonError; static fromLexError(error: ILexingError): CommonError;
/** /**
@ -137,6 +138,7 @@ export declare class CommonError {
* *
* @param error The parsing error * @param error The parsing error
* @returns a new CommonError * @returns a new CommonError
* @internal
*/ */
static fromParseError(error: IRecognitionException): CommonError; static fromParseError(error: IRecognitionException): CommonError;
/** /**
@ -144,6 +146,7 @@ export declare class CommonError {
* *
* @param error The semantic error * @param error The semantic error
* @returns a new CommonError * @returns a new CommonError
* @internal
*/ */
static fromSemanticError(error: ISemanticError): CommonError; static fromSemanticError(error: ISemanticError): CommonError;
/** /**

View File

@ -181,6 +181,7 @@ class CommonError {
* *
* @param error The lexing error * @param error The lexing error
* @returns a new CommonError * @returns a new CommonError
* @internal
*/ */
static fromLexError(error) { static fromLexError(error) {
// not really fond of --> and <-- // not really fond of --> and <--
@ -192,6 +193,7 @@ class CommonError {
* *
* @param error The parsing error * @param error The parsing error
* @returns a new CommonError * @returns a new CommonError
* @internal
*/ */
static fromParseError(error) { static fromParseError(error) {
var _a, _b, _c; var _a, _b, _c;
@ -204,6 +206,7 @@ class CommonError {
* *
* @param error The semantic error * @param error The semantic error
* @returns a new CommonError * @returns a new CommonError
* @internal
*/ */
static fromSemanticError(error) { static fromSemanticError(error) {
return new CommonError("Semantic Error", error.startLine, error.startColumn, error.length, error.message); return new CommonError("Semantic Error", error.startLine, error.startColumn, error.length, error.message);

100
package-lock.json generated
View File

@ -1,6 +1,6 @@
{ {
"name": "human2regex", "name": "human2regex",
"version": "1.0.2", "version": "1.1.0",
"lockfileVersion": 1, "lockfileVersion": 1,
"requires": true, "requires": true,
"dependencies": { "dependencies": {
@ -1607,13 +1607,13 @@
"dev": true "dev": true
}, },
"@typescript-eslint/eslint-plugin": { "@typescript-eslint/eslint-plugin": {
"version": "4.7.0", "version": "4.8.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-4.7.0.tgz", "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-4.8.1.tgz",
"integrity": "sha512-li9aiSVBBd7kU5VlQlT1AqP0uWGDK6JYKUQ9cVDnOg34VNnd9t4jr0Yqc/bKxJr/tDCPDaB4KzoSFN9fgVxe/Q==", "integrity": "sha512-d7LeQ7dbUrIv5YVFNzGgaW3IQKMmnmKFneRWagRlGYOSfLJVaRbj/FrBNOBC1a3tVO+TgNq1GbHvRtg1kwL0FQ==",
"dev": true, "dev": true,
"requires": { "requires": {
"@typescript-eslint/experimental-utils": "4.7.0", "@typescript-eslint/experimental-utils": "4.8.1",
"@typescript-eslint/scope-manager": "4.7.0", "@typescript-eslint/scope-manager": "4.8.1",
"debug": "^4.1.1", "debug": "^4.1.1",
"functional-red-black-tree": "^1.0.1", "functional-red-black-tree": "^1.0.1",
"regexpp": "^3.0.0", "regexpp": "^3.0.0",
@ -1622,55 +1622,55 @@
} }
}, },
"@typescript-eslint/experimental-utils": { "@typescript-eslint/experimental-utils": {
"version": "4.7.0", "version": "4.8.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/experimental-utils/-/experimental-utils-4.7.0.tgz", "resolved": "https://registry.npmjs.org/@typescript-eslint/experimental-utils/-/experimental-utils-4.8.1.tgz",
"integrity": "sha512-cymzovXAiD4EF+YoHAB5Oh02MpnXjvyaOb+v+BdpY7lsJXZQN34oIETeUwVT2XfV9rSNpXaIcknDLfupO/tUoA==", "integrity": "sha512-WigyLn144R3+lGATXW4nNcDJ9JlTkG8YdBWHkDlN0lC3gUGtDi7Pe3h5GPvFKMcRz8KbZpm9FJV9NTW8CpRHpg==",
"dev": true, "dev": true,
"requires": { "requires": {
"@types/json-schema": "^7.0.3", "@types/json-schema": "^7.0.3",
"@typescript-eslint/scope-manager": "4.7.0", "@typescript-eslint/scope-manager": "4.8.1",
"@typescript-eslint/types": "4.7.0", "@typescript-eslint/types": "4.8.1",
"@typescript-eslint/typescript-estree": "4.7.0", "@typescript-eslint/typescript-estree": "4.8.1",
"eslint-scope": "^5.0.0", "eslint-scope": "^5.0.0",
"eslint-utils": "^2.0.0" "eslint-utils": "^2.0.0"
} }
}, },
"@typescript-eslint/parser": { "@typescript-eslint/parser": {
"version": "4.7.0", "version": "4.8.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-4.7.0.tgz", "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-4.8.1.tgz",
"integrity": "sha512-+meGV8bMP1sJHBI2AFq1GeTwofcGiur8LoIr6v+rEmD9knyCqDlrQcFHR0KDDfldHIFDU/enZ53fla6ReF4wRw==", "integrity": "sha512-QND8XSVetATHK9y2Ltc/XBl5Ro7Y62YuZKnPEwnNPB8E379fDsvzJ1dMJ46fg/VOmk0hXhatc+GXs5MaXuL5Uw==",
"dev": true, "dev": true,
"requires": { "requires": {
"@typescript-eslint/scope-manager": "4.7.0", "@typescript-eslint/scope-manager": "4.8.1",
"@typescript-eslint/types": "4.7.0", "@typescript-eslint/types": "4.8.1",
"@typescript-eslint/typescript-estree": "4.7.0", "@typescript-eslint/typescript-estree": "4.8.1",
"debug": "^4.1.1" "debug": "^4.1.1"
} }
}, },
"@typescript-eslint/scope-manager": { "@typescript-eslint/scope-manager": {
"version": "4.7.0", "version": "4.8.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-4.7.0.tgz", "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-4.8.1.tgz",
"integrity": "sha512-ILITvqwDJYbcDCROj6+Ob0oCKNg3SH46iWcNcTIT9B5aiVssoTYkhKjxOMNzR1F7WSJkik4zmuqve5MdnA0DyA==", "integrity": "sha512-r0iUOc41KFFbZdPAdCS4K1mXivnSZqXS5D9oW+iykQsRlTbQRfuFRSW20xKDdYiaCoH+SkSLeIF484g3kWzwOQ==",
"dev": true, "dev": true,
"requires": { "requires": {
"@typescript-eslint/types": "4.7.0", "@typescript-eslint/types": "4.8.1",
"@typescript-eslint/visitor-keys": "4.7.0" "@typescript-eslint/visitor-keys": "4.8.1"
} }
}, },
"@typescript-eslint/types": { "@typescript-eslint/types": {
"version": "4.7.0", "version": "4.8.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-4.7.0.tgz", "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-4.8.1.tgz",
"integrity": "sha512-uLszFe0wExJc+I7q0Z/+BnP7wao/kzX0hB5vJn4LIgrfrMLgnB2UXoReV19lkJQS1a1mHWGGODSxnBx6JQC3Sg==", "integrity": "sha512-ave2a18x2Y25q5K05K/U3JQIe2Av4+TNi/2YuzyaXLAsDx6UZkz1boZ7nR/N6Wwae2PpudTZmHFXqu7faXfHmA==",
"dev": true "dev": true
}, },
"@typescript-eslint/typescript-estree": { "@typescript-eslint/typescript-estree": {
"version": "4.7.0", "version": "4.8.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-4.7.0.tgz", "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-4.8.1.tgz",
"integrity": "sha512-5XZRQznD1MfUmxu1t8/j2Af4OxbA7EFU2rbo0No7meb46eHgGkSieFdfV6omiC/DGIBhH9H9gXn7okBbVOm8jw==", "integrity": "sha512-bJ6Fn/6tW2g7WIkCWh3QRlaSU7CdUUK52shx36/J7T5oTQzANvi6raoTsbwGM11+7eBbeem8hCCKbyvAc0X3sQ==",
"dev": true, "dev": true,
"requires": { "requires": {
"@typescript-eslint/types": "4.7.0", "@typescript-eslint/types": "4.8.1",
"@typescript-eslint/visitor-keys": "4.7.0", "@typescript-eslint/visitor-keys": "4.8.1",
"debug": "^4.1.1", "debug": "^4.1.1",
"globby": "^11.0.1", "globby": "^11.0.1",
"is-glob": "^4.0.1", "is-glob": "^4.0.1",
@ -1680,12 +1680,12 @@
} }
}, },
"@typescript-eslint/visitor-keys": { "@typescript-eslint/visitor-keys": {
"version": "4.7.0", "version": "4.8.1",
"resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-4.7.0.tgz", "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-4.8.1.tgz",
"integrity": "sha512-aDJDWuCRsf1lXOtignlfiPODkzSxxop7D0rZ91L6ZuMlcMCSh0YyK+gAfo5zN/ih6WxMwhoXgJWC3cWQdaKC+A==", "integrity": "sha512-3nrwXFdEYALQh/zW8rFwP4QltqsanCDz4CwWMPiIZmwlk9GlvBeueEIbq05SEq4ganqM0g9nh02xXgv5XI3PeQ==",
"dev": true, "dev": true,
"requires": { "requires": {
"@typescript-eslint/types": "4.7.0", "@typescript-eslint/types": "4.8.1",
"eslint-visitor-keys": "^2.0.0" "eslint-visitor-keys": "^2.0.0"
} }
}, },
@ -2917,9 +2917,9 @@
} }
}, },
"codemirror": { "codemirror": {
"version": "5.58.2", "version": "5.58.3",
"resolved": "https://registry.npmjs.org/codemirror/-/codemirror-5.58.2.tgz", "resolved": "https://registry.npmjs.org/codemirror/-/codemirror-5.58.3.tgz",
"integrity": "sha512-K/hOh24cCwRutd1Mk3uLtjWzNISOkm4fvXiMO7LucCrqbh6aJDdtqUziim3MZUI6wOY0rvY1SlL1Ork01uMy6w==" "integrity": "sha512-KBhB+juiyOOgn0AqtRmWyAT3yoElkuvWTI6hsHa9E6GQrl6bk/fdAYcvuqW1/upO9T9rtEtapWdw4XYcNiVDEA=="
}, },
"collect-v8-coverage": { "collect-v8-coverage": {
"version": "1.0.1", "version": "1.0.1",
@ -3087,9 +3087,9 @@
"dev": true "dev": true
}, },
"copy-webpack-plugin": { "copy-webpack-plugin": {
"version": "6.3.0", "version": "6.3.2",
"resolved": "https://registry.npmjs.org/copy-webpack-plugin/-/copy-webpack-plugin-6.3.0.tgz", "resolved": "https://registry.npmjs.org/copy-webpack-plugin/-/copy-webpack-plugin-6.3.2.tgz",
"integrity": "sha512-kQ2cGGQLO6Ov2fe7rEGVxObI17dPeFkv8bRGnUAGZehOcrrObyAR9yWYlFGlJsyWM4EeuC/ytQNQkXxjYotMzg==", "integrity": "sha512-MgJ1uouLIbDg4ST1GzqrGQyKoXY5iPqi6fghFqarijam7FQcBa/r6Rg0VkoIuzx75Xq8iAMghyOueMkWUQ5OaA==",
"dev": true, "dev": true,
"requires": { "requires": {
"cacache": "^15.0.5", "cacache": "^15.0.5",
@ -3977,9 +3977,9 @@
} }
}, },
"eslint": { "eslint": {
"version": "7.13.0", "version": "7.14.0",
"resolved": "https://registry.npmjs.org/eslint/-/eslint-7.13.0.tgz", "resolved": "https://registry.npmjs.org/eslint/-/eslint-7.14.0.tgz",
"integrity": "sha512-uCORMuOO8tUzJmsdRtrvcGq5qposf7Rw0LwkTJkoDbOycVQtQjmnhZSuLQnozLE4TmAzlMVV45eCHmQ1OpDKUQ==", "integrity": "sha512-5YubdnPXrlrYAFCKybPuHIAH++PINe1pmKNc5wQRB9HSbqIK1ywAnntE3Wwua4giKu0bjligf1gLF6qxMGOYRA==",
"dev": true, "dev": true,
"requires": { "requires": {
"@babel/code-frame": "^7.0.0", "@babel/code-frame": "^7.0.0",
@ -5425,9 +5425,9 @@
"dev": true "dev": true
}, },
"ini": { "ini": {
"version": "1.3.5", "version": "1.3.8",
"resolved": "https://registry.npmjs.org/ini/-/ini-1.3.5.tgz", "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz",
"integrity": "sha512-RZY5huIKCMRWDUqZlEi72f/lmXKMvuszcMBduliQ3nnWbx9X/ZBQO7DijMEYS9EhHBb2qacRUMtC7svLwe0lcw==", "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==",
"dev": true "dev": true
}, },
"interpret": { "interpret": {
@ -11946,9 +11946,9 @@
} }
}, },
"typescript": { "typescript": {
"version": "4.0.5", "version": "4.1.2",
"resolved": "https://registry.npmjs.org/typescript/-/typescript-4.0.5.tgz", "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.1.2.tgz",
"integrity": "sha512-ywmr/VrTVCmNTJ6iV2LwIrfG1P+lv6luD8sUJs+2eI9NLGigaN+nUQc13iHqisq7bra9lnmUSYqbJvegraBOPQ==", "integrity": "sha512-thGloWsGH3SOxv1SoY7QojKi0tc+8FnOmiarEGMbd/lar7QOEd3hvlx3Fp5y6FlDUGl9L+pd4n2e+oToGMmhRQ==",
"dev": true "dev": true
}, },
"uglify-js": { "uglify-js": {

View File

@ -1,6 +1,6 @@
{ {
"name": "human2regex", "name": "human2regex",
"version": "1.0.2", "version": "1.1.0",
"description": "Humanized Regular Expressions", "description": "Humanized Regular Expressions",
"main": "./lib/index.js", "main": "./lib/index.js",
"typings": "./lib/index.d.ts", "typings": "./lib/index.d.ts",
@ -9,13 +9,13 @@
"@types/html-minifier": "^3.5.3", "@types/html-minifier": "^3.5.3",
"@types/jest": "^26.0.15", "@types/jest": "^26.0.15",
"@types/mustache": "^4.0.1", "@types/mustache": "^4.0.1",
"@typescript-eslint/eslint-plugin": "^4.7.0", "@typescript-eslint/eslint-plugin": "^4.8.1",
"@typescript-eslint/parser": "^4.7.0", "@typescript-eslint/parser": "^4.8.1",
"before-build-webpack": "^0.2.9", "before-build-webpack": "^0.2.9",
"codecov": "^3.8.1", "codecov": "^3.8.1",
"copy-webpack-plugin": "^6.3.0", "copy-webpack-plugin": "^6.3.2",
"css-loader": "^4.3.0", "css-loader": "^4.3.0",
"eslint": "^7.13.0", "eslint": "^7.14.0",
"glob": "^7.1.6", "glob": "^7.1.6",
"html-minifier": "^4.0.0", "html-minifier": "^4.0.0",
"jest": "^26.6.3", "jest": "^26.6.3",
@ -26,7 +26,7 @@
"ts-jest": "^26.4.4", "ts-jest": "^26.4.4",
"ts-loader": "^8.0.11", "ts-loader": "^8.0.11",
"ts-node": "^9.0.0", "ts-node": "^9.0.0",
"typescript": "^4.0.5", "typescript": "^4.1.2",
"webpack": "^4.44.2", "webpack": "^4.44.2",
"webpack-cli": "^3.3.12" "webpack-cli": "^3.3.12"
}, },
@ -46,7 +46,7 @@
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"chevrotain": "^7.0.3", "chevrotain": "^7.0.3",
"codemirror": "^5.58.2" "codemirror": "^5.58.3"
}, },
"repository": { "repository": {
"type": "git", "type": "git",

View File

@ -336,7 +336,13 @@ match "World"
<h3 id="tut-final">Putting it all together</h3> <h3 id="tut-final">Putting it all together</h3>
<p>Grouping, repetition, and matching are the 3 primary elements that make up H2R. They can be combined in any way to generate a regular expression. See the <a href="index.html">main page</a> for an example that combines all above to parse a URL.</p> <p>Grouping, repetition, and matching are the 3 primary elements that make up H2R. They can be combined in any way to generate a regular expression. See the <a href="index.html">main page</a> for an example that combines all above to parse a URL.</p>
<h3>Miscellaneous features</h3> <h3>Advanced features</h3>
<p class="font-weight-bold" id="tut-back">Backreferences</p>
<p>TODO</p>
<p class="font-weight-bold" id="tut-if">If statements</p>
<p>TODO</p>
<p class="font-weight-bold" id="tut-unicode">Unicode character properties</p> <p class="font-weight-bold" id="tut-unicode">Unicode character properties</p>
<p>You can match specific unicode sequences using <code class="cm-s-idea">"\uXXXX" <p>You can match specific unicode sequences using <code class="cm-s-idea">"\uXXXX"

View File

@ -7,6 +7,7 @@
import { regexEscape, removeQuotes, hasFlag, combineFlags, isSingleRegexCharacter, first, last, unusedParameter, makeFlag, append } from "./utilities"; import { regexEscape, removeQuotes, hasFlag, combineFlags, isSingleRegexCharacter, first, last, unusedParameter, makeFlag, append } from "./utilities";
import { IToken } from "chevrotain"; import { IToken } from "chevrotain";
import { minimizeMatchString, groupIfRequired, dontClobberRepetition } from "./generator_helper";
/** /**
* List of regular expression dialects we support * List of regular expression dialects we support
@ -63,31 +64,54 @@ const unicode_script_codes = [
]; ];
/** /**
* The base concrete syntax tree class * Context for validation
* *
* @remarks Currently only used to validate groups
* @internal * @internal
*/ */
export abstract class H2RCST { export class GeneratorContext {
public groups: { [ key: string ]: { startLine: number, startColumn: number, length: number } } = {};
/** /**
* Constructor for H2RCST * Checks to see if we already have a group defined
* *
* @param tokens Tokens used to calculate where an error occured * @param identifier the group name
* @internal * @returns true if the group name already exists
*/ */
constructor(public tokens: IToken[]) { public hasGroup(identifier: string): boolean {
/* empty */ return Object.prototype.hasOwnProperty.call(this.groups, identifier);
} }
/**
* Adds the identifier to the group list
*
* @param identifier the group name
*/
public addGroup(identifier: string, tokens: IToken[]): void {
const f = first(tokens);
const l = last(tokens);
this.groups[identifier] = {
startLine: f.startLine ?? NaN,
startColumn: f.startColumn ?? NaN,
length: (l.endOffset ?? l.startOffset) - f.startOffset,
};
}
}
interface Generates {
/** /**
* Validate that this is both valid and can be generated in the specified language * Validate that this is both valid and can be generated in the specified language
* *
* @remarks There is no guarantee toRegex will work unless validate returns no errors * @remarks There is no guarantee toRegex will work unless validate returns no errors
* *
* @param language the regex dialect we're validating * @param language the regex dialect we're validating
* @param context the generator context
* @returns A list of errors * @returns A list of errors
* @public * @public
*/ */
public abstract validate(language: RegexDialect): ISemanticError[]; validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
/** /**
* Generate a regular expression fragment based on this syntax tree * Generate a regular expression fragment based on this syntax tree
@ -98,6 +122,26 @@ export abstract class H2RCST {
* @returns a regular expression fragment * @returns a regular expression fragment
* @public * @public
*/ */
toRegex(language: RegexDialect): string;
}
/**
* The base concrete syntax tree class
*
* @internal
*/
export abstract class H2RCST implements Generates {
/**
* Constructor for H2RCST
*
* @param tokens Tokens used to calculate where an error occured
* @internal
*/
constructor(public tokens: IToken[]) {
/* empty */
}
public abstract validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
public abstract toRegex(language: RegexDialect): string; public abstract toRegex(language: RegexDialect): string;
/** /**
@ -186,7 +230,7 @@ export class MatchSubStatementValue {
* *
* @internal * @internal
*/ */
export class MatchStatementValue { export class MatchStatementValue implements Generates {
/** /**
* Constructor for MatchStatementValue * Constructor for MatchStatementValue
@ -198,6 +242,21 @@ export class MatchStatementValue {
constructor(public optional: boolean, public statement: MatchSubStatementCST) { constructor(public optional: boolean, public statement: MatchSubStatementCST) {
/* empty */ /* empty */
} }
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
return this.statement.validate(language, context);
}
public toRegex(language: RegexDialect): string {
let match_stmt = this.statement.toRegex(language);
// need to group if optional and ungrouped
if (this.optional) {
match_stmt = groupIfRequired(match_stmt) + "?";
}
return match_stmt;
}
} }
/** /**
@ -227,11 +286,11 @@ export class MatchSubStatementCST extends H2RCST {
super(tokens); super(tokens);
} }
public validate(language: RegexDialect): ISemanticError[] { public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = []; const errors: ISemanticError[] = [];
if (this.count) { if (this.count) {
append(errors, this.count.validate(language)); append(errors, this.count.validate(language, context));
} }
for (const value of this.values) { for (const value of this.values) {
@ -353,56 +412,16 @@ export class MatchSubStatementCST extends H2RCST {
} }
} }
let ret = ""; let ret = minimizeMatchString(matches);
let require_grouping = false;
let dont_clobber_plus = false;
if (matches.length === 1) {
ret = first(matches);
if (ret.endsWith("+")) {
dont_clobber_plus = true;
}
}
else {
ret = minimizeMatchString(matches);
if (ret.length > 1 &&
(!ret.startsWith("(") || !ret.endsWith("["))) {
require_grouping = true;
}
}
if (this.count) { if (this.count) {
if (dont_clobber_plus) { if (matches.length === 1) {
const clobber = this.count.toRegex(language); // we don't group if there's only 1 element
// but we need to make sure we don't add an additional + or *
// + can be ignored as well as a count as long as that count is > 0 ret = dontClobberRepetition(ret, this.count.toRegex(language));
switch (clobber) {
case "*":
case "?":
ret = "(?:" + ret + ")" + clobber;
break;
case "+":
// ignore
break;
default:
if (clobber.startsWith("{0")) {
ret = "(?:" + ret + ")" + clobber;
}
else {
// remove + and replace with count
ret.substring(0, ret.length - 1) + clobber;
}
break;
}
} }
else { else {
if (require_grouping) { ret = groupIfRequired(ret) + this.count.toRegex(language);
ret = "(?:" + ret + ")";
}
ret += this.count.toRegex(language);
} }
} }
@ -427,8 +446,9 @@ export class UsingStatementCST extends H2RCST {
super(tokens); super(tokens);
} }
public validate(language: RegexDialect): ISemanticError[] { public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
unusedParameter(language, "Using Statement does not change based on language"); unusedParameter(language, "Count does not need checking");
unusedParameter(context, "Context is not needed");
const errors: ISemanticError[] = []; const errors: ISemanticError[] = [];
let flag = this.flags[0]; let flag = this.flags[0];
@ -490,15 +510,13 @@ export class CountSubStatementCST extends H2RCST {
super(tokens); super(tokens);
} }
public validate(language: RegexDialect): ISemanticError[] { public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
unusedParameter(language, "Count does not need checking"); unusedParameter(language, "Count does not need checking");
unusedParameter(context, "Context is not needed");
const errors: ISemanticError[] = []; const errors: ISemanticError[] = [];
if (this.from < 0) { if (this.to !== null && ((this.opt === "exclusive" && (this.to-1) <= this.from) || this.to <= this.from)) {
errors.push(this.error("Value cannot be negative"));
}
else if (this.to !== null && ((this.opt === "exclusive" && (this.to-1) <= this.from) || this.to <= this.from)) {
errors.push(this.error("Values must be in range of eachother")); errors.push(this.error("Values must be in range of eachother"));
} }
@ -548,49 +566,27 @@ export class MatchStatementCST extends StatementCST {
* Constructor for MatchStatementCST * Constructor for MatchStatementCST
* *
* @param tokens Tokens used to calculate where an error occured * @param tokens Tokens used to calculate where an error occured
* @param matches * @param matches the list of matches
*/ */
constructor(tokens: IToken[], private completely_optional: boolean, private matches: MatchStatementValue[]) { constructor(tokens: IToken[], private completely_optional: boolean, private matches: MatchStatementValue[]) {
super(tokens); super(tokens);
} }
public validate(language: RegexDialect): ISemanticError[] { public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = []; const errors: ISemanticError[] = [];
for (const match of this.matches) { for (const match of this.matches) {
append(errors, match.statement.validate(language)); append(errors, match.statement.validate(language, context));
} }
return errors; return errors;
} }
public toRegex(language: RegexDialect): string { public toRegex(language: RegexDialect): string {
let final_matches = this.matches.map((x) => { let final_matches = this.matches.map((x) => x.toRegex(language)).join("");
let match_stmt = x.statement.toRegex(language);
// need to group if optional and ungrouped
if (x.optional) {
if (!isSingleRegexCharacter(match_stmt)) {
// don't re-group a group
if (match_stmt[0] !== "(" && match_stmt[match_stmt.length-1] !== ")") {
match_stmt = "(?:" + match_stmt + ")";
}
}
match_stmt += "?";
}
return match_stmt;
}).join("");
if (this.completely_optional) { if (this.completely_optional) {
if (!isSingleRegexCharacter(final_matches)) { final_matches = groupIfRequired(final_matches) + "?";
// don't re-group a group
if (final_matches[0] !== "(" && final_matches[final_matches.length-1] !== ")") {
final_matches = "(?:" + final_matches + ")";
}
}
final_matches += "?";
} }
return final_matches; return final_matches;
@ -616,22 +612,22 @@ export class RepeatStatementCST extends StatementCST {
super(tokens); super(tokens);
} }
public validate(language: RegexDialect): ISemanticError[] { public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = []; const errors: ISemanticError[] = [];
if (this.count !== null) { if (this.count !== null) {
append(errors, this.count.validate(language)); append(errors, this.count.validate(language, context));
} }
for (const statement of this.statements) { for (const statement of this.statements) {
append(errors, statement.validate(language)); append(errors, statement.validate(language, context));
} }
return errors; return errors;
} }
public toRegex(language: RegexDialect): string { public toRegex(language: RegexDialect): string {
let str = "(?:" + this.statements.map((x) => x.toRegex(language)).join("") + ")"; let str = groupIfRequired(this.statements.map((x) => x.toRegex(language)).join(""));
if (this.count) { if (this.count) {
str += this.count.toRegex(language); str += this.count.toRegex(language);
@ -659,7 +655,7 @@ export class RepeatStatementCST extends StatementCST {
* @internal * @internal
*/ */
export class GroupStatementCST extends StatementCST { export class GroupStatementCST extends StatementCST {
/** /**
* Constructor for GroupStatementCST * Constructor for GroupStatementCST
* *
@ -673,16 +669,21 @@ export class GroupStatementCST extends StatementCST {
super(tokens); super(tokens);
} }
public validate(language: RegexDialect): ISemanticError[] { public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors : ISemanticError[] = []; const errors : ISemanticError[] = [];
// All languages currently support named groups if (this.name !== null) {
//if (false) { if (context.hasGroup(this.name)) {
// errors.push(this.error("This language does not support named groups")); const past_group = context.groups[this.name];
//} errors.push(this.error(`Group with name "${this.name}" was already defined here: ${past_group.startLine}:${past_group.startLine}-${past_group.startLine}:${past_group.startLine+past_group.length}`));
}
else {
context.addGroup(this.name, this.tokens);
}
}
for (const statement of this.statements) { for (const statement of this.statements) {
append(errors, statement.validate(language)); append(errors, statement.validate(language, context));
} }
return errors; return errors;
@ -711,6 +712,195 @@ export class GroupStatementCST extends StatementCST {
} }
} }
/**
* Concrete Syntax Tree for a Backreference statement
*
* @internal
*/
export class BackrefStatementCST extends StatementCST {
/**
* Constructor for BackrefStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param optional is this backref optional
* @param count optional number of times to repeat
* @param name the group name to call
*/
constructor(tokens: IToken[], private optional: boolean, private count: CountSubStatementCST | null, private name: string) {
super(tokens);
}
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = [];
if (!context.hasGroup(this.name)) {
errors.push(this.error(`Cannot call group with name "${this.name}" as it was never previously defined`));
}
if (this.count !== null) {
append(errors, this.count.validate(language, context));
}
return errors;
}
public toRegex(language: RegexDialect): string {
let str = "";
switch (language) {
case RegexDialect.Python:
str = `(?P=${this.name})`;
break;
case RegexDialect.DotNet:
case RegexDialect.Java:
str = `\\k<${this.name}>`;
break;
default:
str = `\\g<${this.name}>`;
break;
}
if (this.count) {
str += this.count.toRegex(language);
// group for optionality because count would be incorrect otherwise
if (this.optional) {
str = "(?:" + str + ")?";
}
}
else if (this.optional) {
str = "?";
}
return str;
}
}
/**
* Concrete Syntax Tree for an If Pattern statement
*
* @internal
*/
export class IfPatternStatementCST extends StatementCST {
/**
* Constructor for IfPatternStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param matches list of matches to test against
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens: IToken[], private matches: MatchStatementValue[], private true_statements: StatementCST[], private false_statements: StatementCST[]) {
super(tokens);
}
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = [];
if (language === RegexDialect.Java || language === RegexDialect.JS) {
errors.push(this.error("This language does not support conditionals"));
}
if (language === RegexDialect.Python) {
errors.push(this.error("This language does not support pattern conditionals"));
}
for (const match of this.matches) {
append(errors, match.validate(language, context));
}
for (const statement of this.true_statements) {
append(errors, statement.validate(language, context));
}
for (const statement of this.false_statements) {
append(errors, statement.validate(language, context));
}
return errors;
}
public toRegex(language: RegexDialect): string {
const if_stmt = this.matches.map((x) => x.toRegex(language)).join("");
const true_stmt = groupIfRequired(this.true_statements.map((x) => x.toRegex(language)).join(""));
if (this.false_statements.length > 0) {
const false_stmt = groupIfRequired(this.false_statements.map((x) => x.toRegex(language)).join(""));
return `(?(${if_stmt})${true_stmt}|${false_stmt})`;
}
else {
return `(?(${if_stmt})${true_stmt})`;
}
}
}
/**
* Concrete Syntax Tree for an If group Ident statement
*
* @internal
*/
export class IfIdentStatementCST extends StatementCST {
/**
* Constructor for IfIdentStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param identifier the group identifier to check
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens: IToken[], private identifier: string, private true_statements: StatementCST[], private false_statements: StatementCST[]) {
super(tokens);
}
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = [];
if (language === RegexDialect.Java || language === RegexDialect.JS) {
errors.push(this.error("This language does not support conditionals"));
}
if (!context.hasGroup(this.identifier)) {
errors.push(this.error(`Group with name "${this.identifier}" does not exist`));
}
for (const statement of this.true_statements) {
append(errors, statement.validate(language, context));
}
for (const statement of this.false_statements) {
append(errors, statement.validate(language, context));
}
return errors;
}
public toRegex(language: RegexDialect): string {
let if_stmt = this.identifier;
// be more clear with languages that support it
if (language === RegexDialect.Boost) {
if_stmt = "<" + if_stmt + ">";
}
const true_stmt = groupIfRequired(this.true_statements.map((x) => x.toRegex(language)).join(""));
if (this.false_statements.length > 0) {
const false_stmt = groupIfRequired(this.false_statements.map((x) => x.toRegex(language)).join(""));
return `(?(${if_stmt})${true_stmt}|${false_stmt})`;
}
else {
return `(?(${if_stmt})${true_stmt})`;
}
}
}
/** /**
* Concrete Syntax Tree for a regular expression * Concrete Syntax Tree for a regular expression
* *
@ -730,115 +920,20 @@ export class RegularExpressionCST extends H2RCST {
super(tokens); super(tokens);
} }
public validate(language: RegexDialect): ISemanticError[] { public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = this.usings.validate(language); const errors: ISemanticError[] = this.usings.validate(language, context);
for (const statement of this.statements) { for (const statement of this.statements) {
append(errors, statement.validate(language)); append(errors, statement.validate(language, context));
} }
return errors; return errors;
} }
public toRegex(language: RegexDialect): string { public toRegex(language: RegexDialect): string {
const modifiers = this.usings.toRegex(language); const modifiers = this.usings.toRegex(language);
const regex = this.statements.map((x) => x.toRegex(language)).join(""); const regex = this.statements.map((x) => x.toRegex(language)).join("");
return modifiers.replace("{regex}", regex); return modifiers.replace("{regex}", regex);
} }
} }
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
export function minimizeMatchString(arr: string[]): string {
return minMatchString(arr, 0);
}
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array
* @param depth must be 0 for initial call
* @internal
*/
function minMatchString(arr: string[], depth: number = 0): string {
// base case: arr is empty
if (arr.length === 0) {
return "";
}
// base case: arr has 1 element (must have at least 2, so this means this value is optional)
if (arr.length === 1) {
return first(arr) + "?";
}
// remove duplicates
arr = [ ...new Set(arr) ];
// base case: arr has 1 element (after duplicate removal means this is required)
if (arr.length === 1) {
return first(arr);
}
// base case: arr is all single letters
if (arr.every(isSingleRegexCharacter)) {
return "[" + arr.join("") + "]";
}
// now the real magic begins
// You are not expected to understand this
let longest_begin_substring = first(arr);
let longest_end_substring = first(arr);
for (let i = 1; i < arr.length; i++) {
// reduce longest_substring to match everything
for (let j = 0; j < longest_begin_substring.length; j++) {
if (arr[i].length < j || longest_begin_substring[j] !== arr[i][j]) {
longest_begin_substring = longest_begin_substring.substr(0, j);
break;
}
}
for (let j = 0; j < longest_end_substring.length; j++) {
if (arr[i].length-j < 0 || longest_end_substring[longest_end_substring.length-j-1] !== arr[i][arr[i].length-j-1]) {
longest_end_substring = longest_end_substring.substr(longest_end_substring.length-j, longest_end_substring.length);
break;
}
}
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
break;
}
}
// No matches whatsoever
// *technically* we can optimize further, but that is a VERY non-trivial problem
// For example optimizing: [ "a1x1z", "a2y2z", "a3z3z" ] to: "a[123][xyz][123]z"
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
if (depth > 0) {
return "(?:" + arr.join("|") + ")";
}
else {
return arr.join("|");
}
}
// we have some matches
else {
// remove begin (if exists) and end (if exists) from each element and remove empty strings
const begin_pos = longest_begin_substring.length;
const end_pos = longest_end_substring.length;
const similar_matches: string[] = [];
for (const ele of arr) {
const match = ele.substring(begin_pos, ele.length-end_pos);
if (match.length !== 0) {
similar_matches.push(match);
}
}
return longest_begin_substring + minMatchString(similar_matches, depth + 1) + longest_end_substring;
}
}

224
src/generator_helper.ts Normal file
View File

@ -0,0 +1,224 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/**
* Includes helper functions for the Generator
* @packageDocumentation
*/
import { first, isSingleRegexCharacter } from "./utilities";
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
export function minimizeMatchString(arr: string[]): string {
// don't process an array of length 1, otherwise you'll get the wrong result
if (arr.length === 1) {
return first(arr);
}
return minMatchString(arr, 0);
}
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array
* @param depth must be 0 for initial call
* @returns an optimized string
* @internal
*/
function minMatchString(arr: string[], depth: number = 0): string {
// base case: arr is empty
if (arr.length === 0) {
return "";
}
// base case: arr has 1 element (must have at least 2, so this means this value is optional)
if (arr.length === 1) {
return first(arr) + "?";
}
// remove duplicates
arr = [ ...new Set(arr) ];
// base case: arr has 1 element (after duplicate removal means this is required)
if (arr.length === 1) {
return first(arr);
}
// base case: arr is all single letters
if (arr.every(isSingleRegexCharacter)) {
return "[" + arr.join("") + "]";
}
// now the real magic begins
// You are not expected to understand this
let longest_begin_substring = first(arr);
let longest_end_substring = first(arr);
for (let i = 1; i < arr.length; i++) {
// reduce longest_substring to match everything
for (let j = 0; j < longest_begin_substring.length; j++) {
if (arr[i].length < j || longest_begin_substring[j] !== arr[i][j]) {
longest_begin_substring = longest_begin_substring.substr(0, j);
break;
}
}
for (let j = 0; j < longest_end_substring.length; j++) {
if (arr[i].length-j < 0 || longest_end_substring[longest_end_substring.length-j-1] !== arr[i][arr[i].length-j-1]) {
longest_end_substring = longest_end_substring.substr(longest_end_substring.length-j, longest_end_substring.length);
break;
}
}
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
break;
}
}
// No matches whatsoever
// *technically* we can optimize further, but that is a VERY non-trivial problem
// For example optimizing: [ "a1x1z", "a2y2z", "a3z3z" ] to: "a[123][xyz][123]z"
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
if (depth > 0) {
return "(?:" + arr.join("|") + ")";
}
else {
return arr.join("|");
}
}
// we have some matches
else {
// remove begin (if exists) and end (if exists) from each element and remove empty strings
const begin_pos = longest_begin_substring.length;
const end_pos = longest_end_substring.length;
const similar_matches: string[] = [];
for (const ele of arr) {
const match = ele.substring(begin_pos, ele.length-end_pos);
if (match.length !== 0) {
similar_matches.push(match);
}
}
return longest_begin_substring + minMatchString(similar_matches, depth + 1) + longest_end_substring;
}
}
/**
* Groups a regex fragment if it needs to be grouped
*
* @param fragment fragment of regular expression to potentially group
* @returns a non-capturing group if there needs to be one
* @internal
*/
export function groupIfRequired(fragment: string): string {
if (isSingleRegexCharacter(fragment)) {
return fragment;
}
if (fragment[0] === "(" && fragment[fragment.length-1] === ")") {
let bracket_count = 0;
for (let i = 1; i < fragment.length-2; i++) {
if (fragment[i] === "\\") {
i++;
}
else if (fragment[i] === "(") {
bracket_count++;
}
else if (fragment[i] === ")") {
bracket_count--;
if (bracket_count === -1) {
break;
}
}
}
return bracket_count === 0 ? fragment : "(?:" + fragment + ")";
}
else if (fragment[0] === "[" && fragment[fragment.length-1] === "]") {
let bracket_count = 0;
for (let i = 1; i < fragment.length-2; i++) {
if (fragment[i] === "\\") {
i++;
}
//you'll never have a raw [ inside a []
//else if (fragment[i] === "[") {
// bracket_count++;
//}
else if (fragment[i] === "]") {
bracket_count--;
if (bracket_count === -1) {
break;
}
}
}
return bracket_count === 0 ? fragment : "(?:" + fragment + ")";
}
else {
return "(?:" + fragment + ")";
}
}
/**
* Checks to see if fragment has a + or * at the end and has a repetition statement
*
* @param fragment fragment of regular expression
* @param repetition repetition that may clobber the fragment
*/
export function dontClobberRepetition(fragment: string, repetition: string): string {
// + can be ignored as well as a count as long as that count is > 0
if (fragment.endsWith("+")) {
switch (repetition) {
case "*":
// ignore: + is greater than *
break;
case "?":
// non-greedy qualifier
fragment += repetition;
break;
case "+":
// ignore: already +
break;
default:
if (repetition.startsWith("{0")) {
fragment = "(?:" + fragment + ")" + repetition;
}
else {
// remove + and replace with count
fragment = fragment.substring(0, fragment.length - 1) + repetition;
}
break;
}
}
else if (fragment.endsWith("*")) {
switch (repetition) {
case "*":
// ignore: already +
break;
case "?":
// non-greedy qualifier
fragment += repetition;
break;
default:
// remove * and replace with count
fragment = fragment.substring(0, fragment.length - 1) + repetition;
break;
}
}
else {
fragment += repetition;
}
return fragment;
}

View File

@ -7,7 +7,7 @@
import { EmbeddedActionsParser, IOrAlt, IToken } from "chevrotain"; import { EmbeddedActionsParser, IOrAlt, IToken } from "chevrotain";
import * as T from "./tokens"; import * as T from "./tokens";
import { CountSubStatementCST, UsingFlags, MatchSubStatementType, MatchSubStatementValue, MatchSubStatementCST, UsingStatementCST, RegularExpressionCST, StatementCST, RepeatStatementCST, MatchStatementValue, MatchStatementCST, GroupStatementCST, RegexDialect } from "./generator"; import { CountSubStatementCST, UsingFlags, MatchSubStatementType, MatchSubStatementValue, MatchSubStatementCST, UsingStatementCST, RegularExpressionCST, StatementCST, RepeatStatementCST, MatchStatementValue, MatchStatementCST, GroupStatementCST, RegexDialect, BackrefStatementCST, GeneratorContext, IfPatternStatementCST, IfIdentStatementCST } from "./generator";
import { first, usefulConditional, unusedParameter, CommonError } from "./utilities"; import { first, usefulConditional, unusedParameter, CommonError } from "./utilities";
/** /**
@ -60,7 +60,7 @@ export class ParseResult {
* @public * @public
*/ */
public validate(language: RegexDialect): CommonError[] { public validate(language: RegexDialect): CommonError[] {
return this.regexp_cst.validate(language).map(CommonError.fromSemanticError); return this.regexp_cst.validate(language, new GeneratorContext()).map(CommonError.fromSemanticError);
} }
/** /**
@ -558,12 +558,107 @@ export class Human2RegexParser extends EmbeddedActionsParser {
return new RepeatStatementCST(tokens, optional, count, statements); return new RepeatStatementCST(tokens, optional, count, statements);
}); });
const BackrefStatement = $.RULE("BackrefStatement", () => {
const tokens: IToken[] = [];
let optional = false;
let count: CountSubStatementCST | null = null;
$.OPTION5(() => {
tokens.push($.CONSUME(T.Optional));
optional = true;
});
tokens.push($.CONSUME(T.Call));
$.OPTION6(() => count = $.SUBRULE(CountSubStatement));
$.OPTION7(() => {
$.OPTION(() => $.CONSUME(T.The));
$.CONSUME(T.Group);
$.OPTION2(() => $.CONSUME(T.Called));
});
const name = $.CONSUME(T.Identifier).image;
tokens.push($.CONSUME4(T.EndOfLine));
return new BackrefStatementCST(tokens, optional, count, name);
});
const IfStatement = $.RULE("IfStatement", () => {
const tokens: IToken[] = [];
const msv: MatchStatementValue[] = [];
let optional = false;
const true_statements: StatementCST[] = [];
const false_statements: StatementCST[] = [];
let name: string = "";
tokens.push($.CONSUME(T.If));
$.OR2([
{ALT: () => {
name = $.CONSUME(T.Identifier).image;
}},
{ALT: () => {
$.CONSUME(T.Match);
$.OPTION4(() => {
$.CONSUME3(T.Optional);
optional = true;
});
msv.push(new MatchStatementValue(optional, $.SUBRULE(MatchSubStatement)));
$.MANY(() => {
$.OR([
{ ALT: () => {
$.OPTION2(() => $.CONSUME2(T.And));
$.CONSUME(T.Then);
}},
{ ALT: () => $.CONSUME(T.And) },
]);
optional = false;
$.OPTION3(() => {
$.CONSUME2(T.Optional);
optional = true;
});
msv.push(new MatchStatementValue(optional, $.SUBRULE2(MatchSubStatement)));
});
}}
]);
tokens.push($.CONSUME3(T.EndOfLine));
$.CONSUME2(T.Indent);
$.AT_LEAST_ONE2(() => {
true_statements.push($.SUBRULE(Statement));
});
$.CONSUME2(T.Outdent);
$.OPTION(() => {
$.CONSUME(T.Else);
$.CONSUME4(T.EndOfLine);
$.CONSUME3(T.Indent);
$.AT_LEAST_ONE3(() => {
false_statements.push($.SUBRULE2(Statement));
});
$.CONSUME3(T.Outdent);
});
if (name === "") {
return new IfPatternStatementCST(tokens, msv, true_statements, false_statements);
}
else {
return new IfIdentStatementCST(tokens, name, true_statements, false_statements);
}
});
// statement super class // statement super class
const Statement = $.RULE("Statement", () => { const Statement = $.RULE("Statement", () => {
return $.OR([ return $.OR([
{ ALT: () => $.SUBRULE(MatchStatement) }, { ALT: () => $.SUBRULE(MatchStatement) },
{ ALT: () => $.SUBRULE(GroupStatement) }, { ALT: () => $.SUBRULE(GroupStatement) },
{ ALT: () => $.SUBRULE(RepeatStatement) } { ALT: () => $.SUBRULE(RepeatStatement) },
{ ALT: () => $.SUBRULE(BackrefStatement) },
{ ALT: () => $.SUBRULE(IfStatement) }
]); ]);
}); });

View File

@ -53,34 +53,17 @@ import { createToken, Lexer } from "chevrotain";
/** @internal */ export const From = createToken({name: "From", pattern: /from/i}); /** @internal */ export const From = createToken({name: "From", pattern: /from/i});
/** @internal */ export const To = createToken({name: "To", pattern: /(to|through|thru|\-|\.\.\.?)/i}); /** @internal */ export const To = createToken({name: "To", pattern: /(to|through|thru|\-|\.\.\.?)/i});
/** @internal */ export const Create = createToken({name: "Create", pattern: /create(s)?/i}); /** @internal */ export const Create = createToken({name: "Create", pattern: /create(s)?/i});
/** @internal */ export const Called = createToken({name: "Called", pattern: /name(d)?|call(ed)?/i}); /** @internal */ export const Called = createToken({name: "Called", pattern: /named|called/i});
/** @internal */ export const Repeat = createToken({name: "Repeat", pattern: /repeat(s|ing)?/i}); /** @internal */ export const Repeat = createToken({name: "Repeat", pattern: /repeat(s|ing)?/i});
/** @internal */ export const Newline = createToken({name: "Newline", pattern: /(new line|newline)/i}); /** @internal */ export const Newline = createToken({name: "Newline", pattern: /(new line|newline)/i});
/** @internal */ export const CarriageReturn = createToken({name: "CarriageReturn", pattern: /carriage return/i}); /** @internal */ export const CarriageReturn = createToken({name: "CarriageReturn", pattern: /carriage return/i});
/** @internal */ export const CaseInsensitive = createToken({name: "CaseInsensitive", pattern: /case insensitive/i}); /** @internal */ export const CaseInsensitive = createToken({name: "CaseInsensitive", pattern: /case insensitive/i});
/** @internal */ export const CaseSensitive = createToken({name: "CaseSensitive", pattern: /case sensitive/i}); /** @internal */ export const CaseSensitive = createToken({name: "CaseSensitive", pattern: /case sensitive/i});
/** @internal */ export const OrMore = createToken({name: "OrMore", pattern: /\+|or more/i}); /** @internal */ export const OrMore = createToken({name: "OrMore", pattern: /\+|or more/i});
/** @internal */ export const Call = createToken({name: "Call", pattern: /call|invoke|execute|(re ?)?run/i });
/* /** @internal */ export const The = createToken({name: "The", pattern: /the/i });
//Not being used currently /** @internal */ export const If = createToken({name: "If", pattern: /if/i });
export const Of = createToken({name: "Of", pattern: /of/i}); /** @internal */ export const Else = createToken({name: "Else", pattern: /else|otherwise/i });
export const Nothing = createToken({name: "Nothing", pattern: /nothing/i});
export const As = createToken({name: "As", pattern: /as/i});
export const If = createToken({name: "If", pattern: /if/i});
export const Start = createToken({name: "Start", pattern: /start(s) with?/i});
export const Ends = createToken({name: "Ends", pattern: /end(s)? with/i});
export const Else = createToken({name: "Else", pattern: /(other wise|otherwise|else)/i});
export const Unless = createToken({name: "Unless", pattern: /unless/i});
export const While = createToken({name: "While", pattern: /while/i});
export const More = createToken({name: "More", pattern: /more/i});
export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
export const None = createToken({name: "None", pattern: /none/i});
export const Neither = createToken({name: "Neither", pattern: /neither/i});
export const The = createToken({name: "The", pattern: /the/i }); //, longer_alt: Then});
export const By = createToken({name: "By", pattern: /by/i});
*/
/** @internal */ export const EndOfLine = createToken({name: "EOL", pattern: /\n/}); /** @internal */ export const EndOfLine = createToken({name: "EOL", pattern: /\n/});
/** @internal */ export const WS = createToken({name: "Whitespace", pattern: /[^\S\n]+/, start_chars_hint: [ " ", "\r" ], group: Lexer.SKIPPED}); /** @internal */ export const WS = createToken({name: "Whitespace", pattern: /[^\S\n]+/, start_chars_hint: [ " ", "\r" ], group: Lexer.SKIPPED});
@ -127,22 +110,11 @@ export const AllTokens = [
Whitespace, Whitespace,
Number, Number,
Unicode, Unicode,
/* Called,
Of, Call,
As,
If, If,
Start,
Ends,
Else, Else,
Unless,
While,
More,
Nothing,
By,
The, The,
None,
Neither,
*/
Using, Using,
Global, Global,
Multiline, Multiline,
@ -158,7 +130,6 @@ export const AllTokens = [
Exclusive, Exclusive,
From, From,
Create, Create,
Called,
Repeat, Repeat,
Newline, Newline,
CarriageReturn, CarriageReturn,

View File

@ -186,6 +186,7 @@ export class CommonError {
* *
* @param error The lexing error * @param error The lexing error
* @returns a new CommonError * @returns a new CommonError
* @internal
*/ */
public static fromLexError(error: ILexingError): CommonError { public static fromLexError(error: ILexingError): CommonError {
// not really fond of --> and <-- // not really fond of --> and <--
@ -199,6 +200,7 @@ export class CommonError {
* *
* @param error The parsing error * @param error The parsing error
* @returns a new CommonError * @returns a new CommonError
* @internal
*/ */
public static fromParseError(error: IRecognitionException): CommonError { public static fromParseError(error: IRecognitionException): CommonError {
// not really fond of --> and <-- // not really fond of --> and <--
@ -212,6 +214,7 @@ export class CommonError {
* *
* @param error The semantic error * @param error The semantic error
* @returns a new CommonError * @returns a new CommonError
* @internal
*/ */
public static fromSemanticError(error: ISemanticError): CommonError { public static fromSemanticError(error: ISemanticError): CommonError {
return new CommonError("Semantic Error", error.startLine, error.startColumn, error.length, error.message); return new CommonError("Semantic Error", error.startLine, error.startColumn, error.length, error.message);

View File

@ -2,7 +2,7 @@
import { Human2RegexParser, Human2RegexParserOptions } from "../src/parser"; import { Human2RegexParser, Human2RegexParserOptions } from "../src/parser";
import { Human2RegexLexer, Human2RegexLexerOptions } from "../src/lexer"; import { Human2RegexLexer, Human2RegexLexerOptions } from "../src/lexer";
import { RegexDialect, minimizeMatchString } from "../src/generator"; import { RegexDialect } from "../src/generator";
describe("Generator functionality", function() { describe("Generator functionality", function() {
@ -67,6 +67,14 @@ describe("Generator functionality", function() {
const toks5 = lexer.tokenize('match between 2 and 2 exclusive "hello"').tokens; const toks5 = lexer.tokenize('match between 2 and 2 exclusive "hello"').tokens;
const reg5 = parser.parse(toks5); const reg5 = parser.parse(toks5);
expect(reg5.validate(RegexDialect.JS).length).toBeGreaterThan(0); expect(reg5.validate(RegexDialect.JS).length).toBeGreaterThan(0);
const toks6 = lexer.tokenize('create a group called thing\n\tmatch "hi"\ncreate a group called thing\n\tmatch "hi"\n').tokens;
const reg6 = parser.parse(toks6);
expect(reg6.validate(RegexDialect.JS).length).toBeGreaterThan(0);
const toks7 = lexer.tokenize("invoke thing").tokens;
const reg7 = parser.parse(toks7);
expect(reg7.validate(RegexDialect.JS).length).toBeGreaterThan(0);
}); });
it("handles ranges", function() { it("handles ranges", function() {
@ -97,6 +105,12 @@ describe("Generator functionality", function() {
expect(reg2.validate(RegexDialect.JS).length).toBe(0); expect(reg2.validate(RegexDialect.JS).length).toBe(0);
expect(reg2.toRegex(RegexDialect.JS)).toBe("/[a-zA-Z][+-]?\\d+[+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+))/"); expect(reg2.toRegex(RegexDialect.JS)).toBe("/[a-zA-Z][+-]?\\d+[+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+))/");
expect(reg2.toRegex(RegexDialect.PCRE)).toBe("/[[:alpha:]][+-]?\\d+[+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+))/"); expect(reg2.toRegex(RegexDialect.PCRE)).toBe("/[[:alpha:]][+-]?\\d+[+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+))/");
const toks3 = lexer.tokenize("match not letter, not integer, not decimal").tokens;
const reg3 = parser.parse(toks3);
expect(reg3.validate(RegexDialect.JS).length).toBe(0);
expect(reg3.toRegex(RegexDialect.JS)).toBe("/[^a-zA-Z](?![+-]?\\d+)(?![+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+)))/");
expect(reg3.toRegex(RegexDialect.PCRE)).toBe("/[^[:alpha:]](?![+-]?\\d+)(?![+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+)))/");
}); });
it("doesn't clobber repetition", function() { it("doesn't clobber repetition", function() {
@ -115,23 +129,6 @@ describe("Generator functionality", function() {
expect(reg1.toRegex(RegexDialect.JS)).toBe("/(?!hello){1,6}/"); expect(reg1.toRegex(RegexDialect.JS)).toBe("/(?!hello){1,6}/");
}); });
it("can minimize matches", function() {
const test_cases = [
{ from: [ "abc", "abc" ], to: "abc" },
{ from: [ "a", "ab" ], to: "ab?" },
{ from: [ "a1x1z", "a2y2z", "a3z3z" ], to: "a(?:1x1|2y2|3z3)z" },
{ from: [ "ab", "cd" ], to: "ab|cd" },
{ from: [ "abc", "bc" ], to: "a?bc" },
{ from: [ "abc", "xb" ], to: "abc|xb" }
];
for (const c of test_cases) {
const got = minimizeMatchString(c.from);
expect(got).toBe(c.to);
}
});
it("optimizes correctly", function() { it("optimizes correctly", function() {
const toks0 = lexer.tokenize('match "a" or "b" or "b"').tokens; const toks0 = lexer.tokenize('match "a" or "b" or "b"').tokens;
const reg0 = parser.parse(toks0); const reg0 = parser.parse(toks0);
@ -157,6 +154,44 @@ describe("Generator functionality", function() {
const reg4 = parser.parse(toks4); const reg4 = parser.parse(toks4);
expect(reg4.validate(RegexDialect.JS).length).toBe(0); expect(reg4.validate(RegexDialect.JS).length).toBe(0);
expect(reg4.toRegex(RegexDialect.JS)).toBe("/a(?:1x1|2x2|3x3)z/"); expect(reg4.toRegex(RegexDialect.JS)).toBe("/a(?:1x1|2x2|3x3)z/");
const toks5 = lexer.tokenize('match "a", maybe "b" or "c"').tokens;
const reg5 = parser.parse(toks5);
expect(reg5.validate(RegexDialect.JS).length).toBe(0);
expect(reg5.toRegex(RegexDialect.JS)).toBe("/a[bc]?/");
});
it("can generate backreferences", function() {
const toks0 = lexer.tokenize('create a group called thing\n\tmatch "Hello World"\ninvoke thing\noptionally call 3 times the group called thing').tokens;
const reg0 = parser.parse(toks0);
expect(reg0.validate(RegexDialect.JS).length).toBe(0);
expect(reg0.toRegex(RegexDialect.JS)).toBe("/(?<thing>Hello World)\\g<thing>(?:\\g<thing>{3})?/");
expect(reg0.toRegex(RegexDialect.PCRE)).toBe("/(?P<thing>Hello World)\\g<thing>(?:\\g<thing>{3})?/");
expect(reg0.toRegex(RegexDialect.Python)).toBe("/(?P<thing>Hello World)(?P=thing)(?:(?P=thing){3})?/");
expect(reg0.toRegex(RegexDialect.DotNet)).toBe("/(?<thing>Hello World)\\k<thing>(?:\\k<thing>{3})?/");
});
it("can generate if statements", function() {
const toks0 = lexer.tokenize('if matches "a"\n\tmatch "b"\n').tokens;
const reg0 = parser.parse(toks0);
expect(reg0.validate(RegexDialect.JS).length).toBeGreaterThan(0);
expect(reg0.validate(RegexDialect.PCRE).length).toBe(0);
expect(reg0.toRegex(RegexDialect.PCRE)).toBe("/(?(a)b)/");
const toks1 = lexer.tokenize('if matches "alpha", maybe "b" or "f"\n\tmatch "c"\nelse\n\tif matches "d"\n\t\tmatch "e"\n\telse\n\t\tmatch "f"').tokens;
const reg1 = parser.parse(toks1);
expect(reg1.validate(RegexDialect.JS).length).toBeGreaterThan(0);
expect(reg1.validate(RegexDialect.Python).length).toBeGreaterThan(0);
expect(reg1.validate(RegexDialect.PCRE).length).toBe(0);
expect(reg1.toRegex(RegexDialect.PCRE)).toBe("/(?(alpha[bf]?)c|(?(d)e|f))/");
const toks2 = lexer.tokenize('create a group called thing\n\tmatch "a"\nif thing\n\tmatch "b"\nelse\n\tmatch "c"\n').tokens;
const reg2 = parser.parse(toks2);
expect(reg2.validate(RegexDialect.JS).length).toBeGreaterThan(0);
expect(reg2.validate(RegexDialect.PCRE).length).toBe(0);
expect(reg2.toRegex(RegexDialect.PCRE)).toBe("/(?P<thing>a)(?(thing)b|c)/");
expect(reg2.toRegex(RegexDialect.Boost)).toBe("/(?<thing>a)(?(<thing>)b|c)/");
}); });
it("generate dialect specific regex", function() { it("generate dialect specific regex", function() {
@ -187,7 +222,7 @@ describe("Generator functionality", function() {
it("runs complex scripts", function() { it("runs complex scripts", function() {
const str = ` const str = `
using global and multiline and exact matching using global and multiline and exact matching and case insensitive matching
create an optional group called protocol create an optional group called protocol
match "http" match "http"
optionally match "s" optionally match "s"
@ -222,6 +257,6 @@ create an optional group
const toks = lexer.tokenize(str).tokens; const toks = lexer.tokenize(str).tokens;
const reg = parser.parse(toks); const reg = parser.parse(toks);
expect(reg.validate(RegexDialect.JS).length).toBe(0); expect(reg.validate(RegexDialect.JS).length).toBe(0);
expect(reg.toRegex(RegexDialect.JS)).toBe("/^(?<protocol>https?\\:\\/\\/)?(?<subdomain>(?:\\w+\\.)*)?(?<domain>(?:\\w+|_|\\-)+\\.\\w+)(?:\\:\\d*)?(?<path>(?:\\/(?:\\w+|_|\\-)*)*)?(\\?(?<query>(?:(?:\\w+|_|\\-)+=(?:\\w+|_|\\-)+)*))?(#.*)?$/gm"); expect(reg.toRegex(RegexDialect.JS)).toBe("/^(?<protocol>https?\\:\\/\\/)?(?<subdomain>(?:\\w+\\.)*)?(?<domain>(?:\\w+|_|\\-)+\\.\\w+)(?:\\:\\d*)?(?<path>(?:\\/(?:\\w+|_|\\-)*)*)?(\\?(?<query>(?:(?:\\w+|_|\\-)+=(?:\\w+|_|\\-)+)*))?(#.*)?$/gmi");
}); });
}); });

View File

@ -0,0 +1,63 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
import { minimizeMatchString, groupIfRequired, dontClobberRepetition } from "../src/generator_helper";
describe("Generator helper functionality", function() {
it("can minimize matches", function() {
const test_cases = [
{ from: [], to: "" },
{ from: [ "abc" ], to: "abc" },
{ from: [ "abc", "abc" ], to: "abc" },
{ from: [ "a", "ab" ], to: "ab?" },
{ from: [ "a1x1z", "a2y2z", "a3z3z" ], to: "a(?:1x1|2y2|3z3)z" },
{ from: [ "ab", "cd" ], to: "ab|cd" },
{ from: [ "abc", "bc" ], to: "a?bc" },
{ from: [ "abc", "xb" ], to: "abc|xb" }
];
for (const c of test_cases) {
const got = minimizeMatchString(c.from);
expect(got).toBe(c.to);
}
});
it("groups correctly", function() {
const test_cases = [
{ from: "(?P=test)", to: "(?P=test)" },
{ from: "[abc\\]]", to: "[abc\\]]" },
{ from: "abc", to: "(?:abc)" },
{ from: "(abc)|d", to: "(?:(abc)|d)" },
{ from: "[abc\\]][abc]", to: "(?:[abc\\]][abc])" },
{ from: "(abc(abc)\\))(abc)", to: "(?:(abc(abc)\\))(abc))" },
];
for (const c of test_cases) {
const got = groupIfRequired(c.from);
expect(got).toBe(c.to);
}
});
it("doesn't clobber the repetition", function() {
const test_cases = [
{ fragment: "1+", repetition: "+", expected: "1+" },
{ fragment: "1*", repetition: "+", expected: "1+" },
{ fragment: "1+", repetition: "*", expected: "1+" },
{ fragment: "1*", repetition: "*", expected: "1*" },
{ fragment: "1+", repetition: "?", expected: "1+?" },
{ fragment: "1*", repetition: "?", expected: "1*?" },
{ fragment: "1+", repetition: "{0,}", expected: "(?:1+){0,}" },
{ fragment: "1*", repetition: "{0,}", expected: "1{0,}" },
{ fragment: "1+", repetition: "{1,2}", expected: "1{1,2}" },
{ fragment: "1*", repetition: "{1,2}", expected: "1{1,2}" },
];
for (const c of test_cases) {
const got = dontClobberRepetition(c.fragment, c.repetition);
expect(got).toBe(c.expected);
}
});
});

View File

@ -95,8 +95,7 @@ module.exports = {
after: { after: {
root: "./lib", root: "./lib",
include: [ include: [
"script.d.ts", "script.d.ts"
"script.d.ts.map"
] ]
} }
}) })