mirror of
https://github.com/pdemian/human2regex.git
synced 2025-05-16 04:20:35 -07:00
846 lines
31 KiB
JavaScript
846 lines
31 KiB
JavaScript
"use strict";
|
|
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.minimizeMatchString = exports.RegularExpressionCST = exports.BackrefStatementCST = exports.GroupStatementCST = exports.RepeatStatementCST = exports.MatchStatementCST = exports.CountSubStatementCST = exports.UsingStatementCST = exports.MatchSubStatementCST = exports.StatementCST = exports.MatchStatementValue = exports.MatchSubStatementValue = exports.MatchSubStatementType = exports.UsingFlags = exports.H2RCST = exports.GeneratorContext = exports.RegexDialect = void 0;
|
|
/**
|
|
* Includes all Concrete Syntax Trees for Human2Regex
|
|
* @packageDocumentation
|
|
*/
|
|
const utilities_1 = require("./utilities");
|
|
/**
|
|
* List of regular expression dialects we support
|
|
*/
|
|
var RegexDialect;
|
|
(function (RegexDialect) {
|
|
RegexDialect[RegexDialect["JS"] = 0] = "JS";
|
|
RegexDialect[RegexDialect["PCRE"] = 1] = "PCRE";
|
|
RegexDialect[RegexDialect["DotNet"] = 2] = "DotNet";
|
|
RegexDialect[RegexDialect["Java"] = 3] = "Java";
|
|
RegexDialect[RegexDialect["Python"] = 4] = "Python";
|
|
RegexDialect[RegexDialect["Boost"] = 5] = "Boost";
|
|
})(RegexDialect = exports.RegexDialect || (exports.RegexDialect = {}));
|
|
const unicode_property_codes = [
|
|
"C", "Cc", "Cf", "Cn", "Co", "Cs",
|
|
"L", "Ll", "Lm", "Lo", "Lt", "Lu",
|
|
"M", "Mc", "Me", "Mn", "N", "Nd",
|
|
"Nl", "No", "P", "Pc", "Pd", "Pe",
|
|
"Pf", "Pi", "Po", "Ps", "S", "Sc",
|
|
"Sk", "Sm", "So", "Z", "Zl", "Zp",
|
|
"Zs"
|
|
];
|
|
const unicode_script_codes = [
|
|
"Arabic", "Armenian", "Avestan", "Balinese", "Bamum",
|
|
"Batak", "Bengali", "Bopomofo", "Brahmi", "Braille",
|
|
"Buginese", "Buhid", "Canadian_Aboriginal", "Carian", "Chakma",
|
|
"Cham", "Cherokee", "Common", "Coptic", "Cuneiform",
|
|
"Cypriot", "Cyrillic", "Deseret", "Devanagari", "Egyptian_Hieroglyphs",
|
|
"Ethiopic", "Georgian", "Glagolitic", "Gothic", "Greek",
|
|
"Gujarati", "Gurmukhi", "Han", "Hangul", "Hanunoo", "Hebrew",
|
|
"Hiragana", "Imperial_Aramaic", "Inherited", "Inscriptional_Pahlavi",
|
|
"Inscriptional_Parthian", "Javanese", "Kaithi", "Kannada", "Katakana",
|
|
"Kayah_Li", "Kharoshthi", "Khmer", "Lao", "Latin", "Lepcha", "Limbu",
|
|
"Linear_B", "Lisu", "Lycian", "Lydian", "Malayalam", "Mandaic",
|
|
"Meetei_Mayek", "Meroitic_Cursive", "Meroitic_Hieroglyphs", "Miao",
|
|
"Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Ogham", "Old_Italic",
|
|
"Old_Persian", "Old_South_Arabian", "Old_Turkic", "Ol_Chiki", "Oriya",
|
|
"Osmanya", "Phags_Pa", "Phoenician", "Rejang", "Runic", "Samaritan",
|
|
"Saurashtra", "Sharada", "Shavian", "Sinhala", "Sora_Sompeng",
|
|
"Sundanese", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le",
|
|
"Tai_Tham", "Tai_Viet", "Takri", "Tamil", "Telugu", "Thaana", "Thai",
|
|
"Tibetan", "Tifinagh", "Ugaritic", "Vai", "Yi"
|
|
];
|
|
/**
|
|
* Context for validation
|
|
*
|
|
* Currently only used to validate groups
|
|
*
|
|
* @internal
|
|
*/
|
|
class GeneratorContext {
|
|
constructor() {
|
|
this.groups = {};
|
|
}
|
|
/**
|
|
* Checks to see if we already have a group defined
|
|
*
|
|
* @param identifier the group name
|
|
* @returns true if the group name already exists
|
|
*/
|
|
hasGroup(identifier) {
|
|
return Object.prototype.hasOwnProperty.call(this.groups, identifier);
|
|
}
|
|
/**
|
|
* Adds the identifier to the group list
|
|
*
|
|
* @param identifier the group name
|
|
*/
|
|
addGroup(identifier, tokens) {
|
|
var _a, _b, _c;
|
|
const f = utilities_1.first(tokens);
|
|
const l = utilities_1.last(tokens);
|
|
this.groups[identifier] = {
|
|
startLine: (_a = f.startLine) !== null && _a !== void 0 ? _a : NaN,
|
|
startColumn: (_b = f.startColumn) !== null && _b !== void 0 ? _b : NaN,
|
|
length: ((_c = l.endOffset) !== null && _c !== void 0 ? _c : l.startOffset) - f.startOffset,
|
|
};
|
|
}
|
|
}
|
|
exports.GeneratorContext = GeneratorContext;
|
|
/**
|
|
* The base concrete syntax tree class
|
|
*
|
|
* @internal
|
|
*/
|
|
class H2RCST {
|
|
/**
|
|
* Constructor for H2RCST
|
|
*
|
|
* @param tokens Tokens used to calculate where an error occured
|
|
* @internal
|
|
*/
|
|
constructor(tokens) {
|
|
this.tokens = tokens;
|
|
/* empty */
|
|
}
|
|
/**
|
|
* Creates an ISemanticError with a given message and the tokens provided from the constructor
|
|
*
|
|
* @param message the message
|
|
* @internal
|
|
*/
|
|
error(message) {
|
|
var _a, _b, _c;
|
|
const f = utilities_1.first(this.tokens);
|
|
const l = utilities_1.last(this.tokens);
|
|
return {
|
|
startLine: (_a = f.startLine) !== null && _a !== void 0 ? _a : NaN,
|
|
startColumn: (_b = f.startColumn) !== null && _b !== void 0 ? _b : NaN,
|
|
length: ((_c = l.endOffset) !== null && _c !== void 0 ? _c : l.startOffset) - f.startOffset,
|
|
message: message
|
|
};
|
|
}
|
|
}
|
|
exports.H2RCST = H2RCST;
|
|
/**
|
|
* Flags for the using statement
|
|
*
|
|
* @internal
|
|
*/
|
|
var UsingFlags;
|
|
(function (UsingFlags) {
|
|
UsingFlags[UsingFlags["Multiline"] = utilities_1.makeFlag(0)] = "Multiline";
|
|
UsingFlags[UsingFlags["Global"] = utilities_1.makeFlag(1)] = "Global";
|
|
UsingFlags[UsingFlags["Sensitive"] = utilities_1.makeFlag(2)] = "Sensitive";
|
|
UsingFlags[UsingFlags["Insensitive"] = utilities_1.makeFlag(3)] = "Insensitive";
|
|
UsingFlags[UsingFlags["Exact"] = utilities_1.makeFlag(4)] = "Exact";
|
|
})(UsingFlags = exports.UsingFlags || (exports.UsingFlags = {}));
|
|
/**
|
|
* Type of match arguments
|
|
*
|
|
* @remarks SingleString means an escaped string
|
|
* @remarks Between means a range (ex. a-z)
|
|
* @remarks Anything means .
|
|
* @remarks Word, Digit, Character, Whitespace, Number, Tab, Linefeed, Newline, and Carriage return are \w+, \d, \w, \s, \d+, \t, \n, \n, \r respectively
|
|
* @internal
|
|
*/
|
|
var MatchSubStatementType;
|
|
(function (MatchSubStatementType) {
|
|
MatchSubStatementType[MatchSubStatementType["SingleString"] = 0] = "SingleString";
|
|
MatchSubStatementType[MatchSubStatementType["Between"] = 1] = "Between";
|
|
MatchSubStatementType[MatchSubStatementType["Anything"] = 2] = "Anything";
|
|
MatchSubStatementType[MatchSubStatementType["Word"] = 3] = "Word";
|
|
MatchSubStatementType[MatchSubStatementType["Digit"] = 4] = "Digit";
|
|
MatchSubStatementType[MatchSubStatementType["Character"] = 5] = "Character";
|
|
MatchSubStatementType[MatchSubStatementType["Whitespace"] = 6] = "Whitespace";
|
|
MatchSubStatementType[MatchSubStatementType["Number"] = 7] = "Number";
|
|
MatchSubStatementType[MatchSubStatementType["Tab"] = 8] = "Tab";
|
|
MatchSubStatementType[MatchSubStatementType["Linefeed"] = 9] = "Linefeed";
|
|
MatchSubStatementType[MatchSubStatementType["Newline"] = 10] = "Newline";
|
|
MatchSubStatementType[MatchSubStatementType["CarriageReturn"] = 11] = "CarriageReturn";
|
|
MatchSubStatementType[MatchSubStatementType["Boundary"] = 12] = "Boundary";
|
|
MatchSubStatementType[MatchSubStatementType["Unicode"] = 13] = "Unicode";
|
|
MatchSubStatementType[MatchSubStatementType["Letter"] = 14] = "Letter";
|
|
MatchSubStatementType[MatchSubStatementType["Decimal"] = 15] = "Decimal";
|
|
MatchSubStatementType[MatchSubStatementType["Integer"] = 16] = "Integer";
|
|
})(MatchSubStatementType = exports.MatchSubStatementType || (exports.MatchSubStatementType = {}));
|
|
/**
|
|
* Container for match statements
|
|
*
|
|
* @internal
|
|
*/
|
|
class MatchSubStatementValue {
|
|
/**
|
|
* Constructor for MatchSubStatementValue
|
|
*
|
|
* @param type the type of this match
|
|
* @param from optional value or range string
|
|
* @param to optional range string
|
|
* @internal
|
|
*/
|
|
constructor(type, from = null, to = null) {
|
|
this.type = type;
|
|
this.from = from;
|
|
this.to = to;
|
|
/* empty */
|
|
}
|
|
}
|
|
exports.MatchSubStatementValue = MatchSubStatementValue;
|
|
/**
|
|
* Container for MatchStatementValue
|
|
*
|
|
* @internal
|
|
*/
|
|
class MatchStatementValue {
|
|
/**
|
|
* Constructor for MatchStatementValue
|
|
*
|
|
* @param optional is this match optional
|
|
* @param statement the substatement to generate
|
|
* @internal
|
|
*/
|
|
constructor(optional, statement) {
|
|
this.optional = optional;
|
|
this.statement = statement;
|
|
/* empty */
|
|
}
|
|
}
|
|
exports.MatchStatementValue = MatchStatementValue;
|
|
/**
|
|
* The base class for all statement concrete syntax trees
|
|
*
|
|
* @internal
|
|
*/
|
|
class StatementCST extends H2RCST {
|
|
}
|
|
exports.StatementCST = StatementCST;
|
|
/**
|
|
* Concrete Syntax Tree for Match Sub statements
|
|
*
|
|
* @internal
|
|
*/
|
|
class MatchSubStatementCST extends H2RCST {
|
|
/**
|
|
* Constructor for MatchSubStatementCST
|
|
*
|
|
* @param tokens Tokens used to calculate where an error occured
|
|
* @param count optional count statement
|
|
* @param invert is this match inverted (ex, [^a-z] or [a-z])
|
|
* @param values sub statements to match
|
|
*/
|
|
constructor(tokens, count, invert = false, values) {
|
|
super(tokens);
|
|
this.count = count;
|
|
this.invert = invert;
|
|
this.values = values;
|
|
}
|
|
validate(language, context) {
|
|
const errors = [];
|
|
if (this.count) {
|
|
utilities_1.append(errors, this.count.validate(language, context));
|
|
}
|
|
for (const value of this.values) {
|
|
if (value.type === MatchSubStatementType.Between) {
|
|
let from = utilities_1.removeQuotes(value.from);
|
|
let to = utilities_1.removeQuotes(value.to);
|
|
if (!utilities_1.isSingleRegexCharacter(from)) {
|
|
errors.push(this.error("Between statement must begin with a single character"));
|
|
}
|
|
else if (from.startsWith("\\u") || from.startsWith("\\U") || from.startsWith("\\")) {
|
|
from = JSON.parse(`"${from}"`);
|
|
}
|
|
if (!utilities_1.isSingleRegexCharacter(to)) {
|
|
errors.push(this.error("Between statement must end with a single character"));
|
|
}
|
|
else if (to.startsWith("\\u") || to.startsWith("\\U") || to.startsWith("\\")) {
|
|
to = JSON.parse(`"${to}"`);
|
|
}
|
|
if (from.charCodeAt(0) >= to.charCodeAt(0)) {
|
|
errors.push(this.error("Between statement range invalid"));
|
|
}
|
|
}
|
|
else if (value.type === MatchSubStatementType.Unicode) {
|
|
let unicode_class = utilities_1.removeQuotes(value.from);
|
|
// check to see if the given code is supported
|
|
if (!unicode_property_codes.includes(unicode_class)) {
|
|
// check to see if the given script is supported
|
|
// Java and C# requires "Is*"
|
|
if (language === RegexDialect.DotNet || language === RegexDialect.Java) {
|
|
if (!unicode_class.startsWith("Is")) {
|
|
errors.push(this.error("This dialect requires script names to begin with Is, such as IsCyrillic rather than Cyrillic"));
|
|
continue;
|
|
}
|
|
unicode_class = unicode_class.substr(2);
|
|
}
|
|
if (!unicode_script_codes.includes(unicode_class)) {
|
|
errors.push(this.error(`Unknown unicode specifier ${value.from}`));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return errors;
|
|
}
|
|
toRegex(language) {
|
|
const matches = [];
|
|
for (const value of this.values) {
|
|
switch (value.type) {
|
|
case MatchSubStatementType.SingleString: {
|
|
const reg = utilities_1.regexEscape(utilities_1.removeQuotes(value.from));
|
|
matches.push(this.invert ? `(?!${reg})` : reg);
|
|
break;
|
|
}
|
|
case MatchSubStatementType.Between: {
|
|
const from = utilities_1.removeQuotes(value.from);
|
|
const to = utilities_1.removeQuotes(value.to);
|
|
matches.push(this.invert ? `[^${from}-${to}]` : `[${from}-${to}]`);
|
|
break;
|
|
}
|
|
case MatchSubStatementType.Unicode: {
|
|
const unicode = utilities_1.removeQuotes(value.from);
|
|
matches.push(this.invert ? `\\P{${unicode}}` : `\\p{${unicode}}`);
|
|
break;
|
|
}
|
|
case MatchSubStatementType.Boundary:
|
|
matches.push(this.invert ? "\\B" : "\\b");
|
|
break;
|
|
case MatchSubStatementType.Word:
|
|
matches.push(this.invert ? "\\W+" : "\\w+");
|
|
break;
|
|
case MatchSubStatementType.Letter: {
|
|
if (language === RegexDialect.PCRE) {
|
|
matches.push(this.invert ? "[^[:alpha:]]" : "[[:alpha:]]");
|
|
}
|
|
else {
|
|
matches.push(this.invert ? "[^a-zA-Z]" : "[a-zA-Z]");
|
|
}
|
|
break;
|
|
}
|
|
case MatchSubStatementType.Integer:
|
|
matches.push(this.invert ? "(?![+-]?\\d+)" : "[+-]?\\d+");
|
|
break;
|
|
case MatchSubStatementType.Decimal:
|
|
matches.push(this.invert ? "(?![+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+)))" : "[+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+))");
|
|
break;
|
|
case MatchSubStatementType.Digit:
|
|
matches.push(this.invert ? "\\D" : "\\d");
|
|
break;
|
|
case MatchSubStatementType.Character:
|
|
matches.push(this.invert ? "\\W" : "\\w");
|
|
break;
|
|
case MatchSubStatementType.Whitespace:
|
|
matches.push(this.invert ? "\\S" : "\\s");
|
|
break;
|
|
case MatchSubStatementType.Number:
|
|
matches.push(this.invert ? "\\D+" : "\\d+");
|
|
break;
|
|
case MatchSubStatementType.Tab:
|
|
matches.push(this.invert ? "[^\\t]" : "\\t");
|
|
break;
|
|
case MatchSubStatementType.Newline:
|
|
case MatchSubStatementType.Linefeed:
|
|
matches.push(this.invert ? "[^\\n]" : "\\n");
|
|
break;
|
|
case MatchSubStatementType.CarriageReturn:
|
|
matches.push(this.invert ? "[^\\r]" : "\\r");
|
|
break;
|
|
default:
|
|
// default: anything
|
|
matches.push(this.invert ? "[^.]" : ".");
|
|
break;
|
|
}
|
|
}
|
|
let ret = "";
|
|
let require_grouping = false;
|
|
let dont_clobber_plus = false;
|
|
if (matches.length === 1) {
|
|
ret = utilities_1.first(matches);
|
|
if (ret.endsWith("+")) {
|
|
dont_clobber_plus = true;
|
|
}
|
|
}
|
|
else {
|
|
ret = minimizeMatchString(matches);
|
|
if (ret.length > 1 &&
|
|
(!ret.startsWith("(") || !ret.endsWith("["))) {
|
|
require_grouping = true;
|
|
}
|
|
}
|
|
if (this.count) {
|
|
if (dont_clobber_plus) {
|
|
const clobber = this.count.toRegex(language);
|
|
// + can be ignored as well as a count as long as that count is > 0
|
|
switch (clobber) {
|
|
case "*":
|
|
case "?":
|
|
ret = "(?:" + ret + ")" + clobber;
|
|
break;
|
|
case "+":
|
|
// ignore
|
|
break;
|
|
default:
|
|
if (clobber.startsWith("{0")) {
|
|
ret = "(?:" + ret + ")" + clobber;
|
|
}
|
|
else {
|
|
// remove + and replace with count
|
|
ret.substring(0, ret.length - 1) + clobber;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
else {
|
|
if (require_grouping) {
|
|
ret = "(?:" + ret + ")";
|
|
}
|
|
ret += this.count.toRegex(language);
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
}
|
|
exports.MatchSubStatementCST = MatchSubStatementCST;
|
|
/**
|
|
* Concrete Syntax Tree for Using statements
|
|
*
|
|
* @internal
|
|
*/
|
|
class UsingStatementCST extends H2RCST {
|
|
/**
|
|
* Constructor for UsingStatementCST
|
|
*
|
|
* @param tokens Tokens used to calculate where an error occured
|
|
* @param flags using flags
|
|
*/
|
|
constructor(tokens, flags) {
|
|
super(tokens);
|
|
this.flags = flags;
|
|
}
|
|
validate(language, context) {
|
|
utilities_1.unusedParameter(language, "Count does not need checking");
|
|
utilities_1.unusedParameter(context, "Context is not needed");
|
|
const errors = [];
|
|
let flag = this.flags[0];
|
|
for (let i = 1; i < this.flags.length; i++) {
|
|
if (utilities_1.hasFlag(flag, this.flags[i])) {
|
|
errors.push(this.error("Duplicate modifier: " + UsingFlags[this.flags[i]]));
|
|
}
|
|
flag = utilities_1.combineFlags(flag, this.flags[i]);
|
|
}
|
|
if (utilities_1.hasFlag(flag, UsingFlags.Sensitive) && utilities_1.hasFlag(flag, UsingFlags.Insensitive)) {
|
|
errors.push(this.error("Cannot be both case sensitive and insensitive"));
|
|
}
|
|
return errors;
|
|
}
|
|
toRegex(language) {
|
|
utilities_1.unusedParameter(language, "Using Statement does not change based on language");
|
|
let str = "";
|
|
let exact = false;
|
|
for (const flag of this.flags) {
|
|
if (utilities_1.hasFlag(flag, UsingFlags.Multiline)) {
|
|
str += "m";
|
|
}
|
|
else if (utilities_1.hasFlag(flag, UsingFlags.Global)) {
|
|
str += "g";
|
|
}
|
|
else if (utilities_1.hasFlag(flag, UsingFlags.Insensitive)) {
|
|
str += "i";
|
|
}
|
|
else if (utilities_1.hasFlag(flag, UsingFlags.Exact)) {
|
|
exact = true;
|
|
}
|
|
}
|
|
return exact ? "/^{regex}$/" + str : "/{regex}/" + str;
|
|
}
|
|
}
|
|
exports.UsingStatementCST = UsingStatementCST;
|
|
/**
|
|
* Concrete Syntax Tree for Count sub statements
|
|
*
|
|
* @internal
|
|
*/
|
|
class CountSubStatementCST extends H2RCST {
|
|
/**
|
|
* Constructor for CountSubStatementCST
|
|
*
|
|
* @param tokens Tokens used to calculate where an error occured
|
|
* @param from number to count from
|
|
* @param to optional number to count to
|
|
* @param opt option modifier
|
|
*/
|
|
constructor(tokens, from, to = null, opt = null) {
|
|
super(tokens);
|
|
this.from = from;
|
|
this.to = to;
|
|
this.opt = opt;
|
|
}
|
|
validate(language, context) {
|
|
utilities_1.unusedParameter(language, "Count does not need checking");
|
|
utilities_1.unusedParameter(context, "Context is not needed");
|
|
const errors = [];
|
|
if (this.to !== null && ((this.opt === "exclusive" && (this.to - 1) <= this.from) || this.to <= this.from)) {
|
|
errors.push(this.error("Values must be in range of eachother"));
|
|
}
|
|
return errors;
|
|
}
|
|
toRegex(language) {
|
|
utilities_1.unusedParameter(language, "Count does not change from language");
|
|
const from = this.from;
|
|
let to = this.to;
|
|
// if we only have a count of 1, we can ignore adding any extra text
|
|
if (to === null) {
|
|
if (from === 1) {
|
|
return this.opt === "+" ? "+" : "*";
|
|
}
|
|
else if (from === 0) {
|
|
return this.opt === "+" ? "*" : "{0}";
|
|
}
|
|
}
|
|
if (to !== null) {
|
|
if (this.opt === "exclusive") {
|
|
to--;
|
|
}
|
|
return `{${from},${to}}`;
|
|
}
|
|
else if (this.opt === "+") {
|
|
return `{${from},}`;
|
|
}
|
|
else {
|
|
return `{${from}}`;
|
|
}
|
|
}
|
|
}
|
|
exports.CountSubStatementCST = CountSubStatementCST;
|
|
/**
|
|
* Concrete Syntax Tree for a Match statement
|
|
*
|
|
* @internal
|
|
*/
|
|
class MatchStatementCST extends StatementCST {
|
|
/**
|
|
* Constructor for MatchStatementCST
|
|
*
|
|
* @param tokens Tokens used to calculate where an error occured
|
|
* @param matches
|
|
*/
|
|
constructor(tokens, completely_optional, matches) {
|
|
super(tokens);
|
|
this.completely_optional = completely_optional;
|
|
this.matches = matches;
|
|
}
|
|
validate(language, context) {
|
|
const errors = [];
|
|
for (const match of this.matches) {
|
|
utilities_1.append(errors, match.statement.validate(language, context));
|
|
}
|
|
return errors;
|
|
}
|
|
toRegex(language) {
|
|
let final_matches = this.matches.map((x) => {
|
|
let match_stmt = x.statement.toRegex(language);
|
|
// need to group if optional and ungrouped
|
|
if (x.optional) {
|
|
if (!utilities_1.isSingleRegexCharacter(match_stmt)) {
|
|
// don't re-group a group
|
|
if (match_stmt[0] !== "(" && match_stmt[match_stmt.length - 1] !== ")") {
|
|
match_stmt = "(?:" + match_stmt + ")";
|
|
}
|
|
}
|
|
match_stmt += "?";
|
|
}
|
|
return match_stmt;
|
|
}).join("");
|
|
if (this.completely_optional) {
|
|
if (!utilities_1.isSingleRegexCharacter(final_matches)) {
|
|
// don't re-group a group
|
|
if (final_matches[0] !== "(" && final_matches[final_matches.length - 1] !== ")") {
|
|
final_matches = "(?:" + final_matches + ")";
|
|
}
|
|
}
|
|
final_matches += "?";
|
|
}
|
|
return final_matches;
|
|
}
|
|
}
|
|
exports.MatchStatementCST = MatchStatementCST;
|
|
/**
|
|
* Concrete Syntax Tree for a Repeat statement
|
|
*
|
|
* @internal
|
|
*/
|
|
class RepeatStatementCST extends StatementCST {
|
|
/**
|
|
* Constructor for RepeatStatementCST
|
|
*
|
|
* @param tokens Tokens used to calculate where an error occured
|
|
* @param optional is this repetition optional
|
|
* @param count optional number of times to repeat
|
|
* @param statements the statements to repeat
|
|
*/
|
|
constructor(tokens, optional, count, statements) {
|
|
super(tokens);
|
|
this.optional = optional;
|
|
this.count = count;
|
|
this.statements = statements;
|
|
}
|
|
validate(language, context) {
|
|
const errors = [];
|
|
if (this.count !== null) {
|
|
utilities_1.append(errors, this.count.validate(language, context));
|
|
}
|
|
for (const statement of this.statements) {
|
|
utilities_1.append(errors, statement.validate(language, context));
|
|
}
|
|
return errors;
|
|
}
|
|
toRegex(language) {
|
|
let str = "(?:" + this.statements.map((x) => x.toRegex(language)).join("") + ")";
|
|
if (this.count) {
|
|
str += this.count.toRegex(language);
|
|
// group for optionality because count would be incorrect otherwise
|
|
if (this.optional) {
|
|
str = "(?:" + str + ")?";
|
|
}
|
|
}
|
|
else {
|
|
str += "*";
|
|
if (this.optional) {
|
|
str += "?";
|
|
}
|
|
}
|
|
return str;
|
|
}
|
|
}
|
|
exports.RepeatStatementCST = RepeatStatementCST;
|
|
/**
|
|
* Conrete Syntax Tree for a group Statement
|
|
*
|
|
* @internal
|
|
*/
|
|
class GroupStatementCST extends StatementCST {
|
|
/**
|
|
* Constructor for GroupStatementCST
|
|
*
|
|
* @param tokens Tokens used to calculate where an error occured
|
|
* @param optional is this group optional
|
|
* @param name optional name for named group
|
|
* @param statements other statements
|
|
* @internal
|
|
*/
|
|
constructor(tokens, optional, name, statements) {
|
|
super(tokens);
|
|
this.optional = optional;
|
|
this.name = name;
|
|
this.statements = statements;
|
|
}
|
|
validate(language, context) {
|
|
const errors = [];
|
|
// All languages currently support named groups
|
|
//if (false) {
|
|
// errors.push(this.error("This language does not support named groups"));
|
|
//}
|
|
if (this.name !== null) {
|
|
if (context.hasGroup(this.name)) {
|
|
const past_group = context.groups[this.name];
|
|
errors.push(this.error(`Group with name "${this.name}" was already defined here: ${past_group.startLine}:${past_group.startLine}-${past_group.startLine}:${past_group.startLine + past_group.length}`));
|
|
}
|
|
else {
|
|
context.addGroup(this.name, this.tokens);
|
|
}
|
|
}
|
|
for (const statement of this.statements) {
|
|
utilities_1.append(errors, statement.validate(language, context));
|
|
}
|
|
return errors;
|
|
}
|
|
toRegex(language) {
|
|
let str = "(";
|
|
// named group
|
|
if (this.name !== null) {
|
|
str += "?";
|
|
// python and PCRE use "?P" while everything else is just "?"
|
|
if (language === RegexDialect.Python || language === RegexDialect.PCRE) {
|
|
str += "P";
|
|
}
|
|
str += `<${this.name}>`;
|
|
}
|
|
str += this.statements.map((x) => x.toRegex(language)).join("");
|
|
str += (this.optional ? ")?" : ")");
|
|
return str;
|
|
}
|
|
}
|
|
exports.GroupStatementCST = GroupStatementCST;
|
|
/**
|
|
* Concrete Syntax Tree for a Backreference statement
|
|
*
|
|
* @internal
|
|
*/
|
|
class BackrefStatementCST extends StatementCST {
|
|
/**
|
|
* Constructor for BackrefStatementCST
|
|
*
|
|
* @param tokens Tokens used to calculate where an error occured
|
|
* @param optional is this backref optional
|
|
* @param count optional number of times to repeat
|
|
* @param name the group name to call
|
|
*/
|
|
constructor(tokens, optional, count, name) {
|
|
super(tokens);
|
|
this.optional = optional;
|
|
this.count = count;
|
|
this.name = name;
|
|
}
|
|
validate(language, context) {
|
|
const errors = [];
|
|
if (!context.hasGroup(this.name)) {
|
|
errors.push(this.error(`Cannot call group with name "${this.name}" as it was never previously defined`));
|
|
}
|
|
if (this.count !== null) {
|
|
utilities_1.append(errors, this.count.validate(language, context));
|
|
}
|
|
return errors;
|
|
}
|
|
toRegex(language) {
|
|
let str = "";
|
|
switch (language) {
|
|
case RegexDialect.Python:
|
|
str = `(?P=${this.name})`;
|
|
break;
|
|
case RegexDialect.DotNet:
|
|
case RegexDialect.Java:
|
|
str = `\\k<${this.name}>`;
|
|
break;
|
|
default:
|
|
str = `\\g<${this.name}>`;
|
|
break;
|
|
}
|
|
if (this.count) {
|
|
str += this.count.toRegex(language);
|
|
// group for optionality because count would be incorrect otherwise
|
|
if (this.optional) {
|
|
str = "(?:" + str + ")?";
|
|
}
|
|
}
|
|
else if (this.optional) {
|
|
str = "?";
|
|
}
|
|
return str;
|
|
}
|
|
}
|
|
exports.BackrefStatementCST = BackrefStatementCST;
|
|
/**
|
|
* Concrete Syntax Tree for a regular expression
|
|
*
|
|
* @internal
|
|
*/
|
|
class RegularExpressionCST extends H2RCST {
|
|
/**
|
|
* Constructor for RegularExpressionCST
|
|
*
|
|
* @param tokens Tokens used to calculate where an error occured
|
|
* @param usings using statements
|
|
* @param statements other statements
|
|
* @internal
|
|
*/
|
|
constructor(tokens, usings, statements) {
|
|
super(tokens);
|
|
this.usings = usings;
|
|
this.statements = statements;
|
|
}
|
|
validate(language, context) {
|
|
const errors = this.usings.validate(language, context);
|
|
for (const statement of this.statements) {
|
|
utilities_1.append(errors, statement.validate(language, context));
|
|
}
|
|
return errors;
|
|
}
|
|
toRegex(language) {
|
|
const modifiers = this.usings.toRegex(language);
|
|
const regex = this.statements.map((x) => x.toRegex(language)).join("");
|
|
return modifiers.replace("{regex}", regex);
|
|
}
|
|
}
|
|
exports.RegularExpressionCST = RegularExpressionCST;
|
|
/**
|
|
* Minimizes the match string by finding duplicates or substrings in the array
|
|
*
|
|
* @param arr the array of matches
|
|
* @internal
|
|
*/
|
|
function minimizeMatchString(arr) {
|
|
return minMatchString(arr, 0);
|
|
}
|
|
exports.minimizeMatchString = minimizeMatchString;
|
|
/**
|
|
* Minimizes the match string by finding duplicates or substrings in the array
|
|
*
|
|
* @param arr the array
|
|
* @param depth must be 0 for initial call
|
|
* @internal
|
|
*/
|
|
function minMatchString(arr, depth = 0) {
|
|
// base case: arr is empty
|
|
if (arr.length === 0) {
|
|
return "";
|
|
}
|
|
// base case: arr has 1 element (must have at least 2, so this means this value is optional)
|
|
if (arr.length === 1) {
|
|
return utilities_1.first(arr) + "?";
|
|
}
|
|
// remove duplicates
|
|
arr = [...new Set(arr)];
|
|
// base case: arr has 1 element (after duplicate removal means this is required)
|
|
if (arr.length === 1) {
|
|
return utilities_1.first(arr);
|
|
}
|
|
// base case: arr is all single letters
|
|
if (arr.every(utilities_1.isSingleRegexCharacter)) {
|
|
return "[" + arr.join("") + "]";
|
|
}
|
|
// now the real magic begins
|
|
// You are not expected to understand this
|
|
let longest_begin_substring = utilities_1.first(arr);
|
|
let longest_end_substring = utilities_1.first(arr);
|
|
for (let i = 1; i < arr.length; i++) {
|
|
// reduce longest_substring to match everything
|
|
for (let j = 0; j < longest_begin_substring.length; j++) {
|
|
if (arr[i].length < j || longest_begin_substring[j] !== arr[i][j]) {
|
|
longest_begin_substring = longest_begin_substring.substr(0, j);
|
|
break;
|
|
}
|
|
}
|
|
for (let j = 0; j < longest_end_substring.length; j++) {
|
|
if (arr[i].length - j < 0 || longest_end_substring[longest_end_substring.length - j - 1] !== arr[i][arr[i].length - j - 1]) {
|
|
longest_end_substring = longest_end_substring.substr(longest_end_substring.length - j, longest_end_substring.length);
|
|
break;
|
|
}
|
|
}
|
|
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
|
|
break;
|
|
}
|
|
}
|
|
// No matches whatsoever
|
|
// *technically* we can optimize further, but that is a VERY non-trivial problem
|
|
// For example optimizing: [ "a1x1z", "a2y2z", "a3z3z" ] to: "a[123][xyz][123]z"
|
|
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
|
|
if (depth > 0) {
|
|
return "(?:" + arr.join("|") + ")";
|
|
}
|
|
else {
|
|
return arr.join("|");
|
|
}
|
|
}
|
|
// we have some matches
|
|
else {
|
|
// remove begin (if exists) and end (if exists) from each element and remove empty strings
|
|
const begin_pos = longest_begin_substring.length;
|
|
const end_pos = longest_end_substring.length;
|
|
const similar_matches = [];
|
|
for (const ele of arr) {
|
|
const match = ele.substring(begin_pos, ele.length - end_pos);
|
|
if (match.length !== 0) {
|
|
similar_matches.push(match);
|
|
}
|
|
}
|
|
return longest_begin_substring + minMatchString(similar_matches, depth + 1) + longest_end_substring;
|
|
}
|
|
}
|