1
0
mirror of https://github.com/pdemian/human2regex.git synced 2025-05-16 04:20:35 -07:00
human2regex/lib/parser.js

535 lines
24 KiB
JavaScript

"use strict";
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Human2RegexParser = exports.ParseResult = exports.Human2RegexParserOptions = void 0;
/**
* The parser for Human2Regex
* @packageDocumentation
*/
const chevrotain_1 = require("chevrotain");
const T = __importStar(require("./tokens"));
const generator_1 = require("./generator");
const utilities_1 = require("./utilities");
/**
* The options for the Parser
*/
class Human2RegexParserOptions {
/**
* Constructor for Human2RegexParserOptions
*
* @param skip_validations If true, the lexer will skip validations (~25% faster)
*/
constructor(skip_validations = false) {
this.skip_validations = skip_validations;
/* empty */
}
}
exports.Human2RegexParserOptions = Human2RegexParserOptions;
class TokenAndValue {
constructor(token, value) {
this.token = token;
this.value = value;
/* empty */
}
}
class TokensAndValue {
constructor(tokens, value) {
this.tokens = tokens;
this.value = value;
/* empty */
}
}
/**
* Tokenization result
*/
class ParseResult {
/**
* Constructor for the TokenizeResult
*
* @param tokens The token stream
* @param errors A list of lexing errors
*/
constructor(regexp_cst, errors) {
this.regexp_cst = regexp_cst;
this.errors = errors;
/* empty */
}
/**
* Validate that this is both valid and can be generated in the specified language
*
* @remarks There is no guarantee toRegex or toRegExp will work unless validate returns no errors
*
* @param language the regex dialect we're validating
* @returns A list of errors
* @public
*/
validate(language) {
return this.regexp_cst.validate(language).map(utilities_1.CommonError.fromSemanticError);
}
/**
* Generate a regular expression string based on the parse result
*
* @remarks There is no guarantee toRegex will work unless validate returns no errors
*
* @param language the regex dialect we're generating
* @returns a regular expression string
* @public
*/
toRegex(language) {
return this.regexp_cst.toRegex(language);
}
/**
* Generate a RegExp object based on the parse result
*
* @remarks There is no guarantee toRegExp will work unless validate returns no errors
*
* @param language the regex dialect we're generating
* @returns a RegExp object
* @public
*/
toRegExp(language) {
return new RegExp(this.regexp_cst.toRegex(language));
}
}
exports.ParseResult = ParseResult;
/**
* The Parser class
*
* @remarks Only 1 parser instance allowed due to performance reasons
*/
class Human2RegexParser extends chevrotain_1.EmbeddedActionsParser {
constructor(options = new Human2RegexParserOptions()) {
super(T.AllTokens, { recoveryEnabled: false, maxLookahead: 2, skipValidations: options.skip_validations });
this.options = options;
if (Human2RegexParser.already_init) {
throw new Error("Only 1 instance of Human2RegexParser allowed");
}
Human2RegexParser.already_init = true;
const $ = this;
/**
* IN REGARDS TO KEEPING TOKENS:
* We don't really need to keep each token, only the first and last tokens
* This is due to the fact we calculate the difference between those tokens
* However, sometimes we have optional starts and ends
* Each optional near the start and end MUST be recorded because they may be the first/last token
* ex) "optional match 3..." the start token is "optional", but "match 3..."'s start token is "match"
* */
// number rules
let nss_rules = null;
const NumberSubStatement = $.RULE("NumberSubStatement", () => {
return $.OR(nss_rules || (nss_rules = [
{ ALT: () => new TokenAndValue($.CONSUME(T.Zero), 0) },
{ ALT: () => new TokenAndValue($.CONSUME(T.One), 1) },
{ ALT: () => new TokenAndValue($.CONSUME(T.Two), 2) },
{ ALT: () => new TokenAndValue($.CONSUME(T.Three), 3) },
{ ALT: () => new TokenAndValue($.CONSUME(T.Four), 4) },
{ ALT: () => new TokenAndValue($.CONSUME(T.Five), 5) },
{ ALT: () => new TokenAndValue($.CONSUME(T.Six), 6) },
{ ALT: () => new TokenAndValue($.CONSUME(T.Seven), 7) },
{ ALT: () => new TokenAndValue($.CONSUME(T.Eight), 8) },
{ ALT: () => new TokenAndValue($.CONSUME(T.Nine), 9) },
{ ALT: () => new TokenAndValue($.CONSUME(T.Ten), 10) },
{ ALT: () => {
const tok = $.CONSUME(T.NumberLiteral);
return new TokenAndValue(tok, parseInt(tok.image));
} }
]));
});
// 1, 1..2, between 1 and/to 2 inclusively/exclusively
const CountSubStatement = $.RULE("CountSubStatement", () => {
return $.OR([
// between 1 to 4
{ ALT: () => {
const tokens = [];
tokens.push($.CONSUME(T.Between));
const from = $.SUBRULE4(NumberSubStatement);
$.OR3([
{ ALT: () => $.CONSUME2(T.To) },
{ ALT: () => $.CONSUME(T.And) }
]);
const to = $.SUBRULE5(NumberSubStatement);
tokens.push(to.token);
$.OPTION4(() => tokens.push($.CONSUME3(T.Times)));
const opt = $.OPTION5(() => {
return $.OR4([
{ ALT: () => {
tokens.push($.CONSUME(T.Inclusive));
return "inclusive";
} },
{ ALT: () => {
tokens.push($.CONSUME(T.Exclusive));
return "exclusive";
} }
]);
});
return new generator_1.CountSubStatementCST(tokens, from.value, to.value, opt);
} },
// from 1 to 4
{ ALT: () => {
const tokens = [];
$.OPTION2(() => tokens.push($.CONSUME(T.From)));
const from = $.SUBRULE2(NumberSubStatement);
const to = $.OR2([
{ ALT: () => new TokenAndValue($.CONSUME(T.OrMore), [null, "+"]) },
{ ALT: () => {
$.CONSUME(T.To);
const val = $.SUBRULE3(NumberSubStatement);
let token = val.token;
const opt = $.OPTION7(() => {
return $.OR5([
{ ALT: () => {
token = $.CONSUME2(T.Inclusive);
return "inclusive";
} },
{ ALT: () => {
token = $.CONSUME2(T.Exclusive);
return "exclusive";
} }
]);
});
return new TokenAndValue(token, [val.value, opt]);
} }
]);
tokens.push(to.token);
$.OPTION3(() => tokens.push($.CONSUME2(T.Times)));
return new generator_1.CountSubStatementCST(tokens, from.value, to.value ? to.value[0] : null, to.value ? to.value[1] : null);
} },
// exactly 2
{ ALT: () => {
const tokens = [];
$.OPTION(() => tokens.push($.CONSUME(T.Exactly)));
const from = $.SUBRULE(NumberSubStatement);
tokens.push(from.token);
$.OPTION6(() => tokens.push($.CONSUME(T.Times)));
return new generator_1.CountSubStatementCST(tokens, from.value);
} }
]);
});
// match sub rules
let mss_rules = null;
const MatchSubStatement = $.RULE("MatchSubStatement", () => {
let count = null;
let invert = false;
const values = [];
let from = null;
let value = null;
let to = null;
let type = generator_1.MatchSubStatementType.Anything;
let tokens = [];
count = $.OPTION(() => {
const css = $.SUBRULE(CountSubStatement);
if (utilities_1.usefulConditional(css.tokens, "due to how chevrotain works, the first run produces a null value")) {
tokens.push(utilities_1.first(css.tokens));
}
return css;
});
invert = $.OPTION2(() => {
tokens.push($.CONSUME(T.Not));
return true;
});
$.AT_LEAST_ONE_SEP({
SEP: T.Or,
DEF: () => {
$.OPTION3(() => $.CONSUME(T.A));
const result = $.OR(mss_rules || (mss_rules = [
// range [a-z]
{ ALT: () => {
const token0 = $.OPTION4(() => $.CONSUME(T.From));
const token1 = $.CONSUME2(T.StringLiteral);
from = token1.image;
$.CONSUME(T.To);
const token2 = $.CONSUME3(T.StringLiteral);
to = token2.image;
type = generator_1.MatchSubStatementType.Between;
if (utilities_1.usefulConditional(token0, "Bug in type definition. Option should return <T|undefined>, but it doesn't")) {
return { tokens: [token0, token2], statement: new generator_1.MatchSubStatementValue(type, from, to) };
}
return { tokens: [token1, token2], statement: new generator_1.MatchSubStatementValue(type, from, to) };
} },
// range [a-z]
{ ALT: () => {
const token1 = $.CONSUME(T.Between);
from = $.CONSUME4(T.StringLiteral).image;
$.CONSUME(T.And);
const token2 = $.CONSUME5(T.StringLiteral);
to = token2.image;
type = generator_1.MatchSubStatementType.Between;
return { tokens: [token1, token2], statement: new generator_1.MatchSubStatementValue(type, from, to) };
} },
// exact string
{ ALT: () => {
const token = $.CONSUME(T.StringLiteral);
value = token.image;
type = generator_1.MatchSubStatementType.SingleString;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type, value) };
} },
//unicode
{ ALT: () => {
const token1 = $.CONSUME(T.Unicode);
const token2 = $.CONSUME6(T.StringLiteral);
value = token2.image;
type = generator_1.MatchSubStatementType.Unicode;
return { tokens: [token1, token2], statement: new generator_1.MatchSubStatementValue(type, value) };
} },
{ ALT: () => {
const token = $.CONSUME(T.Anything);
type = generator_1.MatchSubStatementType.Anything;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type) };
} },
{ ALT: () => {
const token = $.CONSUME(T.Boundary);
type = generator_1.MatchSubStatementType.Boundary;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type) };
} },
{ ALT: () => {
const token = $.CONSUME(T.Word);
type = generator_1.MatchSubStatementType.Word;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type) };
} },
{ ALT: () => {
const token = $.CONSUME(T.Digit);
type = generator_1.MatchSubStatementType.Digit;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type) };
} },
{ ALT: () => {
const token = $.CONSUME(T.Character);
type = generator_1.MatchSubStatementType.Character;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type) };
} },
{ ALT: () => {
const token = $.CONSUME(T.Whitespace);
type = generator_1.MatchSubStatementType.Whitespace;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type) };
} },
{ ALT: () => {
const token = $.CONSUME(T.Number);
type = generator_1.MatchSubStatementType.Number;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type) };
} },
{ ALT: () => {
const token = $.CONSUME(T.Tab);
type = generator_1.MatchSubStatementType.Tab;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type) };
} },
{ ALT: () => {
const token = $.CONSUME(T.Linefeed);
type = generator_1.MatchSubStatementType.Linefeed;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type) };
} },
{ ALT: () => {
const token = $.CONSUME(T.Newline);
type = generator_1.MatchSubStatementType.Newline;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type) };
} },
{ ALT: () => {
const token = $.CONSUME(T.CarriageReturn);
type = generator_1.MatchSubStatementType.CarriageReturn;
return { tokens: [token], statement: new generator_1.MatchSubStatementValue(type) };
} },
]));
tokens = tokens.concat(result.tokens);
values.push(result.statement);
}
});
return new generator_1.MatchSubStatementCST(tokens, count, invert, values);
});
// optionally match "+" then 1+ words
const MatchStatement = $.RULE("MatchStatement", () => {
let optional = false;
let completely_optional = false;
const msv = [];
const tokens = [];
$.OPTION(() => {
tokens.push($.CONSUME(T.Optional));
completely_optional = true;
});
tokens.push($.CONSUME(T.Match));
$.OPTION4(() => {
$.CONSUME3(T.Optional);
optional = true;
});
msv.push(new generator_1.MatchStatementValue(optional, $.SUBRULE(MatchSubStatement)));
$.MANY(() => {
$.OR([
{ ALT: () => {
$.OPTION2(() => $.CONSUME2(T.And));
$.CONSUME(T.Then);
} },
{ ALT: () => $.CONSUME(T.And) },
]);
optional = false;
$.OPTION3(() => {
$.CONSUME2(T.Optional);
optional = true;
});
msv.push(new generator_1.MatchStatementValue(optional, $.SUBRULE2(MatchSubStatement)));
});
tokens.push($.CONSUME(T.EndOfLine));
return new generator_1.MatchStatementCST(tokens, completely_optional, msv);
});
// using global matching
let us_rules = null;
const UsingStatement = $.RULE("UsingStatement", () => {
const usings = [];
const tokens = [$.CONSUME(T.Using)];
$.AT_LEAST_ONE_SEP({
SEP: T.And,
DEF: () => {
usings.push($.OR(us_rules || (us_rules = [
{ ALT: () => {
$.CONSUME(T.Multiline);
return generator_1.UsingFlags.Multiline;
} },
{ ALT: () => {
$.CONSUME(T.Global);
return generator_1.UsingFlags.Global;
} },
{ ALT: () => {
$.CONSUME(T.CaseInsensitive);
return generator_1.UsingFlags.Insensitive;
} },
{ ALT: () => {
$.CONSUME(T.CaseSensitive);
return generator_1.UsingFlags.Sensitive;
} },
{ ALT: () => {
$.CONSUME(T.Exact);
return generator_1.UsingFlags.Exact;
} }
])));
$.OPTION(() => $.CONSUME(T.Matching));
}
});
tokens.push($.CONSUME(T.EndOfLine));
return new TokensAndValue(tokens, usings);
});
// group rules
const GroupStatement = $.RULE("GroupStatement", () => {
const tokens = [];
let optional = false;
let name = null;
const statement = [];
// position of optional must be OR'd because
// otherwise it could appear twice
// ex) optional? create an optional? group
tokens.push($.OR([
{ ALT: () => {
optional = true;
const first_token = $.CONSUME(T.Optional);
$.CONSUME(T.Create);
$.CONSUME(T.A);
return first_token;
} },
{ ALT: () => {
const first_token = $.CONSUME2(T.Create);
$.CONSUME2(T.A);
$.OPTION2(() => {
$.CONSUME2(T.Optional);
optional = true;
});
return first_token;
} }
]));
$.CONSUME(T.Group);
$.OPTION(() => {
$.CONSUME(T.Called);
name = $.CONSUME(T.Identifier).image;
});
// Note: Technically not the end token,
// BUT this is way more useful than the Outdent for error reporting
tokens.push($.CONSUME2(T.EndOfLine));
$.CONSUME(T.Indent);
$.AT_LEAST_ONE(() => {
statement.push($.SUBRULE(Statement));
});
$.CONSUME(T.Outdent);
return new generator_1.GroupStatementCST(tokens, optional, name, statement);
});
// repeat rules
const RepeatStatement = $.RULE("RepeatStatement", () => {
const tokens = [];
let optional = false;
let count = null;
const statements = [];
$.OPTION3(() => {
tokens.push($.CONSUME(T.Optional));
optional = true;
});
tokens.push($.CONSUME(T.Repeat));
$.OPTION(() => count = $.SUBRULE(CountSubStatement));
$.CONSUME3(T.EndOfLine);
$.CONSUME(T.Indent);
$.AT_LEAST_ONE(() => {
statements.push($.SUBRULE(Statement));
});
tokens.push($.CONSUME(T.Outdent));
return new generator_1.RepeatStatementCST(tokens, optional, count, statements);
});
// statement super class
const Statement = $.RULE("Statement", () => {
return $.OR([
{ ALT: () => $.SUBRULE(MatchStatement) },
{ ALT: () => $.SUBRULE(GroupStatement) },
{ ALT: () => $.SUBRULE(RepeatStatement) }
]);
});
// full regex
const Regex = $.RULE("Regex", () => {
let tokens = [];
let usings = [];
const statements = [];
$.MANY(() => {
const using = $.SUBRULE(UsingStatement);
tokens = tokens.concat(using.tokens);
usings = usings.concat(using.value);
});
$.MANY2(() => statements.push($.SUBRULE(Statement)));
return new generator_1.RegularExpressionCST([], new generator_1.UsingStatementCST(tokens, usings), statements);
});
this.performSelfAnalysis();
this.regexp = Regex;
}
/**
* Parses the token stream
*
* @param tokens Tokens to parse
* @returns a parse result which contains the token stream and error list
* @public
*/
parse(tokens) {
this.input = tokens;
return new ParseResult(this.regexp(), this.errors.map(utilities_1.CommonError.fromParseError));
}
/**
* Sets the options for this parser
*
* @param options options for the parser
* @see Human2RegexParserOptions
* @public
*/
setOptions(options) {
utilities_1.unusedParameter(options, "skip_validations is not valid to change once we've already initialized");
}
}
exports.Human2RegexParser = Human2RegexParser;
Human2RegexParser.already_init = false;