1
0
mirror of https://github.com/pdemian/human2regex.git synced 2025-05-15 20:10:19 -07:00

Merge pull request #4 from pdemian/new-static-site-generator

New static site generator
This commit is contained in:
Patrick Demian 2021-01-17 05:24:33 -05:00 committed by GitHub
commit 32a641c8e4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
59 changed files with 3150 additions and 4467 deletions

10
API.md
View File

@ -132,17 +132,17 @@ export enum RegexDialect {
}
```
After choosing one, you must validate the regular expression. This may be skipped if and only if the input has already been validated before as the generator is not guaranteed to work unless there are no errors.
After choosing one, you should validate the regular expression. This may be skipped if and only if the input has already been validated before. For example, you may keep the `parse_result` around and generate it multiple times, only validating the first time. The generator is not guaranteed to work unless there are no validation errors. The generator does no validation itself and may either return garbage output or crash.
```typescript
const validation_errors = parse_result.validate();
const validation_errors = parse_result.validate(RegexDialect.JS);
```
The result is a list of errors which, again, is a `CommonError`. If there are no errors, you can call the `toRegex()` function to create a string representation of the regular expression. You can also call the `toRegExp()` function to create a `RegExp` expression used in Javascript
The result is a list of errors which is a `CommonError`. If there are no errors, you can call the `toRegex()` function to create a string representation of the regular expression. You can also call the `toRegExp()` function to create a `RegExp` expression used in Javascript
```typescript
const my_regex_string = parse_result.toRegex(); // type is string
const my_regex = parse_result.toRegExp(); // type is RegExp
const my_regex_string = parse_result.toRegex(RegexDialect.JS); // type is string
const my_regex = parse_result.toRegExp(RegexDialect.JS); // type is RegExp
```
This will contain your regular expression.

View File

@ -1,6 +1,6 @@
The MIT License (MIT)
Copyright (c) 2020 Patrick Demian
Copyright (c) 2021 Patrick Demian
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@ -81,6 +81,7 @@ The API reference is available [here](API.md)
## Todo
- Add more regex options such as back references, subroutines, lookahead/behind, and more character classes (eg, `[:alpha:]`)
- Add more regex options such as subroutines~~, conditions, and lookahead/behind~~
- Fix error messages (They sometimes point to the wrong location, off by 1 errors, etc)
- Use a different/better static site generation method
- Add more useful lex/parse errors (What even is an EarlyExitException?)
- ~~Use a different/better static site generation method~~

15
config.json Normal file
View File

@ -0,0 +1,15 @@
{
"prod": true,
"dst": "./docs/",
"src": "./src/",
"compression_config": {
"html": {
"collapseWhitespace": true,
"minifyCSS": true,
"minifyJS": true,
"removeComments": true,
"removeEmptyAttributes": true,
"removeRedundantAttributes": true
}
}
}

View File

@ -1 +1 @@
<!DOCTYPE html><html lang="en" dir="ltr"><head><meta name="viewport" content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name="description" content="Not Found"><meta name="keywords" content="Human2Regex, Human, Regex, Natural, Language, Natural Language"><meta http-equiv="Content-Type" content="text/html; charset=utf-8"><title>Error 404 - Not Found</title><link href="bundle.min.css" rel="stylesheet" type="text/css"><meta name="theme-color" content="#212529"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="default"><link rel="icon" type="image/x-icon" href="favicon.ico"></head><body><a class="skip skip-top" href="#maincontent">Skip to main content</a><div class="wrapper"><nav class="navbar navbar-expand-lg navbar-light fixed-top" id="mainNav"><div class="container"><a class="navbar-brand" href="index.html"><img src="favicon-small.png" width="30" height="30" class="d-inline-block align-top" alt="logo">&nbsp;Human2Regex</a><div class="float-right heading-links"><a class="heading-link" href="index.html">Index</a> <span>&nbsp;|&nbsp;</span> <a class="heading-link" href="tutorial.html">Tutorial</a></div></div></nav><div class="container contained-container" id="maincontent" role="main"><div class="align_header"><div class="mx-auto"><div class="site-heading"><h1>404</h1><span class="subheading">Not Found</span></div></div></div><br><br><br><div class="row"><div class="col-12 mx-auto"><h3 class="align_header">The resource could not be found.</h3></div></div></div><footer><div class="container"><div class="row"><div class="col-lg-8 col-md-10 mx-auto"><p class="copyright">Copyright &copy; 2020 Patrick Demian. This page's source code is available at <a rel="noopener noreferrer" href="https://github.com/pdemian/human2regex">github.com/pdemian/human2regex</a></p></div></div></div></footer></div><script defer="defer" src="bundle.min.js"></script></body></html>
<!DOCTYPE html><html lang="en" dir="ltr"><head><meta name="viewport" content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name="description" content="Not Found"><meta name="keywords" content="Human2Regex, Human, Regex, Natural, Language, Natural Language"><meta http-equiv="Content-Type" content="text/html; charset=utf-8"><title>Error 404 - Not Found</title><link href="/bundle.min.css" rel="stylesheet" type="text/css"><meta name="theme-color" content="#212529"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="default"><link rel="icon" type="image/x-icon" href="/favicon.ico"></head><body><a class="skip skip-top" href="#maincontent">Skip to main content</a><div class="wrapper"><nav class="navbar navbar-expand-lg navbar-light fixed-top" id="mainNav"><div class="container"><a class="navbar-brand" href="/index.html"><img src="/favicon-small.png" width="30" height="30" class="d-inline-block align-top" alt="logo">&nbsp;Human2Regex</a><div class="float-right heading-links"><a class="heading-link" href="/index.html">Index</a> <span>&nbsp;|&nbsp;</span> <a class="heading-link" href="/tutorial.html">Tutorial</a></div></div></nav><div class="container contained-container" id="maincontent" role="main"><div class="align_header"><div class="mx-auto"><div class="site-heading"><h1>404</h1><span class="subheading">Not Found</span></div></div></div><br><br><br><div class="row"><div class="col-12 mx-auto"><h3 class="align_header">The resource could not be found.</h3></div></div></div><footer><div class="container"><div class="row"><div class="col-lg-8 col-md-10 mx-auto"><p class="copyright">Copyright &copy; 2021 Patrick Demian. This page's source code is available at <a rel="noopener noreferrer" href="https://github.com/pdemian/human2regex">github.com/pdemian/human2regex</a></p></div></div></div></footer></div><script defer="defer" src="/bundle.min.js"></script></body></html>

2
docs/bundle.min.css vendored

File diff suppressed because one or more lines are too long

15
docs/bundle.min.js vendored

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,4 @@
<!DOCTYPE html><html lang="en" dir="ltr"><head><meta name="viewport" content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name="description" content="Create regular expressions with natural, human language"><meta name="keywords" content="Human2Regex, Human, Regex, Natural, Language, Natural Language"><meta http-equiv="Content-Type" content="text/html; charset=utf-8"><title>Human2Regex</title><link href="bundle.min.css" rel="stylesheet" type="text/css"><meta name="theme-color" content="#212529"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="default"><link rel="icon" type="image/x-icon" href="favicon.ico"></head><body><a class="skip skip-top" href="#maincontent">Skip to main content</a><div class="wrapper"><nav class="navbar navbar-expand-lg navbar-light fixed-top" id="mainNav"><div class="container"><a class="navbar-brand" href="index.html"><img src="favicon-small.png" width="30" height="30" class="d-inline-block align-top" alt="logo">&nbsp;Human2Regex</a><div class="float-right heading-links"><a class="heading-link" href="index.html">Index</a> <span>&nbsp;|&nbsp;</span> <a class="heading-link" href="tutorial.html">Tutorial</a></div></div></nav><div class="container" id="maincontent" role="main"><div class="row"><div class="col-lg-8 tenpx-margin-bottom"><div class="form-group row zero-margin-bottom"><label for="dialect" class="col-sm-4 col-form-label">Regex dialect:</label><div class="col-sm-8"><select class="form-control" id="dialect"><option value="js" selected="selected">Javascript</option><option value="dotnet">.NET</option><option value="python">Python</option><option value="boost">C++ Boost</option><option value="java">Java 7+</option><option value="pcre">PCRE</option></select></div></div><h4>Your Regular Expression:</h4><div class="row"><div class="col-xl-11 tenpx-margin-bottom"><input readonly="readonly" class="form-control" id="regex"></div><div class="col-xl-1"><button type="button" class="btn btn-secondary float-right" id="clip">Copy</button></div></div><h4>Human Speak:</h4><textarea class="form-control" id="human" rows="25">
/* Make a regex that matches (basic) URLs */
<!DOCTYPE html><html lang="en" dir="ltr"><head><meta name="viewport" content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name="description" content="Create regular expressions with natural, human language"><meta name="keywords" content="Human2Regex, Human, Regex, Natural, Language, Natural Language"><meta http-equiv="Content-Type" content="text/html; charset=utf-8"><title>Human2Regex</title><link href="/bundle.min.css" rel="stylesheet" type="text/css"><meta name="theme-color" content="#212529"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="default"><link rel="icon" type="image/x-icon" href="/favicon.ico"></head><body><a class="skip skip-top" href="#maincontent">Skip to main content</a><div class="wrapper"><nav class="navbar navbar-expand-lg navbar-light fixed-top" id="mainNav"><div class="container"><a class="navbar-brand" href="/index.html"><img src="/favicon-small.png" width="30" height="30" class="d-inline-block align-top" alt="logo">&nbsp;Human2Regex</a><div class="float-right heading-links"><a class="heading-link" href="/index.html">Index</a> <span>&nbsp;|&nbsp;</span> <a class="heading-link" href="/tutorial.html">Tutorial</a></div></div></nav><div class="container" id="maincontent" role="main"><div class="row"><div class="col-lg-8 tenpx-margin-bottom"><div class="form-group row zero-margin-bottom"><label for="dialect" class="col-sm-4 col-form-label">Regex dialect:</label><div class="col-sm-8"><select class="form-control" id="dialect"><option value="js" selected="selected">Javascript</option><option value="dotnet">.NET</option><option value="python">Python</option><option value="boost">C++ Boost</option><option value="java">Java 7+</option><option value="pcre">PCRE</option></select></div></div><h4>Your Regular Expression:</h4><div class="row"><div class="col-xl-11 tenpx-margin-bottom"><input readonly="readonly" class="form-control" id="regex"></div><div class="col-xl-1"><button type="button" class="btn btn-secondary float-right" id="clip">Copy</button></div></div><h4>Human Speak:</h4><textarea class="form-control" id="human" rows="25">/* Make a regex that matches (basic) URLs */
using global and exact matching
create an optional group called protocol
@ -30,5 +29,4 @@ create an optional group
create an optional group
# fragment, again, we don't care, so ignore everything afterwards
match "#"
match 0+ any thing
</textarea><h4>Errors:</h4><textarea readonly="readonly" class="form-control" id="errors" rows="5"></textarea></div><br><div class="col-lg-4 tenpx-margin-bottom"><div class="cheatsheet"><h2>Cheat Sheet:</h2><p>Full documentation available <a href="tutorial.html">here</a></p><p class="font-weight-bold">Matching</p><p><code class="cm-s-idea">match "hello world"</code> matches "hello world" exactly</p><p></p><p><code class="cm-s-idea">match "hello" then optionally " world"</code> matches "hello" or "hello world"</p><p><code class="cm-s-idea">match "hello" or "world"</code> matches "hello" or "world</p><p><code class="cm-s-idea">match a word</code> matches any word</p><p class="font-weight-bold">Repetition</p><p><code class="cm-s-idea">match 0+ "hello"</code> matches 0 or more "hello"s</p><p><code class="cm-s-idea">match 3 "hello"</code> matches exactly "hellohellohello"</p><p><code class="cm-s-idea">match 1 to 5 "hello"</code> matches between 1 to 5 "hello"s</p><p><code class="cm-s-idea">repeat 0 or more</code> repeats the intended text 0 or more times (default)</p><p><code class="cm-s-idea">optionally repeat between 3 to 5</code> optionally repeats the indented text 3 to 5 times</p><p class="font-weight-bold">Grouping</p><p><code class="cm-s-idea">create a group called mygroup</code> creates a group called "mygroup"</p><p><code class="cm-s-idea">create an optional group</code> creates an unnamed optional group</p><p class="font-weight-bold">Using</p><p><code class="cm-s-idea">using global and case insensitive</code> uses the 'g' and 'i' flags</p><p class="font-weight-bold">Misc</p><p><code class="cm-s-idea">// comment</code> is a single line comment</p><p><code class="cm-s-idea">/* comment */</code> is a multi line comment</p></div></div></div></div><footer><div class="container"><div class="row"><div class="col-lg-8 col-md-10 mx-auto"><p class="copyright">Copyright &copy; 2020 Patrick Demian. This page's source code is available at <a rel="noopener noreferrer" href="https://github.com/pdemian/human2regex">github.com/pdemian/human2regex</a></p></div></div></div></footer></div><script defer="defer" src="bundle.min.js"></script></body></html>
match 0+ any thing</textarea><h4>Errors:</h4><textarea readonly="readonly" class="form-control" id="errors" rows="5"></textarea></div><br><div class="col-lg-4 tenpx-margin-bottom"><div class="cheatsheet"><h2>Cheat Sheet:</h2><p>Full documentation available <a href="/tutorial.html">here</a></p><p class="font-weight-bold">Matching</p><p><code class="cm-s-idea">match "hello world"</code> matches "hello world" exactly</p><p></p><p><code class="cm-s-idea">match "hello" then optionally " world"</code> matches "hello" or "hello world"</p><p><code class="cm-s-idea">match "hello" or "world"</code> matches "hello" or "world</p><p><code class="cm-s-idea">match a word</code> matches any word</p><p class="font-weight-bold">Repetition</p><p><code class="cm-s-idea">match 0+ "hello"</code> matches 0 or more "hello"s</p><p><code class="cm-s-idea">match 3 "hello"</code> matches exactly "hellohellohello"</p><p><code class="cm-s-idea">match 1 to 5 "hello"</code> matches between 1 to 5 "hello"s</p><p><code class="cm-s-idea">repeat 0 or more</code> repeats the intended text 0 or more times (default)</p><p><code class="cm-s-idea">optionally repeat between 3 to 5</code> optionally repeats the indented text 3 to 5 times</p><p class="font-weight-bold">Grouping</p><p><code class="cm-s-idea">create a group called mygroup</code> creates a group called "mygroup"</p><p><code class="cm-s-idea">create an optional group</code> creates an unnamed optional group</p><p class="font-weight-bold">Using</p><p><code class="cm-s-idea">using global and case insensitive</code> uses the 'g' and 'i' flags</p><p class="font-weight-bold">Misc</p><p><code class="cm-s-idea">// comment</code> is a single line comment</p><p><code class="cm-s-idea">/* comment */</code> is a multi line comment</p></div></div></div></div><footer><div class="container"><div class="row"><div class="col-lg-8 col-md-10 mx-auto"><p class="copyright">Copyright &copy; 2021 Patrick Demian. This page's source code is available at <a rel="noopener noreferrer" href="https://github.com/pdemian/human2regex">github.com/pdemian/human2regex</a></p></div></div></div></footer></div><script defer="defer" src="/bundle.min.js"></script></body></html>

File diff suppressed because one or more lines are too long

142
lib/generator.d.ts vendored
View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
import { IToken } from "chevrotain";
/**
* List of regular expression dialects we support
@ -21,29 +21,45 @@ export interface ISemanticError {
message: string;
}
/**
* The base concrete syntax tree class
* Context for validation
*
* @remarks Currently only used to validate groups
* @internal
*/
export declare abstract class H2RCST {
tokens: IToken[];
export declare class GeneratorContext {
groups: {
[key: string]: {
startLine: number;
startColumn: number;
length: number;
};
};
/**
* Constructor for H2RCST
* Checks to see if we already have a group defined
*
* @param tokens Tokens used to calculate where an error occured
* @internal
* @param identifier the group name
* @returns true if the group name already exists
*/
constructor(tokens: IToken[]);
hasGroup(identifier: string): boolean;
/**
* Adds the identifier to the group list
*
* @param identifier the group name
*/
addGroup(identifier: string, tokens: IToken[]): void;
}
interface Generates {
/**
* Validate that this is both valid and can be generated in the specified language
*
* @remarks There is no guarantee toRegex will work unless validate returns no errors
*
* @param language the regex dialect we're validating
* @param context the generator context
* @returns A list of errors
* @public
*/
abstract validate(language: RegexDialect): ISemanticError[];
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
/**
* Generate a regular expression fragment based on this syntax tree
*
@ -53,6 +69,23 @@ export declare abstract class H2RCST {
* @returns a regular expression fragment
* @public
*/
toRegex(language: RegexDialect): string;
}
/**
* The base concrete syntax tree class
*
* @internal
*/
export declare abstract class H2RCST implements Generates {
tokens: IToken[];
/**
* Constructor for H2RCST
*
* @param tokens Tokens used to calculate where an error occured
* @internal
*/
constructor(tokens: IToken[]);
abstract validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
abstract toRegex(language: RegexDialect): string;
/**
* Creates an ISemanticError with a given message and the tokens provided from the constructor
@ -126,7 +159,7 @@ export declare class MatchSubStatementValue {
*
* @internal
*/
export declare class MatchStatementValue {
export declare class MatchStatementValue implements Generates {
optional: boolean;
statement: MatchSubStatementCST;
/**
@ -137,6 +170,8 @@ export declare class MatchStatementValue {
* @internal
*/
constructor(optional: boolean, statement: MatchSubStatementCST);
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
* The base class for all statement concrete syntax trees
@ -163,7 +198,7 @@ export declare class MatchSubStatementCST extends H2RCST {
* @param values sub statements to match
*/
constructor(tokens: IToken[], count: CountSubStatementCST | null, invert: boolean, values: MatchSubStatementValue[]);
validate(language: RegexDialect): ISemanticError[];
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
@ -180,7 +215,7 @@ export declare class UsingStatementCST extends H2RCST {
* @param flags using flags
*/
constructor(tokens: IToken[], flags: UsingFlags[]);
validate(language: RegexDialect): ISemanticError[];
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
@ -201,7 +236,7 @@ export declare class CountSubStatementCST extends H2RCST {
* @param opt option modifier
*/
constructor(tokens: IToken[], from: number, to?: number | null, opt?: "inclusive" | "exclusive" | "+" | null);
validate(language: RegexDialect): ISemanticError[];
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
@ -216,10 +251,10 @@ export declare class MatchStatementCST extends StatementCST {
* Constructor for MatchStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param matches
* @param matches the list of matches
*/
constructor(tokens: IToken[], completely_optional: boolean, matches: MatchStatementValue[]);
validate(language: RegexDialect): ISemanticError[];
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
@ -240,7 +275,7 @@ export declare class RepeatStatementCST extends StatementCST {
* @param statements the statements to repeat
*/
constructor(tokens: IToken[], optional: boolean, count: CountSubStatementCST | null, statements: StatementCST[]);
validate(language: RegexDialect): ISemanticError[];
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
@ -262,7 +297,70 @@ export declare class GroupStatementCST extends StatementCST {
* @internal
*/
constructor(tokens: IToken[], optional: boolean, name: string | null, statements: StatementCST[]);
validate(language: RegexDialect): ISemanticError[];
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
* Concrete Syntax Tree for a Backreference statement
*
* @internal
*/
export declare class BackrefStatementCST extends StatementCST {
private optional;
private count;
private name;
/**
* Constructor for BackrefStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param optional is this backref optional
* @param count optional number of times to repeat
* @param name the group name to call
*/
constructor(tokens: IToken[], optional: boolean, count: CountSubStatementCST | null, name: string);
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
* Concrete Syntax Tree for an If Pattern statement
*
* @internal
*/
export declare class IfPatternStatementCST extends StatementCST {
private matches;
private true_statements;
private false_statements;
/**
* Constructor for IfPatternStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param matches list of matches to test against
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens: IToken[], matches: MatchStatementValue[], true_statements: StatementCST[], false_statements: StatementCST[]);
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
* Concrete Syntax Tree for an If group Ident statement
*
* @internal
*/
export declare class IfIdentStatementCST extends StatementCST {
private identifier;
private true_statements;
private false_statements;
/**
* Constructor for IfIdentStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param identifier the group identifier to check
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens: IToken[], identifier: string, true_statements: StatementCST[], false_statements: StatementCST[]);
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
@ -282,13 +380,7 @@ export declare class RegularExpressionCST extends H2RCST {
* @internal
*/
constructor(tokens: IToken[], usings: UsingStatementCST, statements: StatementCST[]);
validate(language: RegexDialect): ISemanticError[];
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
toRegex(language: RegexDialect): string;
}
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
export declare function minimizeMatchString(arr: string[]): string;
export {};

View File

@ -1,12 +1,13 @@
"use strict";
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
Object.defineProperty(exports, "__esModule", { value: true });
exports.minimizeMatchString = exports.RegularExpressionCST = exports.GroupStatementCST = exports.RepeatStatementCST = exports.MatchStatementCST = exports.CountSubStatementCST = exports.UsingStatementCST = exports.MatchSubStatementCST = exports.StatementCST = exports.MatchStatementValue = exports.MatchSubStatementValue = exports.MatchSubStatementType = exports.UsingFlags = exports.H2RCST = exports.RegexDialect = void 0;
exports.RegularExpressionCST = exports.IfIdentStatementCST = exports.IfPatternStatementCST = exports.BackrefStatementCST = exports.GroupStatementCST = exports.RepeatStatementCST = exports.MatchStatementCST = exports.CountSubStatementCST = exports.UsingStatementCST = exports.MatchSubStatementCST = exports.StatementCST = exports.MatchStatementValue = exports.MatchSubStatementValue = exports.MatchSubStatementType = exports.UsingFlags = exports.H2RCST = exports.GeneratorContext = exports.RegexDialect = void 0;
/**
* Includes all Concrete Syntax Trees for Human2Regex
* @packageDocumentation
*/
const utilities_1 = require("./utilities");
const generator_helper_1 = require("./generator_helper");
/**
* List of regular expression dialects we support
*/
@ -49,6 +50,42 @@ const unicode_script_codes = [
"Tai_Tham", "Tai_Viet", "Takri", "Tamil", "Telugu", "Thaana", "Thai",
"Tibetan", "Tifinagh", "Ugaritic", "Vai", "Yi"
];
/**
* Context for validation
*
* @remarks Currently only used to validate groups
* @internal
*/
class GeneratorContext {
constructor() {
this.groups = {};
}
/**
* Checks to see if we already have a group defined
*
* @param identifier the group name
* @returns true if the group name already exists
*/
hasGroup(identifier) {
return Object.prototype.hasOwnProperty.call(this.groups, identifier);
}
/**
* Adds the identifier to the group list
*
* @param identifier the group name
*/
addGroup(identifier, tokens) {
var _a, _b, _c;
const f = utilities_1.first(tokens);
const l = utilities_1.last(tokens);
this.groups[identifier] = {
startLine: (_a = f.startLine) !== null && _a !== void 0 ? _a : NaN,
startColumn: (_b = f.startColumn) !== null && _b !== void 0 ? _b : NaN,
length: ((_c = l.endOffset) !== null && _c !== void 0 ? _c : l.startOffset) - f.startOffset,
};
}
}
exports.GeneratorContext = GeneratorContext;
/**
* The base concrete syntax tree class
*
@ -166,6 +203,17 @@ class MatchStatementValue {
this.statement = statement;
/* empty */
}
validate(language, context) {
return this.statement.validate(language, context);
}
toRegex(language) {
let match_stmt = this.statement.toRegex(language);
// need to group if optional and ungrouped
if (this.optional) {
match_stmt = generator_helper_1.groupIfRequired(match_stmt) + "?";
}
return match_stmt;
}
}
exports.MatchStatementValue = MatchStatementValue;
/**
@ -196,10 +244,10 @@ class MatchSubStatementCST extends H2RCST {
this.invert = invert;
this.values = values;
}
validate(language) {
validate(language, context) {
const errors = [];
if (this.count) {
utilities_1.append(errors, this.count.validate(language));
utilities_1.append(errors, this.count.validate(language, context));
}
for (const value of this.values) {
if (value.type === MatchSubStatementType.Between) {
@ -311,50 +359,15 @@ class MatchSubStatementCST extends H2RCST {
break;
}
}
let ret = "";
let require_grouping = false;
let dont_clobber_plus = false;
if (matches.length === 1) {
ret = utilities_1.first(matches);
if (ret.endsWith("+")) {
dont_clobber_plus = true;
}
}
else {
ret = minimizeMatchString(matches);
if (ret.length > 1 &&
(!ret.startsWith("(") || !ret.endsWith("["))) {
require_grouping = true;
}
}
let ret = generator_helper_1.minimizeMatchString(matches);
if (this.count) {
if (dont_clobber_plus) {
const clobber = this.count.toRegex(language);
// + can be ignored as well as a count as long as that count is > 0
switch (clobber) {
case "*":
case "?":
ret = "(?:" + ret + ")" + clobber;
break;
case "+":
// ignore
break;
default:
if (clobber.startsWith("{0")) {
ret = "(?:" + ret + ")" + clobber;
}
else {
// remove + and replace with count
ret.substring(0, ret.length - 1) + clobber;
}
break;
}
if (matches.length === 1) {
// we don't group if there's only 1 element
// but we need to make sure we don't add an additional + or *
ret = generator_helper_1.dontClobberRepetition(ret, this.count.toRegex(language));
}
else {
if (require_grouping) {
ret = "(?:" + ret + ")";
}
ret += this.count.toRegex(language);
ret = generator_helper_1.groupIfRequired(ret) + this.count.toRegex(language);
}
}
return ret;
@ -377,8 +390,9 @@ class UsingStatementCST extends H2RCST {
super(tokens);
this.flags = flags;
}
validate(language) {
utilities_1.unusedParameter(language, "Using Statement does not change based on language");
validate(language, context) {
utilities_1.unusedParameter(language, "Count does not need checking");
utilities_1.unusedParameter(context, "Context is not needed");
const errors = [];
let flag = this.flags[0];
for (let i = 1; i < this.flags.length; i++) {
@ -434,13 +448,11 @@ class CountSubStatementCST extends H2RCST {
this.to = to;
this.opt = opt;
}
validate(language) {
validate(language, context) {
utilities_1.unusedParameter(language, "Count does not need checking");
utilities_1.unusedParameter(context, "Context is not needed");
const errors = [];
if (this.from < 0) {
errors.push(this.error("Value cannot be negative"));
}
else if (this.to !== null && ((this.opt === "exclusive" && (this.to - 1) <= this.from) || this.to <= this.from)) {
if (this.to !== null && ((this.opt === "exclusive" && (this.to - 1) <= this.from) || this.to <= this.from)) {
errors.push(this.error("Values must be in range of eachother"));
}
return errors;
@ -483,43 +495,24 @@ class MatchStatementCST extends StatementCST {
* Constructor for MatchStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param matches
* @param matches the list of matches
*/
constructor(tokens, completely_optional, matches) {
super(tokens);
this.completely_optional = completely_optional;
this.matches = matches;
}
validate(language) {
validate(language, context) {
const errors = [];
for (const match of this.matches) {
utilities_1.append(errors, match.statement.validate(language));
utilities_1.append(errors, match.statement.validate(language, context));
}
return errors;
}
toRegex(language) {
let final_matches = this.matches.map((x) => {
let match_stmt = x.statement.toRegex(language);
// need to group if optional and ungrouped
if (x.optional) {
if (!utilities_1.isSingleRegexCharacter(match_stmt)) {
// don't re-group a group
if (match_stmt[0] !== "(" && match_stmt[match_stmt.length - 1] !== ")") {
match_stmt = "(?:" + match_stmt + ")";
}
}
match_stmt += "?";
}
return match_stmt;
}).join("");
let final_matches = this.matches.map((x) => x.toRegex(language)).join("");
if (this.completely_optional) {
if (!utilities_1.isSingleRegexCharacter(final_matches)) {
// don't re-group a group
if (final_matches[0] !== "(" && final_matches[final_matches.length - 1] !== ")") {
final_matches = "(?:" + final_matches + ")";
}
}
final_matches += "?";
final_matches = generator_helper_1.groupIfRequired(final_matches) + "?";
}
return final_matches;
}
@ -545,18 +538,18 @@ class RepeatStatementCST extends StatementCST {
this.count = count;
this.statements = statements;
}
validate(language) {
validate(language, context) {
const errors = [];
if (this.count !== null) {
utilities_1.append(errors, this.count.validate(language));
utilities_1.append(errors, this.count.validate(language, context));
}
for (const statement of this.statements) {
utilities_1.append(errors, statement.validate(language));
utilities_1.append(errors, statement.validate(language, context));
}
return errors;
}
toRegex(language) {
let str = "(?:" + this.statements.map((x) => x.toRegex(language)).join("") + ")";
let str = generator_helper_1.groupIfRequired(this.statements.map((x) => x.toRegex(language)).join(""));
if (this.count) {
str += this.count.toRegex(language);
// group for optionality because count would be incorrect otherwise
@ -595,14 +588,19 @@ class GroupStatementCST extends StatementCST {
this.name = name;
this.statements = statements;
}
validate(language) {
validate(language, context) {
const errors = [];
// All languages currently support named groups
//if (false) {
// errors.push(this.error("This language does not support named groups"));
//}
if (this.name !== null) {
if (context.hasGroup(this.name)) {
const past_group = context.groups[this.name];
errors.push(this.error(`Group with name "${this.name}" was already defined here: ${past_group.startLine}:${past_group.startLine}-${past_group.startLine}:${past_group.startLine + past_group.length}`));
}
else {
context.addGroup(this.name, this.tokens);
}
}
for (const statement of this.statements) {
utilities_1.append(errors, statement.validate(language));
utilities_1.append(errors, statement.validate(language, context));
}
return errors;
}
@ -623,6 +621,169 @@ class GroupStatementCST extends StatementCST {
}
}
exports.GroupStatementCST = GroupStatementCST;
/**
* Concrete Syntax Tree for a Backreference statement
*
* @internal
*/
class BackrefStatementCST extends StatementCST {
/**
* Constructor for BackrefStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param optional is this backref optional
* @param count optional number of times to repeat
* @param name the group name to call
*/
constructor(tokens, optional, count, name) {
super(tokens);
this.optional = optional;
this.count = count;
this.name = name;
}
validate(language, context) {
const errors = [];
if (!context.hasGroup(this.name)) {
errors.push(this.error(`Cannot call group with name "${this.name}" as it was never previously defined`));
}
if (this.count !== null) {
utilities_1.append(errors, this.count.validate(language, context));
}
return errors;
}
toRegex(language) {
let str = "";
switch (language) {
case RegexDialect.Python:
str = `(?P=${this.name})`;
break;
case RegexDialect.DotNet:
case RegexDialect.Java:
str = `\\k<${this.name}>`;
break;
default:
str = `\\g<${this.name}>`;
break;
}
if (this.count) {
str += this.count.toRegex(language);
// group for optionality because count would be incorrect otherwise
if (this.optional) {
str = "(?:" + str + ")?";
}
}
else if (this.optional) {
str = "?";
}
return str;
}
}
exports.BackrefStatementCST = BackrefStatementCST;
/**
* Concrete Syntax Tree for an If Pattern statement
*
* @internal
*/
class IfPatternStatementCST extends StatementCST {
/**
* Constructor for IfPatternStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param matches list of matches to test against
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens, matches, true_statements, false_statements) {
super(tokens);
this.matches = matches;
this.true_statements = true_statements;
this.false_statements = false_statements;
}
validate(language, context) {
const errors = [];
if (language === RegexDialect.Java || language === RegexDialect.JS) {
errors.push(this.error("This language does not support conditionals"));
}
if (language === RegexDialect.Python) {
errors.push(this.error("This language does not support pattern conditionals"));
}
for (const match of this.matches) {
utilities_1.append(errors, match.validate(language, context));
}
for (const statement of this.true_statements) {
utilities_1.append(errors, statement.validate(language, context));
}
for (const statement of this.false_statements) {
utilities_1.append(errors, statement.validate(language, context));
}
return errors;
}
toRegex(language) {
const if_stmt = this.matches.map((x) => x.toRegex(language)).join("");
const true_stmt = generator_helper_1.groupIfRequired(this.true_statements.map((x) => x.toRegex(language)).join(""));
if (this.false_statements.length > 0) {
const false_stmt = generator_helper_1.groupIfRequired(this.false_statements.map((x) => x.toRegex(language)).join(""));
return `(?(${if_stmt})${true_stmt}|${false_stmt})`;
}
else {
return `(?(${if_stmt})${true_stmt})`;
}
}
}
exports.IfPatternStatementCST = IfPatternStatementCST;
/**
* Concrete Syntax Tree for an If group Ident statement
*
* @internal
*/
class IfIdentStatementCST extends StatementCST {
/**
* Constructor for IfIdentStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param identifier the group identifier to check
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens, identifier, true_statements, false_statements) {
super(tokens);
this.identifier = identifier;
this.true_statements = true_statements;
this.false_statements = false_statements;
}
validate(language, context) {
const errors = [];
if (language === RegexDialect.Java || language === RegexDialect.JS) {
errors.push(this.error("This language does not support conditionals"));
}
if (!context.hasGroup(this.identifier)) {
errors.push(this.error(`Group with name "${this.identifier}" does not exist`));
}
for (const statement of this.true_statements) {
utilities_1.append(errors, statement.validate(language, context));
}
for (const statement of this.false_statements) {
utilities_1.append(errors, statement.validate(language, context));
}
return errors;
}
toRegex(language) {
let if_stmt = this.identifier;
// be more clear with languages that support it
if (language === RegexDialect.Boost) {
if_stmt = "<" + if_stmt + ">";
}
const true_stmt = generator_helper_1.groupIfRequired(this.true_statements.map((x) => x.toRegex(language)).join(""));
if (this.false_statements.length > 0) {
const false_stmt = generator_helper_1.groupIfRequired(this.false_statements.map((x) => x.toRegex(language)).join(""));
return `(?(${if_stmt})${true_stmt}|${false_stmt})`;
}
else {
return `(?(${if_stmt})${true_stmt})`;
}
}
}
exports.IfIdentStatementCST = IfIdentStatementCST;
/**
* Concrete Syntax Tree for a regular expression
*
@ -642,10 +803,10 @@ class RegularExpressionCST extends H2RCST {
this.usings = usings;
this.statements = statements;
}
validate(language) {
const errors = this.usings.validate(language);
validate(language, context) {
const errors = this.usings.validate(language, context);
for (const statement of this.statements) {
utilities_1.append(errors, statement.validate(language));
utilities_1.append(errors, statement.validate(language, context));
}
return errors;
}
@ -656,87 +817,3 @@ class RegularExpressionCST extends H2RCST {
}
}
exports.RegularExpressionCST = RegularExpressionCST;
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
function minimizeMatchString(arr) {
return minMatchString(arr, 0);
}
exports.minimizeMatchString = minimizeMatchString;
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array
* @param depth must be 0 for initial call
* @internal
*/
function minMatchString(arr, depth = 0) {
// base case: arr is empty
if (arr.length === 0) {
return "";
}
// base case: arr has 1 element (must have at least 2, so this means this value is optional)
if (arr.length === 1) {
return utilities_1.first(arr) + "?";
}
// remove duplicates
arr = [...new Set(arr)];
// base case: arr has 1 element (after duplicate removal means this is required)
if (arr.length === 1) {
return utilities_1.first(arr);
}
// base case: arr is all single letters
if (arr.every(utilities_1.isSingleRegexCharacter)) {
return "[" + arr.join("") + "]";
}
// now the real magic begins
// You are not expected to understand this
let longest_begin_substring = utilities_1.first(arr);
let longest_end_substring = utilities_1.first(arr);
for (let i = 1; i < arr.length; i++) {
// reduce longest_substring to match everything
for (let j = 0; j < longest_begin_substring.length; j++) {
if (arr[i].length < j || longest_begin_substring[j] !== arr[i][j]) {
longest_begin_substring = longest_begin_substring.substr(0, j);
break;
}
}
for (let j = 0; j < longest_end_substring.length; j++) {
if (arr[i].length - j < 0 || longest_end_substring[longest_end_substring.length - j - 1] !== arr[i][arr[i].length - j - 1]) {
longest_end_substring = longest_end_substring.substr(longest_end_substring.length - j, longest_end_substring.length);
break;
}
}
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
break;
}
}
// No matches whatsoever
// *technically* we can optimize further, but that is a VERY non-trivial problem
// For example optimizing: [ "a1x1z", "a2y2z", "a3z3z" ] to: "a[123][xyz][123]z"
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
if (depth > 0) {
return "(?:" + arr.join("|") + ")";
}
else {
return arr.join("|");
}
}
// we have some matches
else {
// remove begin (if exists) and end (if exists) from each element and remove empty strings
const begin_pos = longest_begin_substring.length;
const end_pos = longest_end_substring.length;
const similar_matches = [];
for (const ele of arr) {
const match = ele.substring(begin_pos, ele.length - end_pos);
if (match.length !== 0) {
similar_matches.push(match);
}
}
return longest_begin_substring + minMatchString(similar_matches, depth + 1) + longest_end_substring;
}
}

23
lib/generator_helper.d.ts vendored Normal file
View File

@ -0,0 +1,23 @@
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
export declare function minimizeMatchString(arr: string[]): string;
/**
* Groups a regex fragment if it needs to be grouped
*
* @param fragment fragment of regular expression to potentially group
* @returns a non-capturing group if there needs to be one
* @internal
*/
export declare function groupIfRequired(fragment: string): string;
/**
* Checks to see if fragment has a + or * at the end and has a repetition statement
*
* @param fragment fragment of regular expression
* @param repetition repetition that may clobber the fragment
*/
export declare function dontClobberRepetition(fragment: string, repetition: string): string;

203
lib/generator_helper.js Normal file
View File

@ -0,0 +1,203 @@
"use strict";
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
Object.defineProperty(exports, "__esModule", { value: true });
exports.dontClobberRepetition = exports.groupIfRequired = exports.minimizeMatchString = void 0;
/**
* Includes helper functions for the Generator
* @packageDocumentation
*/
const utilities_1 = require("./utilities");
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
function minimizeMatchString(arr) {
// don't process an array of length 1, otherwise you'll get the wrong result
if (arr.length === 1) {
return utilities_1.first(arr);
}
return minMatchString(arr, 0);
}
exports.minimizeMatchString = minimizeMatchString;
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array
* @param depth must be 0 for initial call
* @returns an optimized string
* @internal
*/
function minMatchString(arr, depth = 0) {
// base case: arr is empty
if (arr.length === 0) {
return "";
}
// base case: arr has 1 element (must have at least 2, so this means this value is optional)
if (arr.length === 1) {
return utilities_1.first(arr) + "?";
}
// remove duplicates
arr = [...new Set(arr)];
// base case: arr has 1 element (after duplicate removal means this is required)
if (arr.length === 1) {
return utilities_1.first(arr);
}
// base case: arr is all single letters
if (arr.every(utilities_1.isSingleRegexCharacter)) {
return "[" + arr.join("") + "]";
}
// now the real magic begins
// You are not expected to understand this
let longest_begin_substring = utilities_1.first(arr);
let longest_end_substring = utilities_1.first(arr);
for (let i = 1; i < arr.length; i++) {
// reduce longest_substring to match everything
for (let j = 0; j < longest_begin_substring.length; j++) {
if (arr[i].length < j || longest_begin_substring[j] !== arr[i][j]) {
longest_begin_substring = longest_begin_substring.substr(0, j);
break;
}
}
for (let j = 0; j < longest_end_substring.length; j++) {
if (arr[i].length - j < 0 || longest_end_substring[longest_end_substring.length - j - 1] !== arr[i][arr[i].length - j - 1]) {
longest_end_substring = longest_end_substring.substr(longest_end_substring.length - j, longest_end_substring.length);
break;
}
}
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
break;
}
}
// No matches whatsoever
// *technically* we can optimize further, but that is a VERY non-trivial problem
// For example optimizing: [ "a1x1z", "a2y2z", "a3z3z" ] to: "a[123][xyz][123]z"
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
if (depth > 0) {
return "(?:" + arr.join("|") + ")";
}
else {
return arr.join("|");
}
}
// we have some matches
else {
// remove begin (if exists) and end (if exists) from each element and remove empty strings
const begin_pos = longest_begin_substring.length;
const end_pos = longest_end_substring.length;
const similar_matches = [];
for (const ele of arr) {
const match = ele.substring(begin_pos, ele.length - end_pos);
if (match.length !== 0) {
similar_matches.push(match);
}
}
return longest_begin_substring + minMatchString(similar_matches, depth + 1) + longest_end_substring;
}
}
/**
* Groups a regex fragment if it needs to be grouped
*
* @param fragment fragment of regular expression to potentially group
* @returns a non-capturing group if there needs to be one
* @internal
*/
function groupIfRequired(fragment) {
if (utilities_1.isSingleRegexCharacter(fragment)) {
return fragment;
}
if (fragment[0] === "(" && fragment[fragment.length - 1] === ")") {
let bracket_count = 0;
for (let i = 1; i < fragment.length - 2; i++) {
if (fragment[i] === "\\") {
i++;
}
else if (fragment[i] === "(") {
bracket_count++;
}
else if (fragment[i] === ")") {
bracket_count--;
if (bracket_count === -1) {
break;
}
}
}
return bracket_count === 0 ? fragment : "(?:" + fragment + ")";
}
else if (fragment[0] === "[" && fragment[fragment.length - 1] === "]") {
let bracket_count = 0;
for (let i = 1; i < fragment.length - 2; i++) {
if (fragment[i] === "\\") {
i++;
}
//you'll never have a raw [ inside a []
//else if (fragment[i] === "[") {
// bracket_count++;
//}
else if (fragment[i] === "]") {
bracket_count--;
if (bracket_count === -1) {
break;
}
}
}
return bracket_count === 0 ? fragment : "(?:" + fragment + ")";
}
else {
return "(?:" + fragment + ")";
}
}
exports.groupIfRequired = groupIfRequired;
/**
* Checks to see if fragment has a + or * at the end and has a repetition statement
*
* @param fragment fragment of regular expression
* @param repetition repetition that may clobber the fragment
*/
function dontClobberRepetition(fragment, repetition) {
// + can be ignored as well as a count as long as that count is > 0
if (fragment.endsWith("+")) {
switch (repetition) {
case "*":
// ignore: + is greater than *
break;
case "?":
// non-greedy qualifier
fragment += repetition;
break;
case "+":
// ignore: already +
break;
default:
if (repetition.startsWith("{0")) {
fragment = "(?:" + fragment + ")" + repetition;
}
else {
// remove + and replace with count
fragment = fragment.substring(0, fragment.length - 1) + repetition;
}
break;
}
}
else if (fragment.endsWith("*")) {
switch (repetition) {
case "*":
// ignore: already +
break;
case "?":
// non-greedy qualifier
fragment += repetition;
break;
default:
// remove * and replace with count
fragment = fragment.substring(0, fragment.length - 1) + repetition;
break;
}
}
else {
fragment += repetition;
}
return fragment;
}
exports.dontClobberRepetition = dontClobberRepetition;

2
lib/index.d.ts vendored
View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* Includes all packages
* @packageDocumentation

View File

@ -1,5 +1,5 @@
"use strict";
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
Object.defineProperty(exports, "__esModule", { value: true });
exports.RegexDialect = exports.ParseResult = exports.Human2RegexParser = exports.Human2RegexParserOptions = exports.TokenizeResult = exports.IndentType = exports.Human2RegexLexer = exports.Human2RegexLexerOptions = exports.CommonError = void 0;
/**

2
lib/lexer.d.ts vendored
View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* The Lexer for Human2Regex
* @packageDocumentation

View File

@ -1,5 +1,5 @@
"use strict";
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
Object.defineProperty(exports, "__esModule", { value: true });
exports.Human2RegexLexer = exports.TokenizeResult = exports.Human2RegexLexerOptions = exports.IndentType = void 0;
/**

2
lib/parser.d.ts vendored
View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* The parser for Human2Regex
* @packageDocumentation

View File

@ -1,5 +1,5 @@
"use strict";
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
@ -83,7 +83,7 @@ class ParseResult {
* @public
*/
validate(language) {
return this.regexp_cst.validate(language).map(utilities_1.CommonError.fromSemanticError);
return this.regexp_cst.validate(language, new generator_1.GeneratorContext()).map(utilities_1.CommonError.fromSemanticError);
}
/**
* Generate a regular expression string based on the parse result
@ -499,12 +499,91 @@ class Human2RegexParser extends chevrotain_1.EmbeddedActionsParser {
tokens.push($.CONSUME(T.Outdent));
return new generator_1.RepeatStatementCST(tokens, optional, count, statements);
});
const BackrefStatement = $.RULE("BackrefStatement", () => {
const tokens = [];
let optional = false;
let count = null;
$.OPTION5(() => {
tokens.push($.CONSUME(T.Optional));
optional = true;
});
tokens.push($.CONSUME(T.Rerun));
$.OPTION6(() => count = $.SUBRULE(CountSubStatement));
$.OPTION7(() => {
$.OPTION(() => $.CONSUME(T.The));
$.CONSUME(T.Group);
$.OPTION2(() => $.CONSUME(T.Called));
});
const name = $.CONSUME(T.Identifier).image;
tokens.push($.CONSUME4(T.EndOfLine));
return new generator_1.BackrefStatementCST(tokens, optional, count, name);
});
const IfStatement = $.RULE("IfStatement", () => {
const tokens = [];
const msv = [];
let optional = false;
const true_statements = [];
const false_statements = [];
let name = "";
tokens.push($.CONSUME(T.If));
$.OR2([
{ ALT: () => {
name = $.CONSUME(T.Identifier).image;
} },
{ ALT: () => {
$.CONSUME(T.Match);
$.OPTION4(() => {
$.CONSUME3(T.Optional);
optional = true;
});
msv.push(new generator_1.MatchStatementValue(optional, $.SUBRULE(MatchSubStatement)));
$.MANY(() => {
$.OR([
{ ALT: () => {
$.OPTION2(() => $.CONSUME2(T.And));
$.CONSUME(T.Then);
} },
{ ALT: () => $.CONSUME(T.And) },
]);
optional = false;
$.OPTION3(() => {
$.CONSUME2(T.Optional);
optional = true;
});
msv.push(new generator_1.MatchStatementValue(optional, $.SUBRULE2(MatchSubStatement)));
});
} }
]);
tokens.push($.CONSUME3(T.EndOfLine));
$.CONSUME2(T.Indent);
$.AT_LEAST_ONE2(() => {
true_statements.push($.SUBRULE(Statement));
});
$.CONSUME2(T.Outdent);
$.OPTION(() => {
$.CONSUME(T.Else);
$.CONSUME4(T.EndOfLine);
$.CONSUME3(T.Indent);
$.AT_LEAST_ONE3(() => {
false_statements.push($.SUBRULE2(Statement));
});
$.CONSUME3(T.Outdent);
});
if (name === "") {
return new generator_1.IfPatternStatementCST(tokens, msv, true_statements, false_statements);
}
else {
return new generator_1.IfIdentStatementCST(tokens, name, true_statements, false_statements);
}
});
// statement super class
const Statement = $.RULE("Statement", () => {
return $.OR([
{ ALT: () => $.SUBRULE(MatchStatement) },
{ ALT: () => $.SUBRULE(GroupStatement) },
{ ALT: () => $.SUBRULE(RepeatStatement) }
{ ALT: () => $.SUBRULE(RepeatStatement) },
{ ALT: () => $.SUBRULE(BackrefStatement) },
{ ALT: () => $.SUBRULE(IfStatement) }
]);
});
// full regex

6
lib/tokens.d.ts vendored
View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/** @internal */ export declare const Zero: import("chevrotain").TokenType;
/** @internal */ export declare const One: import("chevrotain").TokenType;
/** @internal */ export declare const Two: import("chevrotain").TokenType;
@ -51,6 +51,10 @@
/** @internal */ export declare const CaseInsensitive: import("chevrotain").TokenType;
/** @internal */ export declare const CaseSensitive: import("chevrotain").TokenType;
/** @internal */ export declare const OrMore: import("chevrotain").TokenType;
/** @internal */ export declare const Rerun: import("chevrotain").TokenType;
/** @internal */ export declare const The: import("chevrotain").TokenType;
/** @internal */ export declare const If: import("chevrotain").TokenType;
/** @internal */ export declare const Else: import("chevrotain").TokenType;
/** @internal */ export declare const EndOfLine: import("chevrotain").TokenType;
/** @internal */ export declare const WS: import("chevrotain").TokenType;
/** @internal */ export declare const SingleLineComment: import("chevrotain").TokenType;

View File

@ -1,7 +1,8 @@
"use strict";
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
Object.defineProperty(exports, "__esModule", { value: true });
exports.AllTokens = exports.Outdent = exports.Indent = exports.StringLiteral = exports.NumberLiteral = exports.Identifier = exports.MultilineComment = exports.SingleLineComment = exports.WS = exports.EndOfLine = exports.OrMore = exports.CaseSensitive = exports.CaseInsensitive = exports.CarriageReturn = exports.Newline = exports.Repeat = exports.Called = exports.Create = exports.To = exports.From = exports.Exclusive = exports.Inclusive = exports.Exactly = exports.Times = exports.A = exports.Group = exports.Linefeed = exports.Tab = exports.Between = exports.Not = exports.Matching = exports.Exact = exports.Multiline = exports.Global = exports.Using = exports.Unicode = exports.Number = exports.Boundary = exports.Whitespace = exports.Integer = exports.Decimal = exports.Letter = exports.Character = exports.Digit = exports.Word = exports.And = exports.Or = exports.Anything = exports.Then = exports.Match = exports.Optional = exports.Ten = exports.Nine = exports.Eight = exports.Seven = exports.Six = exports.Five = exports.Four = exports.Three = exports.Two = exports.One = exports.Zero = void 0;
exports.CaseInsensitive = exports.CarriageReturn = exports.Newline = exports.Repeat = exports.Called = exports.Create = exports.To = exports.From = exports.Exclusive = exports.Inclusive = exports.Exactly = exports.Times = exports.A = exports.Group = exports.Linefeed = exports.Tab = exports.Between = exports.Not = exports.Matching = exports.Exact = exports.Multiline = exports.Global = exports.Using = exports.Unicode = exports.Number = exports.Boundary = exports.Whitespace = exports.Integer = exports.Decimal = exports.Letter = exports.Character = exports.Digit = exports.Word = exports.And = exports.Or = exports.Anything = exports.Then = exports.Match = exports.Optional = exports.Ten = exports.Nine = exports.Eight = exports.Seven = exports.Six = exports.Five = exports.Four = exports.Three = exports.Two = exports.One = exports.Zero = void 0;
exports.AllTokens = exports.Outdent = exports.Indent = exports.StringLiteral = exports.NumberLiteral = exports.Identifier = exports.MultilineComment = exports.SingleLineComment = exports.WS = exports.EndOfLine = exports.Else = exports.If = exports.The = exports.Rerun = exports.OrMore = exports.CaseSensitive = void 0;
/**
* The tokens required for Human2Regex
* @packageDocumentation
@ -52,32 +53,17 @@ const chevrotain_1 = require("chevrotain");
/** @internal */ exports.From = chevrotain_1.createToken({ name: "From", pattern: /from/i });
/** @internal */ exports.To = chevrotain_1.createToken({ name: "To", pattern: /(to|through|thru|\-|\.\.\.?)/i });
/** @internal */ exports.Create = chevrotain_1.createToken({ name: "Create", pattern: /create(s)?/i });
/** @internal */ exports.Called = chevrotain_1.createToken({ name: "Called", pattern: /name(d)?|call(ed)?/i });
/** @internal */ exports.Called = chevrotain_1.createToken({ name: "Called", pattern: /named|called/i });
/** @internal */ exports.Repeat = chevrotain_1.createToken({ name: "Repeat", pattern: /repeat(s|ing)?/i });
/** @internal */ exports.Newline = chevrotain_1.createToken({ name: "Newline", pattern: /(new line|newline)/i });
/** @internal */ exports.CarriageReturn = chevrotain_1.createToken({ name: "CarriageReturn", pattern: /carriage return/i });
/** @internal */ exports.CaseInsensitive = chevrotain_1.createToken({ name: "CaseInsensitive", pattern: /case insensitive/i });
/** @internal */ exports.CaseSensitive = chevrotain_1.createToken({ name: "CaseSensitive", pattern: /case sensitive/i });
/** @internal */ exports.OrMore = chevrotain_1.createToken({ name: "OrMore", pattern: /\+|or more/i });
/*
//Not being used currently
export const Of = createToken({name: "Of", pattern: /of/i});
export const Nothing = createToken({name: "Nothing", pattern: /nothing/i});
export const As = createToken({name: "As", pattern: /as/i});
export const If = createToken({name: "If", pattern: /if/i});
export const Start = createToken({name: "Start", pattern: /start(s) with?/i});
export const Ends = createToken({name: "Ends", pattern: /end(s)? with/i});
export const Else = createToken({name: "Else", pattern: /(other wise|otherwise|else)/i});
export const Unless = createToken({name: "Unless", pattern: /unless/i});
export const While = createToken({name: "While", pattern: /while/i});
export const More = createToken({name: "More", pattern: /more/i});
export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
export const None = createToken({name: "None", pattern: /none/i});
export const Neither = createToken({name: "Neither", pattern: /neither/i});
export const The = createToken({name: "The", pattern: /the/i }); //, longer_alt: Then});
export const By = createToken({name: "By", pattern: /by/i});
*/
/** @internal */ exports.Rerun = chevrotain_1.createToken({ name: "Rerun", pattern: /re( |-)?(run|capture)/i });
/** @internal */ exports.The = chevrotain_1.createToken({ name: "The", pattern: /the/i });
/** @internal */ exports.If = chevrotain_1.createToken({ name: "If", pattern: /if/i });
/** @internal */ exports.Else = chevrotain_1.createToken({ name: "Else", pattern: /else|otherwise/i });
/** @internal */ exports.EndOfLine = chevrotain_1.createToken({ name: "EOL", pattern: /\n/ });
/** @internal */ exports.WS = chevrotain_1.createToken({ name: "Whitespace", pattern: /[^\S\n]+/, start_chars_hint: [" ", "\r"], group: chevrotain_1.Lexer.SKIPPED });
/** @internal */ exports.SingleLineComment = chevrotain_1.createToken({ name: "SingleLineComment", pattern: /(#|\/\/).*/, group: chevrotain_1.Lexer.SKIPPED });
@ -120,22 +106,11 @@ exports.AllTokens = [
exports.Whitespace,
exports.Number,
exports.Unicode,
/*
Of,
As,
If,
Start,
Ends,
Else,
Unless,
While,
More,
Nothing,
By,
The,
None,
Neither,
*/
exports.Called,
exports.Rerun,
exports.If,
exports.Else,
exports.The,
exports.Using,
exports.Global,
exports.Multiline,
@ -151,7 +126,6 @@ exports.AllTokens = [
exports.Exclusive,
exports.From,
exports.Create,
exports.Called,
exports.Repeat,
exports.Newline,
exports.CarriageReturn,

5
lib/utilities.d.ts vendored
View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* Some utility functions for Human2Regex
* @packageDocumentation
@ -130,6 +130,7 @@ export declare class CommonError {
*
* @param error The lexing error
* @returns a new CommonError
* @internal
*/
static fromLexError(error: ILexingError): CommonError;
/**
@ -137,6 +138,7 @@ export declare class CommonError {
*
* @param error The parsing error
* @returns a new CommonError
* @internal
*/
static fromParseError(error: IRecognitionException): CommonError;
/**
@ -144,6 +146,7 @@ export declare class CommonError {
*
* @param error The semantic error
* @returns a new CommonError
* @internal
*/
static fromSemanticError(error: ISemanticError): CommonError;
/**

View File

@ -1,5 +1,5 @@
"use strict";
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
Object.defineProperty(exports, "__esModule", { value: true });
exports.CommonError = exports.append = exports.regexEscape = exports.removeQuotes = exports.findLastIndex = exports.last = exports.first = exports.isSingleRegexCharacter = exports.combineFlags = exports.hasFlag = exports.makeFlag = exports.usefulConditional = exports.unusedParameter = void 0;
/**
@ -181,6 +181,7 @@ class CommonError {
*
* @param error The lexing error
* @returns a new CommonError
* @internal
*/
static fromLexError(error) {
// not really fond of --> and <--
@ -192,6 +193,7 @@ class CommonError {
*
* @param error The parsing error
* @returns a new CommonError
* @internal
*/
static fromParseError(error) {
var _a, _b, _c;
@ -204,6 +206,7 @@ class CommonError {
*
* @param error The semantic error
* @returns a new CommonError
* @internal
*/
static fromSemanticError(error) {
return new CommonError("Semantic Error", error.startLine, error.startColumn, error.length, error.message);

4765
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -1,32 +1,32 @@
{
"name": "human2regex",
"version": "1.0.2",
"version": "1.1.0",
"description": "Humanized Regular Expressions",
"main": "./lib/index.js",
"typings": "./lib/index.d.ts",
"devDependencies": {
"@types/glob": "^7.1.3",
"@types/html-minifier": "^3.5.3",
"@types/jest": "^26.0.15",
"@types/mustache": "^4.0.1",
"@typescript-eslint/eslint-plugin": "^4.7.0",
"@typescript-eslint/parser": "^4.7.0",
"@types/jest": "^26.0.19",
"@typescript-eslint/eslint-plugin": "^4.11.1",
"@typescript-eslint/parser": "^4.11.1",
"before-build-webpack": "^0.2.9",
"clean-webpack-plugin": "^3.0.0",
"codecov": "^3.8.1",
"copy-webpack-plugin": "^6.3.0",
"copy-webpack-plugin": "^6.4.1",
"css-loader": "^4.3.0",
"eslint": "^7.13.0",
"eslint": "^7.17.0",
"glob": "^7.1.6",
"handlebars": "^4.7.6",
"html-minifier": "^4.0.0",
"jest": "^26.6.3",
"mini-css-extract-plugin": "^1.3.1",
"mustache": "^4.0.1",
"mini-css-extract-plugin": "^1.3.3",
"optimize-css-assets-webpack-plugin": "^5.0.4",
"remove-files-webpack-plugin": "^1.4.4",
"ts-jest": "^26.4.4",
"ts-loader": "^8.0.11",
"ts-node": "^9.0.0",
"typescript": "^4.0.5",
"ts-loader": "^8.0.13",
"ts-node": "^9.1.1",
"typescript": "^4.1.3",
"webpack": "^4.44.2",
"webpack-cli": "^3.3.12"
},
@ -45,8 +45,8 @@
"author": "Patrick Demian",
"license": "MIT",
"dependencies": {
"chevrotain": "^7.0.3",
"codemirror": "^5.58.2"
"chevrotain": "^7.1.0",
"codemirror": "^5.59.1"
},
"repository": {
"type": "git",

View File

@ -1,21 +1,21 @@
{{! Copyright (c) 2020 Patrick Demian; Licensed under MIT }}
{{! Copyright (c) 2021 Patrick Demian; Licensed under MIT }}
{{> header}}
{{> header title="Error 404 - Not Found" description="Not Found"}}
<!-- Main Content -->
<div class="container contained-container" id="maincontent" role="main">
<!-- Page Header -->
<div class="align_header">
<div class="mx-auto">
<div class="site-heading">
<h1>{{error-code}}</h1>
<span class="subheading">{{error-subheading}}</span>
<h1>404</h1>
<span class="subheading">Not Found</span>
</div>
</div>
</div>
<br><br><br>
<div class="row">
<div class="col-12 mx-auto">
<h3 class="align_header">{{{error-string}}}</h3>
<h3 class="align_header">The resource could not be found.</h3>
</div>
</div>
</div>

View File

@ -1,9 +0,0 @@
{
"page": {
"title": "Error 404 - Not Found",
"description": "Not Found"
},
"error-code": "404",
"error-subheading": "Not Found",
"error-string": "The resource could not be found."
}

View File

Before

Width:  |  Height:  |  Size: 2.1 KiB

After

Width:  |  Height:  |  Size: 2.1 KiB

View File

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

View File

Before

Width:  |  Height:  |  Size: 8.0 KiB

After

Width:  |  Height:  |  Size: 8.0 KiB

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
.align_header {
text-align: center;
@ -182,3 +182,25 @@ pre code {
.heading-link:focus {
color: rgba(255, 255, 255, .8)
}
.tut-contents {
display: table;
padding: 7px;
border: 1px solid #a2a9b1;
background-color: #f8f9fa;
padding: 5px;
font-size: 16px!important;
}
.tut-title {
text-align: center;
font-weight: bold;
}
.tut-contents li {
list-style-type: none;
}
.tut-contents>ul {
margin-top: 0;
padding-left: 0;
margin-left: 0.5em;
margin-right: 0.5em;
}

64
src/docs/index.hbs Normal file
View File

@ -0,0 +1,64 @@
{{! Copyright (c) 2021 Patrick Demian; Licensed under MIT }}
{{> header title="Human2Regex" description="Create regular expressions with natural, human language"}}
<!-- Main Content -->
<div class="container" id="maincontent" role="main">
<div class="row">
<div class="col-lg-8 tenpx-margin-bottom">
<div class="form-group row zero-margin-bottom">
<label for="dialect" class="col-sm-4 col-form-label">Regex dialect:</label>
<div class="col-sm-8">
<select class="form-control" id="dialect">
<option value="js" selected>Javascript</option>
<option value="dotnet">.NET</option>
<option value="python">Python</option>
<option value="boost">C++ Boost</option>
<option value="java">Java 7+</option>
<option value="pcre">PCRE</option>
</select>
</div>
</div>
<h4>Your Regular Expression:</h4>
<div class="row">
<div class="col-xl-11 tenpx-margin-bottom">
<input readonly type="text" class="form-control" id="regex"></input>
</div>
<div class="col-xl-1">
<button type="button" class="btn btn-secondary float-right" id="clip">Copy</button>
</div>
</div>
<h4>Human Speak:</h4>
<textarea class="form-control" id="human" rows="25">{{> example_code}}</textarea>
<h4>Errors:</h4>
<textarea readonly class="form-control " id="errors" rows="5"></textarea>
</div>
<br>
<div class="col-lg-4 tenpx-margin-bottom">
<div class="cheatsheet">
<h2>Cheat Sheet:</h2>
<p>Full documentation available <a href="/tutorial.html">here</a></p>
<p class="font-weight-bold">Matching</p>
<p>{{i-code}}match "hello world"{{end-i-code}} matches "hello world" exactly<p>
<p>{{i-code}}match "hello" then optionally " world"{{end-i-code}} matches "hello" or "hello world"</p>
<p>{{i-code}}match "hello" or "world"{{end-i-code}} matches "hello" or "world</p>
<p>{{i-code}}match a word{{end-i-code}} matches any word
<p class="font-weight-bold">Repetition</p>
<p>{{i-code}}match 0+ "hello"{{end-i-code}} matches 0 or more "hello"s</p>
<p>{{i-code}}match 3 "hello"{{end-i-code}} matches exactly "hellohellohello"</p>
<p>{{i-code}}match 1 to 5 "hello"{{end-i-code}} matches between 1 to 5 "hello"s</p>
<p>{{i-code}}repeat 0 or more{{end-i-code}} repeats the intended text 0 or more times (default)</p>
<p>{{i-code}}optionally repeat between 3 to 5{{end-i-code}} optionally repeats the indented text 3 to 5 times</p>
<p class="font-weight-bold">Grouping</p>
<p>{{i-code}}create a group called mygroup{{end-i-code}} creates a group called "mygroup"</p>
<p>{{i-code}}create an optional group{{end-i-code}} creates an unnamed optional group</p>
<p class="font-weight-bold">Using</p>
<p>{{i-code}}using global and case insensitive{{end-i-code}} uses the 'g' and 'i' flags</p>
<p class="font-weight-bold">Misc</p>
<p>{{i-code}}// comment{{end-i-code}} is a single line comment</p>
<p>{{i-code}}/* comment */{{end-i-code}} is a multi line comment</p>
</div>
</div>
</div>
</div>
{{> footer}}

View File

@ -1,6 +0,0 @@
{
"page": {
"title": "Human2Regex",
"description": "Create regular expressions with natural, human language"
}
}

View File

@ -1,97 +0,0 @@
{{! Copyright (c) 2020 Patrick Demian; Licensed under MIT }}
{{> header}}
<!-- Main Content -->
<div class="container" id="maincontent" role="main">
<div class="row">
<div class="col-lg-8 tenpx-margin-bottom">
<div class="form-group row zero-margin-bottom">
<label for="dialect" class="col-sm-4 col-form-label">Regex dialect:</label>
<div class="col-sm-8">
<select class="form-control" id="dialect">
<option value="js" selected>Javascript</option>
<option value="dotnet">.NET</option>
<option value="python">Python</option>
<option value="boost">C++ Boost</option>
<option value="java">Java 7+</option>
<option value="pcre">PCRE</option>
</select>
</div>
</div>
<h4>Your Regular Expression:</h4>
<div class="row">
<div class="col-xl-11 tenpx-margin-bottom">
<input readonly type="text" class="form-control" id="regex"></input>
</div>
<div class="col-xl-1">
<button type="button" class="btn btn-secondary float-right" id="clip">Copy</button>
</div>
</div>
<h4>Human Speak:</h4>
<textarea class="form-control" id="human" rows="25">
/* Make a regex that matches (basic) URLs */
using global and exact matching
create an optional group called protocol
match "http"
possibly match "s"
match "://"
create an optional group called subdomain
repeat
match a word, then "."
create a group called domain
match 1+ words or "_" or "-"
match "."
match a word
# port, but we don't care about it, so ignore it
optionally match ":" then 0+ digits
create an optional group called path
repeat
match "/"
match 0+ words or "_" or "-"
create an optional group
# we don't want to capture the '?', so don't name the group until afterwards
match "?"
create a group called query
repeat
match 1+ words or "_" or "-"
match "="
match 1+ words or "_" or "-"
create an optional group
# fragment, again, we don't care, so ignore everything afterwards
match "#"
match 0+ any thing
</textarea>
<h4>Errors:</h4>
<textarea readonly class="form-control " id="errors" rows="5"></textarea>
</div>
<br>
<div class="col-lg-4 tenpx-margin-bottom">
<div class="cheatsheet">
<h2>Cheat Sheet:</h2>
<p>Full documentation available <a href="tutorial.html">here</a></p>
<p class="font-weight-bold">Matching</p>
<p><code class="cm-s-idea">match "hello world"</code> matches "hello world" exactly<p>
<p><code class="cm-s-idea">match "hello" then optionally " world"</code> matches "hello" or "hello world"</p>
<p><code class="cm-s-idea">match "hello" or "world"</code> matches "hello" or "world</p>
<p><code class="cm-s-idea">match a word</code> matches any word
<p class="font-weight-bold">Repetition</p>
<p><code class="cm-s-idea">match 0+ "hello"</code> matches 0 or more "hello"s</p>
<p><code class="cm-s-idea">match 3 "hello"</code> matches exactly "hellohellohello"</p>
<p><code class="cm-s-idea">match 1 to 5 "hello"</code> matches between 1 to 5 "hello"s</p>
<p><code class="cm-s-idea">repeat 0 or more</code> repeats the intended text 0 or more times (default)</p>
<p><code class="cm-s-idea">optionally repeat between 3 to 5</code> optionally repeats the indented text 3 to 5 times</p>
<p class="font-weight-bold">Grouping</p>
<p><code class="cm-s-idea">create a group called mygroup</code> creates a group called "mygroup"</p>
<p><code class="cm-s-idea">create an optional group</code> creates an unnamed optional group</p>
<p class="font-weight-bold">Using</p>
<p><code class="cm-s-idea">using global and case insensitive</code> uses the 'g' and 'i' flags</p>
<p class="font-weight-bold">Misc</p>
<p><code class="cm-s-idea">// comment</code> is a single line comment</p>
<p><code class="cm-s-idea">/* comment */</code> is a multi line comment</p>
</div>
</div>
</div>
</div>
{{> footer}}

View File

@ -0,0 +1,32 @@
/* Make a regex that matches (basic) URLs */
using global and exact matching
create an optional group called protocol
match "http"
possibly match "s"
match "://"
create an optional group called subdomain
repeat
match a word, then "."
create a group called domain
match 1+ words or "_" or "-"
match "."
match a word
# port, but we don't care about it, so ignore it
optionally match ":" then 0+ digits
create an optional group called path
repeat
match "/"
match 0+ words or "_" or "-"
create an optional group
# we don't want to capture the '?', so don't name the group until afterwards
match "?"
create a group called query
repeat
match 1+ words or "_" or "-"
match "="
match 1+ words or "_" or "-"
create an optional group
# fragment, again, we don't care, so ignore everything afterwards
match "#"
match 0+ any thing

View File

@ -1,11 +1,11 @@
{{! Copyright (c) 2020 Patrick Demian; Licensed under MIT }}
{{! Copyright (c) 2021 Patrick Demian; Licensed under MIT }}
<!-- Footer -->
<footer>
<div class="container">
<div class="row">
<div class="col-lg-8 col-md-10 mx-auto">
<p class="copyright">Copyright &copy; 2020 Patrick Demian. This page's source code is available at <a rel="noopener noreferrer" href="https://github.com/pdemian/human2regex">github.com/pdemian/human2regex</a></p>
<p class="copyright">Copyright &copy; 2021 Patrick Demian. This page's source code is available at <a rel="noopener noreferrer" href="https://github.com/pdemian/human2regex">github.com/pdemian/human2regex</a></p>
</div>
</div>
</div>
@ -14,7 +14,7 @@
</div>
<!-- Our script -->
<script defer src="bundle.min.js"></script>
<script defer src="/bundle.min.js"></script>
</body>
</html>

View File

@ -1,22 +1,22 @@
{{! Copyright (c) 2020 Patrick Demian; Licensed under MIT }}
{{! Copyright (c) 2021 Patrick Demian; Licensed under MIT }}
<!DOCTYPE html>
<html lang="en" dir="ltr">
<head>
<!-- Metadata -->
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<meta name="description" content="{{page.description}}">
<meta name="description" content="{{description}}">
<meta name="keywords" content="Human2Regex, Human, Regex, Natural, Language, Natural Language">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>{{page.title}}</title>
<title>{{title}}</title>
<!-- Our own CSS -->
<link href="bundle.min.css" rel="stylesheet" type="text/css">
<link href="/bundle.min.css" rel="stylesheet" type="text/css">
<meta name="theme-color" content="#212529">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="default">
<link rel="icon" type="image/x-icon" href="favicon.ico">
<link rel="icon" type="image/x-icon" href="/favicon.ico">
</head>
<body>
<a class="skip skip-top" href="#maincontent">Skip to main content</a>
@ -25,14 +25,14 @@
<!-- Navigation -->
<nav class="navbar navbar-expand-lg navbar-light fixed-top" id="mainNav">
<div class="container">
<a class="navbar-brand" href="index.html">
<img src="favicon-small.png" width="30" height="30" class="d-inline-block align-top" alt="logo">&nbsp;Human2Regex
<a class="navbar-brand" href="/index.html">
<img src="/favicon-small.png" width="30" height="30" class="d-inline-block align-top" alt="logo">&nbsp;Human2Regex
</a>
<div class="float-right heading-links">
<a class="heading-link" href="index.html">Index</a>
<a class="heading-link" href="/index.html">Index</a>
<span>&nbsp;|&nbsp;</span>
<a class="heading-link" href="tutorial.html">Tutorial</a>
<a class="heading-link" href="/tutorial.html">Tutorial</a>
</div>
</div>
</nav>

View File

@ -1,43 +1,65 @@
{{! Copyright (c) 2020 Patrick Demian; Licensed under MIT }}
{{! Copyright (c) 2021 Patrick Demian; Licensed under MIT }}
{{> header}}
{{> header title="Human2Regex Tutorial" description="Create regular expressions with natural, human language"}}
<!-- Main Content -->
<div class="container contained-container" id="maincontent" role="main">
<div id="tutorial">
<h2>Tutorial</h2>
<h2 id="tut-begin">Tutorial</h2>
<br>
<p class="font-weight-bold" id="tut-preface">0. Preface</p>
<p>Human2Regex (H2R) is a way to spell out a regular expression in an easy to read, easy to modify language. H2R supports multiple languages as well as many (though not all) different regular expression options such as named groups and quantifiers. You may notice multiple keywords specifying the same thing, and that is intended! Just like how in English there are many ways to express yourself, H2R is made to be flexible and easy to understand. With a range, do you prefer "...", "through", or "to"? It's up to you to choose, H2R supports all of those!</p>
<div class="tut-contents" role="navigation">
<div class="tut-title">Contents</div>
<ul>
<li>
<a href="#tut-begin">Tutorial</a>
<ul>
<li><a href="#tut-first-match">1.1 First match</a></li>
<li><a href="#tut-using">1.2 Using Specifiers</a></li>
<li><a href="#tut-multiple-match">1.3 Matching multiple items</a></li>
<li><a href="#tut-optionality">1.4 Optionality</a></li>
<li><a href="#tut-negation">1.5 Negation</a></li>
<li><a href="#tut-other-match">1.6 Other matching specifiers</a></li>
<li><a href="#tut-repeition">1.7 Repetition</a></li>
<li><a href="#tut-grouping">1.8 Grouping</a></li>
</ul>
</li>
<li><a href="#tut-final">Putting it all together</a></li>
<li>
<a href="#tut-backref">Advanced features</a>
<ul>
<li><a href="#tut-backref">Backreferences</a></li>
<li><a href="#tut-if">If statements</a></li>
<li><a href="#tut-unicode">Unicode character properties</a></li>
</ul>
</li>
</ul>
</ul>
</div>
<br>
<p class="font-weight-bold" id="tut-first-match">1. Your first Match</p>
<p>Every language starts with a "Hello World" program, so let's match the output of those programs. Matching is done using the keyword <code class="cm-s-idea">match</code> followed by what you want to match.
<span class="tutorial-code"><code class="cm-s-idea">
match "Hello World"
</code></span>
The above statement will generate a regular expression that matches "Hello World", like "/Hello World/". Any invalid characters will automatically be escaped, so you don't need to worry about it. H2R also supports block comments with <code class="cm-s-idea">/**/</code>, or line comments with <code class="cm-s-idea">//</code> or <code class="cm-s-idea">#</code> so you can explain why or what you intend to match.</p>
<pre class="tutorial-code"><code class="cm-s-idea">/* This is a block comment */
<p>Every language starts with a "Hello World" program, so let's match the output of those programs. Matching is done using the keyword {{i-code}}match{{end-i-code}} followed by what you want to match.
{{s-code}}match "Hello World"{{end-s-code}}
The above statement will generate a regular expression that matches "Hello World", like "/Hello World/". Any invalid characters will automatically be escaped, so you don't need to worry about it. H2R also supports block comments with {{i-code}}/**/{{end-i-code}}, or line comments with {{i-code}}//{{end-i-code}} or {{i-code}}#{{end-i-code}} so you can explain why or what you intend to match.</p>
{{p-code}}/* This is a block comment */
match "Hello World" // matches the output of "Hello World" programs
</code></pre>
<p>Now what if we want to match every case variation of "Hello World" like "hello world" or "hELLO wORLD"? H2R supports the <code class="cm-s-idea">or</code> operator which allows you to specify many possible combinations.
<span class="tutorial-code"><code class="cm-s-idea">
match "Hello World" or "hello world" or "hELLO wORLD"
</code></span>
Or, you can use a <code class="cm-s-idea">using</code> statement to specify that you want it to be case insensitive.</p>
{{end-p-code}}
<p>Now what if we want to match every case variation of "Hello World" like "hello world" or "hELLO wORLD"? H2R supports the {{i-code}}or{{end-i-code}} operator which allows you to specify many possible combinations.
{{s-code}}match "Hello World" or "hello world" or "hELLO wORLD"{{end-s-code}}
Or, you can use a {{i-code}}using{{end-i-code}} statement to specify that you want it to be case insensitive.</p>
<br>
<p class="font-weight-bold" id="tut-using">2. Using Specifiers</p>
<p>Using statements appear at the beginning. You may have one or more using statements which each can contain one or more specifiers. For example:
<span class="tutorial-code"><code class="cm-s-idea">
using global and case insensitive matching
</code></span>
{{s-code}}using global and case insensitive matching{{end-s-code}}
or</p>
<pre class="tutorial-code">
<code class="cm-s-idea">using global
{{p-code}}using global
using case insensitive
</code></pre>
<p>The <code class="cm-s-idea">matching</code> keyword is optional. The flags which are available are:</p>
{{end-p-code}}
<p>The {{i-code}}matching{{end-i-code}} keyword is optional. The flags which are available are:</p>
<table class="table table-sm table-striped table-bordered">
<thead>
@ -49,27 +71,27 @@ using case insensitive
</thead>
<tbody>
<tr>
<td><code class="cm-s-idea">multiline</code></td>
<td>{{i-code}}multiline{{end-i-code}}</td>
<td>Matches can cross line breaks</td>
<td>/&lt;your regex&gt;/m</td>
</tr>
<tr>
<td><code class="cm-s-idea">global</code></td>
<td>{{i-code}}global{{end-i-code}}</td>
<td>Multiple matches are allowed</td>
<td>/&lt;your regex&gt;/g</td>
</tr>
<tr>
<td><code class="cm-s-idea">case sensitive</code></td>
<td>{{i-code}}case sensitive{{end-i-code}}</td>
<td>Match must be exact case</td>
<td><span class="font-italic">none</span></td>
</tr>
<tr>
<td><code class="cm-s-idea">case insensitive</code></td>
<td>{{i-code}}case insensitive{{end-i-code}}</td>
<td>Match may be any case</td>
<td>/&lt;your regex&gt;/i</td>
</tr>
<tr>
<td><code class="cm-s-idea">exact</code></td>
<td>{{i-code}}exact{{end-i-code}}</td>
<td>An exact statement matches a whole line exactly, nothing before, nothing after</td>
<td>/^&lt;your regex&gt;$/</td>
</tr>
@ -77,70 +99,49 @@ using case insensitive
</table>
<p>To match any variation of hello world, we would then do the following:</p>
<pre class="tutorial-code"><code class="cm-s-idea">using case insensitive matching
{{p-code}}using case insensitive matching
match "hello world"
</code></pre>
{{end-p-code}}
<br>
<p class="font-weight-bold" id="tut-multiple-match">3. Matching multiple items</p>
<p>H2R comes with 2 options to match multiple items in a row. The first is to simply write multiple separate <code class="cm-s-idea">match</code> statements like:</p>
<pre class="tutorial-code">
<code class="cm-s-idea">match "hello"
<p>H2R comes with 2 options to match multiple items in a row. The first is to simply write multiple separate {{i-code}}match{{end-i-code}} statements like:</p>
{{p-code}}match "hello"
match " "
match "world"
</code></pre>
<p>However, you can also use a comma, <code class="cm-s-idea">and</code>, or <code class="cm-s-idea">then</code> for a more concise match.
<span class="tutorial-code"><code class="cm-s-idea">
match "hello", " ", "world"
</code></span>
{{end-p-code}}
<p>However, you can also use a comma, {{i-code}}and{{end-i-code}}, or {{i-code}}then{{end-i-code}} for a more concise match.
{{s-code}}match "hello", " ", "world"{{end-s-code}}
or
<span class="tutorial-code"><code class="cm-s-idea">
match "hello" and " " and "world"
</code></span>
{{s-code}}match "hello" and " " and "world"{{end-s-code}}
or
<span class="tutorial-code"><code class="cm-s-idea">
match "hello" then " " then "world"
</code></span>
{{s-code}}match "hello" then " " then "world"{{end-s-code}}
or any combination like
<span class="tutorial-code"><code class="cm-s-idea">
match "hello", " " and then "world"
</code></span>
{{s-code}}match "hello", " " and then "world"{{end-s-code}}
<br>
<p class="font-weight-bold" id="tut-optionality">4. Optionality</p>
<p>Sometimes you wish to match something that may or may not exist. In H2R, this is done via the <code class="cm-s-idea">optional</code>, <code class="cm-s-idea">optionally</code>, <code class="cm-s-idea">possibly</code> or <code class="cm-s-idea">maybe</code> keyword.
<span class="tutorial-code"><code class="cm-s-idea">
optionally match "hello world"
</code></span>
will match 0 or 1 "hello world"'s. This can be used alongside matching multiple statements in a single <code class="cm-s-idea">match</code> statement.
<span class="tutorial-code"><code class="cm-s-idea">
match "hello", maybe " ", "world"
</code></span>
will match "hello", an optional space if it exists, and "world". However, the start <code class="cm-s-idea">optional</code> is for the entire match statement. Thus,
<span class="tutorial-code"><code class="cm-s-idea">
possibly match "hello", " ", then "world"
</code></span>
will actually make the whole "hello world" an optional match rather than just the first "hello". If you want to make the first match optional but keep the rest required, place the <code class="cm-s-idea">optional</code> immediately after the <code class="cm-s-idea">match</code>.</p>
<p>Sometimes you wish to match something that may or may not exist. In H2R, this is done via the {{i-code}}optional{{end-i-code}}, {{i-code}}optionally{{end-i-code}}, {{i-code}}possibly{{end-i-code}} or {{i-code}}maybe{{end-i-code}} keyword.
{{s-code}}optionally match "hello world"{{end-s-code}}
will match 0 or 1 "hello world"'s. This can be used alongside matching multiple statements in a single {{i-code}}match{{end-i-code}} statement.
{{s-code}}match "hello", maybe " ", "world"{{end-s-code}}
will match "hello", an optional space if it exists, and "world". However, the start {{i-code}}optional{{end-i-code}} is for the entire match statement. Thus,
{{s-code}}possibly match "hello", " ", then "world"{{end-s-code}}
will actually make the whole "hello world" an optional match rather than just the first "hello". If you want to make the first match optional but keep the rest required, place the {{i-code}}optional{{end-i-code}} immediately after the {{i-code}}match{{end-i-code}}.</p>
<br>
<p class="font-weight-bold" id="tut-negation">5. Negation</p>
<p>You can negate a match with the operator <code class="cm-s-idea">not</code>
<span class="tutorial-code"><code class="cm-s-idea">
match not "hello world"
</code></span>
<p>You can negate a match with the operator {{i-code}}not{{end-i-code}}
{{s-code}}match not "hello world"{{end-s-code}}
or
<span class="tutorial-code"><code class="cm-s-idea">
match anything but "hello world"
</code></span>
{{s-code}}match anything but "hello world"{{end-s-code}}
will match everything except for "hello world".</p>
<br>
<p class="font-weight-bold" id="tut-other-match">6. Other matching specifiers</p>
<p>Many times you don't know exactly what you wish to match. H2R comes with many specifiers that you can use for your matching. For example, you may wish to match any word. You can do that with:
<span class="tutorial-code"><code class="cm-s-idea">
match a word
</code></span>
The <code class="cm-s-idea">a</code> or <code class="cm-s-idea">an</code> is optional. The possible specifiers that H2R supports are the following:</p>
{{s-code}}match a word{{end-s-code}}
The {{i-code}}a{{end-i-code}} or {{i-code}}an{{end-i-code}} is optional. The possible specifiers that H2R supports are the following:</p>
<table class="table table-sm table-striped table-bordered">
<thead>
<tr>
@ -152,71 +153,71 @@ match a word
</thead>
<tbody>
<tr>
<td><code class="cm-s-idea">anything</code></td>
<td>{{i-code}}anything{{end-i-code}}</td>
<td>Matches any character</td>
<td>.</td>
<td>&nbsp;</td>
</tr>
<tr>
<td><code class="cm-s-idea">word(s)</code></td>
<td>{{i-code}}word(s){{end-i-code}}</td>
<td>Matches many a-z, A-Z, _, or digit characters</td>
<td>\w+</td>
<td>For a-z only, use <code class="cm-s-idea">letter(s)</code></td>
<td>For a-z only, use {{i-code}}letter(s){{end-i-code}}</td>
</tr>
<tr>
<td><code class="cm-s-idea">letter(s)</code></td>
<td>{{i-code}}letter(s){{end-i-code}}</td>
<td>Matches any letter character</td>
<td>[a-zA-Z]</td>
<td>&nbsp;</td>
</tr>
<tr>
<td><code class="cm-s-idea">number(s)</code></td>
<td>{{i-code}}number(s){{end-i-code}}</td>
<td>Matches a string of digit characters</td>
<td>\d+</td>
<td>&nbsp;</td>
</tr>
<tr>
<td><code class="cm-s-idea">digit(s)</code></td>
<td>{{i-code}}digit(s){{end-i-code}}</td>
<td>Matches any digit character</td>
<td>\d</td>
<td>&nbsp;</td>
</tr>
<tr>
<td><code class="cm-s-idea">integer(s)</code></td>
<td>{{i-code}}integer(s){{end-i-code}}</td>
<td>Matches an integer</td>
<td>[+-]?\d+</td>
<td>&nbsp;</td>
<tr>
<td><code class="cm-s-idea">decimal(s)</code></td>
<td>{{i-code}}decimal(s){{end-i-code}}</td>
<td>Matches digits, an optional decimal point and more digits</td>
<td>[+-]?((\d+[,.]?\d*)|([,.]\d+))</td>
<td>Supports both "," and "." decimal points</td>
<tr>
<td><code class="cm-s-idea">character(s)</code></td>
<td>{{i-code}}character(s){{end-i-code}}</td>
<td>Matches a-z, A-Z, _, or digits</td>
<td>\w</td>
<td>For a-z only, use <code class="cm-s-idea">letter(s)</code></td>
<td>For a-z only, use {{i-code}}letter(s){{end-i-code}}</td>
</tr>
<tr>
<td><code class="cm-s-idea">whitespace(s)</code></td>
<td>{{i-code}}whitespace(s){{end-i-code}}</td>
<td>Matches any whitespace character</td>
<td>\s</td>
<td>&nbsp;</td>
</tr>
<tr>
<td><code class="cm-s-idea">(word )boundary</code></td>
<td>{{i-code}}(word )boundary{{end-i-code}}</td>
<td>Boundary between a word</td>
<td>\b</td>
<td>&nbsp;</td>
</tr>
<tr>
<td><code class="cm-s-idea">line feed</code>/<code class="cm-s-idea">newline</code></td>
<td>{{i-code}}line feed{{end-i-code}}/{{i-code}}newline{{end-i-code}}</td>
<td>Matches a newline</td>
<td>\n</td>
<td>&nbsp;</td>
</tr>
<tr>
<td><code class="cm-s-idea">carriage return</code></td>
<td>{{i-code}}carriage return{{end-i-code}}</td>
<td>Matches a carriage return</td>
<td>\r</td>
<td>&nbsp;</td>
@ -225,132 +226,130 @@ match a word
</table>
<p>You can also create ranges of characters to match. Say for example, you wanted to match any characters between a and z, you could write any of the following:
<span class="tutorial-code"><code class="cm-s-idea">
match from "a" to "z" // "from" is optional
</code></span>
{{s-code}}match from "a" to "z" // "from" is optional{{end-s-code}}
or
<span class="tutorial-code"><code class="cm-s-idea">
match between "a" and "z" // "between" is optional
</code></span>
{{s-code}}match between "a" and "z" // "between" is optional{{end-s-code}}
or
<span class="tutorial-code"><code class="cm-s-idea">
match "a" ... "z" // can use "..." or ".."
</code></span>
{{s-code}}match "a" ... "z" // can use "..." or ".."{{end-s-code}}
or
<span class="tutorial-code"><code class="cm-s-idea">
match "a" - "z"
</code></span>
{{s-code}}match "a" - "z"{{end-s-code}}
or
<span class="tutorial-code"><code class="cm-s-idea">
match "a" through "z" // can also use thru
</code></span>
{{s-code}}match "a" through "z" // can also use thru{{end-s-code}}
<br>
<p class="font-weight-bold" id="tut-repeition">7. Repetition</p>
<p>H2R supports 2 types of repetition: single match repetition, or grouped repetition. When using <code class="cm-s-idea">match</code> you can specify the number of captures you want just before the text to capture.
<span class="tutorial-code"><code class="cm-s-idea">
match 2 digits
</code></span>
<p>H2R supports 2 types of repetition: single match repetition, or grouped repetition. When using {{i-code}}match{{end-i-code}} you can specify the number of captures you want just before the text to capture.
{{s-code}}match 2 digits{{end-s-code}}
or
<span class="tutorial-code"><code class="cm-s-idea">
match exactly 2 digits
</code></span>
{{s-code}}match exactly 2 digits{{end-s-code}}
will match any 2 digits in a row. You can also specify a range you wish to capture
<span class="tutorial-code"><code class="cm-s-idea">
match 2 ... 5 digits
</code></span>
{{s-code}}match 2 ... 5 digits{{end-s-code}}
or
<span class="tutorial-code"><code class="cm-s-idea">
match 2 to 5 digits
</code></span>
{{s-code}}match 2 to 5 digits{{end-s-code}}
or
<span class="tutorial-code"><code class="cm-s-idea">
match between 2 to 5 digits
</code></span>
will match 2, 3, 4, or 5 digits. You can specify if the final number is exclusive with the <code class="cm-s-idea">exclusive</code> or <code class="cm-s-idea">inclusive</code> keywords.
<span class="tutorial-code"><code class="cm-s-idea">
match 2 to 5 exclusive digits
</code></span>
{{s-code}}match between 2 to 5 digits{{end-s-code}}
will match 2, 3, 4, or 5 digits. You can specify if the final number is exclusive with the {{i-code}}exclusive{{end-i-code}} or {{i-code}}inclusive{{end-i-code}} keywords.
{{s-code}}match 2 to 5 exclusive digits{{end-s-code}}
will only match up to 4 digits. You can also choose to leave the end unspecified.
<span class="tutorial-code"><code class="cm-s-idea">
match 2+ digits
</code></span>
{{s-code}}match 2+ digits{{end-s-code}}
or
<span class="tutorial-code"><code class="cm-s-idea">
match 2 or more digits
</code></span>
will match 2 or more digits. Repeition can be chained with the <code class="cm-s-idea">and then</code> keywords or the <code class="cm-s-idea">optional</code> keyword. For example:
<span class="tutorial-code"><code class="cm-s-idea">
match 1+ digits then optionally "." then optionally 0...8 digits
</code></span>
{{s-code}}match 2 or more digits{{end-s-code}}
will match 2 or more digits. Repeition can be chained with the {{i-code}}and then{{end-i-code}} keywords or the {{i-code}}optional{{end-i-code}} keyword. For example:
{{s-code}}match 1+ digits then optionally "." then optionally 0...8 digits{{end-s-code}}
Suppose you want to repeat a group of these match statements. You can group a repetition using the <code class="cm-s-idea">repeat</code> keyword. Everything underneath that is tabbed (scoped) will be repeated. By default, this will match 0 or more of the following statements.</p>
<pre class="tutorial-code">
<code class="cm-s-idea">repeat
Suppose you want to repeat a group of these match statements. You can group a repetition using the {{i-code}}repeat{{end-i-code}} keyword. Everything underneath that is tabbed (scoped) will be repeated. By default, this will match 0 or more of the following statements.</p>
{{p-code}}repeat
match "Hello "
match "World"
</code></pre>
<p>Will match 0 or more "Hello "s, but only 1 "World". The same qualifiers that exist for <code class="cm-s-idea">match</code> statements also exist for <code class="cm-s-idea">repeat</code> statements.</p>
<pre class="tutorial-code">
<code class="cm-s-idea">optionally repeat 3...7 times
{{end-p-code}}
<p>Will match 0 or more "Hello "s, but only 1 "World". The same qualifiers that exist for {{i-code}}match{{end-i-code}} statements also exist for {{i-code}}repeat{{end-i-code}} statements.</p>
{{p-code}}optionally repeat 3...7 times
match "Hello World"
</code></pre>
<p>Will potentially match "Hello World" between 3 and 7 times. H2R also supports the following for numbers: <code class="cm-s-idea">One, Two, Three, Four, Five, Six, Seven, Eight, Nine, and Ten</code></p>
{{end-p-code}}
<p>Will potentially match "Hello World" between 3 and 7 times. H2R also supports the following for numbers: {{i-code}}One, Two, Three, Four, Five, Six, Seven, Eight, Nine, and Ten{{end-i-code}}</p>
<br>
<p class="font-weight-bold" id="tut-grouping">8. Grouping</p>
<p>Just like regular expressions, capture groups are supported in H2R. Each group is defined using the <code class="cm-s-idea">create a group</code> keyphrase.</p>
<pre class="tutorial-code">
<code class="cm-s-idea">create a group
<p>Just like regular expressions, capture groups are supported in H2R. Each group is defined using the {{i-code}}create a group{{end-i-code}} keyphrase.</p>
{{p-code}}create a group
match "Hello World"
</code></pre>
<p>This will create a non-named captured group, equivalent to the regular expression "/(Hello World)/". A non-named captured group will show up in your chosen language's matches, however will not be given a name. To access this match, you will need to know the index of the group. Most regular expression engines support named capture groups, and H2R highly recommends using this feature. If you wish to do so, simply give it a name:<p>
<pre class="tutorial-code">
<code class="cm-s-idea">create a group called TestGroup
{{end-p-code}}
<p>This will create a non-named captured group, equivalent to the regular expression "/(Hello World)/". A non-named captured group will show up in your chosen language's matches, however will not be given a name. To access this match, you will need to know the index of the group. Most regular expression engines support named capture groups, and H2R highly recommends using this feature. If you wish to do so, simply give it a name:</p>
{{p-code}}create a group called TestGroup
match "Hello World"
</code></pre>
<p>In most languages, a named group can be accessed through the match result's group list. Take for example, in JavaScript,
<pre class="tutorial-code">
<code class="cm-s-idea">"hello".match(/(?&lt;TestGroup&gt;hello)/).groups</code>
</pre>
{{end-p-code}}
<p>In most languages, a named group can be accessed through the match result's group list. Take for example, in JavaScript,</p>
{{p-code}}"hello".match(/(?&lt;TestGroup&gt;hello)/).groups{{end-p-code}}
<p>Will return an object with {TestGroup: "hello"}. For another example, check out <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/match#Using_named_capturing_groups">MDN web docs</a>. Groups can also be optional.</p>
<pre class="tutorial-code">
<code class="cm-s-idea">create an optional group
{{p-code}}create an optional group
match "Hello World"
</code></pre>
{{end-p-code}}
<p>And groups may be nested</p>
<pre class="tutorial-code">
<code class="cm-s-idea">create a group called TestGroup
{{p-code}}create a group called TestGroup
match "Hello"
create a group called InnerGroup
match "World"
</code></pre>
{{end-p-code}}
<p>The regular expression returned by this will be "/(?&lt;TestGroup&gt;Hello(?&lt;InnerGroup&gt;World))/". Again, in JavaScript, the following</p>
<pre class="tutorial-code">
<code class="cm-s-idea">"HelloWorld".match(/(?&lt;TestGroup&gt;Hello(?&lt;InnerGroup&gt;World))/).groups</code>
</pre>
{{p-code}}"HelloWorld".match(/(?&lt;TestGroup&gt;Hello(?&lt;InnerGroup&gt;World))/).groups{{end-p-code}}
<p>Will return an object with {TestGroup: "HelloWorld", InnerGroup: "World"}.</p>
<br>
<h3 id="tut-final">Putting it all together</h3>
<p>Grouping, repetition, and matching are the 3 primary elements that make up H2R. They can be combined in any way to generate a regular expression. See the <a href="index.html">main page</a> for an example that combines all above to parse a URL.</p>
<h3>Miscellaneous features</h3>
<h3>Advanced features</h3>
<p class="font-weight-bold" id="tut-backref">Backreferences</p>
<p>Sometimes you may wish to match the same text as a previously matched. Take for example matching opening and closing XML tags such as &lt;hello&gt;world&lt;/hello&gt;:</p>
{{p-code}}match "&lt;"
create a group called opening_tag
match a word or digit or "_" or "-"
match "&gt;"
match 0+ not "&lt;"
match "&lt;/"
create a group called closing_tag
match a word or digit or "_" or "-"
match "&gt;"
{{end-p-code}}
<p>To ensure you matched the same opening tag as closing tag, you'll normally need to perform an additional step afterwards by checking the capture groups are equal. However, in most regex engines, this can be performed automatically through backreferences. Backreferences effectively re-capture the same group. Human2Regex allows you to {{i-code}}rerun{{end-i-code}} or {{i-code}}recapture{{end-i-code}} a previous group.</p>
{{p-code}}match "&lt;"
create a group called tag
match a word or digit or "_" or "-"
match "&gt;"
match 0+ not "&lt;"
match "&lt;/"
recapture tag
match "&gt;"
{{end-p-code}}
<p>The regex will only successfully match if both the tags are the same. One thing to note however, the first group must be captured. For a "function"-like capture see regex subroutines (not yet implemented).</p>
<p>To allow for a more natural language, {{i-code}}recapture the group {{end-i-code}} and {{i-code}}recapture the group called{{end-i-code}} are also supported.</p>
<p class="font-weight-bold" id="tut-if">If statements</p>
<p>Certain regex languages support if statements which can be used simplify statements. Human2Regex supports {{i-code}}if{{end-i-code}}, {{i-code}}else if{{end-i-code}}, and {{i-code}}else{{end-i-code}} statements. Inside each {{i-code}}if{{end-i-code}}, you can recapture a group or run a new match. This is done as the following:</p>
{{p-code}}if match "hello" then optionally "world"
match "!"
else if match "goodbye" then optionally "world"
match "!"
{{end-p-code}}
<p>or</p>
{{p-code}}create a group called tag
match "&lt;" then a word or digit or "_" or "-" then "&gt;"
//do we have another tag? keep matching the same tags
if rerun tag
repeat
recapture tag
//ignore everything else
else
match 0+ any thing
{{end-p-code}}
<p class="font-weight-bold" id="tut-unicode">Unicode character properties</p>
<p>You can match specific unicode sequences using <code class="cm-s-idea">"\uXXXX"
</code> or <code class="cm-s-idea">"\UXXXXXXXX"</code> where X is a hexadecimal character.
<span class="tutorial-code"><code class="cm-s-idea">
match "\u0669" // matches arabic digit 9 "&#x0669;"
</code></span>
Unicode character classes/scripts can be matched using the <code class="cm-s-idea">unicode</code> keyword.
<span class="tutorial-code"><code class="cm-s-idea">
match unicode "Latin" // matches any latin character
</code></span>
<span class="tutorial-code"><code class="cm-s-idea">
match unicode "N" // matches any number character
</code></span>
<p>You can match specific unicode sequences using {{i-code}}"\uXXXX"
{{end-i-code}} or {{i-code}}"\UXXXXXXXX"{{end-i-code}} where X is a hexadecimal character.
{{s-code}}match "\u0669" // matches arabic digit 9 "&#x0669;"{{end-s-code}}
Unicode character classes/scripts can be matched using the {{i-code}}unicode{{end-i-code}} keyword.
{{s-code}}match unicode "Latin" // matches any latin character{{end-s-code}}
{{s-code}}match unicode "N" // matches any number character{{end-s-code}}
The following Unicode class specifiers are available:</p>
<table class="table table-sm table-striped table-bordered">
<thead>

View File

@ -1,6 +0,0 @@
{
"page": {
"title": "Human2Regex Tutorial",
"description": "Create regular expressions with natural, human language"
}
}

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* Includes all Concrete Syntax Trees for Human2Regex
@ -7,6 +7,7 @@
import { regexEscape, removeQuotes, hasFlag, combineFlags, isSingleRegexCharacter, first, last, unusedParameter, makeFlag, append } from "./utilities";
import { IToken } from "chevrotain";
import { minimizeMatchString, groupIfRequired, dontClobberRepetition } from "./generator_helper";
/**
* List of regular expression dialects we support
@ -63,31 +64,54 @@ const unicode_script_codes = [
];
/**
* The base concrete syntax tree class
* Context for validation
*
* @remarks Currently only used to validate groups
* @internal
*/
export abstract class H2RCST {
export class GeneratorContext {
public groups: { [ key: string ]: { startLine: number, startColumn: number, length: number } } = {};
/**
* Constructor for H2RCST
* Checks to see if we already have a group defined
*
* @param tokens Tokens used to calculate where an error occured
* @internal
* @param identifier the group name
* @returns true if the group name already exists
*/
constructor(public tokens: IToken[]) {
/* empty */
public hasGroup(identifier: string): boolean {
return Object.prototype.hasOwnProperty.call(this.groups, identifier);
}
/**
* Adds the identifier to the group list
*
* @param identifier the group name
*/
public addGroup(identifier: string, tokens: IToken[]): void {
const f = first(tokens);
const l = last(tokens);
this.groups[identifier] = {
startLine: f.startLine ?? NaN,
startColumn: f.startColumn ?? NaN,
length: (l.endOffset ?? l.startOffset) - f.startOffset,
};
}
}
interface Generates {
/**
* Validate that this is both valid and can be generated in the specified language
*
* @remarks There is no guarantee toRegex will work unless validate returns no errors
*
* @param language the regex dialect we're validating
* @param context the generator context
* @returns A list of errors
* @public
*/
public abstract validate(language: RegexDialect): ISemanticError[];
validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
/**
* Generate a regular expression fragment based on this syntax tree
@ -98,6 +122,26 @@ export abstract class H2RCST {
* @returns a regular expression fragment
* @public
*/
toRegex(language: RegexDialect): string;
}
/**
* The base concrete syntax tree class
*
* @internal
*/
export abstract class H2RCST implements Generates {
/**
* Constructor for H2RCST
*
* @param tokens Tokens used to calculate where an error occured
* @internal
*/
constructor(public tokens: IToken[]) {
/* empty */
}
public abstract validate(language: RegexDialect, context: GeneratorContext): ISemanticError[];
public abstract toRegex(language: RegexDialect): string;
/**
@ -186,7 +230,7 @@ export class MatchSubStatementValue {
*
* @internal
*/
export class MatchStatementValue {
export class MatchStatementValue implements Generates {
/**
* Constructor for MatchStatementValue
@ -198,6 +242,21 @@ export class MatchStatementValue {
constructor(public optional: boolean, public statement: MatchSubStatementCST) {
/* empty */
}
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
return this.statement.validate(language, context);
}
public toRegex(language: RegexDialect): string {
let match_stmt = this.statement.toRegex(language);
// need to group if optional and ungrouped
if (this.optional) {
match_stmt = groupIfRequired(match_stmt) + "?";
}
return match_stmt;
}
}
/**
@ -227,11 +286,11 @@ export class MatchSubStatementCST extends H2RCST {
super(tokens);
}
public validate(language: RegexDialect): ISemanticError[] {
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = [];
if (this.count) {
append(errors, this.count.validate(language));
append(errors, this.count.validate(language, context));
}
for (const value of this.values) {
@ -353,56 +412,16 @@ export class MatchSubStatementCST extends H2RCST {
}
}
let ret = "";
let require_grouping = false;
let dont_clobber_plus = false;
if (matches.length === 1) {
ret = first(matches);
if (ret.endsWith("+")) {
dont_clobber_plus = true;
}
}
else {
ret = minimizeMatchString(matches);
if (ret.length > 1 &&
(!ret.startsWith("(") || !ret.endsWith("["))) {
require_grouping = true;
}
}
let ret = minimizeMatchString(matches);
if (this.count) {
if (dont_clobber_plus) {
const clobber = this.count.toRegex(language);
// + can be ignored as well as a count as long as that count is > 0
switch (clobber) {
case "*":
case "?":
ret = "(?:" + ret + ")" + clobber;
break;
case "+":
// ignore
break;
default:
if (clobber.startsWith("{0")) {
ret = "(?:" + ret + ")" + clobber;
}
else {
// remove + and replace with count
ret.substring(0, ret.length - 1) + clobber;
}
break;
}
if (matches.length === 1) {
// we don't group if there's only 1 element
// but we need to make sure we don't add an additional + or *
ret = dontClobberRepetition(ret, this.count.toRegex(language));
}
else {
if (require_grouping) {
ret = "(?:" + ret + ")";
}
ret += this.count.toRegex(language);
ret = groupIfRequired(ret) + this.count.toRegex(language);
}
}
@ -427,8 +446,9 @@ export class UsingStatementCST extends H2RCST {
super(tokens);
}
public validate(language: RegexDialect): ISemanticError[] {
unusedParameter(language, "Using Statement does not change based on language");
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
unusedParameter(language, "Count does not need checking");
unusedParameter(context, "Context is not needed");
const errors: ISemanticError[] = [];
let flag = this.flags[0];
@ -490,15 +510,13 @@ export class CountSubStatementCST extends H2RCST {
super(tokens);
}
public validate(language: RegexDialect): ISemanticError[] {
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
unusedParameter(language, "Count does not need checking");
unusedParameter(context, "Context is not needed");
const errors: ISemanticError[] = [];
if (this.from < 0) {
errors.push(this.error("Value cannot be negative"));
}
else if (this.to !== null && ((this.opt === "exclusive" && (this.to-1) <= this.from) || this.to <= this.from)) {
if (this.to !== null && ((this.opt === "exclusive" && (this.to-1) <= this.from) || this.to <= this.from)) {
errors.push(this.error("Values must be in range of eachother"));
}
@ -548,49 +566,27 @@ export class MatchStatementCST extends StatementCST {
* Constructor for MatchStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param matches
* @param matches the list of matches
*/
constructor(tokens: IToken[], private completely_optional: boolean, private matches: MatchStatementValue[]) {
super(tokens);
}
public validate(language: RegexDialect): ISemanticError[] {
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = [];
for (const match of this.matches) {
append(errors, match.statement.validate(language));
append(errors, match.statement.validate(language, context));
}
return errors;
}
public toRegex(language: RegexDialect): string {
let final_matches = this.matches.map((x) => {
let match_stmt = x.statement.toRegex(language);
// need to group if optional and ungrouped
if (x.optional) {
if (!isSingleRegexCharacter(match_stmt)) {
// don't re-group a group
if (match_stmt[0] !== "(" && match_stmt[match_stmt.length-1] !== ")") {
match_stmt = "(?:" + match_stmt + ")";
}
}
match_stmt += "?";
}
return match_stmt;
}).join("");
let final_matches = this.matches.map((x) => x.toRegex(language)).join("");
if (this.completely_optional) {
if (!isSingleRegexCharacter(final_matches)) {
// don't re-group a group
if (final_matches[0] !== "(" && final_matches[final_matches.length-1] !== ")") {
final_matches = "(?:" + final_matches + ")";
}
}
final_matches += "?";
final_matches = groupIfRequired(final_matches) + "?";
}
return final_matches;
@ -616,22 +612,22 @@ export class RepeatStatementCST extends StatementCST {
super(tokens);
}
public validate(language: RegexDialect): ISemanticError[] {
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = [];
if (this.count !== null) {
append(errors, this.count.validate(language));
append(errors, this.count.validate(language, context));
}
for (const statement of this.statements) {
append(errors, statement.validate(language));
append(errors, statement.validate(language, context));
}
return errors;
}
public toRegex(language: RegexDialect): string {
let str = "(?:" + this.statements.map((x) => x.toRegex(language)).join("") + ")";
let str = groupIfRequired(this.statements.map((x) => x.toRegex(language)).join(""));
if (this.count) {
str += this.count.toRegex(language);
@ -659,7 +655,7 @@ export class RepeatStatementCST extends StatementCST {
* @internal
*/
export class GroupStatementCST extends StatementCST {
/**
* Constructor for GroupStatementCST
*
@ -673,16 +669,21 @@ export class GroupStatementCST extends StatementCST {
super(tokens);
}
public validate(language: RegexDialect): ISemanticError[] {
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors : ISemanticError[] = [];
// All languages currently support named groups
//if (false) {
// errors.push(this.error("This language does not support named groups"));
//}
if (this.name !== null) {
if (context.hasGroup(this.name)) {
const past_group = context.groups[this.name];
errors.push(this.error(`Group with name "${this.name}" was already defined here: ${past_group.startLine}:${past_group.startLine}-${past_group.startLine}:${past_group.startLine+past_group.length}`));
}
else {
context.addGroup(this.name, this.tokens);
}
}
for (const statement of this.statements) {
append(errors, statement.validate(language));
append(errors, statement.validate(language, context));
}
return errors;
@ -711,6 +712,195 @@ export class GroupStatementCST extends StatementCST {
}
}
/**
* Concrete Syntax Tree for a Backreference statement
*
* @internal
*/
export class BackrefStatementCST extends StatementCST {
/**
* Constructor for BackrefStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param optional is this backref optional
* @param count optional number of times to repeat
* @param name the group name to call
*/
constructor(tokens: IToken[], private optional: boolean, private count: CountSubStatementCST | null, private name: string) {
super(tokens);
}
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = [];
if (!context.hasGroup(this.name)) {
errors.push(this.error(`Cannot call group with name "${this.name}" as it was never previously defined`));
}
if (this.count !== null) {
append(errors, this.count.validate(language, context));
}
return errors;
}
public toRegex(language: RegexDialect): string {
let str = "";
switch (language) {
case RegexDialect.Python:
str = `(?P=${this.name})`;
break;
case RegexDialect.DotNet:
case RegexDialect.Java:
str = `\\k<${this.name}>`;
break;
default:
str = `\\g<${this.name}>`;
break;
}
if (this.count) {
str += this.count.toRegex(language);
// group for optionality because count would be incorrect otherwise
if (this.optional) {
str = "(?:" + str + ")?";
}
}
else if (this.optional) {
str = "?";
}
return str;
}
}
/**
* Concrete Syntax Tree for an If Pattern statement
*
* @internal
*/
export class IfPatternStatementCST extends StatementCST {
/**
* Constructor for IfPatternStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param matches list of matches to test against
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens: IToken[], private matches: MatchStatementValue[], private true_statements: StatementCST[], private false_statements: StatementCST[]) {
super(tokens);
}
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = [];
if (language === RegexDialect.Java || language === RegexDialect.JS) {
errors.push(this.error("This language does not support conditionals"));
}
if (language === RegexDialect.Python) {
errors.push(this.error("This language does not support pattern conditionals"));
}
for (const match of this.matches) {
append(errors, match.validate(language, context));
}
for (const statement of this.true_statements) {
append(errors, statement.validate(language, context));
}
for (const statement of this.false_statements) {
append(errors, statement.validate(language, context));
}
return errors;
}
public toRegex(language: RegexDialect): string {
const if_stmt = this.matches.map((x) => x.toRegex(language)).join("");
const true_stmt = groupIfRequired(this.true_statements.map((x) => x.toRegex(language)).join(""));
if (this.false_statements.length > 0) {
const false_stmt = groupIfRequired(this.false_statements.map((x) => x.toRegex(language)).join(""));
return `(?(${if_stmt})${true_stmt}|${false_stmt})`;
}
else {
return `(?(${if_stmt})${true_stmt})`;
}
}
}
/**
* Concrete Syntax Tree for an If group Ident statement
*
* @internal
*/
export class IfIdentStatementCST extends StatementCST {
/**
* Constructor for IfIdentStatementCST
*
* @param tokens Tokens used to calculate where an error occured
* @param identifier the group identifier to check
* @param true_statements true path
* @param false_statements false path
*/
constructor(tokens: IToken[], private identifier: string, private true_statements: StatementCST[], private false_statements: StatementCST[]) {
super(tokens);
}
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = [];
if (language === RegexDialect.Java || language === RegexDialect.JS) {
errors.push(this.error("This language does not support conditionals"));
}
if (!context.hasGroup(this.identifier)) {
errors.push(this.error(`Group with name "${this.identifier}" does not exist`));
}
for (const statement of this.true_statements) {
append(errors, statement.validate(language, context));
}
for (const statement of this.false_statements) {
append(errors, statement.validate(language, context));
}
return errors;
}
public toRegex(language: RegexDialect): string {
let if_stmt = this.identifier;
// be more clear with languages that support it
if (language === RegexDialect.Boost) {
if_stmt = "<" + if_stmt + ">";
}
const true_stmt = groupIfRequired(this.true_statements.map((x) => x.toRegex(language)).join(""));
if (this.false_statements.length > 0) {
const false_stmt = groupIfRequired(this.false_statements.map((x) => x.toRegex(language)).join(""));
return `(?(${if_stmt})${true_stmt}|${false_stmt})`;
}
else {
return `(?(${if_stmt})${true_stmt})`;
}
}
}
/**
* Concrete Syntax Tree for a regular expression
*
@ -730,115 +920,20 @@ export class RegularExpressionCST extends H2RCST {
super(tokens);
}
public validate(language: RegexDialect): ISemanticError[] {
const errors: ISemanticError[] = this.usings.validate(language);
public validate(language: RegexDialect, context: GeneratorContext): ISemanticError[] {
const errors: ISemanticError[] = this.usings.validate(language, context);
for (const statement of this.statements) {
append(errors, statement.validate(language));
append(errors, statement.validate(language, context));
}
return errors;
}
public toRegex(language: RegexDialect): string {
const modifiers = this.usings.toRegex(language);
const regex = this.statements.map((x) => x.toRegex(language)).join("");
return modifiers.replace("{regex}", regex);
}
}
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
export function minimizeMatchString(arr: string[]): string {
return minMatchString(arr, 0);
}
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array
* @param depth must be 0 for initial call
* @internal
*/
function minMatchString(arr: string[], depth: number = 0): string {
// base case: arr is empty
if (arr.length === 0) {
return "";
}
// base case: arr has 1 element (must have at least 2, so this means this value is optional)
if (arr.length === 1) {
return first(arr) + "?";
}
// remove duplicates
arr = [ ...new Set(arr) ];
// base case: arr has 1 element (after duplicate removal means this is required)
if (arr.length === 1) {
return first(arr);
}
// base case: arr is all single letters
if (arr.every(isSingleRegexCharacter)) {
return "[" + arr.join("") + "]";
}
// now the real magic begins
// You are not expected to understand this
let longest_begin_substring = first(arr);
let longest_end_substring = first(arr);
for (let i = 1; i < arr.length; i++) {
// reduce longest_substring to match everything
for (let j = 0; j < longest_begin_substring.length; j++) {
if (arr[i].length < j || longest_begin_substring[j] !== arr[i][j]) {
longest_begin_substring = longest_begin_substring.substr(0, j);
break;
}
}
for (let j = 0; j < longest_end_substring.length; j++) {
if (arr[i].length-j < 0 || longest_end_substring[longest_end_substring.length-j-1] !== arr[i][arr[i].length-j-1]) {
longest_end_substring = longest_end_substring.substr(longest_end_substring.length-j, longest_end_substring.length);
break;
}
}
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
break;
}
}
// No matches whatsoever
// *technically* we can optimize further, but that is a VERY non-trivial problem
// For example optimizing: [ "a1x1z", "a2y2z", "a3z3z" ] to: "a[123][xyz][123]z"
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
if (depth > 0) {
return "(?:" + arr.join("|") + ")";
}
else {
return arr.join("|");
}
}
// we have some matches
else {
// remove begin (if exists) and end (if exists) from each element and remove empty strings
const begin_pos = longest_begin_substring.length;
const end_pos = longest_end_substring.length;
const similar_matches: string[] = [];
for (const ele of arr) {
const match = ele.substring(begin_pos, ele.length-end_pos);
if (match.length !== 0) {
similar_matches.push(match);
}
}
return longest_begin_substring + minMatchString(similar_matches, depth + 1) + longest_end_substring;
}
}
}

224
src/generator_helper.ts Normal file
View File

@ -0,0 +1,224 @@
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* Includes helper functions for the Generator
* @packageDocumentation
*/
import { first, isSingleRegexCharacter } from "./utilities";
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array of matches
* @internal
*/
export function minimizeMatchString(arr: string[]): string {
// don't process an array of length 1, otherwise you'll get the wrong result
if (arr.length === 1) {
return first(arr);
}
return minMatchString(arr, 0);
}
/**
* Minimizes the match string by finding duplicates or substrings in the array
*
* @param arr the array
* @param depth must be 0 for initial call
* @returns an optimized string
* @internal
*/
function minMatchString(arr: string[], depth: number = 0): string {
// base case: arr is empty
if (arr.length === 0) {
return "";
}
// base case: arr has 1 element (must have at least 2, so this means this value is optional)
if (arr.length === 1) {
return first(arr) + "?";
}
// remove duplicates
arr = [ ...new Set(arr) ];
// base case: arr has 1 element (after duplicate removal means this is required)
if (arr.length === 1) {
return first(arr);
}
// base case: arr is all single letters
if (arr.every(isSingleRegexCharacter)) {
return "[" + arr.join("") + "]";
}
// now the real magic begins
// You are not expected to understand this
let longest_begin_substring = first(arr);
let longest_end_substring = first(arr);
for (let i = 1; i < arr.length; i++) {
// reduce longest_substring to match everything
for (let j = 0; j < longest_begin_substring.length; j++) {
if (arr[i].length < j || longest_begin_substring[j] !== arr[i][j]) {
longest_begin_substring = longest_begin_substring.substr(0, j);
break;
}
}
for (let j = 0; j < longest_end_substring.length; j++) {
if (arr[i].length-j < 0 || longest_end_substring[longest_end_substring.length-j-1] !== arr[i][arr[i].length-j-1]) {
longest_end_substring = longest_end_substring.substr(longest_end_substring.length-j, longest_end_substring.length);
break;
}
}
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
break;
}
}
// No matches whatsoever
// *technically* we can optimize further, but that is a VERY non-trivial problem
// For example optimizing: [ "a1x1z", "a2y2z", "a3z3z" ] to: "a[123][xyz][123]z"
if (longest_begin_substring.length === 0 && longest_end_substring.length === 0) {
if (depth > 0) {
return "(?:" + arr.join("|") + ")";
}
else {
return arr.join("|");
}
}
// we have some matches
else {
// remove begin (if exists) and end (if exists) from each element and remove empty strings
const begin_pos = longest_begin_substring.length;
const end_pos = longest_end_substring.length;
const similar_matches: string[] = [];
for (const ele of arr) {
const match = ele.substring(begin_pos, ele.length-end_pos);
if (match.length !== 0) {
similar_matches.push(match);
}
}
return longest_begin_substring + minMatchString(similar_matches, depth + 1) + longest_end_substring;
}
}
/**
* Groups a regex fragment if it needs to be grouped
*
* @param fragment fragment of regular expression to potentially group
* @returns a non-capturing group if there needs to be one
* @internal
*/
export function groupIfRequired(fragment: string): string {
if (isSingleRegexCharacter(fragment)) {
return fragment;
}
if (fragment[0] === "(" && fragment[fragment.length-1] === ")") {
let bracket_count = 0;
for (let i = 1; i < fragment.length-2; i++) {
if (fragment[i] === "\\") {
i++;
}
else if (fragment[i] === "(") {
bracket_count++;
}
else if (fragment[i] === ")") {
bracket_count--;
if (bracket_count === -1) {
break;
}
}
}
return bracket_count === 0 ? fragment : "(?:" + fragment + ")";
}
else if (fragment[0] === "[" && fragment[fragment.length-1] === "]") {
let bracket_count = 0;
for (let i = 1; i < fragment.length-2; i++) {
if (fragment[i] === "\\") {
i++;
}
//you'll never have a raw [ inside a []
//else if (fragment[i] === "[") {
// bracket_count++;
//}
else if (fragment[i] === "]") {
bracket_count--;
if (bracket_count === -1) {
break;
}
}
}
return bracket_count === 0 ? fragment : "(?:" + fragment + ")";
}
else {
return "(?:" + fragment + ")";
}
}
/**
* Checks to see if fragment has a + or * at the end and has a repetition statement
*
* @param fragment fragment of regular expression
* @param repetition repetition that may clobber the fragment
*/
export function dontClobberRepetition(fragment: string, repetition: string): string {
// + can be ignored as well as a count as long as that count is > 0
if (fragment.endsWith("+")) {
switch (repetition) {
case "*":
// ignore: + is greater than *
break;
case "?":
// non-greedy qualifier
fragment += repetition;
break;
case "+":
// ignore: already +
break;
default:
if (repetition.startsWith("{0")) {
fragment = "(?:" + fragment + ")" + repetition;
}
else {
// remove + and replace with count
fragment = fragment.substring(0, fragment.length - 1) + repetition;
}
break;
}
}
else if (fragment.endsWith("*")) {
switch (repetition) {
case "*":
// ignore: already +
break;
case "?":
// non-greedy qualifier
fragment += repetition;
break;
default:
// remove * and replace with count
fragment = fragment.substring(0, fragment.length - 1) + repetition;
break;
}
}
else {
fragment += repetition;
}
return fragment;
}

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* Includes all packages

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* The Lexer for Human2Regex

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* The parser for Human2Regex
@ -7,7 +7,7 @@
import { EmbeddedActionsParser, IOrAlt, IToken } from "chevrotain";
import * as T from "./tokens";
import { CountSubStatementCST, UsingFlags, MatchSubStatementType, MatchSubStatementValue, MatchSubStatementCST, UsingStatementCST, RegularExpressionCST, StatementCST, RepeatStatementCST, MatchStatementValue, MatchStatementCST, GroupStatementCST, RegexDialect } from "./generator";
import { CountSubStatementCST, UsingFlags, MatchSubStatementType, MatchSubStatementValue, MatchSubStatementCST, UsingStatementCST, RegularExpressionCST, StatementCST, RepeatStatementCST, MatchStatementValue, MatchStatementCST, GroupStatementCST, RegexDialect, BackrefStatementCST, GeneratorContext, IfPatternStatementCST, IfIdentStatementCST } from "./generator";
import { first, usefulConditional, unusedParameter, CommonError } from "./utilities";
/**
@ -60,7 +60,7 @@ export class ParseResult {
* @public
*/
public validate(language: RegexDialect): CommonError[] {
return this.regexp_cst.validate(language).map(CommonError.fromSemanticError);
return this.regexp_cst.validate(language, new GeneratorContext()).map(CommonError.fromSemanticError);
}
/**
@ -558,12 +558,107 @@ export class Human2RegexParser extends EmbeddedActionsParser {
return new RepeatStatementCST(tokens, optional, count, statements);
});
const BackrefStatement = $.RULE("BackrefStatement", () => {
const tokens: IToken[] = [];
let optional = false;
let count: CountSubStatementCST | null = null;
$.OPTION5(() => {
tokens.push($.CONSUME(T.Optional));
optional = true;
});
tokens.push($.CONSUME(T.Rerun));
$.OPTION6(() => count = $.SUBRULE(CountSubStatement));
$.OPTION7(() => {
$.OPTION(() => $.CONSUME(T.The));
$.CONSUME(T.Group);
$.OPTION2(() => $.CONSUME(T.Called));
});
const name = $.CONSUME(T.Identifier).image;
tokens.push($.CONSUME4(T.EndOfLine));
return new BackrefStatementCST(tokens, optional, count, name);
});
const IfStatement = $.RULE("IfStatement", () => {
const tokens: IToken[] = [];
const msv: MatchStatementValue[] = [];
let optional = false;
const true_statements: StatementCST[] = [];
const false_statements: StatementCST[] = [];
let name: string = "";
tokens.push($.CONSUME(T.If));
$.OR2([
{ALT: () => {
name = $.CONSUME(T.Identifier).image;
}},
{ALT: () => {
$.CONSUME(T.Match);
$.OPTION4(() => {
$.CONSUME3(T.Optional);
optional = true;
});
msv.push(new MatchStatementValue(optional, $.SUBRULE(MatchSubStatement)));
$.MANY(() => {
$.OR([
{ ALT: () => {
$.OPTION2(() => $.CONSUME2(T.And));
$.CONSUME(T.Then);
}},
{ ALT: () => $.CONSUME(T.And) },
]);
optional = false;
$.OPTION3(() => {
$.CONSUME2(T.Optional);
optional = true;
});
msv.push(new MatchStatementValue(optional, $.SUBRULE2(MatchSubStatement)));
});
}}
]);
tokens.push($.CONSUME3(T.EndOfLine));
$.CONSUME2(T.Indent);
$.AT_LEAST_ONE2(() => {
true_statements.push($.SUBRULE(Statement));
});
$.CONSUME2(T.Outdent);
$.OPTION(() => {
$.CONSUME(T.Else);
$.CONSUME4(T.EndOfLine);
$.CONSUME3(T.Indent);
$.AT_LEAST_ONE3(() => {
false_statements.push($.SUBRULE2(Statement));
});
$.CONSUME3(T.Outdent);
});
if (name === "") {
return new IfPatternStatementCST(tokens, msv, true_statements, false_statements);
}
else {
return new IfIdentStatementCST(tokens, name, true_statements, false_statements);
}
});
// statement super class
const Statement = $.RULE("Statement", () => {
return $.OR([
{ ALT: () => $.SUBRULE(MatchStatement) },
{ ALT: () => $.SUBRULE(GroupStatement) },
{ ALT: () => $.SUBRULE(RepeatStatement) }
{ ALT: () => $.SUBRULE(RepeatStatement) },
{ ALT: () => $.SUBRULE(BackrefStatement) },
{ ALT: () => $.SUBRULE(IfStatement) }
]);
});

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
"use strict";
import { Human2RegexLexer, Human2RegexLexerOptions } from "./lexer";
@ -10,10 +10,10 @@ import "codemirror/addon/mode/simple";
import "codemirror/addon/runmode/runmode";
import "codemirror/addon/lint/lint";
import "./docs/bootstrap.css";
import "./docs/cleanblog.css";
import "./docs/codemirror.css";
import "./docs/style.css";
import "./docs/assets/bootstrap.css";
import "./docs/assets/cleanblog.css";
import "./docs/assets/codemirror.css";
import "./docs/assets/style.css";
interface CodeMirror {
defineSimpleMode: (name: string, value: Record<string, unknown>) => void;
@ -81,6 +81,10 @@ document.addEventListener("DOMContentLoaded", function() {
{token: "builtin", regex: /case insensitive/i},
{token: "builtin", regex: /case sensitive/i},
{token: "operator", regex: /\+|or more/i},
{token: "keyword", regex: /re( |-)?(run|capture)/i },
{token: "operator", regex: /the/i },
{token: "keyword", regex: /if/i },
{token: "keyword", regex: /else|otherwise/i },
{token: "variable", regex: /[a-z]\w*/i},
{token: "number", regex: /\d+/},
{token: "string", regex: /"(?:[^\\"]|\\(?:[bfnrtv"\\/]|u[0-9a-f]{4}|U[0-9a-f]{8}))*"/i},
@ -200,8 +204,6 @@ document.addEventListener("DOMContentLoaded", function() {
}
}
const editor = code_mirror.fromTextArea($human, {
mode: "human2regex",
lineNumbers: false,

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* The tokens required for Human2Regex
@ -53,34 +53,17 @@ import { createToken, Lexer } from "chevrotain";
/** @internal */ export const From = createToken({name: "From", pattern: /from/i});
/** @internal */ export const To = createToken({name: "To", pattern: /(to|through|thru|\-|\.\.\.?)/i});
/** @internal */ export const Create = createToken({name: "Create", pattern: /create(s)?/i});
/** @internal */ export const Called = createToken({name: "Called", pattern: /name(d)?|call(ed)?/i});
/** @internal */ export const Called = createToken({name: "Called", pattern: /named|called/i});
/** @internal */ export const Repeat = createToken({name: "Repeat", pattern: /repeat(s|ing)?/i});
/** @internal */ export const Newline = createToken({name: "Newline", pattern: /(new line|newline)/i});
/** @internal */ export const CarriageReturn = createToken({name: "CarriageReturn", pattern: /carriage return/i});
/** @internal */ export const CaseInsensitive = createToken({name: "CaseInsensitive", pattern: /case insensitive/i});
/** @internal */ export const CaseSensitive = createToken({name: "CaseSensitive", pattern: /case sensitive/i});
/** @internal */ export const OrMore = createToken({name: "OrMore", pattern: /\+|or more/i});
/*
//Not being used currently
export const Of = createToken({name: "Of", pattern: /of/i});
export const Nothing = createToken({name: "Nothing", pattern: /nothing/i});
export const As = createToken({name: "As", pattern: /as/i});
export const If = createToken({name: "If", pattern: /if/i});
export const Start = createToken({name: "Start", pattern: /start(s) with?/i});
export const Ends = createToken({name: "Ends", pattern: /end(s)? with/i});
export const Else = createToken({name: "Else", pattern: /(other wise|otherwise|else)/i});
export const Unless = createToken({name: "Unless", pattern: /unless/i});
export const While = createToken({name: "While", pattern: /while/i});
export const More = createToken({name: "More", pattern: /more/i});
export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
export const None = createToken({name: "None", pattern: /none/i});
export const Neither = createToken({name: "Neither", pattern: /neither/i});
export const The = createToken({name: "The", pattern: /the/i }); //, longer_alt: Then});
export const By = createToken({name: "By", pattern: /by/i});
*/
/** @internal */ export const Rerun = createToken({name: "Rerun", pattern: /re( |-)?(run|capture)/i });
/** @internal */ export const The = createToken({name: "The", pattern: /the/i });
/** @internal */ export const If = createToken({name: "If", pattern: /if/i });
/** @internal */ export const Else = createToken({name: "Else", pattern: /else|otherwise/i });
/** @internal */ export const EndOfLine = createToken({name: "EOL", pattern: /\n/});
/** @internal */ export const WS = createToken({name: "Whitespace", pattern: /[^\S\n]+/, start_chars_hint: [ " ", "\r" ], group: Lexer.SKIPPED});
@ -127,22 +110,11 @@ export const AllTokens = [
Whitespace,
Number,
Unicode,
/*
Of,
As,
Called,
Rerun,
If,
Start,
Ends,
Else,
Unless,
While,
More,
Nothing,
By,
The,
None,
Neither,
*/
Using,
Global,
Multiline,
@ -158,7 +130,6 @@ export const AllTokens = [
Exclusive,
From,
Create,
Called,
Repeat,
Newline,
CarriageReturn,

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
/**
* Some utility functions for Human2Regex
@ -186,6 +186,7 @@ export class CommonError {
*
* @param error The lexing error
* @returns a new CommonError
* @internal
*/
public static fromLexError(error: ILexingError): CommonError {
// not really fond of --> and <--
@ -199,6 +200,7 @@ export class CommonError {
*
* @param error The parsing error
* @returns a new CommonError
* @internal
*/
public static fromParseError(error: IRecognitionException): CommonError {
// not really fond of --> and <--
@ -212,6 +214,7 @@ export class CommonError {
*
* @param error The semantic error
* @returns a new CommonError
* @internal
*/
public static fromSemanticError(error: ISemanticError): CommonError {
return new CommonError("Semantic Error", error.startLine, error.startColumn, error.length, error.message);

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
import { Human2RegexParser, Human2RegexParserOptions, ParseResult,
Human2RegexLexer, Human2RegexLexerOptions, TokenizeResult,

View File

@ -1,8 +1,8 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
import { Human2RegexParser, Human2RegexParserOptions } from "../src/parser";
import { Human2RegexLexer, Human2RegexLexerOptions } from "../src/lexer";
import { RegexDialect, minimizeMatchString } from "../src/generator";
import { RegexDialect } from "../src/generator";
describe("Generator functionality", function() {
@ -67,6 +67,14 @@ describe("Generator functionality", function() {
const toks5 = lexer.tokenize('match between 2 and 2 exclusive "hello"').tokens;
const reg5 = parser.parse(toks5);
expect(reg5.validate(RegexDialect.JS).length).toBeGreaterThan(0);
const toks6 = lexer.tokenize('create a group called thing\n\tmatch "hi"\ncreate a group called thing\n\tmatch "hi"\n').tokens;
const reg6 = parser.parse(toks6);
expect(reg6.validate(RegexDialect.JS).length).toBeGreaterThan(0);
const toks7 = lexer.tokenize("rerun thing").tokens;
const reg7 = parser.parse(toks7);
expect(reg7.validate(RegexDialect.JS).length).toBeGreaterThan(0);
});
it("handles ranges", function() {
@ -97,6 +105,12 @@ describe("Generator functionality", function() {
expect(reg2.validate(RegexDialect.JS).length).toBe(0);
expect(reg2.toRegex(RegexDialect.JS)).toBe("/[a-zA-Z][+-]?\\d+[+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+))/");
expect(reg2.toRegex(RegexDialect.PCRE)).toBe("/[[:alpha:]][+-]?\\d+[+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+))/");
const toks3 = lexer.tokenize("match not letter, not integer, not decimal").tokens;
const reg3 = parser.parse(toks3);
expect(reg3.validate(RegexDialect.JS).length).toBe(0);
expect(reg3.toRegex(RegexDialect.JS)).toBe("/[^a-zA-Z](?![+-]?\\d+)(?![+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+)))/");
expect(reg3.toRegex(RegexDialect.PCRE)).toBe("/[^[:alpha:]](?![+-]?\\d+)(?![+-]?(?:(?:\\d+[,.]?\\d*)|(?:[,.]\\d+)))/");
});
it("doesn't clobber repetition", function() {
@ -115,23 +129,6 @@ describe("Generator functionality", function() {
expect(reg1.toRegex(RegexDialect.JS)).toBe("/(?!hello){1,6}/");
});
it("can minimize matches", function() {
const test_cases = [
{ from: [ "abc", "abc" ], to: "abc" },
{ from: [ "a", "ab" ], to: "ab?" },
{ from: [ "a1x1z", "a2y2z", "a3z3z" ], to: "a(?:1x1|2y2|3z3)z" },
{ from: [ "ab", "cd" ], to: "ab|cd" },
{ from: [ "abc", "bc" ], to: "a?bc" },
{ from: [ "abc", "xb" ], to: "abc|xb" }
];
for (const c of test_cases) {
const got = minimizeMatchString(c.from);
expect(got).toBe(c.to);
}
});
it("optimizes correctly", function() {
const toks0 = lexer.tokenize('match "a" or "b" or "b"').tokens;
const reg0 = parser.parse(toks0);
@ -157,6 +154,44 @@ describe("Generator functionality", function() {
const reg4 = parser.parse(toks4);
expect(reg4.validate(RegexDialect.JS).length).toBe(0);
expect(reg4.toRegex(RegexDialect.JS)).toBe("/a(?:1x1|2x2|3x3)z/");
const toks5 = lexer.tokenize('match "a", maybe "b" or "c"').tokens;
const reg5 = parser.parse(toks5);
expect(reg5.validate(RegexDialect.JS).length).toBe(0);
expect(reg5.toRegex(RegexDialect.JS)).toBe("/a[bc]?/");
});
it("can generate backreferences", function() {
const toks0 = lexer.tokenize('create a group called thing\n\tmatch "Hello World"\nrerun thing\noptionally recapture 3 times the group called thing').tokens;
const reg0 = parser.parse(toks0);
expect(reg0.validate(RegexDialect.JS).length).toBe(0);
expect(reg0.toRegex(RegexDialect.JS)).toBe("/(?<thing>Hello World)\\g<thing>(?:\\g<thing>{3})?/");
expect(reg0.toRegex(RegexDialect.PCRE)).toBe("/(?P<thing>Hello World)\\g<thing>(?:\\g<thing>{3})?/");
expect(reg0.toRegex(RegexDialect.Python)).toBe("/(?P<thing>Hello World)(?P=thing)(?:(?P=thing){3})?/");
expect(reg0.toRegex(RegexDialect.DotNet)).toBe("/(?<thing>Hello World)\\k<thing>(?:\\k<thing>{3})?/");
});
it("can generate if statements", function() {
const toks0 = lexer.tokenize('if matches "a"\n\tmatch "b"\n').tokens;
const reg0 = parser.parse(toks0);
expect(reg0.validate(RegexDialect.JS).length).toBeGreaterThan(0);
expect(reg0.validate(RegexDialect.PCRE).length).toBe(0);
expect(reg0.toRegex(RegexDialect.PCRE)).toBe("/(?(a)b)/");
const toks1 = lexer.tokenize('if matches "alpha", maybe "b" or "f"\n\tmatch "c"\nelse\n\tif matches "d"\n\t\tmatch "e"\n\telse\n\t\tmatch "f"').tokens;
const reg1 = parser.parse(toks1);
expect(reg1.validate(RegexDialect.JS).length).toBeGreaterThan(0);
expect(reg1.validate(RegexDialect.Python).length).toBeGreaterThan(0);
expect(reg1.validate(RegexDialect.PCRE).length).toBe(0);
expect(reg1.toRegex(RegexDialect.PCRE)).toBe("/(?(alpha[bf]?)c|(?(d)e|f))/");
const toks2 = lexer.tokenize('create a group called thing\n\tmatch "a"\nif thing\n\tmatch "b"\nelse\n\tmatch "c"\n').tokens;
const reg2 = parser.parse(toks2);
expect(reg2.validate(RegexDialect.JS).length).toBeGreaterThan(0);
expect(reg2.validate(RegexDialect.PCRE).length).toBe(0);
expect(reg2.toRegex(RegexDialect.PCRE)).toBe("/(?P<thing>a)(?(thing)b|c)/");
expect(reg2.toRegex(RegexDialect.Boost)).toBe("/(?<thing>a)(?(<thing>)b|c)/");
});
it("generate dialect specific regex", function() {
@ -187,7 +222,7 @@ describe("Generator functionality", function() {
it("runs complex scripts", function() {
const str = `
using global and multiline and exact matching
using global and multiline and exact matching and case insensitive matching
create an optional group called protocol
match "http"
optionally match "s"
@ -222,6 +257,6 @@ create an optional group
const toks = lexer.tokenize(str).tokens;
const reg = parser.parse(toks);
expect(reg.validate(RegexDialect.JS).length).toBe(0);
expect(reg.toRegex(RegexDialect.JS)).toBe("/^(?<protocol>https?\\:\\/\\/)?(?<subdomain>(?:\\w+\\.)*)?(?<domain>(?:\\w+|_|\\-)+\\.\\w+)(?:\\:\\d*)?(?<path>(?:\\/(?:\\w+|_|\\-)*)*)?(\\?(?<query>(?:(?:\\w+|_|\\-)+=(?:\\w+|_|\\-)+)*))?(#.*)?$/gm");
expect(reg.toRegex(RegexDialect.JS)).toBe("/^(?<protocol>https?\\:\\/\\/)?(?<subdomain>(?:\\w+\\.)*)?(?<domain>(?:\\w+|_|\\-)+\\.\\w+)(?:\\:\\d*)?(?<path>(?:\\/(?:\\w+|_|\\-)*)*)?(\\?(?<query>(?:(?:\\w+|_|\\-)+=(?:\\w+|_|\\-)+)*))?(#.*)?$/gmi");
});
});

View File

@ -0,0 +1,62 @@
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
import { minimizeMatchString, groupIfRequired, dontClobberRepetition } from "../src/generator_helper";
describe("Generator helper functionality", function() {
it("can minimize matches", function() {
const test_cases = [
{ from: [], to: "" },
{ from: [ "abc" ], to: "abc" },
{ from: [ "abc", "abc" ], to: "abc" },
{ from: [ "a", "ab" ], to: "ab?" },
{ from: [ "a1x1z", "a2y2z", "a3z3z" ], to: "a(?:1x1|2y2|3z3)z" },
{ from: [ "ab", "cd" ], to: "ab|cd" },
{ from: [ "abc", "bc" ], to: "a?bc" },
{ from: [ "abc", "xb" ], to: "abc|xb" }
];
for (const c of test_cases) {
const got = minimizeMatchString(c.from);
expect(got).toBe(c.to);
}
});
it("groups correctly", function() {
const test_cases = [
{ from: "(?P=test)", to: "(?P=test)" },
{ from: "[abc\\]]", to: "[abc\\]]" },
{ from: "abc", to: "(?:abc)" },
{ from: "(abc)|d", to: "(?:(abc)|d)" },
{ from: "[abc\\]][abc]", to: "(?:[abc\\]][abc])" },
{ from: "(abc(abc)\\))(abc)", to: "(?:(abc(abc)\\))(abc))" },
];
for (const c of test_cases) {
const got = groupIfRequired(c.from);
expect(got).toBe(c.to);
}
});
it("doesn't clobber the repetition", function() {
const test_cases = [
{ fragment: "1+", repetition: "+", expected: "1+" },
{ fragment: "1*", repetition: "+", expected: "1+" },
{ fragment: "1+", repetition: "*", expected: "1+" },
{ fragment: "1*", repetition: "*", expected: "1*" },
{ fragment: "1+", repetition: "?", expected: "1+?" },
{ fragment: "1*", repetition: "?", expected: "1*?" },
{ fragment: "1+", repetition: "{0,}", expected: "(?:1+){0,}" },
{ fragment: "1*", repetition: "{0,}", expected: "1{0,}" },
{ fragment: "1+", repetition: "{1,2}", expected: "1{1,2}" },
{ fragment: "1*", repetition: "{1,2}", expected: "1{1,2}" },
];
for (const c of test_cases) {
const got = dontClobberRepetition(c.fragment, c.repetition);
expect(got).toBe(c.expected);
}
});
});

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
import { Human2RegexLexer, Human2RegexLexerOptions, IndentType } from "../src/lexer";
import { Indent } from "../src/tokens";

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
import { Human2RegexParser, Human2RegexParserOptions } from "../src/parser";
import { Human2RegexLexer, Human2RegexLexerOptions } from "../src/lexer";

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
import { Human2RegexLexer, Human2RegexLexerOptions } from "../src/lexer";
import { Human2RegexParser, Human2RegexParserOptions, ParseResult } from "../src/parser";

View File

@ -1,4 +1,4 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
/*! Copyright (c) 2021 Patrick Demian; Licensed under MIT */
import "../src/utilities";
import { isSingleRegexCharacter, findLastIndex, removeQuotes, regexEscape, hasFlag, combineFlags, makeFlag, first, last, CommonError, append } from "../src/utilities";

View File

@ -1,62 +1,62 @@
/* eslint-disable func-style */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/naming-convention */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable no-undef */
const path = require("path");
const { glob } = require("glob");
const { render } = require("mustache");
const { readFileSync, writeFileSync, existsSync, mkdirSync } = require("fs");
const { minify } = require("html-minifier");
const CopyPlugin = require("copy-webpack-plugin");
const Handlebars = require("handlebars");
const MiniCssExtractPlugin = require("mini-css-extract-plugin");
const OptimizeCSSAssetsPlugin = require("optimize-css-assets-webpack-plugin");
const WebpackBeforeBuildPlugin = require("before-build-webpack");
const TerserPlugin = require("terser-webpack-plugin");
const RemovePlugin = require('remove-files-webpack-plugin');
const RemovePlugin = require("remove-files-webpack-plugin");
const { CleanWebpackPlugin } = require("clean-webpack-plugin");
const config = {
prod: true,
dst: "./docs/",
src: "./src/",
compression_config: {
html: {
collapseWhitespace: true,
minifyCSS: true,
minifyJS: true,
removeComments: true,
removeEmptyAttributes: true,
removeRedundantAttributes: true
},
}
};
const config = require("./config.json");
function build_mustache() {
if (!existsSync(config.dst)){
// todo: if I'm bored, make this a plugin for webpack so it gets "emitted"
function buildHandlebars() {
if (!existsSync(config.dst)){
mkdirSync(config.dst);
}
}
const read_json_file = (filename) => JSON.parse(readFileSync(filename), "utf8");
const files = glob.sync(path.join(config.src, "docs", "*.hbs"));
const context = {
build: {
prod: config.prod,
year: String(new Date().getFullYear())
}
};
const compress_html = (input) => config.prod ? minify(input, config.compression_config.html) : input;
// helper functions
const compressHtml = (input) => config.prod ? minify(input, config.compression_config.html) : input;
// get views
const files = glob.sync(path.join(config.src, "docs", "*.json"));
Handlebars.registerHelper("i-code", () => new Handlebars.SafeString('<code class="cm-s-idea">'));
Handlebars.registerHelper("s-code", () => new Handlebars.SafeString('<span class="tutorial-code"><code class="cm-s-idea">'));
Handlebars.registerHelper("p-code", () => new Handlebars.SafeString('<pre class="tutorial-code"><code class="cm-s-idea">'));
Handlebars.registerHelper("end-i-code", () => new Handlebars.SafeString("</code>"));
Handlebars.registerHelper("end-s-code", () => new Handlebars.SafeString("</code></span>"));
Handlebars.registerHelper("end-p-code", () => new Handlebars.SafeString("</code></pre>"));
// get partials
const partials = {
header: readFileSync(path.join(config.src, "docs", "header.mustache"), "utf8"),
footer: readFileSync(path.join(config.src, "docs", "footer.mustache"), "utf8")
};
Handlebars.registerPartial("header", readFileSync(path.join(config.src, "docs", "partials", "header.hbs"), "utf8"));
Handlebars.registerPartial("footer", readFileSync(path.join(config.src, "docs", "partials", "footer.hbs"), "utf8"));
Handlebars.registerPartial("example_code", readFileSync(path.join(config.src, "docs", "partials", "example_code.hbs"), "utf8"));
// build handlebar files
for (const file of files) {
const filename = path.basename(file);
const to = path.join(config.dst, path.basename(filename, ".hbs") + ".html");
const template = readFileSync(path.join(config.src, "docs", filename), "utf8");
const html = Handlebars.compile(template)(context);
// build main mustache files
for (const item of files) {
const filename = path.basename(item, ".json");
const view = read_json_file(item);
const to = path.join(config.dst, filename + ".html");
const template = readFileSync(path.join(config.src, "docs", filename + ".mustache"), "utf8");
writeFileSync(to, compress_html(render(template, view, partials)));
writeFileSync(to, compressHtml(html));
}
}
@ -80,23 +80,28 @@ module.exports = {
minimize: config.prod,
minimizer: [ new TerserPlugin({cache: true, parallel: true}), new OptimizeCSSAssetsPlugin({}) ]
},
performance: {
hints: false,
maxEntrypointSize: 512000,
maxAssetSize: 512000
},
plugins: [
new CleanWebpackPlugin({verbose:true, protectWebpackAssets: false}),
new CopyPlugin({
patterns: [
{ from: config.src + "docs/" + "!(*.css|*.mustache|*.json)", to: "", flatten: true}
{ from: config.src + "docs/" + "assets/" + "!(*.css|*.hbs)", to: "", flatten: true}
]
}),
new MiniCssExtractPlugin({ filename: "bundle.min.css" }),
new WebpackBeforeBuildPlugin(function(_, callback) {
build_mustache();
buildHandlebars();
callback();
}),
}, [ "done" ]),
new RemovePlugin({
after: {
root: "./lib",
include: [
"script.d.ts",
"script.d.ts.map"
"script.d.ts"
]
}
})