More bugs fixed, updated readme

2025-05-16 04:20:35 -07:00 · 2020-11-06 16:06:27 -05:00 · 2020-11-06 16:06:27 -05:00 · 298aee7226
commit 298aee7226
parent 7d7d6337e1
8 changed files with 1899 additions and 526 deletions
--- a/Readme.md
+++ b/Readme.md
@ -2,21 +2,22 @@

 ## Purpose

-Generate regular expressions from natural language. Currently WIP, but should look something like this:
+Generate regular expressions from natural language.

-Instead of a convoluted mess of symbols why not
+Instead of a convoluted mess of symbols like `/([\w\.=\-]*\w+)/g` why not

    using global matching
-    create a group called "capture_me" 
-        match 0+ words or "." or "=" or "-"
+    create a group called capture_me
+        match 0+ characters or "." or "=" or "-"
        match 1+ words

+Is the former not much easier to read and bug fix than the latter?
+
 Running the program should result in the following output:

-    Your regex = /\$([\w\.=\-]*[\w]+)/g
-    "capture_me" is group id 1
+    Your regex = /(?<capture_me>[\w\.\=\-]*\w++)/g

-Is the former not much easier to read and bug fix than the latter?
+You can then use your regex in your language of choice, with Human2Regex validating your regex for you.

 Another example

@ -61,16 +62,24 @@ Another example

 Running the program should result in the following output:

-    Your regex = /^(https?:\/\/)?((\w\.)*)(:\d+)?([\w_\-]\.\w)((/[\w_\-]))?(\?([\w_\-]=[\w_\-]))?(#.*)$/g
-    "protocol" is group id 1
-    "subdomain" is group id 2
-    "domain" is group id 4
-    "path" is group id 5
-    "query" is group id 5 or 6 if "path" exists
+    Your regex = /^(?<protocol>https?\:\/\/)?(?<subdomain>(\w+\.)*)?(?<domain>(?:\w+|_|\-)+\.\w+)\:?\d*(?<path>(\/(?:\w+|_|\-)*)*)?(\?(?<query>((?:\w+|_|\-)+\=(?:\w+|_|\-)+)*))?(#.*)?$/g
+
+Which one would you rather debug?

 ## Usage
-Configure config.ts
-Run
+Build

    npm run build

+Run
+    
+    point web browser to: docs/index.html
+
+Test
+
+    npm t
+
+
+## Todo
+- Seperate website and source code. Move to yarn/npm
+- Add more regex options such as back references, subroutines, lookahead/behind, and more character classes (eg,  `[:alpha:]`)
--- a/docs/bundle.min.js
+++ b/docs/bundle.min.js
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "human2regex",
-  "version": "0.9.0",
+  "version": "0.9.5",
  "description": "Humanized Regular Expressions",
  "main": "bundle.min.js",
  "devDependencies": {
@ -8,20 +8,20 @@
    "@types/html-minifier": "^3.5.3",
    "@types/jest": "^26.0.15",
    "@types/mustache": "^4.0.1",
-    "@typescript-eslint/eslint-plugin": "^4.4.0",
-    "@typescript-eslint/parser": "^4.4.0",
+    "@typescript-eslint/eslint-plugin": "^4.6.1",
+    "@typescript-eslint/parser": "^4.6.1",
    "before-build-webpack": "^0.2.9",
-    "copy-webpack-plugin": "^6.2.1",
+    "copy-webpack-plugin": "^6.3.0",
    "css-loader": "^4.3.0",
    "eslint": "^7.11.0",
    "glob": "^7.1.6",
    "html-minifier": "^4.0.0",
-    "jest": "^26.6.1",
+    "jest": "^26.6.3",
    "mini-css-extract-plugin": "^1.0.0",
    "mustache": "^4.0.1",
    "optimize-css-assets-webpack-plugin": "^5.0.4",
    "ts-jest": "^26.4.3",
-    "ts-loader": "^8.0.4",
+    "ts-loader": "^8.0.9",
    "ts-node": "^9.0.0",
    "typescript": "^4.0.5",
    "webpack": "^4.44.2",
@ -37,7 +37,7 @@
  "author": "Patrick Demian",
  "license": "MIT",
  "dependencies": {
-    "chevrotain": "^7.0.2",
+    "chevrotain": "^7.0.3",
    "codemirror": "^5.58.2"
  },
  "repository": {
--- a/src/generator.ts
+++ b/src/generator.ts
@ -334,9 +334,13 @@ export class MatchSubStatementCST extends H2RCST {
        let ret = "";

        let require_grouping = false;
+        let dont_clobber_plus = false;

        if (str.length === 1) {
            ret = str[0];
+            if (ret.endsWith("+")) {
+                dont_clobber_plus = true;
+            }
        }
        // we can use regex's [] for single chars, otherwise we need a group
        else if (str.every(isSingleRegexCharacter)) {
@ -349,10 +353,36 @@ export class MatchSubStatementCST extends H2RCST {
        }

        if (this.count) {
-            if (require_grouping) {
-                ret = "(?:" + ret + ")";
+            if (dont_clobber_plus) {
+                const clobber = this.count.toRegex(language);
+
+                // + can be ignored as well as a count as long as that count is > 0
+                switch (clobber) {
+                    case "*":
+                    case "?":
+                        ret = "(?:" + ret + ")" + clobber;
+                        break;
+                    case "+":
+                        // ignore
+                        break;
+                    default:
+                        if (clobber.startsWith("{0")) {
+                            ret = "(?:" + ret + ")" + clobber;
+                        }
+                        else {
+                            // remove + and replace with count
+                            ret.substring(0, ret.length - 1) + clobber;
+                        }
+                        break;
+                }
+            }
+            else {
+                if (require_grouping) {
+                    ret = "(?:" + ret + ")";
+                }
+
+                ret += this.count.toRegex(language);
            }
-            ret += this.count.toRegex(language);
        }

        return ret;
--- a/src/tokens.ts
+++ b/src/tokens.ts
@ -27,8 +27,8 @@ import { createToken, Lexer } from "chevrotain";
 /** @internal */ export const And = createToken({name: "And", pattern: /and|,/i});
 /** @internal */ export const Word = createToken({name: "WordSpecifier", pattern: /word(s)?/i});
 /** @internal */ export const Digit = createToken({name: "DigitSpecifier", pattern: /digit(s)?/i});
-/** @internal */ export const Character = createToken({name: "CharacterSpecifier", pattern: /character(s)?/i});
-/** @internal */ export const Whitespace = createToken({name: "WhitespaceSpecifier", pattern: /(white space|whitespace)(s)?/i});
+/** @internal */ export const Character = createToken({name: "CharacterSpecifier", pattern: /(character|letter)s?/i});
+/** @internal */ export const Whitespace = createToken({name: "WhitespaceSpecifier", pattern: /(white space|whitespace)s?/i});
 /** @internal */ export const Boundary = createToken({name: "BoundarySpecifier", pattern: /(word )boundary/i});
 /** @internal */ export const Number = createToken({name: "NumberSpecifier", pattern: /number(s)?/i});
 /** @internal */ export const Unicode = createToken({name: "UnicodeSpecifier", pattern: /unicode( class)?/i});
--- a/tests/generator.spec.ts
+++ b/tests/generator.spec.ts
@ -32,6 +32,11 @@ describe("Generator functionality", function() {
        const reg1 = parser.parse();
        expect(reg1.validate(RegexDialect.JS).length).toBe(0);
        expect(reg1.toRegex(RegexDialect.JS)).toBe("/https?/");
+
+        parser.input = lexer.tokenize("match 1+ words").tokens;
+        const reg2 = parser.parse();
+        expect(reg2.validate(RegexDialect.JS).length).toBe(0);
+        expect(reg2.toRegex(RegexDialect.JS)).toBe("/\\w+/"); // used to generate w++. make sure not to regress
    });

    it("validates invalid regexes", function() {
--- a/webpack.config.js
+++ b/webpack.config.js
@ -1,3 +1,5 @@
+/* eslint-disable @typescript-eslint/explicit-function-return-type */
+/* eslint-disable @typescript-eslint/naming-convention */
 /* eslint-disable @typescript-eslint/no-var-requires */
 /* eslint-disable no-undef */
 const path = require("path");
@ -29,7 +31,6 @@ const config = {


 function build_mustache() {
-
 	if (!existsSync(config.dst)){
 		mkdirSync(config.dst);
 	}
@ -48,7 +49,7 @@ function build_mustache() {
    };

    // build main mustache files
-    for(const item of files) {
+    for (const item of files) {
        const filename = path.basename(item, ".json");
        const view = read_json_file(item);
        const to = path.join(config.dst, filename + ".html");