Added boundary and unicode class support

2025-05-15 20:10:19 -07:00 · 2020-11-03 21:31:47 -05:00 · 2020-11-03 21:31:47 -05:00 · 32354d8aaf
commit 32354d8aaf
parent ffbd0d72b1
4 changed files with 97 additions and 9 deletions
--- a/src/generator.ts
+++ b/src/generator.ts
@ -28,6 +28,40 @@ export interface ISemanticError {
    message: string
 }

+const unicode_property_codes = [
+    "C", "Cc", "Cf", "Cn", "Co", "Cs", 
+    "L", "Ll", "Lm", "Lo", "Lt", "Lu", 
+    "M", "Mc", "Me", "Mn", "N", "Nd", 
+    "Nl", "No", "P", "Pc", "Pd", "Pe", 
+    "Pf", "Pi", "Po", "Ps", "S", "Sc", 
+    "Sk", "Sm", "So", "Z", "Zl", "Zp", 
+    "Zs"
+];
+
+const unicode_script_codes = [
+    "Arabic", "Armenian", "Avestan", "Balinese", "Bamum",
+    "Batak", "Bengali", "Bopomofo", "Brahmi", "Braille",
+    "Buginese", "Buhid", "Canadian_Aboriginal", "Carian", "Chakma",
+    "Cham", "Cherokee", "Common", "Coptic", "Cuneiform",
+    "Cypriot", "Cyrillic", "Deseret", "Devanagari", "Egyptian_Hieroglyphs",
+    "Ethiopic", "Georgian", "Glagolitic", "Gothic", "Greek",
+    "Gujarati", "Gurmukhi", "Han", "Hangul", "Hanunoo", "Hebrew",
+    "Hiragana", "Imperial_Aramaic", "Inherited", "Inscriptional_Pahlavi",
+    "Inscriptional_Parthian", "Javanese", "Kaithi", "Kannada", "Katakana", 
+    "Kayah_Li", "Kharoshthi", "Khmer", "Lao", "Latin", "Lepcha", "Limbu",
+    "Linear_B", "Lisu", "Lycian", "Lydian", "Malayalam", "Mandaic", 
+    "Meetei_Mayek", "Meroitic_Cursive", "Meroitic_Hieroglyphs", "Miao",
+    "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Ogham", "Old_Italic",
+    "Old_Persian", "Old_South_Arabian", "Old_Turkic", "Ol_Chiki", "Oriya",
+    "Osmanya", "Phags_Pa", "Phoenician", "Rejang", "Runic", "Samaritan", 
+    "Saurashtra", "Sharada", "Shavian", "Sinhala", "Sora_Sompeng", 
+    "Sundanese", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le",
+    "Tai_Tham", "Tai_Viet", "Takri", "Tamil", "Telugu", "Thaana", "Thai",
+    "Tibetan", "Tifinagh", "Ugaritic", "Vai", "Yi"
+];
+
+
+
 /**
 * The base concrete syntax tree class
 * 
@ -102,7 +136,7 @@ export enum UsingFlags {

 /**
 * Type of match arguments
- * 
+ *
 * @remarks SingleString means an escaped string
 * @remarks Between means a range (ex. a-z)
 * @remarks Anything means .
@ -121,7 +155,9 @@ export enum MatchSubStatementType {
    Tab,
    Linefeed,
    Newline,
-    CarriageReturn
+    CarriageReturn,
+    Boundary,
+    Unicode
 }

 /**
@ -135,7 +171,7 @@ export class MatchSubStatementValue {
     * Constructor for MatchSubStatementValue
     * 
     * @param type the type of this match
-     * @param from optional range string
+     * @param from optional value or range string
     * @param to  optional range string
     * @internal
     */
@ -203,14 +239,14 @@ export class MatchSubStatementCST extends H2RCST {
                let to = value.to as string;

                if (!isSingleRegexCharacter(from)) {
-                        errors.push(this.error("Between statement must begin with a single character"));
+                    errors.push(this.error("Between statement must begin with a single character"));
                }
                else if (from.startsWith("\\u") || from.startsWith("\\U") || from.startsWith("\\")) {
                    from = JSON.parse(`"${regexEscape(from)}"`);
                }

                if (!isSingleRegexCharacter(to)) {
-                        errors.push(this.error("Between statement must end with a single character"));
+                    errors.push(this.error("Between statement must end with a single character"));
                }
                else if (to.startsWith("\\u") || to.startsWith("\\U") || to.startsWith("\\")) {
                    to = JSON.parse(`"${regexEscape(to)}"`);
@ -220,6 +256,27 @@ export class MatchSubStatementCST extends H2RCST {
                    errors.push(this.error("Between statement range invalid"));
                }
            }
+            else if (value.type === MatchSubStatementType.Unicode) {
+                let unicode_class = value.from as string;
+                // check to see if the given code is supported
+                if (!unicode_property_codes.includes(unicode_class)) {
+                    // check to see if the given script is supported
+
+                    // Java and C# requires "Is*"
+                    if (language === RegexDialect.DotNet || language === RegexDialect.Java) {
+                        if (!unicode_class.startsWith("Is")) {
+                            errors.push(this.error("This dialect requires script names to begin with Is, such as IsCyrillic rather than Cyrillic"));
+                            continue;
+                        }
+                        unicode_class = unicode_class.substr(0, 2);
+                    }
+
+                    // attempt with and without "_" characters
+                    if (!unicode_script_codes.includes(unicode_class) && !unicode_script_codes.includes(unicode_class.replace("_", ""))) {
+                        errors.push(this.error(`Unknown unicode specifier ${value.from}`));
+                    }
+                }
+            }
        }

        return errors;
@ -238,6 +295,12 @@ export class MatchSubStatementCST extends H2RCST {
                case MatchSubStatementType.Between:
                    str.push(this.invert ? `[^${value.from}-${value.to}]` : `[${value.from}-${value.to}]`);
                    break;
+                case MatchSubStatementType.Unicode:
+                    str.push(this.invert ? `\\P{${value.from}}` : `\\p{${value.from}}`);
+                    break;
+                case MatchSubStatementType.Boundary:
+                    str.push(this.invert ? "\\B" : "\\b");
+                    break;
                case MatchSubStatementType.Word:
                    str.push(this.invert ? "\\W+" : "\\w+");
                    break;
--- a/src/parser.ts
+++ b/src/parser.ts
@ -160,6 +160,7 @@ export class Human2RegexParser extends EmbeddedActionsParser {
            let invert: boolean = false;
            const values: MatchSubStatementValue[] = [];
            let from: string | null = null;
+            let value: string | null = null;
            let to: string | null = null;
            let type: MatchSubStatementType = MatchSubStatementType.Anything;

@ -215,17 +216,35 @@ export class Human2RegexParser extends EmbeddedActionsParser {
                        { ALT: () => {
                            const token = $.CONSUME(T.StringLiteral);
                            tokens.push(token);
-                            from = token.image;
+                            value = token.image;
                            type = MatchSubStatementType.SingleString;

-                            return new MatchSubStatementValue(type, from);
+                            return new MatchSubStatementValue(type, value);
                        }},
+
+                        //unicode
+                        { ALT: () => {
+                            $.CONSUME(T.Unicode);
+                            const token = $.CONSUME5(T.StringLiteral);
+                            tokens.push(token);
+                            value = token.image;
+                            type = MatchSubStatementType.Unicode;
+
+                            return new MatchSubStatementValue(type, value);
+                        }},
+
                        { ALT: () => { 
                            tokens.push($.CONSUME(T.Anything)); 
                            type = MatchSubStatementType.Anything;

                            return new MatchSubStatementValue(type);
                        }},
+                        { ALT: () => {
+                            tokens.push($.CONSUME(T.Boundary));
+                            type = MatchSubStatementType.Boundary;
+
+                            return new MatchSubStatementValue(type);
+                        }},
                        { ALT: () => { 
                            tokens.push($.CONSUME(T.Word)); 
                            type = MatchSubStatementType.Word;
--- a/src/script.ts
+++ b/src/script.ts
@ -57,6 +57,8 @@ document.addEventListener("DOMContentLoaded", function() {
 			{token: "builtin", regex: /(any thing|any|anything)(s)?/i},
 			{token: "operator", regex: /or/i},
 			{token: "operator", regex: /and|,/i},
+			{token: "builtin", regex: /unicode( class)?/i},
+			{token: "builtin", regex: /(word )boundary/i},
 			{token: "builtin", regex: /word(s)?/i},
 			{token: "builtin", regex: /digit(s)?/i},
 			{token: "builtin", regex: /character(s)?/i},
--- a/src/tokens.ts
+++ b/src/tokens.ts
@ -29,18 +29,20 @@ import { createToken, Lexer } from "chevrotain";
 /** @internal */ export const Digit = createToken({name: "DigitSpecifier", pattern: /digit(s)?/i});
 /** @internal */ export const Character = createToken({name: "CharacterSpecifier", pattern: /character(s)?/i});
 /** @internal */ export const Whitespace = createToken({name: "WhitespaceSpecifier", pattern: /(white space|whitespace)(s)?/i});
+/** @internal */ export const Boundary = createToken({name: "BoundarySpecifier", pattern: /(word )boundary/i});
 /** @internal */ export const Number = createToken({name: "NumberSpecifier", pattern: /number(s)?/i});
+/** @internal */ export const Unicode = createToken({name: "UnicodeSpecifier", pattern: /unicode( class)?/i});
 /** @internal */ export const Using = createToken({name: "Using", pattern: /using/i});
 /** @internal */ export const Global = createToken({name: "Global", pattern: /global/i});
 /** @internal */ export const Multiline = createToken({name: "Multiline", pattern: /(multi line|multiline)/i});
 /** @internal */ export const Exact = createToken({name: "Exact", pattern: /exact/i});
 /** @internal */ export const Matching = createToken({name: "Matching", pattern: /matching/i});
-/** @internal */ export const Not = createToken({name: "Not", pattern: /not/i}); //, longer_alt: Nothing});
+/** @internal */ export const Not = createToken({name: "Not", pattern: /not/i});
 /** @internal */ export const Between = createToken({name: "Between", pattern: /between/i});
 /** @internal */ export const Tab = createToken({name: "Tab", pattern: /tab/i});
 /** @internal */ export const Linefeed = createToken({name: "Linefeed", pattern: /(line feed|linefeed)/i});
 /** @internal */ export const Group = createToken({name: "Group", pattern: /group/i});
-/** @internal */ export const A = createToken({name: "A", pattern: /a(n)?/i }); //, longer_alt: Anything});
+/** @internal */ export const A = createToken({name: "A", pattern: /a(n)?/i });
 /** @internal */ export const Times = createToken({name: "Times", pattern: /times/i});
 /** @internal */ export const Exactly = createToken({name: "Exactly", pattern: /exact(ly)?/i});
 /** @internal */ export const Inclusive = createToken({name: "Inclusive", pattern: /inclusive(ly)?/i});
@ -111,11 +113,13 @@ export const AllTokens = [
    Then,
    Anything,
    And,
+    Boundary,
    Word,
    Digit,
    Character,
    Whitespace,
    Number,
+    Unicode,
    /*
    Of,
    As,