diff --git a/src/generator.ts b/src/generator.ts index 7ff0670..f271105 100644 --- a/src/generator.ts +++ b/src/generator.ts @@ -28,6 +28,40 @@ export interface ISemanticError { message: string } +const unicode_property_codes = [ + "C", "Cc", "Cf", "Cn", "Co", "Cs", + "L", "Ll", "Lm", "Lo", "Lt", "Lu", + "M", "Mc", "Me", "Mn", "N", "Nd", + "Nl", "No", "P", "Pc", "Pd", "Pe", + "Pf", "Pi", "Po", "Ps", "S", "Sc", + "Sk", "Sm", "So", "Z", "Zl", "Zp", + "Zs" +]; + +const unicode_script_codes = [ + "Arabic", "Armenian", "Avestan", "Balinese", "Bamum", + "Batak", "Bengali", "Bopomofo", "Brahmi", "Braille", + "Buginese", "Buhid", "Canadian_Aboriginal", "Carian", "Chakma", + "Cham", "Cherokee", "Common", "Coptic", "Cuneiform", + "Cypriot", "Cyrillic", "Deseret", "Devanagari", "Egyptian_Hieroglyphs", + "Ethiopic", "Georgian", "Glagolitic", "Gothic", "Greek", + "Gujarati", "Gurmukhi", "Han", "Hangul", "Hanunoo", "Hebrew", + "Hiragana", "Imperial_Aramaic", "Inherited", "Inscriptional_Pahlavi", + "Inscriptional_Parthian", "Javanese", "Kaithi", "Kannada", "Katakana", + "Kayah_Li", "Kharoshthi", "Khmer", "Lao", "Latin", "Lepcha", "Limbu", + "Linear_B", "Lisu", "Lycian", "Lydian", "Malayalam", "Mandaic", + "Meetei_Mayek", "Meroitic_Cursive", "Meroitic_Hieroglyphs", "Miao", + "Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Ogham", "Old_Italic", + "Old_Persian", "Old_South_Arabian", "Old_Turkic", "Ol_Chiki", "Oriya", + "Osmanya", "Phags_Pa", "Phoenician", "Rejang", "Runic", "Samaritan", + "Saurashtra", "Sharada", "Shavian", "Sinhala", "Sora_Sompeng", + "Sundanese", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le", + "Tai_Tham", "Tai_Viet", "Takri", "Tamil", "Telugu", "Thaana", "Thai", + "Tibetan", "Tifinagh", "Ugaritic", "Vai", "Yi" +]; + + + /** * The base concrete syntax tree class * @@ -102,7 +136,7 @@ export enum UsingFlags { /** * Type of match arguments - * + * * @remarks SingleString means an escaped string * @remarks Between means a range (ex. a-z) * @remarks Anything means . @@ -121,7 +155,9 @@ export enum MatchSubStatementType { Tab, Linefeed, Newline, - CarriageReturn + CarriageReturn, + Boundary, + Unicode } /** @@ -135,7 +171,7 @@ export class MatchSubStatementValue { * Constructor for MatchSubStatementValue * * @param type the type of this match - * @param from optional range string + * @param from optional value or range string * @param to optional range string * @internal */ @@ -203,14 +239,14 @@ export class MatchSubStatementCST extends H2RCST { let to = value.to as string; if (!isSingleRegexCharacter(from)) { - errors.push(this.error("Between statement must begin with a single character")); + errors.push(this.error("Between statement must begin with a single character")); } else if (from.startsWith("\\u") || from.startsWith("\\U") || from.startsWith("\\")) { from = JSON.parse(`"${regexEscape(from)}"`); } if (!isSingleRegexCharacter(to)) { - errors.push(this.error("Between statement must end with a single character")); + errors.push(this.error("Between statement must end with a single character")); } else if (to.startsWith("\\u") || to.startsWith("\\U") || to.startsWith("\\")) { to = JSON.parse(`"${regexEscape(to)}"`); @@ -220,6 +256,27 @@ export class MatchSubStatementCST extends H2RCST { errors.push(this.error("Between statement range invalid")); } } + else if (value.type === MatchSubStatementType.Unicode) { + let unicode_class = value.from as string; + // check to see if the given code is supported + if (!unicode_property_codes.includes(unicode_class)) { + // check to see if the given script is supported + + // Java and C# requires "Is*" + if (language === RegexDialect.DotNet || language === RegexDialect.Java) { + if (!unicode_class.startsWith("Is")) { + errors.push(this.error("This dialect requires script names to begin with Is, such as IsCyrillic rather than Cyrillic")); + continue; + } + unicode_class = unicode_class.substr(0, 2); + } + + // attempt with and without "_" characters + if (!unicode_script_codes.includes(unicode_class) && !unicode_script_codes.includes(unicode_class.replace("_", ""))) { + errors.push(this.error(`Unknown unicode specifier ${value.from}`)); + } + } + } } return errors; @@ -238,6 +295,12 @@ export class MatchSubStatementCST extends H2RCST { case MatchSubStatementType.Between: str.push(this.invert ? `[^${value.from}-${value.to}]` : `[${value.from}-${value.to}]`); break; + case MatchSubStatementType.Unicode: + str.push(this.invert ? `\\P{${value.from}}` : `\\p{${value.from}}`); + break; + case MatchSubStatementType.Boundary: + str.push(this.invert ? "\\B" : "\\b"); + break; case MatchSubStatementType.Word: str.push(this.invert ? "\\W+" : "\\w+"); break; diff --git a/src/parser.ts b/src/parser.ts index 3ef0b1f..5063b88 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -160,6 +160,7 @@ export class Human2RegexParser extends EmbeddedActionsParser { let invert: boolean = false; const values: MatchSubStatementValue[] = []; let from: string | null = null; + let value: string | null = null; let to: string | null = null; let type: MatchSubStatementType = MatchSubStatementType.Anything; @@ -215,17 +216,35 @@ export class Human2RegexParser extends EmbeddedActionsParser { { ALT: () => { const token = $.CONSUME(T.StringLiteral); tokens.push(token); - from = token.image; + value = token.image; type = MatchSubStatementType.SingleString; - return new MatchSubStatementValue(type, from); + return new MatchSubStatementValue(type, value); }}, + + //unicode + { ALT: () => { + $.CONSUME(T.Unicode); + const token = $.CONSUME5(T.StringLiteral); + tokens.push(token); + value = token.image; + type = MatchSubStatementType.Unicode; + + return new MatchSubStatementValue(type, value); + }}, + { ALT: () => { tokens.push($.CONSUME(T.Anything)); type = MatchSubStatementType.Anything; return new MatchSubStatementValue(type); }}, + { ALT: () => { + tokens.push($.CONSUME(T.Boundary)); + type = MatchSubStatementType.Boundary; + + return new MatchSubStatementValue(type); + }}, { ALT: () => { tokens.push($.CONSUME(T.Word)); type = MatchSubStatementType.Word; diff --git a/src/script.ts b/src/script.ts index 33508c0..36f917f 100644 --- a/src/script.ts +++ b/src/script.ts @@ -57,6 +57,8 @@ document.addEventListener("DOMContentLoaded", function() { {token: "builtin", regex: /(any thing|any|anything)(s)?/i}, {token: "operator", regex: /or/i}, {token: "operator", regex: /and|,/i}, + {token: "builtin", regex: /unicode( class)?/i}, + {token: "builtin", regex: /(word )boundary/i}, {token: "builtin", regex: /word(s)?/i}, {token: "builtin", regex: /digit(s)?/i}, {token: "builtin", regex: /character(s)?/i}, diff --git a/src/tokens.ts b/src/tokens.ts index b6f5928..87f713a 100644 --- a/src/tokens.ts +++ b/src/tokens.ts @@ -29,18 +29,20 @@ import { createToken, Lexer } from "chevrotain"; /** @internal */ export const Digit = createToken({name: "DigitSpecifier", pattern: /digit(s)?/i}); /** @internal */ export const Character = createToken({name: "CharacterSpecifier", pattern: /character(s)?/i}); /** @internal */ export const Whitespace = createToken({name: "WhitespaceSpecifier", pattern: /(white space|whitespace)(s)?/i}); +/** @internal */ export const Boundary = createToken({name: "BoundarySpecifier", pattern: /(word )boundary/i}); /** @internal */ export const Number = createToken({name: "NumberSpecifier", pattern: /number(s)?/i}); +/** @internal */ export const Unicode = createToken({name: "UnicodeSpecifier", pattern: /unicode( class)?/i}); /** @internal */ export const Using = createToken({name: "Using", pattern: /using/i}); /** @internal */ export const Global = createToken({name: "Global", pattern: /global/i}); /** @internal */ export const Multiline = createToken({name: "Multiline", pattern: /(multi line|multiline)/i}); /** @internal */ export const Exact = createToken({name: "Exact", pattern: /exact/i}); /** @internal */ export const Matching = createToken({name: "Matching", pattern: /matching/i}); -/** @internal */ export const Not = createToken({name: "Not", pattern: /not/i}); //, longer_alt: Nothing}); +/** @internal */ export const Not = createToken({name: "Not", pattern: /not/i}); /** @internal */ export const Between = createToken({name: "Between", pattern: /between/i}); /** @internal */ export const Tab = createToken({name: "Tab", pattern: /tab/i}); /** @internal */ export const Linefeed = createToken({name: "Linefeed", pattern: /(line feed|linefeed)/i}); /** @internal */ export const Group = createToken({name: "Group", pattern: /group/i}); -/** @internal */ export const A = createToken({name: "A", pattern: /a(n)?/i }); //, longer_alt: Anything}); +/** @internal */ export const A = createToken({name: "A", pattern: /a(n)?/i }); /** @internal */ export const Times = createToken({name: "Times", pattern: /times/i}); /** @internal */ export const Exactly = createToken({name: "Exactly", pattern: /exact(ly)?/i}); /** @internal */ export const Inclusive = createToken({name: "Inclusive", pattern: /inclusive(ly)?/i}); @@ -111,11 +113,13 @@ export const AllTokens = [ Then, Anything, And, + Boundary, Word, Digit, Character, Whitespace, Number, + Unicode, /* Of, As,