1
0
mirror of https://github.com/pdemian/human2regex.git synced 2025-05-15 20:10:19 -07:00

Added boundary and unicode class support

This commit is contained in:
Patrick Demian 2020-11-03 21:31:47 -05:00
parent ffbd0d72b1
commit 32354d8aaf
4 changed files with 97 additions and 9 deletions

View File

@ -28,6 +28,40 @@ export interface ISemanticError {
message: string
}
const unicode_property_codes = [
"C", "Cc", "Cf", "Cn", "Co", "Cs",
"L", "Ll", "Lm", "Lo", "Lt", "Lu",
"M", "Mc", "Me", "Mn", "N", "Nd",
"Nl", "No", "P", "Pc", "Pd", "Pe",
"Pf", "Pi", "Po", "Ps", "S", "Sc",
"Sk", "Sm", "So", "Z", "Zl", "Zp",
"Zs"
];
const unicode_script_codes = [
"Arabic", "Armenian", "Avestan", "Balinese", "Bamum",
"Batak", "Bengali", "Bopomofo", "Brahmi", "Braille",
"Buginese", "Buhid", "Canadian_Aboriginal", "Carian", "Chakma",
"Cham", "Cherokee", "Common", "Coptic", "Cuneiform",
"Cypriot", "Cyrillic", "Deseret", "Devanagari", "Egyptian_Hieroglyphs",
"Ethiopic", "Georgian", "Glagolitic", "Gothic", "Greek",
"Gujarati", "Gurmukhi", "Han", "Hangul", "Hanunoo", "Hebrew",
"Hiragana", "Imperial_Aramaic", "Inherited", "Inscriptional_Pahlavi",
"Inscriptional_Parthian", "Javanese", "Kaithi", "Kannada", "Katakana",
"Kayah_Li", "Kharoshthi", "Khmer", "Lao", "Latin", "Lepcha", "Limbu",
"Linear_B", "Lisu", "Lycian", "Lydian", "Malayalam", "Mandaic",
"Meetei_Mayek", "Meroitic_Cursive", "Meroitic_Hieroglyphs", "Miao",
"Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Ogham", "Old_Italic",
"Old_Persian", "Old_South_Arabian", "Old_Turkic", "Ol_Chiki", "Oriya",
"Osmanya", "Phags_Pa", "Phoenician", "Rejang", "Runic", "Samaritan",
"Saurashtra", "Sharada", "Shavian", "Sinhala", "Sora_Sompeng",
"Sundanese", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le",
"Tai_Tham", "Tai_Viet", "Takri", "Tamil", "Telugu", "Thaana", "Thai",
"Tibetan", "Tifinagh", "Ugaritic", "Vai", "Yi"
];
/**
* The base concrete syntax tree class
*
@ -102,7 +136,7 @@ export enum UsingFlags {
/**
* Type of match arguments
*
*
* @remarks SingleString means an escaped string
* @remarks Between means a range (ex. a-z)
* @remarks Anything means .
@ -121,7 +155,9 @@ export enum MatchSubStatementType {
Tab,
Linefeed,
Newline,
CarriageReturn
CarriageReturn,
Boundary,
Unicode
}
/**
@ -135,7 +171,7 @@ export class MatchSubStatementValue {
* Constructor for MatchSubStatementValue
*
* @param type the type of this match
* @param from optional range string
* @param from optional value or range string
* @param to optional range string
* @internal
*/
@ -203,14 +239,14 @@ export class MatchSubStatementCST extends H2RCST {
let to = value.to as string;
if (!isSingleRegexCharacter(from)) {
errors.push(this.error("Between statement must begin with a single character"));
errors.push(this.error("Between statement must begin with a single character"));
}
else if (from.startsWith("\\u") || from.startsWith("\\U") || from.startsWith("\\")) {
from = JSON.parse(`"${regexEscape(from)}"`);
}
if (!isSingleRegexCharacter(to)) {
errors.push(this.error("Between statement must end with a single character"));
errors.push(this.error("Between statement must end with a single character"));
}
else if (to.startsWith("\\u") || to.startsWith("\\U") || to.startsWith("\\")) {
to = JSON.parse(`"${regexEscape(to)}"`);
@ -220,6 +256,27 @@ export class MatchSubStatementCST extends H2RCST {
errors.push(this.error("Between statement range invalid"));
}
}
else if (value.type === MatchSubStatementType.Unicode) {
let unicode_class = value.from as string;
// check to see if the given code is supported
if (!unicode_property_codes.includes(unicode_class)) {
// check to see if the given script is supported
// Java and C# requires "Is*"
if (language === RegexDialect.DotNet || language === RegexDialect.Java) {
if (!unicode_class.startsWith("Is")) {
errors.push(this.error("This dialect requires script names to begin with Is, such as IsCyrillic rather than Cyrillic"));
continue;
}
unicode_class = unicode_class.substr(0, 2);
}
// attempt with and without "_" characters
if (!unicode_script_codes.includes(unicode_class) && !unicode_script_codes.includes(unicode_class.replace("_", ""))) {
errors.push(this.error(`Unknown unicode specifier ${value.from}`));
}
}
}
}
return errors;
@ -238,6 +295,12 @@ export class MatchSubStatementCST extends H2RCST {
case MatchSubStatementType.Between:
str.push(this.invert ? `[^${value.from}-${value.to}]` : `[${value.from}-${value.to}]`);
break;
case MatchSubStatementType.Unicode:
str.push(this.invert ? `\\P{${value.from}}` : `\\p{${value.from}}`);
break;
case MatchSubStatementType.Boundary:
str.push(this.invert ? "\\B" : "\\b");
break;
case MatchSubStatementType.Word:
str.push(this.invert ? "\\W+" : "\\w+");
break;

View File

@ -160,6 +160,7 @@ export class Human2RegexParser extends EmbeddedActionsParser {
let invert: boolean = false;
const values: MatchSubStatementValue[] = [];
let from: string | null = null;
let value: string | null = null;
let to: string | null = null;
let type: MatchSubStatementType = MatchSubStatementType.Anything;
@ -215,17 +216,35 @@ export class Human2RegexParser extends EmbeddedActionsParser {
{ ALT: () => {
const token = $.CONSUME(T.StringLiteral);
tokens.push(token);
from = token.image;
value = token.image;
type = MatchSubStatementType.SingleString;
return new MatchSubStatementValue(type, from);
return new MatchSubStatementValue(type, value);
}},
//unicode
{ ALT: () => {
$.CONSUME(T.Unicode);
const token = $.CONSUME5(T.StringLiteral);
tokens.push(token);
value = token.image;
type = MatchSubStatementType.Unicode;
return new MatchSubStatementValue(type, value);
}},
{ ALT: () => {
tokens.push($.CONSUME(T.Anything));
type = MatchSubStatementType.Anything;
return new MatchSubStatementValue(type);
}},
{ ALT: () => {
tokens.push($.CONSUME(T.Boundary));
type = MatchSubStatementType.Boundary;
return new MatchSubStatementValue(type);
}},
{ ALT: () => {
tokens.push($.CONSUME(T.Word));
type = MatchSubStatementType.Word;

View File

@ -57,6 +57,8 @@ document.addEventListener("DOMContentLoaded", function() {
{token: "builtin", regex: /(any thing|any|anything)(s)?/i},
{token: "operator", regex: /or/i},
{token: "operator", regex: /and|,/i},
{token: "builtin", regex: /unicode( class)?/i},
{token: "builtin", regex: /(word )boundary/i},
{token: "builtin", regex: /word(s)?/i},
{token: "builtin", regex: /digit(s)?/i},
{token: "builtin", regex: /character(s)?/i},

View File

@ -29,18 +29,20 @@ import { createToken, Lexer } from "chevrotain";
/** @internal */ export const Digit = createToken({name: "DigitSpecifier", pattern: /digit(s)?/i});
/** @internal */ export const Character = createToken({name: "CharacterSpecifier", pattern: /character(s)?/i});
/** @internal */ export const Whitespace = createToken({name: "WhitespaceSpecifier", pattern: /(white space|whitespace)(s)?/i});
/** @internal */ export const Boundary = createToken({name: "BoundarySpecifier", pattern: /(word )boundary/i});
/** @internal */ export const Number = createToken({name: "NumberSpecifier", pattern: /number(s)?/i});
/** @internal */ export const Unicode = createToken({name: "UnicodeSpecifier", pattern: /unicode( class)?/i});
/** @internal */ export const Using = createToken({name: "Using", pattern: /using/i});
/** @internal */ export const Global = createToken({name: "Global", pattern: /global/i});
/** @internal */ export const Multiline = createToken({name: "Multiline", pattern: /(multi line|multiline)/i});
/** @internal */ export const Exact = createToken({name: "Exact", pattern: /exact/i});
/** @internal */ export const Matching = createToken({name: "Matching", pattern: /matching/i});
/** @internal */ export const Not = createToken({name: "Not", pattern: /not/i}); //, longer_alt: Nothing});
/** @internal */ export const Not = createToken({name: "Not", pattern: /not/i});
/** @internal */ export const Between = createToken({name: "Between", pattern: /between/i});
/** @internal */ export const Tab = createToken({name: "Tab", pattern: /tab/i});
/** @internal */ export const Linefeed = createToken({name: "Linefeed", pattern: /(line feed|linefeed)/i});
/** @internal */ export const Group = createToken({name: "Group", pattern: /group/i});
/** @internal */ export const A = createToken({name: "A", pattern: /a(n)?/i }); //, longer_alt: Anything});
/** @internal */ export const A = createToken({name: "A", pattern: /a(n)?/i });
/** @internal */ export const Times = createToken({name: "Times", pattern: /times/i});
/** @internal */ export const Exactly = createToken({name: "Exactly", pattern: /exact(ly)?/i});
/** @internal */ export const Inclusive = createToken({name: "Inclusive", pattern: /inclusive(ly)?/i});
@ -111,11 +113,13 @@ export const AllTokens = [
Then,
Anything,
And,
Boundary,
Word,
Digit,
Character,
Whitespace,
Number,
Unicode,
/*
Of,
As,