mirror of
https://github.com/pdemian/human2regex.git
synced 2025-05-15 20:10:19 -07:00
Added boundary and unicode class support
This commit is contained in:
parent
ffbd0d72b1
commit
32354d8aaf
@ -28,6 +28,40 @@ export interface ISemanticError {
|
||||
message: string
|
||||
}
|
||||
|
||||
const unicode_property_codes = [
|
||||
"C", "Cc", "Cf", "Cn", "Co", "Cs",
|
||||
"L", "Ll", "Lm", "Lo", "Lt", "Lu",
|
||||
"M", "Mc", "Me", "Mn", "N", "Nd",
|
||||
"Nl", "No", "P", "Pc", "Pd", "Pe",
|
||||
"Pf", "Pi", "Po", "Ps", "S", "Sc",
|
||||
"Sk", "Sm", "So", "Z", "Zl", "Zp",
|
||||
"Zs"
|
||||
];
|
||||
|
||||
const unicode_script_codes = [
|
||||
"Arabic", "Armenian", "Avestan", "Balinese", "Bamum",
|
||||
"Batak", "Bengali", "Bopomofo", "Brahmi", "Braille",
|
||||
"Buginese", "Buhid", "Canadian_Aboriginal", "Carian", "Chakma",
|
||||
"Cham", "Cherokee", "Common", "Coptic", "Cuneiform",
|
||||
"Cypriot", "Cyrillic", "Deseret", "Devanagari", "Egyptian_Hieroglyphs",
|
||||
"Ethiopic", "Georgian", "Glagolitic", "Gothic", "Greek",
|
||||
"Gujarati", "Gurmukhi", "Han", "Hangul", "Hanunoo", "Hebrew",
|
||||
"Hiragana", "Imperial_Aramaic", "Inherited", "Inscriptional_Pahlavi",
|
||||
"Inscriptional_Parthian", "Javanese", "Kaithi", "Kannada", "Katakana",
|
||||
"Kayah_Li", "Kharoshthi", "Khmer", "Lao", "Latin", "Lepcha", "Limbu",
|
||||
"Linear_B", "Lisu", "Lycian", "Lydian", "Malayalam", "Mandaic",
|
||||
"Meetei_Mayek", "Meroitic_Cursive", "Meroitic_Hieroglyphs", "Miao",
|
||||
"Mongolian", "Myanmar", "New_Tai_Lue", "Nko", "Ogham", "Old_Italic",
|
||||
"Old_Persian", "Old_South_Arabian", "Old_Turkic", "Ol_Chiki", "Oriya",
|
||||
"Osmanya", "Phags_Pa", "Phoenician", "Rejang", "Runic", "Samaritan",
|
||||
"Saurashtra", "Sharada", "Shavian", "Sinhala", "Sora_Sompeng",
|
||||
"Sundanese", "Syloti_Nagri", "Syriac", "Tagalog", "Tagbanwa", "Tai_Le",
|
||||
"Tai_Tham", "Tai_Viet", "Takri", "Tamil", "Telugu", "Thaana", "Thai",
|
||||
"Tibetan", "Tifinagh", "Ugaritic", "Vai", "Yi"
|
||||
];
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* The base concrete syntax tree class
|
||||
*
|
||||
@ -102,7 +136,7 @@ export enum UsingFlags {
|
||||
|
||||
/**
|
||||
* Type of match arguments
|
||||
*
|
||||
*
|
||||
* @remarks SingleString means an escaped string
|
||||
* @remarks Between means a range (ex. a-z)
|
||||
* @remarks Anything means .
|
||||
@ -121,7 +155,9 @@ export enum MatchSubStatementType {
|
||||
Tab,
|
||||
Linefeed,
|
||||
Newline,
|
||||
CarriageReturn
|
||||
CarriageReturn,
|
||||
Boundary,
|
||||
Unicode
|
||||
}
|
||||
|
||||
/**
|
||||
@ -135,7 +171,7 @@ export class MatchSubStatementValue {
|
||||
* Constructor for MatchSubStatementValue
|
||||
*
|
||||
* @param type the type of this match
|
||||
* @param from optional range string
|
||||
* @param from optional value or range string
|
||||
* @param to optional range string
|
||||
* @internal
|
||||
*/
|
||||
@ -203,14 +239,14 @@ export class MatchSubStatementCST extends H2RCST {
|
||||
let to = value.to as string;
|
||||
|
||||
if (!isSingleRegexCharacter(from)) {
|
||||
errors.push(this.error("Between statement must begin with a single character"));
|
||||
errors.push(this.error("Between statement must begin with a single character"));
|
||||
}
|
||||
else if (from.startsWith("\\u") || from.startsWith("\\U") || from.startsWith("\\")) {
|
||||
from = JSON.parse(`"${regexEscape(from)}"`);
|
||||
}
|
||||
|
||||
if (!isSingleRegexCharacter(to)) {
|
||||
errors.push(this.error("Between statement must end with a single character"));
|
||||
errors.push(this.error("Between statement must end with a single character"));
|
||||
}
|
||||
else if (to.startsWith("\\u") || to.startsWith("\\U") || to.startsWith("\\")) {
|
||||
to = JSON.parse(`"${regexEscape(to)}"`);
|
||||
@ -220,6 +256,27 @@ export class MatchSubStatementCST extends H2RCST {
|
||||
errors.push(this.error("Between statement range invalid"));
|
||||
}
|
||||
}
|
||||
else if (value.type === MatchSubStatementType.Unicode) {
|
||||
let unicode_class = value.from as string;
|
||||
// check to see if the given code is supported
|
||||
if (!unicode_property_codes.includes(unicode_class)) {
|
||||
// check to see if the given script is supported
|
||||
|
||||
// Java and C# requires "Is*"
|
||||
if (language === RegexDialect.DotNet || language === RegexDialect.Java) {
|
||||
if (!unicode_class.startsWith("Is")) {
|
||||
errors.push(this.error("This dialect requires script names to begin with Is, such as IsCyrillic rather than Cyrillic"));
|
||||
continue;
|
||||
}
|
||||
unicode_class = unicode_class.substr(0, 2);
|
||||
}
|
||||
|
||||
// attempt with and without "_" characters
|
||||
if (!unicode_script_codes.includes(unicode_class) && !unicode_script_codes.includes(unicode_class.replace("_", ""))) {
|
||||
errors.push(this.error(`Unknown unicode specifier ${value.from}`));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return errors;
|
||||
@ -238,6 +295,12 @@ export class MatchSubStatementCST extends H2RCST {
|
||||
case MatchSubStatementType.Between:
|
||||
str.push(this.invert ? `[^${value.from}-${value.to}]` : `[${value.from}-${value.to}]`);
|
||||
break;
|
||||
case MatchSubStatementType.Unicode:
|
||||
str.push(this.invert ? `\\P{${value.from}}` : `\\p{${value.from}}`);
|
||||
break;
|
||||
case MatchSubStatementType.Boundary:
|
||||
str.push(this.invert ? "\\B" : "\\b");
|
||||
break;
|
||||
case MatchSubStatementType.Word:
|
||||
str.push(this.invert ? "\\W+" : "\\w+");
|
||||
break;
|
||||
|
@ -160,6 +160,7 @@ export class Human2RegexParser extends EmbeddedActionsParser {
|
||||
let invert: boolean = false;
|
||||
const values: MatchSubStatementValue[] = [];
|
||||
let from: string | null = null;
|
||||
let value: string | null = null;
|
||||
let to: string | null = null;
|
||||
let type: MatchSubStatementType = MatchSubStatementType.Anything;
|
||||
|
||||
@ -215,17 +216,35 @@ export class Human2RegexParser extends EmbeddedActionsParser {
|
||||
{ ALT: () => {
|
||||
const token = $.CONSUME(T.StringLiteral);
|
||||
tokens.push(token);
|
||||
from = token.image;
|
||||
value = token.image;
|
||||
type = MatchSubStatementType.SingleString;
|
||||
|
||||
return new MatchSubStatementValue(type, from);
|
||||
return new MatchSubStatementValue(type, value);
|
||||
}},
|
||||
|
||||
//unicode
|
||||
{ ALT: () => {
|
||||
$.CONSUME(T.Unicode);
|
||||
const token = $.CONSUME5(T.StringLiteral);
|
||||
tokens.push(token);
|
||||
value = token.image;
|
||||
type = MatchSubStatementType.Unicode;
|
||||
|
||||
return new MatchSubStatementValue(type, value);
|
||||
}},
|
||||
|
||||
{ ALT: () => {
|
||||
tokens.push($.CONSUME(T.Anything));
|
||||
type = MatchSubStatementType.Anything;
|
||||
|
||||
return new MatchSubStatementValue(type);
|
||||
}},
|
||||
{ ALT: () => {
|
||||
tokens.push($.CONSUME(T.Boundary));
|
||||
type = MatchSubStatementType.Boundary;
|
||||
|
||||
return new MatchSubStatementValue(type);
|
||||
}},
|
||||
{ ALT: () => {
|
||||
tokens.push($.CONSUME(T.Word));
|
||||
type = MatchSubStatementType.Word;
|
||||
|
@ -57,6 +57,8 @@ document.addEventListener("DOMContentLoaded", function() {
|
||||
{token: "builtin", regex: /(any thing|any|anything)(s)?/i},
|
||||
{token: "operator", regex: /or/i},
|
||||
{token: "operator", regex: /and|,/i},
|
||||
{token: "builtin", regex: /unicode( class)?/i},
|
||||
{token: "builtin", regex: /(word )boundary/i},
|
||||
{token: "builtin", regex: /word(s)?/i},
|
||||
{token: "builtin", regex: /digit(s)?/i},
|
||||
{token: "builtin", regex: /character(s)?/i},
|
||||
|
@ -29,18 +29,20 @@ import { createToken, Lexer } from "chevrotain";
|
||||
/** @internal */ export const Digit = createToken({name: "DigitSpecifier", pattern: /digit(s)?/i});
|
||||
/** @internal */ export const Character = createToken({name: "CharacterSpecifier", pattern: /character(s)?/i});
|
||||
/** @internal */ export const Whitespace = createToken({name: "WhitespaceSpecifier", pattern: /(white space|whitespace)(s)?/i});
|
||||
/** @internal */ export const Boundary = createToken({name: "BoundarySpecifier", pattern: /(word )boundary/i});
|
||||
/** @internal */ export const Number = createToken({name: "NumberSpecifier", pattern: /number(s)?/i});
|
||||
/** @internal */ export const Unicode = createToken({name: "UnicodeSpecifier", pattern: /unicode( class)?/i});
|
||||
/** @internal */ export const Using = createToken({name: "Using", pattern: /using/i});
|
||||
/** @internal */ export const Global = createToken({name: "Global", pattern: /global/i});
|
||||
/** @internal */ export const Multiline = createToken({name: "Multiline", pattern: /(multi line|multiline)/i});
|
||||
/** @internal */ export const Exact = createToken({name: "Exact", pattern: /exact/i});
|
||||
/** @internal */ export const Matching = createToken({name: "Matching", pattern: /matching/i});
|
||||
/** @internal */ export const Not = createToken({name: "Not", pattern: /not/i}); //, longer_alt: Nothing});
|
||||
/** @internal */ export const Not = createToken({name: "Not", pattern: /not/i});
|
||||
/** @internal */ export const Between = createToken({name: "Between", pattern: /between/i});
|
||||
/** @internal */ export const Tab = createToken({name: "Tab", pattern: /tab/i});
|
||||
/** @internal */ export const Linefeed = createToken({name: "Linefeed", pattern: /(line feed|linefeed)/i});
|
||||
/** @internal */ export const Group = createToken({name: "Group", pattern: /group/i});
|
||||
/** @internal */ export const A = createToken({name: "A", pattern: /a(n)?/i }); //, longer_alt: Anything});
|
||||
/** @internal */ export const A = createToken({name: "A", pattern: /a(n)?/i });
|
||||
/** @internal */ export const Times = createToken({name: "Times", pattern: /times/i});
|
||||
/** @internal */ export const Exactly = createToken({name: "Exactly", pattern: /exact(ly)?/i});
|
||||
/** @internal */ export const Inclusive = createToken({name: "Inclusive", pattern: /inclusive(ly)?/i});
|
||||
@ -111,11 +113,13 @@ export const AllTokens = [
|
||||
Then,
|
||||
Anything,
|
||||
And,
|
||||
Boundary,
|
||||
Word,
|
||||
Digit,
|
||||
Character,
|
||||
Whitespace,
|
||||
Number,
|
||||
Unicode,
|
||||
/*
|
||||
Of,
|
||||
As,
|
||||
|
Loading…
x
Reference in New Issue
Block a user