1
0
mirror of https://github.com/pdemian/human2regex.git synced 2025-05-16 12:30:09 -07:00

And I broke it

This commit is contained in:
Patrick Demian 2020-10-26 05:08:17 -04:00
parent 0601dc20ca
commit 5e9c185923
2 changed files with 248 additions and 77 deletions

View File

@ -21,7 +21,7 @@
"@typescript-eslint/explicit-function-return-type": "off", "@typescript-eslint/explicit-function-return-type": "off",
"no-magic-numbers": [ "no-magic-numbers": [
"warn", "warn",
{ "ignoreArrayIndexes": true, "ignore": [0,1,2,3,4,5,6,7,8,9]} { "ignoreArrayIndexes": true, "ignore": [-1,0,1,2,3,4,5,6,7,8,9]}
], ],
"curly": "warn", "curly": "warn",
"no-loss-of-precision": "error", "no-loss-of-precision": "error",
@ -45,7 +45,7 @@
"error", "error",
"always" "always"
], ],
"no-shadow": "error", "no-shadow": "off",
"no-undefined": "error", "no-undefined": "error",
"brace-style": [ "brace-style": [
"error", "error",

View File

@ -1,6 +1,6 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */ /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
import { createToken, Lexer } from "chevrotain"; import { createToken, Lexer, IToken, createTokenInstance, ILexingResult } from "chevrotain";
export const Zero = createToken({name: "Zero", pattern: /zero/i }); export const Zero = createToken({name: "Zero", pattern: /zero/i });
export const One = createToken({name: "One", pattern: /one/i }); export const One = createToken({name: "One", pattern: /one/i });
@ -17,7 +17,7 @@ export const Ten = createToken({name: "Ten", pattern: /ten/i });
export const Optional = createToken({name: "Optional", pattern: /optional(ly)?/i }); export const Optional = createToken({name: "Optional", pattern: /optional(ly)?/i });
export const Match = createToken({name: "Match", pattern: /match(es)?/i }); export const Match = createToken({name: "Match", pattern: /match(es)?/i });
export const Then = createToken({name: "Then", pattern: /then/i }); export const Then = createToken({name: "Then", pattern: /then/i });
export const Anything = createToken({name: "Anything", pattern: /(any|anything|any thing)(s)?/i}); export const Anything = createToken({name: "Anything", pattern: /(any thing|any|anything)(s)?/i});
export const Of = createToken({name: "Of", pattern: /of/i}); export const Of = createToken({name: "Of", pattern: /of/i});
export const Or = createToken({name: "Or", pattern: /or/i}); export const Or = createToken({name: "Or", pattern: /or/i});
export const And = createToken({name: "And", pattern: /and|,/i}); export const And = createToken({name: "And", pattern: /and|,/i});
@ -69,8 +69,7 @@ export const OrMore = createToken({name: "Or More", pattern: /\+/ });
export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ }); export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ }); export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
export const Indent = createToken({name: "Indent", pattern: /(( ){4}\t)/ }); export const EndOfLine = createToken({name: "EOL", pattern: /\n/, group: "nl" });
export const EndOfLine = createToken({name: "EOL", pattern: /\n/ });
export const WhiteSpace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED }); export const WhiteSpace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED });
export const SingleLineComment = createToken({name: "Single-Line Comment", pattern: /(#|\/\/).*/, group: Lexer.SKIPPED }); export const SingleLineComment = createToken({name: "Single-Line Comment", pattern: /(#|\/\/).*/, group: Lexer.SKIPPED });
export const MultilineComment = createToken({name: "Multi-Line Comment", pattern: /\/\*(.*)\*\//, line_breaks: true, group: Lexer.SKIPPED }); export const MultilineComment = createToken({name: "Multi-Line Comment", pattern: /\/\*(.*)\*\//, line_breaks: true, group: Lexer.SKIPPED });
@ -79,78 +78,250 @@ export const Identifier = createToken({name: "Identifier", pattern: /[a-z]\w*/i
export const NumberLiteral = createToken({name: "Number Literal", pattern: /-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?/ }); export const NumberLiteral = createToken({name: "Number Literal", pattern: /-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?/ });
export const StringLiteral = createToken({name: "String Literal", pattern: /"(?:[^\\"]|\\(?:[bfnrtv"\\/]|u[0-9a-f]{4}|U[0-9a-f]{8}))*"/i }); export const StringLiteral = createToken({name: "String Literal", pattern: /"(?:[^\\"]|\\(?:[bfnrtv"\\/]|u[0-9a-f]{4}|U[0-9a-f]{8}))*"/i });
enum IndentBaseType {
Indent,
Outdent
}
export const Indent = createToken({
name: "Indent",
start_chars_hint: [ "\t", " " ],
pattern: (text, offset, matchedTokens, groups) => Human2RegexLexer.matchIndentBase(text, offset, matchedTokens, groups, IndentBaseType.Indent),
// custom token patterns should explicitly specify the line_breaks option
line_breaks: false
});
export const Outdent = createToken({
name: "Outdent",
start_chars_hint: [ "\t", " " ],
pattern: (text, offset, matchedTokens, groups) => Human2RegexLexer.matchIndentBase(text, offset, matchedTokens, groups, IndentBaseType.Outdent),
// custom token patterns should explicitly specify the line_breaks option
line_breaks: false
});
export const AllTokens = [ export const AllTokens = [
Zero, Zero,
One, One,
Two, Two,
Three, Three,
Four, Four,
Five, Five,
Six, Six,
Seven, Seven,
Eight, Eight,
Nine, Nine,
Ten, Ten,
Optional, Optional,
Matching, Matching,
Match, Match,
Then, Then,
Anything, Anything,
Of, Of,
Or, Or,
And, And,
Word, Word,
Digit, Digit,
Character, Character,
Whitespace, Whitespace,
Number, Number,
As, As,
If, If,
Start, Start,
With, With,
Ends, Ends,
Otherwise, Otherwise,
Else, Else,
Unless, Unless,
While, While,
More, More,
Using, Using,
Global, Global,
Multiline, Multiline,
Exact, Exact,
Nothing, Nothing,
Not, Not,
Between, Between,
Tab, Tab,
Linefeed, Linefeed,
Group, Group,
By, By,
A, A,
The, The,
Exactly, Exactly,
Inclusive, Inclusive,
Exclusive, Exclusive,
From, From,
Create, Create,
Called, Called,
Repeat, Repeat,
Newline, Newline,
None, None,
Neither, Neither,
CarriageReturn, CarriageReturn,
CaseInsensitive, CaseInsensitive,
CaseSensitive, CaseSensitive,
OrMore, OrMore,
To, To,
Indent, EndOfLine,
EndOfLine, Indent,
WhiteSpace, Outdent,
SingleLineComment, WhiteSpace,
MultilineComment, SingleLineComment,
Identifier, MultilineComment,
NumberLiteral, Identifier,
StringLiteral, NumberLiteral,
StringLiteral,
]; ];
export const Human2RegexLexer = new Lexer(AllTokens, { ensureOptimizations: true }); const H2RLexer = new Lexer(AllTokens, { ensureOptimizations: true });
export enum IndentType {
Tabs,
Spaces,
Both
}
export class Human2RegexLexerOptions {
constructor(public type: IndentType = IndentType.Both, public spaces_per_tab: number = 4) {
/* empty */
}
}
export class Human2RegexLexer {
//Taken and adapted from https://github.com/SAP/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
// State required for matching the indentations
private static options = new Human2RegexLexerOptions();
private static indentStack = [ 0 ];
private static wsRegExp: RegExp;
private static spacesPerTab = " ";
private static findLastIndex<T>(array: T[], predicate: (x: T) => boolean) : number {
for (let index = array.length; index >= 0; index--) {
if (predicate(array[index])) {
return index;
}
}
return -1;
}
/**
* This custom Token matcher uses Lexer context ("matchedTokens" and "groups" arguments)
* combined with state via closure ("indentStack" and "lastTextMatched") to match indentation.
*/
public static matchIndentBase(text: string, offset: number, matchedTokens: IToken[], groups: {[groupName: string]: IToken[]}, type: IndentBaseType) : RegExpExecArray | null {
const noTokensMatchedYet = !matchedTokens.length;
const newLines = groups.nl;
const noNewLinesMatchedYet = !newLines.length;
const isFirstLine = noTokensMatchedYet && noNewLinesMatchedYet;
const isStartOfLine =
// only newlines matched so far
(noTokensMatchedYet && !noNewLinesMatchedYet) ||
// Both newlines and other Tokens have been matched AND the offset is just after the last newline
(!noTokensMatchedYet &&
!noNewLinesMatchedYet &&
offset === newLines[newLines.length-1].startOffset + 1);
// indentation can only be matched at the start of a line.
if (isFirstLine || isStartOfLine) {
let currIndentLevel: number = -1;
Human2RegexLexer.wsRegExp.lastIndex = offset;
const match = Human2RegexLexer.wsRegExp.exec(text);
// possible non-empty indentation
if (match !== null) {
currIndentLevel = match[0].length;
//if (this.options.type === IndentType.Tabs) {
// currIndentLevel = match[0].length;
//}
//else {
// currIndentLevel = match[0].replace(Human2RegexLexer.spacesPerTab, "\t").length;
//}
}
// "empty" indentation means indentLevel of 0.
else {
currIndentLevel = 0;
}
const prevIndentLevel = this.indentStack[this.indentStack.length-1];
// deeper indentation
if (currIndentLevel > prevIndentLevel && type === IndentBaseType.Indent) {
this.indentStack.push(currIndentLevel);
return match;
}
// shallower indentation
else if (currIndentLevel < prevIndentLevel && type === IndentBaseType.Outdent) {
const matchIndentIndex = this.findLastIndex(this.indentStack, (stackIndentDepth) => stackIndentDepth === currIndentLevel);
// any outdent must match some previous indentation level.
if (matchIndentIndex === -1) {
throw Error(`invalid outdent at offset: ${offset}`);
}
const numberOfDedents = this.indentStack.length - matchIndentIndex - 1;
// This is a little tricky
// 1. If there is no match (0 level indent) than this custom token
// matcher would return "null" and so we need to add all the required outdents ourselves.
// 2. If there was match (> 0 level indent) than we need to add minus one number of outsents
// because the lexer would create one due to returning a none null result.
const iStart = match !== null ? 1 : 0;
for (let i = iStart; i < numberOfDedents; i++) {
this.indentStack.pop();
matchedTokens.push(createTokenInstance(Outdent, "", NaN, NaN, NaN, NaN, NaN, NaN));
}
// even though we are adding fewer outdents directly we still need to update the indent stack fully.
if (iStart === 1) {
this.indentStack.pop();
}
return match;
}
else {
// same indent, this should be lexed as simple whitespace and ignored
return null;
}
}
else {
// indentation cannot be matched under other circumstances
return null;
}
}
public static tokenize(text: string, options: Human2RegexLexerOptions | null = null) : ILexingResult{
// have to reset the indent stack between processing of different text inputs
Human2RegexLexer.indentStack = [ 0 ];
if (options !== null) {
Human2RegexLexer.options = this.options;
}
/*
if (this.options.type === IndentType.Tabs) {
Human2RegexLexer.wsRegExp = /\t/y;
}
else {
let reg = ` {${this.options.spaces_per_tab}}`;
if (this.options.type === IndentType.Both) {
reg += "|\\t";
}
Human2RegexLexer.wsRegExp = new RegExp(reg, "y");
Human2RegexLexer.spacesPerTab = Array(this.options.spaces_per_tab+1).join(" ");
}*/
Human2RegexLexer.wsRegExp = / +/y;
const lexResult = H2RLexer.tokenize(text);
//add remaining Outdents
while (Human2RegexLexer.indentStack.length > 1) {
lexResult.tokens.push(createTokenInstance(Outdent, "", NaN, NaN, NaN, NaN, NaN, NaN));
Human2RegexLexer.indentStack.pop();
}
return lexResult;
}
}