mirror of
https://github.com/pdemian/human2regex.git
synced 2025-05-16 12:30:09 -07:00
And I broke it
This commit is contained in:
parent
0601dc20ca
commit
5e9c185923
@ -21,7 +21,7 @@
|
|||||||
"@typescript-eslint/explicit-function-return-type": "off",
|
"@typescript-eslint/explicit-function-return-type": "off",
|
||||||
"no-magic-numbers": [
|
"no-magic-numbers": [
|
||||||
"warn",
|
"warn",
|
||||||
{ "ignoreArrayIndexes": true, "ignore": [0,1,2,3,4,5,6,7,8,9]}
|
{ "ignoreArrayIndexes": true, "ignore": [-1,0,1,2,3,4,5,6,7,8,9]}
|
||||||
],
|
],
|
||||||
"curly": "warn",
|
"curly": "warn",
|
||||||
"no-loss-of-precision": "error",
|
"no-loss-of-precision": "error",
|
||||||
@ -45,7 +45,7 @@
|
|||||||
"error",
|
"error",
|
||||||
"always"
|
"always"
|
||||||
],
|
],
|
||||||
"no-shadow": "error",
|
"no-shadow": "off",
|
||||||
"no-undefined": "error",
|
"no-undefined": "error",
|
||||||
"brace-style": [
|
"brace-style": [
|
||||||
"error",
|
"error",
|
||||||
|
321
src/tokenizer.ts
321
src/tokenizer.ts
@ -1,6 +1,6 @@
|
|||||||
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
||||||
|
|
||||||
import { createToken, Lexer } from "chevrotain";
|
import { createToken, Lexer, IToken, createTokenInstance, ILexingResult } from "chevrotain";
|
||||||
|
|
||||||
export const Zero = createToken({name: "Zero", pattern: /zero/i });
|
export const Zero = createToken({name: "Zero", pattern: /zero/i });
|
||||||
export const One = createToken({name: "One", pattern: /one/i });
|
export const One = createToken({name: "One", pattern: /one/i });
|
||||||
@ -17,7 +17,7 @@ export const Ten = createToken({name: "Ten", pattern: /ten/i });
|
|||||||
export const Optional = createToken({name: "Optional", pattern: /optional(ly)?/i });
|
export const Optional = createToken({name: "Optional", pattern: /optional(ly)?/i });
|
||||||
export const Match = createToken({name: "Match", pattern: /match(es)?/i });
|
export const Match = createToken({name: "Match", pattern: /match(es)?/i });
|
||||||
export const Then = createToken({name: "Then", pattern: /then/i });
|
export const Then = createToken({name: "Then", pattern: /then/i });
|
||||||
export const Anything = createToken({name: "Anything", pattern: /(any|anything|any thing)(s)?/i});
|
export const Anything = createToken({name: "Anything", pattern: /(any thing|any|anything)(s)?/i});
|
||||||
export const Of = createToken({name: "Of", pattern: /of/i});
|
export const Of = createToken({name: "Of", pattern: /of/i});
|
||||||
export const Or = createToken({name: "Or", pattern: /or/i});
|
export const Or = createToken({name: "Or", pattern: /or/i});
|
||||||
export const And = createToken({name: "And", pattern: /and|,/i});
|
export const And = createToken({name: "And", pattern: /and|,/i});
|
||||||
@ -69,8 +69,7 @@ export const OrMore = createToken({name: "Or More", pattern: /\+/ });
|
|||||||
export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
|
export const LBracket = createToken({name: "Left Bracket", pattern: /\(/ });
|
||||||
export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
|
export const RBracket = createToken({name: "Right Bracket", pattern: /\)/ });
|
||||||
|
|
||||||
export const Indent = createToken({name: "Indent", pattern: /(( ){4}\t)/ });
|
export const EndOfLine = createToken({name: "EOL", pattern: /\n/, group: "nl" });
|
||||||
export const EndOfLine = createToken({name: "EOL", pattern: /\n/ });
|
|
||||||
export const WhiteSpace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED });
|
export const WhiteSpace = createToken({name: "Whitespace", pattern: /\s+/, group: Lexer.SKIPPED });
|
||||||
export const SingleLineComment = createToken({name: "Single-Line Comment", pattern: /(#|\/\/).*/, group: Lexer.SKIPPED });
|
export const SingleLineComment = createToken({name: "Single-Line Comment", pattern: /(#|\/\/).*/, group: Lexer.SKIPPED });
|
||||||
export const MultilineComment = createToken({name: "Multi-Line Comment", pattern: /\/\*(.*)\*\//, line_breaks: true, group: Lexer.SKIPPED });
|
export const MultilineComment = createToken({name: "Multi-Line Comment", pattern: /\/\*(.*)\*\//, line_breaks: true, group: Lexer.SKIPPED });
|
||||||
@ -79,78 +78,250 @@ export const Identifier = createToken({name: "Identifier", pattern: /[a-z]\w*/i
|
|||||||
export const NumberLiteral = createToken({name: "Number Literal", pattern: /-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?/ });
|
export const NumberLiteral = createToken({name: "Number Literal", pattern: /-?(0|[1-9]\d*)(\.\d+)?([eE][+-]?\d+)?/ });
|
||||||
export const StringLiteral = createToken({name: "String Literal", pattern: /"(?:[^\\"]|\\(?:[bfnrtv"\\/]|u[0-9a-f]{4}|U[0-9a-f]{8}))*"/i });
|
export const StringLiteral = createToken({name: "String Literal", pattern: /"(?:[^\\"]|\\(?:[bfnrtv"\\/]|u[0-9a-f]{4}|U[0-9a-f]{8}))*"/i });
|
||||||
|
|
||||||
|
enum IndentBaseType {
|
||||||
|
Indent,
|
||||||
|
Outdent
|
||||||
|
}
|
||||||
|
|
||||||
|
export const Indent = createToken({
|
||||||
|
name: "Indent",
|
||||||
|
start_chars_hint: [ "\t", " " ],
|
||||||
|
pattern: (text, offset, matchedTokens, groups) => Human2RegexLexer.matchIndentBase(text, offset, matchedTokens, groups, IndentBaseType.Indent),
|
||||||
|
// custom token patterns should explicitly specify the line_breaks option
|
||||||
|
line_breaks: false
|
||||||
|
});
|
||||||
|
|
||||||
|
export const Outdent = createToken({
|
||||||
|
name: "Outdent",
|
||||||
|
start_chars_hint: [ "\t", " " ],
|
||||||
|
pattern: (text, offset, matchedTokens, groups) => Human2RegexLexer.matchIndentBase(text, offset, matchedTokens, groups, IndentBaseType.Outdent),
|
||||||
|
// custom token patterns should explicitly specify the line_breaks option
|
||||||
|
line_breaks: false
|
||||||
|
});
|
||||||
|
|
||||||
export const AllTokens = [
|
export const AllTokens = [
|
||||||
Zero,
|
Zero,
|
||||||
One,
|
One,
|
||||||
Two,
|
Two,
|
||||||
Three,
|
Three,
|
||||||
Four,
|
Four,
|
||||||
Five,
|
Five,
|
||||||
Six,
|
Six,
|
||||||
Seven,
|
Seven,
|
||||||
Eight,
|
Eight,
|
||||||
Nine,
|
Nine,
|
||||||
Ten,
|
Ten,
|
||||||
Optional,
|
Optional,
|
||||||
Matching,
|
Matching,
|
||||||
Match,
|
Match,
|
||||||
Then,
|
Then,
|
||||||
Anything,
|
Anything,
|
||||||
Of,
|
Of,
|
||||||
Or,
|
Or,
|
||||||
And,
|
And,
|
||||||
Word,
|
Word,
|
||||||
Digit,
|
Digit,
|
||||||
Character,
|
Character,
|
||||||
Whitespace,
|
Whitespace,
|
||||||
Number,
|
Number,
|
||||||
As,
|
As,
|
||||||
If,
|
If,
|
||||||
Start,
|
Start,
|
||||||
With,
|
With,
|
||||||
Ends,
|
Ends,
|
||||||
Otherwise,
|
Otherwise,
|
||||||
Else,
|
Else,
|
||||||
Unless,
|
Unless,
|
||||||
While,
|
While,
|
||||||
More,
|
More,
|
||||||
Using,
|
Using,
|
||||||
Global,
|
Global,
|
||||||
Multiline,
|
Multiline,
|
||||||
Exact,
|
Exact,
|
||||||
Nothing,
|
Nothing,
|
||||||
Not,
|
Not,
|
||||||
Between,
|
Between,
|
||||||
Tab,
|
Tab,
|
||||||
Linefeed,
|
Linefeed,
|
||||||
Group,
|
Group,
|
||||||
By,
|
By,
|
||||||
A,
|
A,
|
||||||
The,
|
The,
|
||||||
Exactly,
|
Exactly,
|
||||||
Inclusive,
|
Inclusive,
|
||||||
Exclusive,
|
Exclusive,
|
||||||
From,
|
From,
|
||||||
Create,
|
Create,
|
||||||
Called,
|
Called,
|
||||||
Repeat,
|
Repeat,
|
||||||
Newline,
|
Newline,
|
||||||
None,
|
None,
|
||||||
Neither,
|
Neither,
|
||||||
CarriageReturn,
|
CarriageReturn,
|
||||||
CaseInsensitive,
|
CaseInsensitive,
|
||||||
CaseSensitive,
|
CaseSensitive,
|
||||||
OrMore,
|
OrMore,
|
||||||
To,
|
To,
|
||||||
Indent,
|
EndOfLine,
|
||||||
EndOfLine,
|
Indent,
|
||||||
WhiteSpace,
|
Outdent,
|
||||||
SingleLineComment,
|
WhiteSpace,
|
||||||
MultilineComment,
|
SingleLineComment,
|
||||||
Identifier,
|
MultilineComment,
|
||||||
NumberLiteral,
|
Identifier,
|
||||||
StringLiteral,
|
NumberLiteral,
|
||||||
|
StringLiteral,
|
||||||
];
|
];
|
||||||
|
|
||||||
export const Human2RegexLexer = new Lexer(AllTokens, { ensureOptimizations: true });
|
const H2RLexer = new Lexer(AllTokens, { ensureOptimizations: true });
|
||||||
|
|
||||||
|
export enum IndentType {
|
||||||
|
Tabs,
|
||||||
|
Spaces,
|
||||||
|
Both
|
||||||
|
}
|
||||||
|
|
||||||
|
export class Human2RegexLexerOptions {
|
||||||
|
constructor(public type: IndentType = IndentType.Both, public spaces_per_tab: number = 4) {
|
||||||
|
/* empty */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class Human2RegexLexer {
|
||||||
|
//Taken and adapted from https://github.com/SAP/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js
|
||||||
|
|
||||||
|
// State required for matching the indentations
|
||||||
|
private static options = new Human2RegexLexerOptions();
|
||||||
|
private static indentStack = [ 0 ];
|
||||||
|
private static wsRegExp: RegExp;
|
||||||
|
private static spacesPerTab = " ";
|
||||||
|
|
||||||
|
private static findLastIndex<T>(array: T[], predicate: (x: T) => boolean) : number {
|
||||||
|
for (let index = array.length; index >= 0; index--) {
|
||||||
|
if (predicate(array[index])) {
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This custom Token matcher uses Lexer context ("matchedTokens" and "groups" arguments)
|
||||||
|
* combined with state via closure ("indentStack" and "lastTextMatched") to match indentation.
|
||||||
|
*/
|
||||||
|
public static matchIndentBase(text: string, offset: number, matchedTokens: IToken[], groups: {[groupName: string]: IToken[]}, type: IndentBaseType) : RegExpExecArray | null {
|
||||||
|
const noTokensMatchedYet = !matchedTokens.length;
|
||||||
|
const newLines = groups.nl;
|
||||||
|
const noNewLinesMatchedYet = !newLines.length;
|
||||||
|
const isFirstLine = noTokensMatchedYet && noNewLinesMatchedYet;
|
||||||
|
const isStartOfLine =
|
||||||
|
// only newlines matched so far
|
||||||
|
(noTokensMatchedYet && !noNewLinesMatchedYet) ||
|
||||||
|
// Both newlines and other Tokens have been matched AND the offset is just after the last newline
|
||||||
|
(!noTokensMatchedYet &&
|
||||||
|
!noNewLinesMatchedYet &&
|
||||||
|
offset === newLines[newLines.length-1].startOffset + 1);
|
||||||
|
|
||||||
|
// indentation can only be matched at the start of a line.
|
||||||
|
if (isFirstLine || isStartOfLine) {
|
||||||
|
let currIndentLevel: number = -1;
|
||||||
|
|
||||||
|
Human2RegexLexer.wsRegExp.lastIndex = offset;
|
||||||
|
const match = Human2RegexLexer.wsRegExp.exec(text);
|
||||||
|
|
||||||
|
// possible non-empty indentation
|
||||||
|
if (match !== null) {
|
||||||
|
currIndentLevel = match[0].length;
|
||||||
|
//if (this.options.type === IndentType.Tabs) {
|
||||||
|
// currIndentLevel = match[0].length;
|
||||||
|
//}
|
||||||
|
//else {
|
||||||
|
// currIndentLevel = match[0].replace(Human2RegexLexer.spacesPerTab, "\t").length;
|
||||||
|
//}
|
||||||
|
}
|
||||||
|
// "empty" indentation means indentLevel of 0.
|
||||||
|
else {
|
||||||
|
currIndentLevel = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const prevIndentLevel = this.indentStack[this.indentStack.length-1];
|
||||||
|
// deeper indentation
|
||||||
|
if (currIndentLevel > prevIndentLevel && type === IndentBaseType.Indent) {
|
||||||
|
this.indentStack.push(currIndentLevel);
|
||||||
|
return match;
|
||||||
|
}
|
||||||
|
// shallower indentation
|
||||||
|
else if (currIndentLevel < prevIndentLevel && type === IndentBaseType.Outdent) {
|
||||||
|
const matchIndentIndex = this.findLastIndex(this.indentStack, (stackIndentDepth) => stackIndentDepth === currIndentLevel);
|
||||||
|
|
||||||
|
// any outdent must match some previous indentation level.
|
||||||
|
if (matchIndentIndex === -1) {
|
||||||
|
throw Error(`invalid outdent at offset: ${offset}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const numberOfDedents = this.indentStack.length - matchIndentIndex - 1;
|
||||||
|
|
||||||
|
// This is a little tricky
|
||||||
|
// 1. If there is no match (0 level indent) than this custom token
|
||||||
|
// matcher would return "null" and so we need to add all the required outdents ourselves.
|
||||||
|
// 2. If there was match (> 0 level indent) than we need to add minus one number of outsents
|
||||||
|
// because the lexer would create one due to returning a none null result.
|
||||||
|
const iStart = match !== null ? 1 : 0;
|
||||||
|
for (let i = iStart; i < numberOfDedents; i++) {
|
||||||
|
this.indentStack.pop();
|
||||||
|
matchedTokens.push(createTokenInstance(Outdent, "", NaN, NaN, NaN, NaN, NaN, NaN));
|
||||||
|
}
|
||||||
|
|
||||||
|
// even though we are adding fewer outdents directly we still need to update the indent stack fully.
|
||||||
|
if (iStart === 1) {
|
||||||
|
this.indentStack.pop();
|
||||||
|
}
|
||||||
|
return match;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// same indent, this should be lexed as simple whitespace and ignored
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// indentation cannot be matched under other circumstances
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static tokenize(text: string, options: Human2RegexLexerOptions | null = null) : ILexingResult{
|
||||||
|
// have to reset the indent stack between processing of different text inputs
|
||||||
|
Human2RegexLexer.indentStack = [ 0 ];
|
||||||
|
|
||||||
|
if (options !== null) {
|
||||||
|
Human2RegexLexer.options = this.options;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
if (this.options.type === IndentType.Tabs) {
|
||||||
|
Human2RegexLexer.wsRegExp = /\t/y;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
let reg = ` {${this.options.spaces_per_tab}}`;
|
||||||
|
|
||||||
|
if (this.options.type === IndentType.Both) {
|
||||||
|
reg += "|\\t";
|
||||||
|
}
|
||||||
|
|
||||||
|
Human2RegexLexer.wsRegExp = new RegExp(reg, "y");
|
||||||
|
|
||||||
|
Human2RegexLexer.spacesPerTab = Array(this.options.spaces_per_tab+1).join(" ");
|
||||||
|
}*/
|
||||||
|
Human2RegexLexer.wsRegExp = / +/y;
|
||||||
|
|
||||||
|
const lexResult = H2RLexer.tokenize(text);
|
||||||
|
|
||||||
|
//add remaining Outdents
|
||||||
|
while (Human2RegexLexer.indentStack.length > 1) {
|
||||||
|
lexResult.tokens.push(createTokenInstance(Outdent, "", NaN, NaN, NaN, NaN, NaN, NaN));
|
||||||
|
Human2RegexLexer.indentStack.pop();
|
||||||
|
}
|
||||||
|
|
||||||
|
return lexResult;
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user