1
0
mirror of https://github.com/pdemian/human2regex.git synced 2025-05-16 12:30:09 -07:00

Fixed tokenizer

Just 1 more commit and I think I'll be done with it
This commit is contained in:
Patrick Demian 2020-10-13 05:04:44 -04:00
parent 9f46d1246c
commit 88c5b203fd
4 changed files with 179 additions and 61 deletions

0
src/ast.ts Normal file
View File

View File

@ -12,7 +12,46 @@ $(function() {
*/ */
const opts = new TokenizerOptions(); const opts = new TokenizerOptions();
const result = tokenize("match /* 9+ */ 1+ optionally 1..3 0-zero then //comment match", opts); const result = tokenize(`
// H2R supports // # and /**/ as comments
// A group is only captured if given a name.
// You can use "and", "or", "not" to specify "[]" regex
// You can use "then" to combine match statements, however I find using multiple "match" statements easier to read
// exact matching means use a ^ and $ to signify the start and end of the string
using global and exact matching
create an optional group called "protocol"
match "http"
optionally match "s"
match "://"
create a group called "subdomain"
repeat
match 1+ words
match "."
create a group called "domain"
match 1+ words or "_" or "-"
match "."
match a word
# port, but we don't care about it, so ignore it
optionally match ":" then 0+ digits
create an optional group called "path"
repeat
match "/"
match 0+ words or "_" or "-"
create an optional group
# we don't want to capture the '?', so don't name the group until afterwards
match "?"
create a group called "query"
repeat
match 1+ words or "_" or "-"
match "="
match 1+ words or "_" or "-"
create an optional group
# fragment, again, we don't care, so ignore everything afterwards
match "#"
match 0+ any thing
`, opts);
for(const r of result.tokens) { for(const r of result.tokens) {
console.log(r.to_string()); console.log(r.to_string());

View File

@ -5,12 +5,16 @@
import { Token, TokenType, TokenError } from "./tokens"; import { Token, TokenType, TokenError } from "./tokens";
const keywords = { const keywords = {
/* Full Keywords */
"optional": TokenType.KEYWORD_OPTIONAL, "optional": TokenType.KEYWORD_OPTIONAL,
"optionally": TokenType.KEYWORD_OPTIONAL, "optionally": TokenType.KEYWORD_OPTIONAL,
"match": TokenType.KEYWORD_MATCH, "match": TokenType.KEYWORD_MATCH,
"matches": TokenType.KEYWORD_MATCH,
"then": TokenType.KEYWORD_THEN, "then": TokenType.KEYWORD_THEN,
"any": TokenType.KEYWORD_ANY, "any": TokenType.KEYWORD_ANY,
"anything": TokenType.KEYWORD_ANY, "anything": TokenType.KEYWORD_ANY,
"anythings": TokenType.KEYWORD_ANY,
"of": TokenType.KEYWORD_OF, "of": TokenType.KEYWORD_OF,
"or": TokenType.KEYWORD_OR, "or": TokenType.KEYWORD_OR,
"and": TokenType.KEYWORD_AND, "and": TokenType.KEYWORD_AND,
@ -19,9 +23,15 @@ const keywords = {
"character": TokenType.KEYWORD_CHAR_SPECIFIER, "character": TokenType.KEYWORD_CHAR_SPECIFIER,
"whitespace": TokenType.KEYWORD_WHITESPACE_SPECIFIER, "whitespace": TokenType.KEYWORD_WHITESPACE_SPECIFIER,
"number": TokenType.KEYWORD_NUMBER_SPECIFIER, "number": TokenType.KEYWORD_NUMBER_SPECIFIER,
"words": TokenType.KEYWODE_WORD_SPECIFIER,
"digits": TokenType.KEYWORD_DIGIT_SPECIFIER,
"characters": TokenType.KEYWORD_CHAR_SPECIFIER,
"whitespaces": TokenType.KEYWORD_WHITESPACE_SPECIFIER,
"numbers": TokenType.KEYWORD_NUMBER_SPECIFIER,
"multiple": TokenType.KEYWORD_MULTIPLE, "multiple": TokenType.KEYWORD_MULTIPLE,
"as": TokenType.KEYWORD_AS, "as": TokenType.KEYWORD_AS,
"if": TokenType.KEYWORD_IF, "if": TokenType.KEYWORD_IF,
"start": TokenType.KEYWORD_STARTS,
"starts": TokenType.KEYWORD_STARTS, "starts": TokenType.KEYWORD_STARTS,
"with": TokenType.KEYWORD_WITH, "with": TokenType.KEYWORD_WITH,
"ends": TokenType.KEYWORD_ENDS, "ends": TokenType.KEYWORD_ENDS,
@ -39,8 +49,6 @@ const keywords = {
"between": TokenType.KEYWORD_BETWEEN, "between": TokenType.KEYWORD_BETWEEN,
"tab": TokenType.KEYWORD_TAB, "tab": TokenType.KEYWORD_TAB,
"linefeed": TokenType.KEYWORD_LINEFEED, "linefeed": TokenType.KEYWORD_LINEFEED,
"carriage": TokenType.KEYWORD_CARRIAGE,
"return": TokenType.KEYWORD_RETURN,
"group": TokenType.KEYWORD_GROUP, "group": TokenType.KEYWORD_GROUP,
"by": TokenType.KEYWORD_BY, "by": TokenType.KEYWORD_BY,
"an": TokenType.KEYWORD_ARTICLE, "an": TokenType.KEYWORD_ARTICLE,
@ -52,7 +60,58 @@ const keywords = {
"exclusive": TokenType.KEYWORD_EXCLUSIVE, "exclusive": TokenType.KEYWORD_EXCLUSIVE,
"exclusively": TokenType.KEYWORD_EXCLUSIVE, "exclusively": TokenType.KEYWORD_EXCLUSIVE,
"from": TokenType.KEYWORD_FROM, "from": TokenType.KEYWORD_FROM,
"to": TokenType.KEYWORD_TO "to": TokenType.KEYWORD_TO,
"create": TokenType.KEYWORD_CREATE,
"creates": TokenType.KEYWORD_CREATE,
"called": TokenType.KEYWORD_CALLED,
"repeat": TokenType.KEYWORD_REPEAT,
"repeats": TokenType.KEYWORD_REPEAT,
"newline": TokenType.KEYWORD_NEWLINE,
/* Partial keywords */
"thing": TokenType.PARTIAL_KEYWORD,
"things": TokenType.PARTIAL_KEYWORD,
"white": TokenType.PARTIAL_KEYWORD,
"space": TokenType.PARTIAL_KEYWORD,
"spaces": TokenType.PARTIAL_KEYWORD,
"other": TokenType.PARTIAL_KEYWORD,
"wise": TokenType.PARTIAL_KEYWORD,
"multi": TokenType.PARTIAL_KEYWORD,
"new": TokenType.PARTIAL_KEYWORD,
"line": TokenType.PARTIAL_KEYWORD,
"feed": TokenType.PARTIAL_KEYWORD,
"carriage": TokenType.PARTIAL_KEYWORD,
"return": TokenType.PARTIAL_KEYWORD,
};
const numbers = {
"zero": "0",
"one": "1",
"two": "2",
"three": "3",
"four": "4",
"five": "5",
"six": "6",
"seven": "7",
"eight": "8",
"nine": "9",
"ten": "10"
}
interface token_transformation {
[key: string]: { preceeding_token: string, transforms_to: TokenType }[]
}
const token_transformations : token_transformation = {
"thing": [{ preceeding_token: "any", transforms_to: TokenType.KEYWORD_ANY }],
"things": [{ preceeding_token: "any", transforms_to: TokenType.KEYWORD_ANY }],
"space": [{ preceeding_token: "white", transforms_to: TokenType.KEYWORD_WHITESPACE_SPECIFIER }],
"spaces": [{ preceeding_token: "white", transforms_to: TokenType.KEYWORD_WHITESPACE_SPECIFIER }],
"wise": [{ preceeding_token: "other", transforms_to: TokenType.KEYWORD_ELSE }],
"line": [{ preceeding_token: "multi", transforms_to: TokenType.KEYWORD_MULTILINE },
{ preceeding_token: "new", transforms_to: TokenType.KEYWORD_NEWLINE }],
"feed": [{ preceeding_token: "line", transforms_to: TokenType.KEYWORD_LINEFEED }],
"return": [{ preceeding_token: "carriage", transforms_to: TokenType.KEYWORD_CARRIAGE_RETURN }],
}; };
const escape_sequences = { const escape_sequences = {
@ -68,10 +127,6 @@ const escape_sequences = {
"\\": "\\", "\\": "\\",
}; };
export class TokenizerOptions {
public convert_spaces_to_tabs: boolean = true;
}
const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g); const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g);
function escape_sequence_gather_hex(input: string, i : number, max: number) : string { function escape_sequence_gather_hex(input: string, i : number, max: number) : string {
@ -120,31 +175,73 @@ function escape_sequence_mapper(input: string, i : number) : { code: string, rea
} }
} }
const test_chars = "09azAZ"; const test_char_0 = "0".charCodeAt(0);
const test_char_9 = "9".charCodeAt(0);
const test_char_a = "a".charCodeAt(0);
const test_char_z = "z".charCodeAt(0);
const test_char_A = "A".charCodeAt(0);
const test_char_Z = "Z".charCodeAt(0);
function is_digit(input: string, i: number) : boolean { function is_digit(input: string, i: number) : boolean {
//return /[0-9]/g.test(input);
const value = input.charCodeAt(i); const value = input.charCodeAt(i);
return value >= test_chars.charCodeAt(0) && value <= test_chars.charCodeAt(1); return value >= test_char_0 && value <= test_char_9;
} }
function is_char(input: string, i: number) : boolean { function is_char(input: string, i: number) : boolean {
//return input.toUpperCase() != input.toLowerCase();
//return /[a-zA-Z]/g.test(input);
const value = input.charCodeAt(i); const value = input.charCodeAt(i);
return ((value >= test_chars.charCodeAt(2) && value <= test_chars.charCodeAt(3)) || return ((value >= test_char_a && value <= test_char_z) ||
(value >= test_chars.charCodeAt(4) && value <= test_chars.charCodeAt(5))); (value >= test_char_A && value <= test_char_Z));
}
function transform_tokens(tokens: Token[], errors: TokenError[]) : void {
for(let i = 0; i < tokens.length; i++) {
//check past tokens: if it matches the preceeding tokens, we transform it.
if(tokens[i].type === TokenType.PARTIAL_KEYWORD && token_transformations[tokens[i].token_string as string]) {
const transform = token_transformations[tokens[i].token_string as string];
for(let j = 0; j < transform.length; j++) {
if(i-1 >= 0 && transform[j].preceeding_token === tokens[i-1].token_string) {
// use the i-1 token because it has the start line and position
tokens[i-1].type = transform[j].transforms_to;
(tokens[i-1].token_string as string) += " " + tokens[i].token_string as string;
tokens.splice(i, 1); // remove this token
i--; // move token counter back because we removed the token
break;
}
}
}
/* else ignore */
}
// do we still have partial tokens? those are errors then
for(let i = 0; i < tokens.length; i++) {
if(tokens[i].type === TokenType.PARTIAL_KEYWORD) {
errors.push(new TokenError(`Unknown keyword "${tokens[i].token_string}"`, tokens[i].line, tokens[i].position));
}
}
}
export class TokenizerOptions {
public convert_spaces_to_tabs: boolean = true;
}
export interface TokenizeResult {
tokens: Token[],
errors: TokenError[]
} }
/* Basic Tokenizer */ /* Basic Tokenizer */
export function tokenize(input: string, options: TokenizerOptions) : { tokens: Token[], errors: TokenError[] } { export function tokenize(input: string, options: TokenizerOptions) : TokenizeResult {
let line = 1; let line = 1;
let position = 1; let position = 1;
const tokens : Token[] = []; const tokens : Token[] = [];
const errors : TokenError[] = []; const errors : TokenError[] = [];
// gather tokens
for(let i = 0; i < input.length; i++, position++) { for(let i = 0; i < input.length; i++, position++) {
// 4 spaces = 1 tab. That is final. Debate over // 4 spaces = 1 tab. That is final. Debate over
if(options.convert_spaces_to_tabs && input.startsWith(" ", i)) { if(options.convert_spaces_to_tabs && input.startsWith(" ", i)) {
@ -276,6 +373,8 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
break; break;
case "\n": case "\n":
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position)); tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
line++;
position = 0;
break; break;
case "\r": case "\r":
// ignore // ignore
@ -284,20 +383,25 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
tokens.push(new Token(TokenType.INDENT, line, position)); tokens.push(new Token(TokenType.INDENT, line, position));
break; break;
case " ": case " ":
// ignore
break; break;
default: default:
// is digit? build up a number // is digit? build up a number
if(is_digit(input, i)) { if(is_digit(input, i)) {
const digit_begin = position;
let digits = input[i]; let digits = input[i];
for(; i+1 < input.length && is_digit(input, i+1); i++, position++) { for(; i+1 < input.length && is_digit(input, i+1); i++, position++) {
digits += input[i+1]; digits += input[i+1];
} }
tokens.push(new Token(TokenType.NUMBER, line, position, digits)); tokens.push(new Token(TokenType.NUMBER, line, digit_begin, digits));
} }
// is char? build up a word // is char? build up a word
else if(is_char(input, i)) { else if(is_char(input, i)) {
const word_begin = position;
let text = input[i]; let text = input[i];
for(; i+1 < input.length && is_char(input, i+1); i++, position++) { for(; i+1 < input.length && is_char(input, i+1); i++, position++) {
@ -306,49 +410,16 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
const keyword_text = text.toLowerCase(); const keyword_text = text.toLowerCase();
// keyword (ex. "match")
if(keywords[keyword_text]) { if(keywords[keyword_text]) {
tokens.push(new Token(keywords[keyword_text], line, position)); tokens.push(new Token(keywords[keyword_text], line, word_begin, keyword_text));
}
// text number (ex. "one")
else if(numbers[keyword_text]) {
tokens.push(new Token(TokenType.NUMBER, line, word_begin, keyword_text));
} }
else { else {
switch(keyword_text) { errors.push(new TokenError(`Unknown keyword "${text}"`, line, word_begin));
case "none":
case "zero":
tokens.push(new Token(TokenType.NUMBER, line, position, "0"));
break;
case "one":
tokens.push(new Token(TokenType.NUMBER, line, position, "1"));
break;
case "two":
tokens.push(new Token(TokenType.NUMBER, line, position, "2"));
break;
case "three":
tokens.push(new Token(TokenType.NUMBER, line, position, "3"));
break;
case "four":
tokens.push(new Token(TokenType.NUMBER, line, position, "4"));
break;
case "five":
tokens.push(new Token(TokenType.NUMBER, line, position, "5"));
break;
case "six":
tokens.push(new Token(TokenType.NUMBER, line, position, "6"));
break;
case "seven":
tokens.push(new Token(TokenType.NUMBER, line, position, "7"));
break;
case "eight":
tokens.push(new Token(TokenType.NUMBER, line, position, "8"));
break;
case "nine":
tokens.push(new Token(TokenType.NUMBER, line, position, "9"));
break;
case "ten":
tokens.push(new Token(TokenType.NUMBER, line, position, "10"));
break;
default:
errors.push(new TokenError(`Unknown keyword ${text}`, line, position));
break;
}
} }
} }
else { else {
@ -359,5 +430,8 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
} }
} }
// transform tokens
transform_tokens(tokens, errors);
return { tokens: tokens, errors: errors }; return { tokens: tokens, errors: errors };
} }

View File

@ -4,6 +4,7 @@ export enum TokenType {
BETWEEN, BETWEEN,
QUOTE, QUOTE,
NUMBER, NUMBER,
PARTIAL_KEYWORD,
KEYWORD_BETWEEN, KEYWORD_BETWEEN,
KEYWORD_OPTIONAL, KEYWORD_OPTIONAL,
KEYWORD_MATCH, KEYWORD_MATCH,
@ -35,8 +36,7 @@ export enum TokenType {
KEYWORD_NOT, KEYWORD_NOT,
KEYWORD_TAB, KEYWORD_TAB,
KEYWORD_LINEFEED, KEYWORD_LINEFEED,
KEYWORD_CARRIAGE, KEYWORD_CARRIAGE_RETURN,
KEYWORD_RETURN,
KEYWORD_GROUP, KEYWORD_GROUP,
KEYWORD_BY, KEYWORD_BY,
KEYWORD_ARTICLE, KEYWORD_ARTICLE,
@ -44,7 +44,11 @@ export enum TokenType {
KEYWORD_INCLUSIVE, KEYWORD_INCLUSIVE,
KEYWORD_EXCLUSIVE, KEYWORD_EXCLUSIVE,
KEYWORD_FROM, KEYWORD_FROM,
KEYWORD_TO KEYWORD_TO,
KEYWORD_CREATE,
KEYWORD_CALLED,
KEYWORD_REPEAT,
KEYWORD_NEWLINE
} }
export class TokenError extends Error { export class TokenError extends Error {
@ -58,6 +62,7 @@ export class TokenError extends Error {
} }
export class Token { export class Token {
/* TODO: end line and position? */
constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) { constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) {
/* nothing required */ /* nothing required */
} }