1
0
mirror of https://github.com/pdemian/human2regex.git synced 2025-05-16 12:30:09 -07:00

Refactored code and made tokenizer

TODO: webpack config?
This commit is contained in:
Patrick Demian 2020-10-10 04:09:13 -04:00
parent 0a4f65b1a8
commit 40ca670a2a
6 changed files with 438 additions and 317 deletions

11
src/parser.ts Normal file
View File

@ -0,0 +1,11 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
import { Token, TokenType } from "./tokens";
export class ParserOptions {
}
export function parse(tokens: Token[]) {
return undefined;
}

View File

@ -1,152 +0,0 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
"use strict";
const keywords = [
"optional", "optionally", "match", "then", "any", "of", "or", "word", "digit", "unicode", "character",
"multiple", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "anything",
"whitespace", "as", "number", "if", "starts", "with", "ends", "otherwise", "else", "unless", "while", "more",
"using", "global", "and", "multiline", "exact", "matching", "not", "between", "tab", "linefeed", "carriage", "return",
"group", "by", "exactly", "inclusive", "inclusively", "exclusive", "exclusively", "including", "from", "to"
];
var TokenType;
(function (TokenType) {
TokenType[TokenType["END_OF_STATEMENT"] = 0] = "END_OF_STATEMENT";
TokenType[TokenType["INDENT"] = 1] = "INDENT";
TokenType[TokenType["BETWEEN"] = 2] = "BETWEEN";
TokenType[TokenType["QUOTE"] = 3] = "QUOTE";
TokenType[TokenType["KEYWORD_BETWEEN"] = 4] = "KEYWORD_BETWEEN";
TokenType[TokenType["KEYWORD_OPTIONAL"] = 5] = "KEYWORD_OPTIONAL";
TokenType[TokenType["KEYWORD_MATCH"] = 6] = "KEYWORD_MATCH";
TokenType[TokenType["KEYWORD_THEN"] = 7] = "KEYWORD_THEN";
TokenType[TokenType["KEYWORD_AND"] = 8] = "KEYWORD_AND";
TokenType[TokenType["KEYWORD_OR"] = 9] = "KEYWORD_OR";
TokenType[TokenType["KEYWORD_ANY"] = 10] = "KEYWORD_ANY";
TokenType[TokenType["KEYWORD_OF"] = 11] = "KEYWORD_OF";
})(TokenType || (TokenType = {}));
class Token {
constructor(type, token_string) {
this.type = type;
this.token_string = token_string;
}
}
class TokenizerOptions {
constructor() {
this.convert_spaces_to_tabs = false;
}
}
/* Basic Tokenizer: To be replaced with a unicode variant later */
function tokenize(input, options) {
let tokens = [];
let errors = [];
for (let i = 0; i < input.length; i++) {
// 4 spaces = 1 tab. That is final. Debate over
if (options.convert_spaces_to_tabs && input.startsWith(" ", i)) {
tokens.push(new Token(TokenType.INDENT));
i += 3;
}
// between (ex: 0...3 or 0-3)
else if (input.startsWith("...", i)) {
tokens.push(new Token(TokenType.BETWEEN));
i += 2;
}
else if (input.startsWith("..", i)) {
tokens.push(new Token(TokenType.BETWEEN));
i += 1;
}
// comments
else if (input.startsWith("//", i)) {
i += 1;
while (i < input.length) {
if (input[i] == '\n') {
tokens.push(new Token(TokenType.END_OF_STATEMENT));
break;
}
i++;
}
}
else if (input.startsWith("\r\n", i)) {
tokens.push(new Token(TokenType.END_OF_STATEMENT));
i += 1;
}
else {
switch (input[i]) {
// comment
case '#':
i++;
while (i < input.length) {
if (input[i] == '\n') {
tokens.push(new Token(TokenType.END_OF_STATEMENT));
break;
}
i++;
}
break;
// quote
case '"':
case '\"':
// build up a word between quotes
const quote_char = input[i];
let found_ending = false;
let quote = "";
do {
i++;
if (input[i] == quote_char) {
found_ending = true;
break;
}
else if (input[i] == '\n') {
}
} while (i < input.length);
if (found_ending) {
tokens.push(new Token(TokenType.QUOTE, quote));
}
else {
// Skip until newline and throw an error
}
break;
// between (ex: 0...3 or 0-3)
case '-':
tokens.push(new Token(TokenType.BETWEEN));
break;
case '\n':
tokens.push(new Token(TokenType.END_OF_STATEMENT));
break;
case '\r':
// ignore
break;
case '\t':
tokens.push(new Token(TokenType.INDENT));
break;
case ' ':
break;
default:
// is digit? build up a number
// is char? build up a word
keywords.includes("word");
// build up a word
break;
}
}
}
return { tokens: tokens, errors: errors };
}
/*
String.prototype.escape = function() {
var tagsToReplace = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;'
};
return this.replace(/[&<>]/g, function(tag) {
return tagsToReplace[tag] || tag;
});
};
String.prototype.norm = function() {
if(String.prototype.normalize != undefined) {
return this.normalize("NFD").replace(/[\u0300-\u036F]/g,"");
}
return this;
};
*/
$(function () {
});

View File

@ -1,166 +1,8 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
"use strict";
const keywords = [
"optional", "optionally", "match", "then", "any", "of", "or", "word", "digit", "unicode", "character",
"multiple", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "anything",
"whitespace", "as", "number", "if", "starts", "with", "ends", "otherwise", "else", "unless", "while", "more",
"using", "global", "and", "multiline", "exact", "matching", "not", "between", "tab", "linefeed", "carriage", "return",
"group", "by", "exactly", "inclusive", "inclusively", "exclusive", "exclusively", "including", "from", "to"
];
enum TokenType {
END_OF_STATEMENT,
INDENT,
BETWEEN,
QUOTE,
KEYWORD_BETWEEN,
KEYWORD_OPTIONAL,
KEYWORD_MATCH,
KEYWORD_THEN,
KEYWORD_AND,
KEYWORD_OR,
KEYWORD_ANY,
KEYWORD_OF,
}
class Token {
constructor(public type: TokenType, public token_string?: string) {
}
}
class TokenizerOptions {
public convert_spaces_to_tabs: boolean = false;
}
/* Basic Tokenizer: To be replaced with a unicode variant later */
function tokenize(input: string, options: TokenizerOptions) : { tokens: Token[], errors: Error[] } {
let tokens : Token[] = [];
let errors : Error[] = [];
for(let i = 0; i < input.length; i++) {
// 4 spaces = 1 tab. That is final. Debate over
if(options.convert_spaces_to_tabs && input.startsWith(" ", i)) {
tokens.push(new Token(TokenType.INDENT));
i += 3;
}
// between (ex: 0...3 or 0-3)
else if(input.startsWith("...", i)) {
tokens.push(new Token(TokenType.BETWEEN));
i += 2;
} else if(input.startsWith("..", i)) {
tokens.push(new Token(TokenType.BETWEEN));
i += 1;
}
// comments
else if(input.startsWith("//", i)) {
i += 1;
while(i < input.length) {
if(input[i] == '\n') {
tokens.push(new Token(TokenType.END_OF_STATEMENT));
break;
}
i++;
}
} else if (input.startsWith("\r\n", i)) {
tokens.push(new Token(TokenType.END_OF_STATEMENT));
i += 1;
} else {
switch(input[i]) {
// comment
case '#':
i++;
while(i < input.length) {
if(input[i] == '\n') {
tokens.push(new Token(TokenType.END_OF_STATEMENT));
break;
}
i++;
}
break;
// quote
case '"':
case '\"':
// build up a word between quotes
const quote_char = input[i];
let found_ending = false;
let quote = "";
do {
i++;
if(input[i] == quote_char) {
found_ending = true;
break;
}
else if(input[i] == '\n') {
}
} while(i < input.length);
if(found_ending) {
tokens.push(new Token(TokenType.QUOTE, quote));
}
else {
// Skip until newline and throw an error
}
break;
// between (ex: 0...3 or 0-3)
case '-':
tokens.push(new Token(TokenType.BETWEEN));
break;
case '\n':
tokens.push(new Token(TokenType.END_OF_STATEMENT));
break;
case '\r':
// ignore
break;
case '\t':
tokens.push(new Token(TokenType.INDENT));
break;
case ' ':
break;
default:
// is digit? build up a number
// is char? build up a word
keywords.includes("word");
// build up a word
break;
}
}
}
return { tokens: tokens, errors: errors };
}
/*
String.prototype.escape = function() {
var tagsToReplace = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;'
};
return this.replace(/[&<>]/g, function(tag) {
return tagsToReplace[tag] || tag;
});
};
String.prototype.norm = function() {
if(String.prototype.normalize != undefined) {
return this.normalize("NFD").replace(/[\u0300-\u036F]/g,"");
}
return this;
};
*/
import { Token, TokenType } from "./tokens";
import { TokenizerOptions, tokenize } from "./tokenizer";
import { ParserOptions, parse } from "./parser";
$(function() {

View File

@ -263,8 +263,8 @@ footer {
}
/* accessibility */
a {
color: #00497A;
a:hover {
color: #208bff;
}
.navbar-light .navbar-nav .nav-link {

356
src/tokenizer.ts Normal file
View File

@ -0,0 +1,356 @@
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
// TODO: replace every version of switch(<some string>) with switch(<some string>.charCodeAt(0))
import { Token, TokenType, TokenError } from "./tokens";
const keywords = {
"optional": TokenType.KEYWORD_OPTIONAL,
"optionally": TokenType.KEYWORD_OPTIONAL,
"match": TokenType.KEYWORD_MATCH,
"then": TokenType.KEYWORD_THEN,
"any": TokenType.KEYWORD_ANY,
"anything": TokenType.KEYWORD_ANY,
"of": TokenType.KEYWORD_OF,
"or": TokenType.KEYWORD_OR,
"and": TokenType.KEYWORD_AND,
"word": TokenType.KEYWODE_WORD_SPECIFIER,
"digit": TokenType.KEYWORD_DIGIT_SPECIFIER,
"character": TokenType.KEYWORD_CHAR_SPECIFIER,
"whitespace": TokenType.KEYWORD_WHITESPACE_SPECIFIER,
"number": TokenType.KEYWORD_NUMBER_SPECIFIER,
"multiple": TokenType.KEYWORD_MULTIPLE,
"as": TokenType.KEYWORD_AS,
"if": TokenType.KEYWORD_IF,
"starts": TokenType.KEYWORD_STARTS,
"with": TokenType.KEYWORD_WITH,
"ends": TokenType.KEYWORD_ENDS,
"otherwise": TokenType.KEYWORD_ELSE,
"else": TokenType.KEYWORD_ELSE,
"unless": TokenType.KEYWORD_UNLESS,
"while": TokenType.KEYWORD_WHILE,
"more": TokenType.KEYWORD_MORE,
"using": TokenType.KEYWORD_USING,
"global": TokenType.KEYWORD_GLOBAL,
"multiline": TokenType.KEYWORD_MULTILINE,
"exact": TokenType.KEYWORD_EXACT,
"matching": TokenType.KEYWORD_MATCHING,
"not": TokenType.KEYWORD_NOT,
"between": TokenType.KEYWORD_BETWEEN,
"tab": TokenType.KEYWORD_TAB,
"linefeed": TokenType.KEYWORD_LINEFEED,
"carriage": TokenType.KEYWORD_CARRIAGE,
"return": TokenType.KEYWORD_RETURN,
"group": TokenType.KEYWORD_GROUP,
"by": TokenType.KEYWORD_BY,
"an": TokenType.KEYWORD_ARTICLE,
"a": TokenType.KEYWORD_ARTICLE,
"the": TokenType.KEYWORD_ARTICLE,
"exactly": TokenType.KEYWORD_EXACTLY,
"inclusive": TokenType.KEYWORD_INCLUSIVE,
"inclusively": TokenType.KEYWORD_INCLUSIVE,
"exclusive": TokenType.KEYWORD_EXCLUSIVE,
"exclusively": TokenType.KEYWORD_EXCLUSIVE,
"from": TokenType.KEYWORD_FROM,
"to": TokenType.KEYWORD_TO
};
const escape_sequences = {
'a': '\a',
'b': '\b',
'e': '\e',
'f': '\f',
'n': '\n',
'r': '\r',
't': '\t',
'"': '"',
'\'': '\'',
'\\': '\\',
};
export class TokenizerOptions {
public convert_spaces_to_tabs: boolean = false;
}
const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g);
function escape_sequence_gather_hex(input: string, i : number, max: number) : string {
let hex = "";
for(i++; i < input.length && max-- > 0; i++) {
if(escape_sequence_hex_regex.test(input[i])) hex += input[i];
}
return hex;
}
function escape_sequence_mapper(input: string, i : number) : { code: string, read: number, error?: Error } {
if(escape_sequences[input[i]] != undefined) {
return { code: escape_sequences[input[i]], read: 1 };
}
//variable hex code
else if(input[i] == 'x') {
const hex = escape_sequence_gather_hex(input, ++i, 4);
return { code: String.fromCharCode(parseInt(hex, 16)), read: hex.length + 1 };
}
//4 hex unicode
else if(input[i] == 'u') {
const unicode = escape_sequence_gather_hex(input, ++i, 4);
if(unicode.length != 4) {
return { code: "", read: unicode.length + 1, error: new Error("Bad escape sequence")};
}
else {
return { code: String.fromCharCode(parseInt(unicode, 16)), read: 5 };
}
}
else if(input[i] == 'U') {
const unicode = escape_sequence_gather_hex(input, ++i, 8);
if(unicode.length != 8) {
return { code: "", read: unicode.length + 1, error: new Error("Bad escape sequence")};
}
else {
return { code: String.fromCharCode(parseInt(unicode, 16)), read: 9 };
}
}
else {
// should throw an exception, but gonna just ignore it
return { code: input[i], read: 1 };
}
}
function is_digit(input: string) : boolean {
//return /[0-9]/g.test(input);
const value = input.charCodeAt(0);
return value >= 48 && value <= 57;
}
function is_char(input: string) : boolean {
//return input.toUpperCase() != input.toLowerCase();
//return /[a-zA-Z]/g.test(input);
const value = input.charCodeAt(0);
return ((value >= 65 && value <= 90) || (value >= 97 && value <= 122));
}
/* Basic Tokenizer */
export function tokenize(input: string, options: TokenizerOptions) : { tokens: Token[], errors: TokenError[] } {
let line = 1;
let position = 1;
let tokens : Token[] = [];
let errors : TokenError[] = [];
for(let i = 0; i < input.length; i++, position++) {
// 4 spaces = 1 tab. That is final. Debate over
if(options.convert_spaces_to_tabs && input.startsWith(" ", i)) {
tokens.push(new Token(TokenType.INDENT, line, position));
i += 3;
position += 3;
}
// between (ex: 0...3 or 0-3)
else if(input.startsWith("...", i)) {
tokens.push(new Token(TokenType.BETWEEN, line, position));
i += 2;
position += 2;
}
else if(input.startsWith("..", i)) {
tokens.push(new Token(TokenType.BETWEEN, line, position));
i++;
position++;
}
// comments
else if(input.startsWith("//", i)) {
for(i++, position++; i < input.length; i++, position++) {
if(input[i] == '\n') {
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
break;
}
}
line++;
position = 0;
}
else if(input.startsWith("/*", i)) {
for(i++, position++; i < input.length-1; i++, position++) {
if(input[i] == '*' && input[i+1] == '/') {
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
i++;
position++;
break;
}
if(input[i] == '\n') {
line++;
position = 0;
}
}
if(i == input.length-1) {
errors.push(new TokenError("Unexpected EOF", line, position));
}
else {
line++;
position = 0;
}
}
else if (input.startsWith("\r\n", i)) {
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
i++;
line++;
position = 0;
}
else {
switch(input[i]) {
// comment
case '#':
for(i++, position++; i < input.length; i++, position++) {
if(input[i] == '\n') {
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
line++;
position = 0;
break;
}
}
break;
// quote
case '"':
case '\"':
// build up a word between quotes
const quote_begin = { line: line, position: position };
const quote_char = input[i];
let found_ending = false;
let quote = "";
do {
i++;
position++;
if(input[i] == '\\') {
i++;
position++;
const sequence = escape_sequence_mapper(input, i);
if(sequence.error != undefined) {
errors.push(new TokenError(sequence.error.message, line, position));
}
position += sequence.read;
i += sequence.read;
quote += sequence.code;
}
else if(input[i] == quote_char) {
found_ending = true;
break;
}
else if(input[i] == '\n') {
line++;
position = 0;
break;
}
else {
quote += input[i];
}
} while(i < input.length);
if(found_ending) {
tokens.push(new Token(TokenType.QUOTE, line, position, quote));
}
else {
//we reached the end of the line or the end of the file
errors.push(new TokenError(`Unexpected end of quote. Quote began at ${quote_begin.line}:${quote_begin.position}`, line, position));
line++;
position = 0;
}
break;
// between (ex: 0...3 or 0-3)
case '-':
tokens.push(new Token(TokenType.BETWEEN, line, position));
break;
case '\n':
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
break;
case '\r':
// ignore
break;
case '\t':
tokens.push(new Token(TokenType.INDENT, line, position));
break;
case ' ':
break;
default:
// is digit? build up a number
if(is_digit(input[i])) {
let digits = input[i];
do {
i++; position++;
digits += input[i];
} while(i+1 < input.length && is_digit(input[i+1]));
tokens.push(new Token(TokenType.NUMBER, line, position, digits));
}
// is char? build up a word
else if(is_char(input[i])) {
let text = input[i];
do {
i++; position++;
text += input[i];
} while(i+1 < input.length && is_char(input[i+1]));
const keyword_text = text.toLowerCase();
if(keywords[keyword_text] != undefined) {
tokens.push(new Token(keywords[keyword_text], line, position));
}
else {
switch(keyword_text) {
case "none":
case "zero":
tokens.push(new Token(TokenType.NUMBER, line, position, "0"));
break;
case "one":
tokens.push(new Token(TokenType.NUMBER, line, position, "1"));
break;
case "two":
tokens.push(new Token(TokenType.NUMBER, line, position, "2"));
break;
case "three":
tokens.push(new Token(TokenType.NUMBER, line, position, "3"));
break;
case "four":
tokens.push(new Token(TokenType.NUMBER, line, position, "4"));
break;
case "five":
tokens.push(new Token(TokenType.NUMBER, line, position, "5"));
break;
case "six":
tokens.push(new Token(TokenType.NUMBER, line, position, "6"));
break;
case "seven":
tokens.push(new Token(TokenType.NUMBER, line, position, "7"));
break;
case "eight":
tokens.push(new Token(TokenType.NUMBER, line, position, "8"));
break;
case "nine":
tokens.push(new Token(TokenType.NUMBER, line, position, "9"));
break;
case "ten":
tokens.push(new Token(TokenType.NUMBER, line, position, "10"));
break;
default:
errors.push(new TokenError(`Unknown keyword ${text}`, line, position));
break;
}
}
}
else {
errors.push(new TokenError(`Unknown character in text: ${input.charCodeAt(i)}`, line, position));
}
break;
}
}
}
return { tokens: tokens, errors: errors };
}

64
src/tokens.ts Normal file
View File

@ -0,0 +1,64 @@
export enum TokenType {
END_OF_STATEMENT,
INDENT,
BETWEEN,
QUOTE,
NUMBER,
KEYWORD_BETWEEN,
KEYWORD_OPTIONAL,
KEYWORD_MATCH,
KEYWORD_THEN,
KEYWORD_AND,
KEYWORD_OR,
KEYWORD_ANY,
KEYWORD_OF,
KEYWODE_WORD_SPECIFIER,
KEYWORD_DIGIT_SPECIFIER,
KEYWORD_CHAR_SPECIFIER,
KEYWORD_WHITESPACE_SPECIFIER,
KEYWORD_NUMBER_SPECIFIER,
KEYWORD_MULTIPLE,
KEYWORD_AS,
KEYWORD_IF,
KEYWORD_STARTS,
KEYWORD_WITH,
KEYWORD_ENDS,
KEYWORD_ELSE,
KEYWORD_UNLESS,
KEYWORD_WHILE,
KEYWORD_MORE,
KEYWORD_USING,
KEYWORD_GLOBAL,
KEYWORD_MULTILINE,
KEYWORD_EXACT,
KEYWORD_MATCHING,
KEYWORD_NOT,
KEYWORD_TAB,
KEYWORD_LINEFEED,
KEYWORD_CARRIAGE,
KEYWORD_RETURN,
KEYWORD_GROUP,
KEYWORD_BY,
KEYWORD_ARTICLE,
KEYWORD_EXACTLY,
KEYWORD_INCLUSIVE,
KEYWORD_EXCLUSIVE,
KEYWORD_FROM,
KEYWORD_TO
}
export class TokenError extends Error {
constructor(message: string, public line: number, public position: number) {
super(message);
}
public to_string() {
return `${this.line}:${this.position} ${this.message}`;
}
}
export class Token {
constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) {
}
}