mirror of
https://github.com/pdemian/human2regex.git
synced 2025-05-16 12:30:09 -07:00
Refactored code and made tokenizer
TODO: webpack config?
This commit is contained in:
parent
0a4f65b1a8
commit
40ca670a2a
11
src/parser.ts
Normal file
11
src/parser.ts
Normal file
@ -0,0 +1,11 @@
|
||||
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
||||
|
||||
import { Token, TokenType } from "./tokens";
|
||||
|
||||
export class ParserOptions {
|
||||
|
||||
}
|
||||
|
||||
export function parse(tokens: Token[]) {
|
||||
return undefined;
|
||||
}
|
152
src/script.js
152
src/script.js
@ -1,152 +0,0 @@
|
||||
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
||||
"use strict";
|
||||
const keywords = [
|
||||
"optional", "optionally", "match", "then", "any", "of", "or", "word", "digit", "unicode", "character",
|
||||
"multiple", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "anything",
|
||||
"whitespace", "as", "number", "if", "starts", "with", "ends", "otherwise", "else", "unless", "while", "more",
|
||||
"using", "global", "and", "multiline", "exact", "matching", "not", "between", "tab", "linefeed", "carriage", "return",
|
||||
"group", "by", "exactly", "inclusive", "inclusively", "exclusive", "exclusively", "including", "from", "to"
|
||||
];
|
||||
var TokenType;
|
||||
(function (TokenType) {
|
||||
TokenType[TokenType["END_OF_STATEMENT"] = 0] = "END_OF_STATEMENT";
|
||||
TokenType[TokenType["INDENT"] = 1] = "INDENT";
|
||||
TokenType[TokenType["BETWEEN"] = 2] = "BETWEEN";
|
||||
TokenType[TokenType["QUOTE"] = 3] = "QUOTE";
|
||||
TokenType[TokenType["KEYWORD_BETWEEN"] = 4] = "KEYWORD_BETWEEN";
|
||||
TokenType[TokenType["KEYWORD_OPTIONAL"] = 5] = "KEYWORD_OPTIONAL";
|
||||
TokenType[TokenType["KEYWORD_MATCH"] = 6] = "KEYWORD_MATCH";
|
||||
TokenType[TokenType["KEYWORD_THEN"] = 7] = "KEYWORD_THEN";
|
||||
TokenType[TokenType["KEYWORD_AND"] = 8] = "KEYWORD_AND";
|
||||
TokenType[TokenType["KEYWORD_OR"] = 9] = "KEYWORD_OR";
|
||||
TokenType[TokenType["KEYWORD_ANY"] = 10] = "KEYWORD_ANY";
|
||||
TokenType[TokenType["KEYWORD_OF"] = 11] = "KEYWORD_OF";
|
||||
})(TokenType || (TokenType = {}));
|
||||
class Token {
|
||||
constructor(type, token_string) {
|
||||
this.type = type;
|
||||
this.token_string = token_string;
|
||||
}
|
||||
}
|
||||
class TokenizerOptions {
|
||||
constructor() {
|
||||
this.convert_spaces_to_tabs = false;
|
||||
}
|
||||
}
|
||||
/* Basic Tokenizer: To be replaced with a unicode variant later */
|
||||
function tokenize(input, options) {
|
||||
let tokens = [];
|
||||
let errors = [];
|
||||
for (let i = 0; i < input.length; i++) {
|
||||
// 4 spaces = 1 tab. That is final. Debate over
|
||||
if (options.convert_spaces_to_tabs && input.startsWith(" ", i)) {
|
||||
tokens.push(new Token(TokenType.INDENT));
|
||||
i += 3;
|
||||
}
|
||||
// between (ex: 0...3 or 0-3)
|
||||
else if (input.startsWith("...", i)) {
|
||||
tokens.push(new Token(TokenType.BETWEEN));
|
||||
i += 2;
|
||||
}
|
||||
else if (input.startsWith("..", i)) {
|
||||
tokens.push(new Token(TokenType.BETWEEN));
|
||||
i += 1;
|
||||
}
|
||||
// comments
|
||||
else if (input.startsWith("//", i)) {
|
||||
i += 1;
|
||||
while (i < input.length) {
|
||||
if (input[i] == '\n') {
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT));
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
}
|
||||
else if (input.startsWith("\r\n", i)) {
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT));
|
||||
i += 1;
|
||||
}
|
||||
else {
|
||||
switch (input[i]) {
|
||||
// comment
|
||||
case '#':
|
||||
i++;
|
||||
while (i < input.length) {
|
||||
if (input[i] == '\n') {
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT));
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
break;
|
||||
// quote
|
||||
case '"':
|
||||
case '\"':
|
||||
// build up a word between quotes
|
||||
const quote_char = input[i];
|
||||
let found_ending = false;
|
||||
let quote = "";
|
||||
do {
|
||||
i++;
|
||||
if (input[i] == quote_char) {
|
||||
found_ending = true;
|
||||
break;
|
||||
}
|
||||
else if (input[i] == '\n') {
|
||||
}
|
||||
} while (i < input.length);
|
||||
if (found_ending) {
|
||||
tokens.push(new Token(TokenType.QUOTE, quote));
|
||||
}
|
||||
else {
|
||||
// Skip until newline and throw an error
|
||||
}
|
||||
break;
|
||||
// between (ex: 0...3 or 0-3)
|
||||
case '-':
|
||||
tokens.push(new Token(TokenType.BETWEEN));
|
||||
break;
|
||||
case '\n':
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT));
|
||||
break;
|
||||
case '\r':
|
||||
// ignore
|
||||
break;
|
||||
case '\t':
|
||||
tokens.push(new Token(TokenType.INDENT));
|
||||
break;
|
||||
case ' ':
|
||||
break;
|
||||
default:
|
||||
// is digit? build up a number
|
||||
// is char? build up a word
|
||||
keywords.includes("word");
|
||||
// build up a word
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return { tokens: tokens, errors: errors };
|
||||
}
|
||||
/*
|
||||
String.prototype.escape = function() {
|
||||
var tagsToReplace = {
|
||||
'&': '&',
|
||||
'<': '<',
|
||||
'>': '>'
|
||||
};
|
||||
return this.replace(/[&<>]/g, function(tag) {
|
||||
return tagsToReplace[tag] || tag;
|
||||
});
|
||||
};
|
||||
String.prototype.norm = function() {
|
||||
if(String.prototype.normalize != undefined) {
|
||||
return this.normalize("NFD").replace(/[\u0300-\u036F]/g,"");
|
||||
}
|
||||
return this;
|
||||
};
|
||||
|
||||
*/
|
||||
$(function () {
|
||||
});
|
168
src/script.ts
168
src/script.ts
@ -1,167 +1,9 @@
|
||||
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
||||
|
||||
"use strict";
|
||||
|
||||
const keywords = [
|
||||
"optional", "optionally", "match", "then", "any", "of", "or", "word", "digit", "unicode", "character",
|
||||
"multiple", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "anything",
|
||||
"whitespace", "as", "number", "if", "starts", "with", "ends", "otherwise", "else", "unless", "while", "more",
|
||||
"using", "global", "and", "multiline", "exact", "matching", "not", "between", "tab", "linefeed", "carriage", "return",
|
||||
"group", "by", "exactly", "inclusive", "inclusively", "exclusive", "exclusively", "including", "from", "to"
|
||||
];
|
||||
|
||||
enum TokenType {
|
||||
END_OF_STATEMENT,
|
||||
INDENT,
|
||||
BETWEEN,
|
||||
QUOTE,
|
||||
KEYWORD_BETWEEN,
|
||||
KEYWORD_OPTIONAL,
|
||||
KEYWORD_MATCH,
|
||||
KEYWORD_THEN,
|
||||
KEYWORD_AND,
|
||||
KEYWORD_OR,
|
||||
KEYWORD_ANY,
|
||||
KEYWORD_OF,
|
||||
}
|
||||
|
||||
class Token {
|
||||
constructor(public type: TokenType, public token_string?: string) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
class TokenizerOptions {
|
||||
public convert_spaces_to_tabs: boolean = false;
|
||||
|
||||
}
|
||||
|
||||
/* Basic Tokenizer: To be replaced with a unicode variant later */
|
||||
|
||||
function tokenize(input: string, options: TokenizerOptions) : { tokens: Token[], errors: Error[] } {
|
||||
let tokens : Token[] = [];
|
||||
let errors : Error[] = [];
|
||||
|
||||
for(let i = 0; i < input.length; i++) {
|
||||
|
||||
// 4 spaces = 1 tab. That is final. Debate over
|
||||
if(options.convert_spaces_to_tabs && input.startsWith(" ", i)) {
|
||||
tokens.push(new Token(TokenType.INDENT));
|
||||
i += 3;
|
||||
}
|
||||
// between (ex: 0...3 or 0-3)
|
||||
else if(input.startsWith("...", i)) {
|
||||
tokens.push(new Token(TokenType.BETWEEN));
|
||||
i += 2;
|
||||
} else if(input.startsWith("..", i)) {
|
||||
tokens.push(new Token(TokenType.BETWEEN));
|
||||
i += 1;
|
||||
}
|
||||
// comments
|
||||
else if(input.startsWith("//", i)) {
|
||||
i += 1;
|
||||
while(i < input.length) {
|
||||
if(input[i] == '\n') {
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT));
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
} else if (input.startsWith("\r\n", i)) {
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT));
|
||||
i += 1;
|
||||
} else {
|
||||
switch(input[i]) {
|
||||
// comment
|
||||
case '#':
|
||||
i++;
|
||||
while(i < input.length) {
|
||||
if(input[i] == '\n') {
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT));
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
break;
|
||||
// quote
|
||||
case '"':
|
||||
case '\"':
|
||||
// build up a word between quotes
|
||||
const quote_char = input[i];
|
||||
let found_ending = false;
|
||||
|
||||
let quote = "";
|
||||
|
||||
do {
|
||||
i++;
|
||||
if(input[i] == quote_char) {
|
||||
found_ending = true;
|
||||
break;
|
||||
}
|
||||
else if(input[i] == '\n') {
|
||||
|
||||
}
|
||||
} while(i < input.length);
|
||||
|
||||
if(found_ending) {
|
||||
tokens.push(new Token(TokenType.QUOTE, quote));
|
||||
}
|
||||
else {
|
||||
// Skip until newline and throw an error
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
// between (ex: 0...3 or 0-3)
|
||||
case '-':
|
||||
tokens.push(new Token(TokenType.BETWEEN));
|
||||
break;
|
||||
case '\n':
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT));
|
||||
break;
|
||||
case '\r':
|
||||
// ignore
|
||||
break;
|
||||
case '\t':
|
||||
tokens.push(new Token(TokenType.INDENT));
|
||||
break;
|
||||
case ' ':
|
||||
break;
|
||||
default:
|
||||
// is digit? build up a number
|
||||
|
||||
// is char? build up a word
|
||||
|
||||
keywords.includes("word");
|
||||
// build up a word
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { tokens: tokens, errors: errors };
|
||||
}
|
||||
|
||||
/*
|
||||
String.prototype.escape = function() {
|
||||
var tagsToReplace = {
|
||||
'&': '&',
|
||||
'<': '<',
|
||||
'>': '>'
|
||||
};
|
||||
return this.replace(/[&<>]/g, function(tag) {
|
||||
return tagsToReplace[tag] || tag;
|
||||
});
|
||||
};
|
||||
String.prototype.norm = function() {
|
||||
if(String.prototype.normalize != undefined) {
|
||||
return this.normalize("NFD").replace(/[\u0300-\u036F]/g,"");
|
||||
}
|
||||
return this;
|
||||
};
|
||||
|
||||
*/
|
||||
|
||||
$( function() {
|
||||
import { Token, TokenType } from "./tokens";
|
||||
import { TokenizerOptions, tokenize } from "./tokenizer";
|
||||
import { ParserOptions, parse } from "./parser";
|
||||
|
||||
$(function() {
|
||||
|
||||
});
|
@ -263,8 +263,8 @@ footer {
|
||||
}
|
||||
|
||||
/* accessibility */
|
||||
a {
|
||||
color: #00497A;
|
||||
a:hover {
|
||||
color: #208bff;
|
||||
}
|
||||
|
||||
.navbar-light .navbar-nav .nav-link {
|
||||
|
356
src/tokenizer.ts
Normal file
356
src/tokenizer.ts
Normal file
@ -0,0 +1,356 @@
|
||||
/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
|
||||
|
||||
// TODO: replace every version of switch(<some string>) with switch(<some string>.charCodeAt(0))
|
||||
|
||||
import { Token, TokenType, TokenError } from "./tokens";
|
||||
|
||||
const keywords = {
|
||||
"optional": TokenType.KEYWORD_OPTIONAL,
|
||||
"optionally": TokenType.KEYWORD_OPTIONAL,
|
||||
"match": TokenType.KEYWORD_MATCH,
|
||||
"then": TokenType.KEYWORD_THEN,
|
||||
"any": TokenType.KEYWORD_ANY,
|
||||
"anything": TokenType.KEYWORD_ANY,
|
||||
"of": TokenType.KEYWORD_OF,
|
||||
"or": TokenType.KEYWORD_OR,
|
||||
"and": TokenType.KEYWORD_AND,
|
||||
"word": TokenType.KEYWODE_WORD_SPECIFIER,
|
||||
"digit": TokenType.KEYWORD_DIGIT_SPECIFIER,
|
||||
"character": TokenType.KEYWORD_CHAR_SPECIFIER,
|
||||
"whitespace": TokenType.KEYWORD_WHITESPACE_SPECIFIER,
|
||||
"number": TokenType.KEYWORD_NUMBER_SPECIFIER,
|
||||
"multiple": TokenType.KEYWORD_MULTIPLE,
|
||||
"as": TokenType.KEYWORD_AS,
|
||||
"if": TokenType.KEYWORD_IF,
|
||||
"starts": TokenType.KEYWORD_STARTS,
|
||||
"with": TokenType.KEYWORD_WITH,
|
||||
"ends": TokenType.KEYWORD_ENDS,
|
||||
"otherwise": TokenType.KEYWORD_ELSE,
|
||||
"else": TokenType.KEYWORD_ELSE,
|
||||
"unless": TokenType.KEYWORD_UNLESS,
|
||||
"while": TokenType.KEYWORD_WHILE,
|
||||
"more": TokenType.KEYWORD_MORE,
|
||||
"using": TokenType.KEYWORD_USING,
|
||||
"global": TokenType.KEYWORD_GLOBAL,
|
||||
"multiline": TokenType.KEYWORD_MULTILINE,
|
||||
"exact": TokenType.KEYWORD_EXACT,
|
||||
"matching": TokenType.KEYWORD_MATCHING,
|
||||
"not": TokenType.KEYWORD_NOT,
|
||||
"between": TokenType.KEYWORD_BETWEEN,
|
||||
"tab": TokenType.KEYWORD_TAB,
|
||||
"linefeed": TokenType.KEYWORD_LINEFEED,
|
||||
"carriage": TokenType.KEYWORD_CARRIAGE,
|
||||
"return": TokenType.KEYWORD_RETURN,
|
||||
"group": TokenType.KEYWORD_GROUP,
|
||||
"by": TokenType.KEYWORD_BY,
|
||||
"an": TokenType.KEYWORD_ARTICLE,
|
||||
"a": TokenType.KEYWORD_ARTICLE,
|
||||
"the": TokenType.KEYWORD_ARTICLE,
|
||||
"exactly": TokenType.KEYWORD_EXACTLY,
|
||||
"inclusive": TokenType.KEYWORD_INCLUSIVE,
|
||||
"inclusively": TokenType.KEYWORD_INCLUSIVE,
|
||||
"exclusive": TokenType.KEYWORD_EXCLUSIVE,
|
||||
"exclusively": TokenType.KEYWORD_EXCLUSIVE,
|
||||
"from": TokenType.KEYWORD_FROM,
|
||||
"to": TokenType.KEYWORD_TO
|
||||
};
|
||||
|
||||
const escape_sequences = {
|
||||
'a': '\a',
|
||||
'b': '\b',
|
||||
'e': '\e',
|
||||
'f': '\f',
|
||||
'n': '\n',
|
||||
'r': '\r',
|
||||
't': '\t',
|
||||
'"': '"',
|
||||
'\'': '\'',
|
||||
'\\': '\\',
|
||||
};
|
||||
|
||||
export class TokenizerOptions {
|
||||
public convert_spaces_to_tabs: boolean = false;
|
||||
}
|
||||
|
||||
const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g);
|
||||
|
||||
function escape_sequence_gather_hex(input: string, i : number, max: number) : string {
|
||||
let hex = "";
|
||||
for(i++; i < input.length && max-- > 0; i++) {
|
||||
if(escape_sequence_hex_regex.test(input[i])) hex += input[i];
|
||||
}
|
||||
return hex;
|
||||
}
|
||||
|
||||
function escape_sequence_mapper(input: string, i : number) : { code: string, read: number, error?: Error } {
|
||||
if(escape_sequences[input[i]] != undefined) {
|
||||
return { code: escape_sequences[input[i]], read: 1 };
|
||||
}
|
||||
//variable hex code
|
||||
else if(input[i] == 'x') {
|
||||
const hex = escape_sequence_gather_hex(input, ++i, 4);
|
||||
|
||||
return { code: String.fromCharCode(parseInt(hex, 16)), read: hex.length + 1 };
|
||||
}
|
||||
//4 hex unicode
|
||||
else if(input[i] == 'u') {
|
||||
const unicode = escape_sequence_gather_hex(input, ++i, 4);
|
||||
if(unicode.length != 4) {
|
||||
return { code: "", read: unicode.length + 1, error: new Error("Bad escape sequence")};
|
||||
}
|
||||
else {
|
||||
return { code: String.fromCharCode(parseInt(unicode, 16)), read: 5 };
|
||||
}
|
||||
}
|
||||
else if(input[i] == 'U') {
|
||||
const unicode = escape_sequence_gather_hex(input, ++i, 8);
|
||||
|
||||
if(unicode.length != 8) {
|
||||
return { code: "", read: unicode.length + 1, error: new Error("Bad escape sequence")};
|
||||
}
|
||||
else {
|
||||
return { code: String.fromCharCode(parseInt(unicode, 16)), read: 9 };
|
||||
}
|
||||
}
|
||||
else {
|
||||
// should throw an exception, but gonna just ignore it
|
||||
return { code: input[i], read: 1 };
|
||||
}
|
||||
}
|
||||
|
||||
function is_digit(input: string) : boolean {
|
||||
//return /[0-9]/g.test(input);
|
||||
const value = input.charCodeAt(0);
|
||||
return value >= 48 && value <= 57;
|
||||
}
|
||||
|
||||
function is_char(input: string) : boolean {
|
||||
//return input.toUpperCase() != input.toLowerCase();
|
||||
//return /[a-zA-Z]/g.test(input);
|
||||
|
||||
const value = input.charCodeAt(0);
|
||||
return ((value >= 65 && value <= 90) || (value >= 97 && value <= 122));
|
||||
}
|
||||
|
||||
/* Basic Tokenizer */
|
||||
export function tokenize(input: string, options: TokenizerOptions) : { tokens: Token[], errors: TokenError[] } {
|
||||
let line = 1;
|
||||
let position = 1;
|
||||
|
||||
let tokens : Token[] = [];
|
||||
let errors : TokenError[] = [];
|
||||
|
||||
for(let i = 0; i < input.length; i++, position++) {
|
||||
// 4 spaces = 1 tab. That is final. Debate over
|
||||
if(options.convert_spaces_to_tabs && input.startsWith(" ", i)) {
|
||||
tokens.push(new Token(TokenType.INDENT, line, position));
|
||||
i += 3;
|
||||
position += 3;
|
||||
}
|
||||
// between (ex: 0...3 or 0-3)
|
||||
else if(input.startsWith("...", i)) {
|
||||
tokens.push(new Token(TokenType.BETWEEN, line, position));
|
||||
i += 2;
|
||||
position += 2;
|
||||
}
|
||||
else if(input.startsWith("..", i)) {
|
||||
tokens.push(new Token(TokenType.BETWEEN, line, position));
|
||||
i++;
|
||||
position++;
|
||||
}
|
||||
// comments
|
||||
else if(input.startsWith("//", i)) {
|
||||
for(i++, position++; i < input.length; i++, position++) {
|
||||
if(input[i] == '\n') {
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
|
||||
break;
|
||||
}
|
||||
}
|
||||
line++;
|
||||
position = 0;
|
||||
}
|
||||
else if(input.startsWith("/*", i)) {
|
||||
for(i++, position++; i < input.length-1; i++, position++) {
|
||||
if(input[i] == '*' && input[i+1] == '/') {
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
|
||||
i++;
|
||||
position++;
|
||||
break;
|
||||
}
|
||||
if(input[i] == '\n') {
|
||||
line++;
|
||||
position = 0;
|
||||
}
|
||||
}
|
||||
if(i == input.length-1) {
|
||||
errors.push(new TokenError("Unexpected EOF", line, position));
|
||||
}
|
||||
else {
|
||||
line++;
|
||||
position = 0;
|
||||
}
|
||||
}
|
||||
else if (input.startsWith("\r\n", i)) {
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
|
||||
i++;
|
||||
line++;
|
||||
position = 0;
|
||||
}
|
||||
else {
|
||||
switch(input[i]) {
|
||||
// comment
|
||||
case '#':
|
||||
for(i++, position++; i < input.length; i++, position++) {
|
||||
if(input[i] == '\n') {
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
|
||||
line++;
|
||||
position = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
// quote
|
||||
case '"':
|
||||
case '\"':
|
||||
// build up a word between quotes
|
||||
const quote_begin = { line: line, position: position };
|
||||
const quote_char = input[i];
|
||||
let found_ending = false;
|
||||
|
||||
let quote = "";
|
||||
|
||||
do {
|
||||
i++;
|
||||
position++;
|
||||
if(input[i] == '\\') {
|
||||
i++;
|
||||
position++;
|
||||
const sequence = escape_sequence_mapper(input, i);
|
||||
|
||||
if(sequence.error != undefined) {
|
||||
errors.push(new TokenError(sequence.error.message, line, position));
|
||||
}
|
||||
|
||||
position += sequence.read;
|
||||
i += sequence.read;
|
||||
quote += sequence.code;
|
||||
|
||||
}
|
||||
else if(input[i] == quote_char) {
|
||||
found_ending = true;
|
||||
break;
|
||||
}
|
||||
else if(input[i] == '\n') {
|
||||
line++;
|
||||
position = 0;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
quote += input[i];
|
||||
}
|
||||
} while(i < input.length);
|
||||
|
||||
if(found_ending) {
|
||||
tokens.push(new Token(TokenType.QUOTE, line, position, quote));
|
||||
}
|
||||
else {
|
||||
//we reached the end of the line or the end of the file
|
||||
errors.push(new TokenError(`Unexpected end of quote. Quote began at ${quote_begin.line}:${quote_begin.position}`, line, position));
|
||||
line++;
|
||||
position = 0;
|
||||
}
|
||||
break;
|
||||
|
||||
// between (ex: 0...3 or 0-3)
|
||||
case '-':
|
||||
tokens.push(new Token(TokenType.BETWEEN, line, position));
|
||||
break;
|
||||
case '\n':
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
|
||||
break;
|
||||
case '\r':
|
||||
// ignore
|
||||
break;
|
||||
case '\t':
|
||||
tokens.push(new Token(TokenType.INDENT, line, position));
|
||||
break;
|
||||
case ' ':
|
||||
break;
|
||||
default:
|
||||
// is digit? build up a number
|
||||
if(is_digit(input[i])) {
|
||||
let digits = input[i];
|
||||
|
||||
do {
|
||||
i++; position++;
|
||||
digits += input[i];
|
||||
} while(i+1 < input.length && is_digit(input[i+1]));
|
||||
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, digits));
|
||||
}
|
||||
// is char? build up a word
|
||||
else if(is_char(input[i])) {
|
||||
let text = input[i];
|
||||
|
||||
do {
|
||||
i++; position++;
|
||||
text += input[i];
|
||||
} while(i+1 < input.length && is_char(input[i+1]));
|
||||
|
||||
const keyword_text = text.toLowerCase();
|
||||
|
||||
if(keywords[keyword_text] != undefined) {
|
||||
tokens.push(new Token(keywords[keyword_text], line, position));
|
||||
}
|
||||
else {
|
||||
switch(keyword_text) {
|
||||
case "none":
|
||||
case "zero":
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, "0"));
|
||||
break;
|
||||
case "one":
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, "1"));
|
||||
break;
|
||||
case "two":
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, "2"));
|
||||
break;
|
||||
case "three":
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, "3"));
|
||||
break;
|
||||
case "four":
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, "4"));
|
||||
break;
|
||||
case "five":
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, "5"));
|
||||
break;
|
||||
case "six":
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, "6"));
|
||||
break;
|
||||
case "seven":
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, "7"));
|
||||
break;
|
||||
case "eight":
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, "8"));
|
||||
break;
|
||||
case "nine":
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, "9"));
|
||||
break;
|
||||
case "ten":
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, "10"));
|
||||
break;
|
||||
default:
|
||||
errors.push(new TokenError(`Unknown keyword ${text}`, line, position));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
errors.push(new TokenError(`Unknown character in text: ${input.charCodeAt(i)}`, line, position));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { tokens: tokens, errors: errors };
|
||||
}
|
64
src/tokens.ts
Normal file
64
src/tokens.ts
Normal file
@ -0,0 +1,64 @@
|
||||
export enum TokenType {
|
||||
END_OF_STATEMENT,
|
||||
INDENT,
|
||||
BETWEEN,
|
||||
QUOTE,
|
||||
NUMBER,
|
||||
KEYWORD_BETWEEN,
|
||||
KEYWORD_OPTIONAL,
|
||||
KEYWORD_MATCH,
|
||||
KEYWORD_THEN,
|
||||
KEYWORD_AND,
|
||||
KEYWORD_OR,
|
||||
KEYWORD_ANY,
|
||||
KEYWORD_OF,
|
||||
KEYWODE_WORD_SPECIFIER,
|
||||
KEYWORD_DIGIT_SPECIFIER,
|
||||
KEYWORD_CHAR_SPECIFIER,
|
||||
KEYWORD_WHITESPACE_SPECIFIER,
|
||||
KEYWORD_NUMBER_SPECIFIER,
|
||||
KEYWORD_MULTIPLE,
|
||||
KEYWORD_AS,
|
||||
KEYWORD_IF,
|
||||
KEYWORD_STARTS,
|
||||
KEYWORD_WITH,
|
||||
KEYWORD_ENDS,
|
||||
KEYWORD_ELSE,
|
||||
KEYWORD_UNLESS,
|
||||
KEYWORD_WHILE,
|
||||
KEYWORD_MORE,
|
||||
KEYWORD_USING,
|
||||
KEYWORD_GLOBAL,
|
||||
KEYWORD_MULTILINE,
|
||||
KEYWORD_EXACT,
|
||||
KEYWORD_MATCHING,
|
||||
KEYWORD_NOT,
|
||||
KEYWORD_TAB,
|
||||
KEYWORD_LINEFEED,
|
||||
KEYWORD_CARRIAGE,
|
||||
KEYWORD_RETURN,
|
||||
KEYWORD_GROUP,
|
||||
KEYWORD_BY,
|
||||
KEYWORD_ARTICLE,
|
||||
KEYWORD_EXACTLY,
|
||||
KEYWORD_INCLUSIVE,
|
||||
KEYWORD_EXCLUSIVE,
|
||||
KEYWORD_FROM,
|
||||
KEYWORD_TO
|
||||
}
|
||||
|
||||
export class TokenError extends Error {
|
||||
constructor(message: string, public line: number, public position: number) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public to_string() {
|
||||
return `${this.line}:${this.position} ${this.message}`;
|
||||
}
|
||||
}
|
||||
|
||||
export class Token {
|
||||
constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) {
|
||||
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user