Refactored code and made tokenizer

TODO: webpack config?
2025-05-16 12:30:09 -07:00 · 2020-10-10 04:09:13 -04:00 · 2020-10-10 04:09:13 -04:00 · 40ca670a2a
commit 40ca670a2a
parent 0a4f65b1a8
6 changed files with 438 additions and 317 deletions
--- a/src/parser.ts
+++ b/src/parser.ts
@ -0,0 +1,11 @@
 /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
 import { Token, TokenType } from "./tokens";
 export class ParserOptions {
 }
 export function parse(tokens: Token[]) {
    return undefined;
 }
--- a/src/script.js
+++ b/src/script.js
@ -1,152 +0,0 @@
 /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
 "use strict";
 const keywords = [
    "optional", "optionally", "match", "then", "any", "of", "or", "word", "digit", "unicode", "character",
    "multiple", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "anything",
    "whitespace", "as", "number", "if", "starts", "with", "ends", "otherwise", "else", "unless", "while", "more",
    "using", "global", "and", "multiline", "exact", "matching", "not", "between", "tab", "linefeed", "carriage", "return",
    "group", "by", "exactly", "inclusive", "inclusively", "exclusive", "exclusively", "including", "from", "to"
 ];
 var TokenType;
 (function (TokenType) {
    TokenType[TokenType["END_OF_STATEMENT"] = 0] = "END_OF_STATEMENT";
    TokenType[TokenType["INDENT"] = 1] = "INDENT";
    TokenType[TokenType["BETWEEN"] = 2] = "BETWEEN";
    TokenType[TokenType["QUOTE"] = 3] = "QUOTE";
    TokenType[TokenType["KEYWORD_BETWEEN"] = 4] = "KEYWORD_BETWEEN";
    TokenType[TokenType["KEYWORD_OPTIONAL"] = 5] = "KEYWORD_OPTIONAL";
    TokenType[TokenType["KEYWORD_MATCH"] = 6] = "KEYWORD_MATCH";
    TokenType[TokenType["KEYWORD_THEN"] = 7] = "KEYWORD_THEN";
    TokenType[TokenType["KEYWORD_AND"] = 8] = "KEYWORD_AND";
    TokenType[TokenType["KEYWORD_OR"] = 9] = "KEYWORD_OR";
    TokenType[TokenType["KEYWORD_ANY"] = 10] = "KEYWORD_ANY";
    TokenType[TokenType["KEYWORD_OF"] = 11] = "KEYWORD_OF";
 })(TokenType || (TokenType = {}));
 class Token {
    constructor(type, token_string) {
        this.type = type;
        this.token_string = token_string;
    }
 }
 class TokenizerOptions {
    constructor() {
        this.convert_spaces_to_tabs = false;
    }
 }
 /* Basic Tokenizer: To be replaced with a unicode variant later */
 function tokenize(input, options) {
    let tokens = [];
    let errors = [];
    for (let i = 0; i < input.length; i++) {
        // 4 spaces = 1 tab. That is final. Debate over
        if (options.convert_spaces_to_tabs && input.startsWith("    ", i)) {
            tokens.push(new Token(TokenType.INDENT));
            i += 3;
        }
        // between (ex: 0...3 or 0-3)
        else if (input.startsWith("...", i)) {
            tokens.push(new Token(TokenType.BETWEEN));
            i += 2;
        }
        else if (input.startsWith("..", i)) {
            tokens.push(new Token(TokenType.BETWEEN));
            i += 1;
        }
        // comments
        else if (input.startsWith("//", i)) {
            i += 1;
            while (i < input.length) {
                if (input[i] == '\n') {
                    tokens.push(new Token(TokenType.END_OF_STATEMENT));
                    break;
                }
                i++;
            }
        }
        else if (input.startsWith("\r\n", i)) {
            tokens.push(new Token(TokenType.END_OF_STATEMENT));
            i += 1;
        }
        else {
            switch (input[i]) {
                // comment
                case '#':
                    i++;
                    while (i < input.length) {
                        if (input[i] == '\n') {
                            tokens.push(new Token(TokenType.END_OF_STATEMENT));
                            break;
                        }
                        i++;
                    }
                    break;
                // quote
                case '"':
                case '\"':
                    // build up a word between quotes
                    const quote_char = input[i];
                    let found_ending = false;
                    let quote = "";
                    do {
                        i++;
                        if (input[i] == quote_char) {
                            found_ending = true;
                            break;
                        }
                        else if (input[i] == '\n') {
                        }
                    } while (i < input.length);
                    if (found_ending) {
                        tokens.push(new Token(TokenType.QUOTE, quote));
                    }
                    else {
                        // Skip until newline and throw an error
                    }
                    break;
                // between (ex: 0...3 or 0-3)
                case '-':
                    tokens.push(new Token(TokenType.BETWEEN));
                    break;
                case '\n':
                    tokens.push(new Token(TokenType.END_OF_STATEMENT));
                    break;
                case '\r':
                    // ignore
                    break;
                case '\t':
                    tokens.push(new Token(TokenType.INDENT));
                    break;
                case ' ':
                    break;
                default:
                    // is digit? build up a number
                    // is char? build up a word
                    keywords.includes("word");
                    // build up a word
                    break;
            }
        }
    }
    return { tokens: tokens, errors: errors };
 }
 /*
 String.prototype.escape = function() {
    var tagsToReplace = {
        '&': '&amp;',
        '<': '&lt;',
        '>': '&gt;'
    };
    return this.replace(/[&<>]/g, function(tag) {
        return tagsToReplace[tag] || tag;
    });
 };
 String.prototype.norm = function() {
    if(String.prototype.normalize != undefined) {
        return this.normalize("NFD").replace(/[\u0300-\u036F]/g,"");
    }
    return this;
 };
 */
 $(function () {
 });
--- a/src/script.ts
+++ b/src/script.ts
@ -1,167 +1,9 @@
 /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
 "use strict";
-const keywords = [
+import { Token, TokenType } from "./tokens";
-    "optional", "optionally", "match", "then", "any", "of", "or", "word", "digit", "unicode", "character", 
+import { TokenizerOptions, tokenize } from "./tokenizer";
-    "multiple", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "anything",
+import { ParserOptions, parse } from "./parser";
    "whitespace", "as", "number", "if", "starts", "with", "ends", "otherwise", "else", "unless", "while", "more",
    "using", "global", "and", "multiline", "exact", "matching", "not", "between", "tab", "linefeed", "carriage", "return",
    "group", "by", "exactly", "inclusive", "inclusively", "exclusive", "exclusively", "including", "from", "to"
 ];
 enum TokenType {
    END_OF_STATEMENT,
    INDENT,
    BETWEEN,
    QUOTE,
    KEYWORD_BETWEEN,
    KEYWORD_OPTIONAL,
    KEYWORD_MATCH,
    KEYWORD_THEN,
    KEYWORD_AND,
    KEYWORD_OR,
    KEYWORD_ANY,
    KEYWORD_OF,
 }
 class Token {
    constructor(public type: TokenType, public token_string?: string) {
    }
 }
 class TokenizerOptions {
    public convert_spaces_to_tabs: boolean = false;
 }
 /* Basic Tokenizer: To be replaced with a unicode variant later */
 function tokenize(input: string, options: TokenizerOptions) : { tokens: Token[], errors: Error[] } {
    let tokens : Token[] = [];
    let errors : Error[] = [];
    for(let i = 0; i < input.length; i++) {
        // 4 spaces = 1 tab. That is final. Debate over
        if(options.convert_spaces_to_tabs && input.startsWith("    ", i)) {
            tokens.push(new Token(TokenType.INDENT));
            i += 3;
        } 
        // between (ex: 0...3 or 0-3)
        else if(input.startsWith("...", i)) {
            tokens.push(new Token(TokenType.BETWEEN));
            i += 2;
        } else if(input.startsWith("..", i)) {
            tokens.push(new Token(TokenType.BETWEEN));
            i += 1; 
        } 
        // comments
        else if(input.startsWith("//", i)) {
            i += 1;
            while(i < input.length) {
                if(input[i] == '\n') {
                    tokens.push(new Token(TokenType.END_OF_STATEMENT));
                    break;
                }
                i++;
            }
        } else if (input.startsWith("\r\n", i)) {
            tokens.push(new Token(TokenType.END_OF_STATEMENT));
            i += 1;
        } else {
            switch(input[i]) {
                // comment
                case '#':
                    i++;
                    while(i < input.length) {
                        if(input[i] == '\n') {
                            tokens.push(new Token(TokenType.END_OF_STATEMENT));
                            break;
                        }
                        i++;
                    }
                    break;
                // quote
                case '"':
                case '\"':
                    // build up a word between quotes
                    const quote_char = input[i];
                    let found_ending = false;
                    let quote = "";
                    do {
                        i++;
                        if(input[i] == quote_char) {
                            found_ending = true;
                            break;
                        }
                        else if(input[i] == '\n') {
                        }
                    } while(i < input.length);
                    if(found_ending) {
                        tokens.push(new Token(TokenType.QUOTE, quote));
                    }
                    else {
                        // Skip until newline and throw an error
                    }
                    break;
                // between (ex: 0...3 or 0-3)
                case '-':
                    tokens.push(new Token(TokenType.BETWEEN));
                    break;
                case '\n':
                    tokens.push(new Token(TokenType.END_OF_STATEMENT));
                    break;
                case '\r':
                    // ignore
                    break;
                case '\t':
                    tokens.push(new Token(TokenType.INDENT));
                    break;
                case ' ':
                    break;
                default:
                    // is digit? build up a number
                    // is char? build up a word
                    keywords.includes("word");
                    // build up a word
                    break;
            }
        }
    }
    return { tokens: tokens, errors: errors };
 }
 /*
 String.prototype.escape = function() {
    var tagsToReplace = {
        '&': '&amp;',
        '<': '&lt;',
        '>': '&gt;'
    };
    return this.replace(/[&<>]/g, function(tag) {
        return tagsToReplace[tag] || tag;
    });
 };
 String.prototype.norm = function() {
 	if(String.prototype.normalize != undefined) {
 		return this.normalize("NFD").replace(/[\u0300-\u036F]/g,"");
 	}
 	return this;
 };
 */
 $( function() {
 $(function() {
 });
--- a/src/style.css
+++ b/src/style.css
@ -263,8 +263,8 @@ footer {
 }
 /* accessibility */
-a {
+a:hover {
-	color: #00497A;
+    color: #208bff;
 }
 .navbar-light .navbar-nav .nav-link {
--- a/src/tokenizer.ts
+++ b/src/tokenizer.ts
@ -0,0 +1,356 @@
 /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */
 // TODO: replace every version of switch(<some string>) with switch(<some string>.charCodeAt(0))
 import { Token, TokenType, TokenError } from "./tokens";
 const keywords = {
    "optional": TokenType.KEYWORD_OPTIONAL,
    "optionally": TokenType.KEYWORD_OPTIONAL,
    "match": TokenType.KEYWORD_MATCH,
    "then": TokenType.KEYWORD_THEN,
    "any": TokenType.KEYWORD_ANY, 
    "anything": TokenType.KEYWORD_ANY,
    "of": TokenType.KEYWORD_OF,
    "or": TokenType.KEYWORD_OR,
    "and": TokenType.KEYWORD_AND,
    "word": TokenType.KEYWODE_WORD_SPECIFIER,
    "digit": TokenType.KEYWORD_DIGIT_SPECIFIER,
    "character": TokenType.KEYWORD_CHAR_SPECIFIER, 
    "whitespace": TokenType.KEYWORD_WHITESPACE_SPECIFIER,
    "number": TokenType.KEYWORD_NUMBER_SPECIFIER, 
    "multiple": TokenType.KEYWORD_MULTIPLE, 
    "as": TokenType.KEYWORD_AS,
    "if": TokenType.KEYWORD_IF,
    "starts": TokenType.KEYWORD_STARTS,
    "with": TokenType.KEYWORD_WITH,
    "ends": TokenType.KEYWORD_ENDS,
    "otherwise": TokenType.KEYWORD_ELSE,
    "else": TokenType.KEYWORD_ELSE,
    "unless": TokenType.KEYWORD_UNLESS,
    "while": TokenType.KEYWORD_WHILE,
    "more": TokenType.KEYWORD_MORE,
    "using": TokenType.KEYWORD_USING,
    "global": TokenType.KEYWORD_GLOBAL,
    "multiline": TokenType.KEYWORD_MULTILINE,
    "exact": TokenType.KEYWORD_EXACT,
    "matching": TokenType.KEYWORD_MATCHING, 
    "not": TokenType.KEYWORD_NOT,
    "between": TokenType.KEYWORD_BETWEEN, 
    "tab": TokenType.KEYWORD_TAB,
    "linefeed": TokenType.KEYWORD_LINEFEED,
    "carriage": TokenType.KEYWORD_CARRIAGE,
    "return": TokenType.KEYWORD_RETURN,
    "group": TokenType.KEYWORD_GROUP,
    "by": TokenType.KEYWORD_BY,
    "an": TokenType.KEYWORD_ARTICLE,
    "a": TokenType.KEYWORD_ARTICLE,
    "the": TokenType.KEYWORD_ARTICLE,
    "exactly": TokenType.KEYWORD_EXACTLY,
    "inclusive": TokenType.KEYWORD_INCLUSIVE,
    "inclusively": TokenType.KEYWORD_INCLUSIVE,
    "exclusive": TokenType.KEYWORD_EXCLUSIVE,
    "exclusively": TokenType.KEYWORD_EXCLUSIVE,
    "from": TokenType.KEYWORD_FROM, 
    "to": TokenType.KEYWORD_TO
 };
 const escape_sequences = {
    'a': '\a',
    'b': '\b',
    'e': '\e',
    'f': '\f',
    'n': '\n',
    'r': '\r',
    't': '\t',
    '"': '"',
    '\'': '\'',
    '\\': '\\',
 };
 export class TokenizerOptions {
    public convert_spaces_to_tabs: boolean = false;
 }
 const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g);
 function escape_sequence_gather_hex(input: string, i : number, max: number) : string {
    let hex = "";
    for(i++; i < input.length && max-- > 0; i++) {
        if(escape_sequence_hex_regex.test(input[i])) hex += input[i];
    }
    return hex;
 }
 function escape_sequence_mapper(input: string, i : number) : { code: string, read: number, error?: Error } {
    if(escape_sequences[input[i]] != undefined) {
        return { code: escape_sequences[input[i]], read: 1 };
    }
    //variable hex code
    else if(input[i] == 'x') {
        const hex = escape_sequence_gather_hex(input, ++i, 4);
        return { code:  String.fromCharCode(parseInt(hex, 16)), read: hex.length + 1 };
    }
    //4 hex unicode
    else if(input[i] == 'u') {
        const unicode = escape_sequence_gather_hex(input, ++i, 4);
        if(unicode.length != 4) {
            return { code: "", read: unicode.length + 1, error: new Error("Bad escape sequence")};
        }
        else {
            return { code: String.fromCharCode(parseInt(unicode, 16)), read: 5 };
        }
    }
    else if(input[i] == 'U') {
        const unicode = escape_sequence_gather_hex(input, ++i, 8);
        if(unicode.length != 8) {
            return { code: "", read: unicode.length + 1, error: new Error("Bad escape sequence")};
        }
        else {
            return { code: String.fromCharCode(parseInt(unicode, 16)), read: 9 };
        }
    }
    else {
        // should throw an exception, but gonna just ignore it
        return { code:  input[i], read: 1 };
    }
 }
 function is_digit(input: string) : boolean {
    //return /[0-9]/g.test(input);
    const value = input.charCodeAt(0);
    return value >= 48 && value <= 57;
 }
 function is_char(input: string) : boolean {
    //return input.toUpperCase() != input.toLowerCase();
    //return /[a-zA-Z]/g.test(input);
    const value = input.charCodeAt(0);
    return ((value >= 65 && value <= 90) || (value >= 97 && value <= 122));
 }
 /* Basic Tokenizer */
 export function tokenize(input: string, options: TokenizerOptions) : { tokens: Token[], errors: TokenError[] } {
    let line = 1;
    let position = 1;
    let tokens : Token[] = [];
    let errors : TokenError[] = [];
    for(let i = 0; i < input.length; i++, position++) {
        // 4 spaces = 1 tab. That is final. Debate over
        if(options.convert_spaces_to_tabs && input.startsWith("    ", i)) {
            tokens.push(new Token(TokenType.INDENT, line, position));
            i += 3;
            position += 3;
        } 
        // between (ex: 0...3 or 0-3)
        else if(input.startsWith("...", i)) {
            tokens.push(new Token(TokenType.BETWEEN, line, position));
            i += 2;
            position += 2;
        } 
        else if(input.startsWith("..", i)) {
            tokens.push(new Token(TokenType.BETWEEN, line, position));
            i++;
            position++;
        } 
        // comments
        else if(input.startsWith("//", i)) {
            for(i++, position++; i < input.length; i++, position++) {
                if(input[i] == '\n') {
                    tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
                    break;
                }
            }
            line++;
            position = 0;
        } 
        else if(input.startsWith("/*", i)) {
            for(i++, position++; i < input.length-1; i++, position++) {
                if(input[i] == '*' && input[i+1] == '/') {
                    tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
                    i++;
                    position++;
                    break;
                }
                if(input[i] == '\n') {
                    line++;
                    position = 0;
                }
            }
            if(i == input.length-1) {
                errors.push(new TokenError("Unexpected EOF", line, position));
            }
            else {
                line++;
                position = 0;
            }
        }
        else if (input.startsWith("\r\n", i)) {
            tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
            i++;
            line++;
            position = 0;
        } 
        else {
            switch(input[i]) {
                // comment
                case '#':
                    for(i++, position++; i < input.length; i++, position++) {
                        if(input[i] == '\n') {
                            tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
                            line++;
                            position = 0;
                            break;
                        }
                    }
                    break;
                // quote
                case '"':
                case '\"':
                    // build up a word between quotes
                    const quote_begin = { line: line, position: position };
                    const quote_char = input[i];
                    let found_ending = false;
                    let quote = "";
                    do {
                        i++;
                        position++;
                        if(input[i] == '\\') {
                            i++;
                            position++;
                            const sequence = escape_sequence_mapper(input, i);
                            if(sequence.error != undefined) {
                                errors.push(new TokenError(sequence.error.message, line, position));
                            }
                            position += sequence.read;
                            i += sequence.read;
                            quote += sequence.code;
                        }
                        else if(input[i] == quote_char) {
                            found_ending = true;
                            break;
                        }
                        else if(input[i] == '\n') {
                            line++;
                            position = 0;
                            break;
                        }
                        else {
                            quote += input[i];
                        }
                    } while(i < input.length);
                    if(found_ending) {
                        tokens.push(new Token(TokenType.QUOTE, line, position, quote));
                    }
                    else {
                        //we reached the end of the line or the end of the file
                        errors.push(new TokenError(`Unexpected end of quote. Quote began at ${quote_begin.line}:${quote_begin.position}`, line, position));
                        line++;
                        position = 0;
                    }
                    break;
                // between (ex: 0...3 or 0-3)
                case '-':
                    tokens.push(new Token(TokenType.BETWEEN, line, position));
                    break;
                case '\n':
                    tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
                    break;
                case '\r':
                    // ignore
                    break;
                case '\t':
                    tokens.push(new Token(TokenType.INDENT, line, position));
                    break;
                case ' ':
                    break;
                default:
                    // is digit? build up a number
                    if(is_digit(input[i])) {
                        let digits = input[i];
                        do {
                            i++; position++;
                            digits += input[i];
                        } while(i+1 < input.length && is_digit(input[i+1]));
                        tokens.push(new Token(TokenType.NUMBER, line, position, digits));
                    }
                    // is char? build up a word
                    else if(is_char(input[i])) {
                        let text = input[i];
                        do {
                            i++; position++;
                            text += input[i];
                        } while(i+1 < input.length && is_char(input[i+1]));
                        const keyword_text = text.toLowerCase();
                        if(keywords[keyword_text] != undefined) {
                            tokens.push(new Token(keywords[keyword_text], line, position));
                        }
                        else {
                            switch(keyword_text) {
                                case "none":
                                case "zero":
                                    tokens.push(new Token(TokenType.NUMBER, line, position, "0")); 
                                    break;
                                case "one": 
                                    tokens.push(new Token(TokenType.NUMBER, line, position, "1")); 
                                    break;
                                case "two": 
                                    tokens.push(new Token(TokenType.NUMBER, line, position, "2")); 
                                    break;
                                case "three": 
                                    tokens.push(new Token(TokenType.NUMBER, line, position, "3")); 
                                    break;
                                case "four": 
                                    tokens.push(new Token(TokenType.NUMBER, line, position, "4")); 
                                    break;
                                case "five": 
                                    tokens.push(new Token(TokenType.NUMBER, line, position, "5")); 
                                    break;
                                case "six": 
                                    tokens.push(new Token(TokenType.NUMBER, line, position, "6")); 
                                    break;
                                case "seven": 
                                    tokens.push(new Token(TokenType.NUMBER, line, position, "7")); 
                                    break;
                                case "eight": 
                                    tokens.push(new Token(TokenType.NUMBER, line, position, "8")); 
                                    break;
                                case "nine": 
                                    tokens.push(new Token(TokenType.NUMBER, line, position, "9")); 
                                    break;
                                case "ten": 
                                    tokens.push(new Token(TokenType.NUMBER, line, position, "10")); 
                                    break;
                                default:
                                    errors.push(new TokenError(`Unknown keyword ${text}`, line, position));
                                    break;
                            }
                        }
                    }
                    else {
                        errors.push(new TokenError(`Unknown character in text: ${input.charCodeAt(i)}`, line, position));
                    }
                    break;
            }
        }
    }
    return { tokens: tokens, errors: errors };
 }
--- a/src/tokens.ts
+++ b/src/tokens.ts
@ -0,0 +1,64 @@
 export enum TokenType {
    END_OF_STATEMENT,
    INDENT,
    BETWEEN,
    QUOTE,
    NUMBER,
    KEYWORD_BETWEEN,
    KEYWORD_OPTIONAL,
    KEYWORD_MATCH,
    KEYWORD_THEN,
    KEYWORD_AND,
    KEYWORD_OR,
    KEYWORD_ANY,
    KEYWORD_OF,
    KEYWODE_WORD_SPECIFIER,
    KEYWORD_DIGIT_SPECIFIER,
    KEYWORD_CHAR_SPECIFIER,
    KEYWORD_WHITESPACE_SPECIFIER,
    KEYWORD_NUMBER_SPECIFIER,
    KEYWORD_MULTIPLE,
    KEYWORD_AS,
    KEYWORD_IF,
    KEYWORD_STARTS,
    KEYWORD_WITH,
    KEYWORD_ENDS,
    KEYWORD_ELSE,
    KEYWORD_UNLESS,
    KEYWORD_WHILE,
    KEYWORD_MORE,
    KEYWORD_USING,
    KEYWORD_GLOBAL,
    KEYWORD_MULTILINE,
    KEYWORD_EXACT,
    KEYWORD_MATCHING,
    KEYWORD_NOT,
    KEYWORD_TAB,
    KEYWORD_LINEFEED,
    KEYWORD_CARRIAGE,
    KEYWORD_RETURN,
    KEYWORD_GROUP,
    KEYWORD_BY,
    KEYWORD_ARTICLE,
    KEYWORD_EXACTLY,
    KEYWORD_INCLUSIVE,
    KEYWORD_EXCLUSIVE,
    KEYWORD_FROM,
    KEYWORD_TO
 }
 export class TokenError extends Error {
    constructor(message: string, public line: number, public position: number) {
        super(message);
    }
    public to_string() {
        return `${this.line}:${this.position} ${this.message}`;
    }
 }
 export class Token {
    constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) {
    }
 }