diff --git a/src/parser.ts b/src/parser.ts new file mode 100644 index 0000000..9f14556 --- /dev/null +++ b/src/parser.ts @@ -0,0 +1,11 @@ +/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */ + +import { Token, TokenType } from "./tokens"; + +export class ParserOptions { + +} + +export function parse(tokens: Token[]) { + return undefined; +} \ No newline at end of file diff --git a/src/script.js b/src/script.js deleted file mode 100644 index 5e027ff..0000000 --- a/src/script.js +++ /dev/null @@ -1,152 +0,0 @@ -/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */ -"use strict"; -const keywords = [ - "optional", "optionally", "match", "then", "any", "of", "or", "word", "digit", "unicode", "character", - "multiple", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "anything", - "whitespace", "as", "number", "if", "starts", "with", "ends", "otherwise", "else", "unless", "while", "more", - "using", "global", "and", "multiline", "exact", "matching", "not", "between", "tab", "linefeed", "carriage", "return", - "group", "by", "exactly", "inclusive", "inclusively", "exclusive", "exclusively", "including", "from", "to" -]; -var TokenType; -(function (TokenType) { - TokenType[TokenType["END_OF_STATEMENT"] = 0] = "END_OF_STATEMENT"; - TokenType[TokenType["INDENT"] = 1] = "INDENT"; - TokenType[TokenType["BETWEEN"] = 2] = "BETWEEN"; - TokenType[TokenType["QUOTE"] = 3] = "QUOTE"; - TokenType[TokenType["KEYWORD_BETWEEN"] = 4] = "KEYWORD_BETWEEN"; - TokenType[TokenType["KEYWORD_OPTIONAL"] = 5] = "KEYWORD_OPTIONAL"; - TokenType[TokenType["KEYWORD_MATCH"] = 6] = "KEYWORD_MATCH"; - TokenType[TokenType["KEYWORD_THEN"] = 7] = "KEYWORD_THEN"; - TokenType[TokenType["KEYWORD_AND"] = 8] = "KEYWORD_AND"; - TokenType[TokenType["KEYWORD_OR"] = 9] = "KEYWORD_OR"; - TokenType[TokenType["KEYWORD_ANY"] = 10] = "KEYWORD_ANY"; - TokenType[TokenType["KEYWORD_OF"] = 11] = "KEYWORD_OF"; -})(TokenType || (TokenType = {})); -class Token { - constructor(type, token_string) { - this.type = type; - this.token_string = token_string; - } -} -class TokenizerOptions { - constructor() { - this.convert_spaces_to_tabs = false; - } -} -/* Basic Tokenizer: To be replaced with a unicode variant later */ -function tokenize(input, options) { - let tokens = []; - let errors = []; - for (let i = 0; i < input.length; i++) { - // 4 spaces = 1 tab. That is final. Debate over - if (options.convert_spaces_to_tabs && input.startsWith(" ", i)) { - tokens.push(new Token(TokenType.INDENT)); - i += 3; - } - // between (ex: 0...3 or 0-3) - else if (input.startsWith("...", i)) { - tokens.push(new Token(TokenType.BETWEEN)); - i += 2; - } - else if (input.startsWith("..", i)) { - tokens.push(new Token(TokenType.BETWEEN)); - i += 1; - } - // comments - else if (input.startsWith("//", i)) { - i += 1; - while (i < input.length) { - if (input[i] == '\n') { - tokens.push(new Token(TokenType.END_OF_STATEMENT)); - break; - } - i++; - } - } - else if (input.startsWith("\r\n", i)) { - tokens.push(new Token(TokenType.END_OF_STATEMENT)); - i += 1; - } - else { - switch (input[i]) { - // comment - case '#': - i++; - while (i < input.length) { - if (input[i] == '\n') { - tokens.push(new Token(TokenType.END_OF_STATEMENT)); - break; - } - i++; - } - break; - // quote - case '"': - case '\"': - // build up a word between quotes - const quote_char = input[i]; - let found_ending = false; - let quote = ""; - do { - i++; - if (input[i] == quote_char) { - found_ending = true; - break; - } - else if (input[i] == '\n') { - } - } while (i < input.length); - if (found_ending) { - tokens.push(new Token(TokenType.QUOTE, quote)); - } - else { - // Skip until newline and throw an error - } - break; - // between (ex: 0...3 or 0-3) - case '-': - tokens.push(new Token(TokenType.BETWEEN)); - break; - case '\n': - tokens.push(new Token(TokenType.END_OF_STATEMENT)); - break; - case '\r': - // ignore - break; - case '\t': - tokens.push(new Token(TokenType.INDENT)); - break; - case ' ': - break; - default: - // is digit? build up a number - // is char? build up a word - keywords.includes("word"); - // build up a word - break; - } - } - } - return { tokens: tokens, errors: errors }; -} -/* -String.prototype.escape = function() { - var tagsToReplace = { - '&': '&', - '<': '<', - '>': '>' - }; - return this.replace(/[&<>]/g, function(tag) { - return tagsToReplace[tag] || tag; - }); -}; -String.prototype.norm = function() { - if(String.prototype.normalize != undefined) { - return this.normalize("NFD").replace(/[\u0300-\u036F]/g,""); - } - return this; -}; - -*/ -$(function () { -}); diff --git a/src/script.ts b/src/script.ts index 831c670..9176787 100644 --- a/src/script.ts +++ b/src/script.ts @@ -1,167 +1,9 @@ -/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */ - "use strict"; -const keywords = [ - "optional", "optionally", "match", "then", "any", "of", "or", "word", "digit", "unicode", "character", - "multiple", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "anything", - "whitespace", "as", "number", "if", "starts", "with", "ends", "otherwise", "else", "unless", "while", "more", - "using", "global", "and", "multiline", "exact", "matching", "not", "between", "tab", "linefeed", "carriage", "return", - "group", "by", "exactly", "inclusive", "inclusively", "exclusive", "exclusively", "including", "from", "to" -]; - -enum TokenType { - END_OF_STATEMENT, - INDENT, - BETWEEN, - QUOTE, - KEYWORD_BETWEEN, - KEYWORD_OPTIONAL, - KEYWORD_MATCH, - KEYWORD_THEN, - KEYWORD_AND, - KEYWORD_OR, - KEYWORD_ANY, - KEYWORD_OF, -} - -class Token { - constructor(public type: TokenType, public token_string?: string) { - - } -} - -class TokenizerOptions { - public convert_spaces_to_tabs: boolean = false; - -} - -/* Basic Tokenizer: To be replaced with a unicode variant later */ - -function tokenize(input: string, options: TokenizerOptions) : { tokens: Token[], errors: Error[] } { - let tokens : Token[] = []; - let errors : Error[] = []; - - for(let i = 0; i < input.length; i++) { - - // 4 spaces = 1 tab. That is final. Debate over - if(options.convert_spaces_to_tabs && input.startsWith(" ", i)) { - tokens.push(new Token(TokenType.INDENT)); - i += 3; - } - // between (ex: 0...3 or 0-3) - else if(input.startsWith("...", i)) { - tokens.push(new Token(TokenType.BETWEEN)); - i += 2; - } else if(input.startsWith("..", i)) { - tokens.push(new Token(TokenType.BETWEEN)); - i += 1; - } - // comments - else if(input.startsWith("//", i)) { - i += 1; - while(i < input.length) { - if(input[i] == '\n') { - tokens.push(new Token(TokenType.END_OF_STATEMENT)); - break; - } - i++; - } - } else if (input.startsWith("\r\n", i)) { - tokens.push(new Token(TokenType.END_OF_STATEMENT)); - i += 1; - } else { - switch(input[i]) { - // comment - case '#': - i++; - while(i < input.length) { - if(input[i] == '\n') { - tokens.push(new Token(TokenType.END_OF_STATEMENT)); - break; - } - i++; - } - break; - // quote - case '"': - case '\"': - // build up a word between quotes - const quote_char = input[i]; - let found_ending = false; - - let quote = ""; - - do { - i++; - if(input[i] == quote_char) { - found_ending = true; - break; - } - else if(input[i] == '\n') { - - } - } while(i < input.length); - - if(found_ending) { - tokens.push(new Token(TokenType.QUOTE, quote)); - } - else { - // Skip until newline and throw an error - } - - break; - - // between (ex: 0...3 or 0-3) - case '-': - tokens.push(new Token(TokenType.BETWEEN)); - break; - case '\n': - tokens.push(new Token(TokenType.END_OF_STATEMENT)); - break; - case '\r': - // ignore - break; - case '\t': - tokens.push(new Token(TokenType.INDENT)); - break; - case ' ': - break; - default: - // is digit? build up a number - - // is char? build up a word - - keywords.includes("word"); - // build up a word - break; - } - } - } - - return { tokens: tokens, errors: errors }; -} - -/* -String.prototype.escape = function() { - var tagsToReplace = { - '&': '&', - '<': '<', - '>': '>' - }; - return this.replace(/[&<>]/g, function(tag) { - return tagsToReplace[tag] || tag; - }); -}; -String.prototype.norm = function() { - if(String.prototype.normalize != undefined) { - return this.normalize("NFD").replace(/[\u0300-\u036F]/g,""); - } - return this; -}; - -*/ - -$( function() { +import { Token, TokenType } from "./tokens"; +import { TokenizerOptions, tokenize } from "./tokenizer"; +import { ParserOptions, parse } from "./parser"; +$(function() { + }); \ No newline at end of file diff --git a/src/style.css b/src/style.css index 577c6b4..345a416 100644 --- a/src/style.css +++ b/src/style.css @@ -263,8 +263,8 @@ footer { } /* accessibility */ -a { - color: #00497A; +a:hover { + color: #208bff; } .navbar-light .navbar-nav .nav-link { diff --git a/src/tokenizer.ts b/src/tokenizer.ts new file mode 100644 index 0000000..e2f8aef --- /dev/null +++ b/src/tokenizer.ts @@ -0,0 +1,356 @@ +/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */ + +// TODO: replace every version of switch() with switch(.charCodeAt(0)) + +import { Token, TokenType, TokenError } from "./tokens"; + +const keywords = { + "optional": TokenType.KEYWORD_OPTIONAL, + "optionally": TokenType.KEYWORD_OPTIONAL, + "match": TokenType.KEYWORD_MATCH, + "then": TokenType.KEYWORD_THEN, + "any": TokenType.KEYWORD_ANY, + "anything": TokenType.KEYWORD_ANY, + "of": TokenType.KEYWORD_OF, + "or": TokenType.KEYWORD_OR, + "and": TokenType.KEYWORD_AND, + "word": TokenType.KEYWODE_WORD_SPECIFIER, + "digit": TokenType.KEYWORD_DIGIT_SPECIFIER, + "character": TokenType.KEYWORD_CHAR_SPECIFIER, + "whitespace": TokenType.KEYWORD_WHITESPACE_SPECIFIER, + "number": TokenType.KEYWORD_NUMBER_SPECIFIER, + "multiple": TokenType.KEYWORD_MULTIPLE, + "as": TokenType.KEYWORD_AS, + "if": TokenType.KEYWORD_IF, + "starts": TokenType.KEYWORD_STARTS, + "with": TokenType.KEYWORD_WITH, + "ends": TokenType.KEYWORD_ENDS, + "otherwise": TokenType.KEYWORD_ELSE, + "else": TokenType.KEYWORD_ELSE, + "unless": TokenType.KEYWORD_UNLESS, + "while": TokenType.KEYWORD_WHILE, + "more": TokenType.KEYWORD_MORE, + "using": TokenType.KEYWORD_USING, + "global": TokenType.KEYWORD_GLOBAL, + "multiline": TokenType.KEYWORD_MULTILINE, + "exact": TokenType.KEYWORD_EXACT, + "matching": TokenType.KEYWORD_MATCHING, + "not": TokenType.KEYWORD_NOT, + "between": TokenType.KEYWORD_BETWEEN, + "tab": TokenType.KEYWORD_TAB, + "linefeed": TokenType.KEYWORD_LINEFEED, + "carriage": TokenType.KEYWORD_CARRIAGE, + "return": TokenType.KEYWORD_RETURN, + "group": TokenType.KEYWORD_GROUP, + "by": TokenType.KEYWORD_BY, + "an": TokenType.KEYWORD_ARTICLE, + "a": TokenType.KEYWORD_ARTICLE, + "the": TokenType.KEYWORD_ARTICLE, + "exactly": TokenType.KEYWORD_EXACTLY, + "inclusive": TokenType.KEYWORD_INCLUSIVE, + "inclusively": TokenType.KEYWORD_INCLUSIVE, + "exclusive": TokenType.KEYWORD_EXCLUSIVE, + "exclusively": TokenType.KEYWORD_EXCLUSIVE, + "from": TokenType.KEYWORD_FROM, + "to": TokenType.KEYWORD_TO +}; + +const escape_sequences = { + 'a': '\a', + 'b': '\b', + 'e': '\e', + 'f': '\f', + 'n': '\n', + 'r': '\r', + 't': '\t', + '"': '"', + '\'': '\'', + '\\': '\\', +}; + +export class TokenizerOptions { + public convert_spaces_to_tabs: boolean = false; +} + +const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g); + +function escape_sequence_gather_hex(input: string, i : number, max: number) : string { + let hex = ""; + for(i++; i < input.length && max-- > 0; i++) { + if(escape_sequence_hex_regex.test(input[i])) hex += input[i]; + } + return hex; +} + +function escape_sequence_mapper(input: string, i : number) : { code: string, read: number, error?: Error } { + if(escape_sequences[input[i]] != undefined) { + return { code: escape_sequences[input[i]], read: 1 }; + } + //variable hex code + else if(input[i] == 'x') { + const hex = escape_sequence_gather_hex(input, ++i, 4); + + return { code: String.fromCharCode(parseInt(hex, 16)), read: hex.length + 1 }; + } + //4 hex unicode + else if(input[i] == 'u') { + const unicode = escape_sequence_gather_hex(input, ++i, 4); + if(unicode.length != 4) { + return { code: "", read: unicode.length + 1, error: new Error("Bad escape sequence")}; + } + else { + return { code: String.fromCharCode(parseInt(unicode, 16)), read: 5 }; + } + } + else if(input[i] == 'U') { + const unicode = escape_sequence_gather_hex(input, ++i, 8); + + if(unicode.length != 8) { + return { code: "", read: unicode.length + 1, error: new Error("Bad escape sequence")}; + } + else { + return { code: String.fromCharCode(parseInt(unicode, 16)), read: 9 }; + } + } + else { + // should throw an exception, but gonna just ignore it + return { code: input[i], read: 1 }; + } +} + +function is_digit(input: string) : boolean { + //return /[0-9]/g.test(input); + const value = input.charCodeAt(0); + return value >= 48 && value <= 57; +} + +function is_char(input: string) : boolean { + //return input.toUpperCase() != input.toLowerCase(); + //return /[a-zA-Z]/g.test(input); + + const value = input.charCodeAt(0); + return ((value >= 65 && value <= 90) || (value >= 97 && value <= 122)); +} + +/* Basic Tokenizer */ +export function tokenize(input: string, options: TokenizerOptions) : { tokens: Token[], errors: TokenError[] } { + let line = 1; + let position = 1; + + let tokens : Token[] = []; + let errors : TokenError[] = []; + + for(let i = 0; i < input.length; i++, position++) { + // 4 spaces = 1 tab. That is final. Debate over + if(options.convert_spaces_to_tabs && input.startsWith(" ", i)) { + tokens.push(new Token(TokenType.INDENT, line, position)); + i += 3; + position += 3; + } + // between (ex: 0...3 or 0-3) + else if(input.startsWith("...", i)) { + tokens.push(new Token(TokenType.BETWEEN, line, position)); + i += 2; + position += 2; + } + else if(input.startsWith("..", i)) { + tokens.push(new Token(TokenType.BETWEEN, line, position)); + i++; + position++; + } + // comments + else if(input.startsWith("//", i)) { + for(i++, position++; i < input.length; i++, position++) { + if(input[i] == '\n') { + tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position)); + break; + } + } + line++; + position = 0; + } + else if(input.startsWith("/*", i)) { + for(i++, position++; i < input.length-1; i++, position++) { + if(input[i] == '*' && input[i+1] == '/') { + tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position)); + i++; + position++; + break; + } + if(input[i] == '\n') { + line++; + position = 0; + } + } + if(i == input.length-1) { + errors.push(new TokenError("Unexpected EOF", line, position)); + } + else { + line++; + position = 0; + } + } + else if (input.startsWith("\r\n", i)) { + tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position)); + i++; + line++; + position = 0; + } + else { + switch(input[i]) { + // comment + case '#': + for(i++, position++; i < input.length; i++, position++) { + if(input[i] == '\n') { + tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position)); + line++; + position = 0; + break; + } + } + break; + // quote + case '"': + case '\"': + // build up a word between quotes + const quote_begin = { line: line, position: position }; + const quote_char = input[i]; + let found_ending = false; + + let quote = ""; + + do { + i++; + position++; + if(input[i] == '\\') { + i++; + position++; + const sequence = escape_sequence_mapper(input, i); + + if(sequence.error != undefined) { + errors.push(new TokenError(sequence.error.message, line, position)); + } + + position += sequence.read; + i += sequence.read; + quote += sequence.code; + + } + else if(input[i] == quote_char) { + found_ending = true; + break; + } + else if(input[i] == '\n') { + line++; + position = 0; + break; + } + else { + quote += input[i]; + } + } while(i < input.length); + + if(found_ending) { + tokens.push(new Token(TokenType.QUOTE, line, position, quote)); + } + else { + //we reached the end of the line or the end of the file + errors.push(new TokenError(`Unexpected end of quote. Quote began at ${quote_begin.line}:${quote_begin.position}`, line, position)); + line++; + position = 0; + } + break; + + // between (ex: 0...3 or 0-3) + case '-': + tokens.push(new Token(TokenType.BETWEEN, line, position)); + break; + case '\n': + tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position)); + break; + case '\r': + // ignore + break; + case '\t': + tokens.push(new Token(TokenType.INDENT, line, position)); + break; + case ' ': + break; + default: + // is digit? build up a number + if(is_digit(input[i])) { + let digits = input[i]; + + do { + i++; position++; + digits += input[i]; + } while(i+1 < input.length && is_digit(input[i+1])); + + tokens.push(new Token(TokenType.NUMBER, line, position, digits)); + } + // is char? build up a word + else if(is_char(input[i])) { + let text = input[i]; + + do { + i++; position++; + text += input[i]; + } while(i+1 < input.length && is_char(input[i+1])); + + const keyword_text = text.toLowerCase(); + + if(keywords[keyword_text] != undefined) { + tokens.push(new Token(keywords[keyword_text], line, position)); + } + else { + switch(keyword_text) { + case "none": + case "zero": + tokens.push(new Token(TokenType.NUMBER, line, position, "0")); + break; + case "one": + tokens.push(new Token(TokenType.NUMBER, line, position, "1")); + break; + case "two": + tokens.push(new Token(TokenType.NUMBER, line, position, "2")); + break; + case "three": + tokens.push(new Token(TokenType.NUMBER, line, position, "3")); + break; + case "four": + tokens.push(new Token(TokenType.NUMBER, line, position, "4")); + break; + case "five": + tokens.push(new Token(TokenType.NUMBER, line, position, "5")); + break; + case "six": + tokens.push(new Token(TokenType.NUMBER, line, position, "6")); + break; + case "seven": + tokens.push(new Token(TokenType.NUMBER, line, position, "7")); + break; + case "eight": + tokens.push(new Token(TokenType.NUMBER, line, position, "8")); + break; + case "nine": + tokens.push(new Token(TokenType.NUMBER, line, position, "9")); + break; + case "ten": + tokens.push(new Token(TokenType.NUMBER, line, position, "10")); + break; + default: + errors.push(new TokenError(`Unknown keyword ${text}`, line, position)); + break; + } + } + } + else { + errors.push(new TokenError(`Unknown character in text: ${input.charCodeAt(i)}`, line, position)); + } + break; + } + } + } + + return { tokens: tokens, errors: errors }; +} \ No newline at end of file diff --git a/src/tokens.ts b/src/tokens.ts new file mode 100644 index 0000000..b68c111 --- /dev/null +++ b/src/tokens.ts @@ -0,0 +1,64 @@ +export enum TokenType { + END_OF_STATEMENT, + INDENT, + BETWEEN, + QUOTE, + NUMBER, + KEYWORD_BETWEEN, + KEYWORD_OPTIONAL, + KEYWORD_MATCH, + KEYWORD_THEN, + KEYWORD_AND, + KEYWORD_OR, + KEYWORD_ANY, + KEYWORD_OF, + KEYWODE_WORD_SPECIFIER, + KEYWORD_DIGIT_SPECIFIER, + KEYWORD_CHAR_SPECIFIER, + KEYWORD_WHITESPACE_SPECIFIER, + KEYWORD_NUMBER_SPECIFIER, + KEYWORD_MULTIPLE, + KEYWORD_AS, + KEYWORD_IF, + KEYWORD_STARTS, + KEYWORD_WITH, + KEYWORD_ENDS, + KEYWORD_ELSE, + KEYWORD_UNLESS, + KEYWORD_WHILE, + KEYWORD_MORE, + KEYWORD_USING, + KEYWORD_GLOBAL, + KEYWORD_MULTILINE, + KEYWORD_EXACT, + KEYWORD_MATCHING, + KEYWORD_NOT, + KEYWORD_TAB, + KEYWORD_LINEFEED, + KEYWORD_CARRIAGE, + KEYWORD_RETURN, + KEYWORD_GROUP, + KEYWORD_BY, + KEYWORD_ARTICLE, + KEYWORD_EXACTLY, + KEYWORD_INCLUSIVE, + KEYWORD_EXCLUSIVE, + KEYWORD_FROM, + KEYWORD_TO +} + +export class TokenError extends Error { + constructor(message: string, public line: number, public position: number) { + super(message); + } + + public to_string() { + return `${this.line}:${this.position} ${this.message}`; + } +} + +export class Token { + constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) { + + } +} \ No newline at end of file