diff --git a/docs/bundle.min.js b/docs/bundle.min.js index bb9319c..1bfe44a 100644 --- a/docs/bundle.min.js +++ b/docs/bundle.min.js @@ -99,8 +99,50 @@ $(function() { }); */ const opts = new tokenizer_1.TokenizerOptions(); -const res = tokenizer_1.tokenize("match 1+ thing from thingy", opts); -console.log(res); +const result = tokenizer_1.tokenize(` +// H2R supports // # and /**/ as comments +// A group is only captured if given a name. +// You can use "and", "or", "not" to specify "[]" regex +// You can use "then" to combine match statements, however I find using multiple "match" statements easier to read + +// exact matching means use a ^ and $ to signify the start and end of the string + +using global and exact matching +create an optional group called "protocol" + match "http" + optionally match "s" + match "://" +create a group called "subdomain" + repeat + match 1+ words + match "." +create a group called "domain" + match 1+ words or "_" or "-" + match "." + match a word +# port, but we don't care about it, so ignore it +optionally match ":" then 0+ digits +create an optional group called "path" + repeat + match "/" + match 0+ words or "_" or "-" +create an optional group + # we don't want to capture the '?', so don't name the group until afterwards + match "?" + create a group called "query" + repeat + match 1+ words or "_" or "-" + match "=" + match 1+ words or "_" or "-" +create an optional group + # fragment, again, we don't care, so ignore everything afterwards + match "#" + match 0+ any thing +`, opts); +for (const r of result.tokens) { + console.log(r.to_string()); +} +console.log(result.errors); /***/ }), @@ -124,12 +166,15 @@ exports.tokenize = exports.TokenizerOptions = void 0; // TODO: replace every version of switch() with switch(.charCodeAt(0)) const tokens_1 = __webpack_require__(3); const keywords = { + /* Full Keywords */ "optional": tokens_1.TokenType.KEYWORD_OPTIONAL, "optionally": tokens_1.TokenType.KEYWORD_OPTIONAL, "match": tokens_1.TokenType.KEYWORD_MATCH, + "matches": tokens_1.TokenType.KEYWORD_MATCH, "then": tokens_1.TokenType.KEYWORD_THEN, "any": tokens_1.TokenType.KEYWORD_ANY, "anything": tokens_1.TokenType.KEYWORD_ANY, + "anythings": tokens_1.TokenType.KEYWORD_ANY, "of": tokens_1.TokenType.KEYWORD_OF, "or": tokens_1.TokenType.KEYWORD_OR, "and": tokens_1.TokenType.KEYWORD_AND, @@ -138,9 +183,15 @@ const keywords = { "character": tokens_1.TokenType.KEYWORD_CHAR_SPECIFIER, "whitespace": tokens_1.TokenType.KEYWORD_WHITESPACE_SPECIFIER, "number": tokens_1.TokenType.KEYWORD_NUMBER_SPECIFIER, + "words": tokens_1.TokenType.KEYWODE_WORD_SPECIFIER, + "digits": tokens_1.TokenType.KEYWORD_DIGIT_SPECIFIER, + "characters": tokens_1.TokenType.KEYWORD_CHAR_SPECIFIER, + "whitespaces": tokens_1.TokenType.KEYWORD_WHITESPACE_SPECIFIER, + "numbers": tokens_1.TokenType.KEYWORD_NUMBER_SPECIFIER, "multiple": tokens_1.TokenType.KEYWORD_MULTIPLE, "as": tokens_1.TokenType.KEYWORD_AS, "if": tokens_1.TokenType.KEYWORD_IF, + "start": tokens_1.TokenType.KEYWORD_STARTS, "starts": tokens_1.TokenType.KEYWORD_STARTS, "with": tokens_1.TokenType.KEYWORD_WITH, "ends": tokens_1.TokenType.KEYWORD_ENDS, @@ -158,8 +209,6 @@ const keywords = { "between": tokens_1.TokenType.KEYWORD_BETWEEN, "tab": tokens_1.TokenType.KEYWORD_TAB, "linefeed": tokens_1.TokenType.KEYWORD_LINEFEED, - "carriage": tokens_1.TokenType.KEYWORD_CARRIAGE, - "return": tokens_1.TokenType.KEYWORD_RETURN, "group": tokens_1.TokenType.KEYWORD_GROUP, "by": tokens_1.TokenType.KEYWORD_BY, "an": tokens_1.TokenType.KEYWORD_ARTICLE, @@ -171,7 +220,58 @@ const keywords = { "exclusive": tokens_1.TokenType.KEYWORD_EXCLUSIVE, "exclusively": tokens_1.TokenType.KEYWORD_EXCLUSIVE, "from": tokens_1.TokenType.KEYWORD_FROM, - "to": tokens_1.TokenType.KEYWORD_TO + "to": tokens_1.TokenType.KEYWORD_TO, + "create": tokens_1.TokenType.KEYWORD_CREATE, + "creates": tokens_1.TokenType.KEYWORD_CREATE, + "called": tokens_1.TokenType.KEYWORD_CALLED, + "repeat": tokens_1.TokenType.KEYWORD_REPEAT, + "repeats": tokens_1.TokenType.KEYWORD_REPEAT, + "newline": tokens_1.TokenType.KEYWORD_NEWLINE, + "none": tokens_1.TokenType.KEYWORD_NONE, + "neither": tokens_1.TokenType.KEYWORD_NEITHER, + /* Partial keywords */ + "thing": tokens_1.TokenType.PARTIAL_KEYWORD, + "things": tokens_1.TokenType.PARTIAL_KEYWORD, + "white": tokens_1.TokenType.PARTIAL_KEYWORD, + "space": tokens_1.TokenType.PARTIAL_KEYWORD, + "spaces": tokens_1.TokenType.PARTIAL_KEYWORD, + "other": tokens_1.TokenType.PARTIAL_KEYWORD, + "wise": tokens_1.TokenType.PARTIAL_KEYWORD, + "multi": tokens_1.TokenType.PARTIAL_KEYWORD, + "new": tokens_1.TokenType.PARTIAL_KEYWORD, + "line": tokens_1.TokenType.PARTIAL_KEYWORD, + "feed": tokens_1.TokenType.PARTIAL_KEYWORD, + "carriage": tokens_1.TokenType.PARTIAL_KEYWORD, + "return": tokens_1.TokenType.PARTIAL_KEYWORD, + "case": tokens_1.TokenType.PARTIAL_KEYWORD, + "insensitive": tokens_1.TokenType.PARTIAL_KEYWORD, + "sensitive": tokens_1.TokenType.PARTIAL_KEYWORD +}; +const numbers = { + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10" +}; +const token_transformations = { + "thing": [{ preceeding_token: "any", transforms_to: tokens_1.TokenType.KEYWORD_ANY }], + "things": [{ preceeding_token: "any", transforms_to: tokens_1.TokenType.KEYWORD_ANY }], + "space": [{ preceeding_token: "white", transforms_to: tokens_1.TokenType.KEYWORD_WHITESPACE_SPECIFIER }], + "spaces": [{ preceeding_token: "white", transforms_to: tokens_1.TokenType.KEYWORD_WHITESPACE_SPECIFIER }], + "wise": [{ preceeding_token: "other", transforms_to: tokens_1.TokenType.KEYWORD_ELSE }], + "line": [{ preceeding_token: "multi", transforms_to: tokens_1.TokenType.KEYWORD_MULTILINE }, + { preceeding_token: "new", transforms_to: tokens_1.TokenType.KEYWORD_NEWLINE }], + "feed": [{ preceeding_token: "line", transforms_to: tokens_1.TokenType.KEYWORD_LINEFEED }], + "return": [{ preceeding_token: "carriage", transforms_to: tokens_1.TokenType.KEYWORD_CARRIAGE_RETURN }], + "sensitive": [{ preceeding_token: "case", transforms_to: tokens_1.TokenType.KEYWORD_CASE_SENSITIVE }], + "insensitive": [{ preceeding_token: "case", transforms_to: tokens_1.TokenType.KEYWORD_CASE_INSENSITIVE }], }; const escape_sequences = { "a": "\a", @@ -185,12 +285,6 @@ const escape_sequences = { "\"": '"', "\\": "\\", }; -class TokenizerOptions { - constructor() { - this.convert_spaces_to_tabs = false; - } -} -exports.TokenizerOptions = TokenizerOptions; const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g); function escape_sequence_gather_hex(input, i, max) { let hex = ""; @@ -234,38 +328,74 @@ function escape_sequence_mapper(input, i) { return { code: input[i], read: 1 }; } } -function is_digit(input) { - //return /[0-9]/g.test(input); - const value = input.charCodeAt(0); - return value >= 48 && value <= 57; +const test_char_0 = "0".charCodeAt(0); +const test_char_9 = "9".charCodeAt(0); +const test_char_a = "a".charCodeAt(0); +const test_char_z = "z".charCodeAt(0); +const test_char_A = "A".charCodeAt(0); +const test_char_Z = "Z".charCodeAt(0); +function is_digit(input, i) { + const value = input.charCodeAt(i); + return value >= test_char_0 && value <= test_char_9; } -function is_char(input) { - //return input.toUpperCase() != input.toLowerCase(); - //return /[a-zA-Z]/g.test(input); - const value = input.charCodeAt(0); - return ((value >= 65 && value <= 90) || (value >= 97 && value <= 122)); +function is_char(input, i) { + const value = input.charCodeAt(i); + return ((value >= test_char_a && value <= test_char_z) || + (value >= test_char_A && value <= test_char_Z)); } +function transform_tokens(tokens, errors) { + for (let i = 0; i < tokens.length; i++) { + //check past tokens: if it matches the preceeding tokens, we transform it. + if (tokens[i].type === tokens_1.TokenType.PARTIAL_KEYWORD && token_transformations[tokens[i].token_string]) { + const transform = token_transformations[tokens[i].token_string]; + for (let j = 0; j < transform.length; j++) { + if (i - 1 >= 0 && transform[j].preceeding_token === tokens[i - 1].token_string) { + // use the i-1 token because it has the start line and position + tokens[i - 1].type = transform[j].transforms_to; + tokens[i - 1].token_string += " " + tokens[i].token_string; + tokens.splice(i, 1); // remove this token + i--; // move token counter back because we removed the token + break; + } + } + } + /* else ignore */ + } + // do we still have partial tokens? those are errors then + for (let i = 0; i < tokens.length; i++) { + if (tokens[i].type === tokens_1.TokenType.PARTIAL_KEYWORD) { + errors.push(new tokens_1.TokenError(`Unknown keyword "${tokens[i].token_string}"`, tokens[i].line, tokens[i].position)); + } + } +} +class TokenizerOptions { + constructor() { + this.convert_spaces_to_tabs = true; + } +} +exports.TokenizerOptions = TokenizerOptions; /* Basic Tokenizer */ function tokenize(input, options) { let line = 1; let position = 1; const tokens = []; const errors = []; + // gather tokens for (let i = 0; i < input.length; i++, position++) { // 4 spaces = 1 tab. That is final. Debate over if (options.convert_spaces_to_tabs && input.startsWith(" ", i)) { - tokens.push(new tokens_1.Token(tokens_1.TokenType.INDENT, line, position)); + tokens.push(new tokens_1.Token(tokens_1.TokenType.INDENT, line, position, 4)); i += 3; position += 3; } // between (ex: 0...3 or 0-3) else if (input.startsWith("...", i)) { - tokens.push(new tokens_1.Token(tokens_1.TokenType.BETWEEN, line, position)); + tokens.push(new tokens_1.Token(tokens_1.TokenType.BETWEEN, line, position, 3)); i += 2; position += 2; } else if (input.startsWith("..", i)) { - tokens.push(new tokens_1.Token(tokens_1.TokenType.BETWEEN, line, position)); + tokens.push(new tokens_1.Token(tokens_1.TokenType.BETWEEN, line, position, 3)); i++; position++; } @@ -273,7 +403,7 @@ function tokenize(input, options) { else if (input.startsWith("//", i)) { for (i++, position++; i < input.length; i++, position++) { if (input[i] === "\n") { - tokens.push(new tokens_1.Token(tokens_1.TokenType.END_OF_STATEMENT, line, position)); + tokens.push(new tokens_1.Token(tokens_1.TokenType.END_OF_STATEMENT, line, position, -1)); break; } } @@ -283,7 +413,6 @@ function tokenize(input, options) { else if (input.startsWith("/*", i)) { for (i++, position++; i < input.length - 1; i++, position++) { if (input[i] === "*" && input[i + 1] === "/") { - tokens.push(new tokens_1.Token(tokens_1.TokenType.END_OF_STATEMENT, line, position)); i++; position++; break; @@ -302,7 +431,7 @@ function tokenize(input, options) { } } else if (input.startsWith("\r\n", i)) { - tokens.push(new tokens_1.Token(tokens_1.TokenType.END_OF_STATEMENT, line, position)); + tokens.push(new tokens_1.Token(tokens_1.TokenType.END_OF_STATEMENT, line, position, -1)); i++; line++; position = 0; @@ -313,7 +442,7 @@ function tokenize(input, options) { case "#": for (i++, position++; i < input.length; i++, position++) { if (input[i] === "\n") { - tokens.push(new tokens_1.Token(tokens_1.TokenType.END_OF_STATEMENT, line, position)); + tokens.push(new tokens_1.Token(tokens_1.TokenType.END_OF_STATEMENT, line, position, -1)); line++; position = 0; break; @@ -357,7 +486,7 @@ function tokenize(input, options) { } } while (i < input.length); if (found_ending) { - tokens.push(new tokens_1.Token(tokens_1.TokenType.QUOTE, line, position, quote)); + tokens.push(new tokens_1.Token(tokens_1.TokenType.QUOTE, line, position, quote.length + 2, quote)); } else { //we reached the end of the line or the end of the file @@ -369,91 +498,65 @@ function tokenize(input, options) { } // between (ex: 0...3 or 0-3) case "-": - tokens.push(new tokens_1.Token(tokens_1.TokenType.BETWEEN, line, position)); + tokens.push(new tokens_1.Token(tokens_1.TokenType.BETWEEN, line, position, 1)); + break; + case "+": + tokens.push(new tokens_1.Token(tokens_1.TokenType.KEYWORD_OR, line, position, 1)); + tokens.push(new tokens_1.Token(tokens_1.TokenType.KEYWORD_MORE, line, position, 0)); break; case "\n": - tokens.push(new tokens_1.Token(tokens_1.TokenType.END_OF_STATEMENT, line, position)); + tokens.push(new tokens_1.Token(tokens_1.TokenType.END_OF_STATEMENT, line, position, -1)); + line++; + position = 0; break; case "\r": // ignore break; case "\t": - tokens.push(new tokens_1.Token(tokens_1.TokenType.INDENT, line, position)); + tokens.push(new tokens_1.Token(tokens_1.TokenType.INDENT, line, position, 1)); break; case " ": + // ignore break; default: // is digit? build up a number - if (is_digit(input[i])) { + if (is_digit(input, i)) { + const digit_begin = position; let digits = input[i]; - do { - i++; - position++; - digits += input[i]; - } while (i + 1 < input.length && is_digit(input[i + 1])); - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, digits)); + for (; i + 1 < input.length && is_digit(input, i + 1); i++, position++) { + digits += input[i + 1]; + } + tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, digit_begin, position - digit_begin + 1, digits)); } // is char? build up a word - else if (is_char(input[i])) { + else if (is_char(input, i)) { + const word_begin = position; let text = input[i]; - do { - i++; - position++; - text += input[i]; - } while (i + 1 < input.length && is_char(input[i + 1])); + for (; i + 1 < input.length && is_char(input, i + 1); i++, position++) { + text += input[i + 1]; + } const keyword_text = text.toLowerCase(); + // keyword (ex. "match") if (keywords[keyword_text]) { - tokens.push(new tokens_1.Token(keywords[keyword_text], line, position)); + tokens.push(new tokens_1.Token(keywords[keyword_text], line, word_begin, position - word_begin + 1, keyword_text)); + } + // text number (ex. "one") + else if (numbers[keyword_text]) { + tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, word_begin, position - word_begin + 1, keyword_text)); } else { - switch (keyword_text) { - case "none": - case "zero": - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, "0")); - break; - case "one": - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, "1")); - break; - case "two": - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, "2")); - break; - case "three": - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, "3")); - break; - case "four": - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, "4")); - break; - case "five": - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, "5")); - break; - case "six": - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, "6")); - break; - case "seven": - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, "7")); - break; - case "eight": - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, "8")); - break; - case "nine": - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, "9")); - break; - case "ten": - tokens.push(new tokens_1.Token(tokens_1.TokenType.NUMBER, line, position, "10")); - break; - default: - errors.push(new tokens_1.TokenError(`Unknown keyword ${text}`, line, position)); - break; - } + errors.push(new tokens_1.TokenError(`Unknown keyword "${text}"`, line, word_begin)); } } else { - errors.push(new tokens_1.TokenError(`Unknown character in text: ${input.charCodeAt(i)}`, line, position)); + errors.push(new tokens_1.TokenError(`Unknown character in text: "${input[i]}" (${input.charCodeAt(i)})`, line, position)); } break; } } } + // transform tokens + transform_tokens(tokens, errors); return { tokens: tokens, errors: errors }; } exports.tokenize = tokenize; @@ -474,47 +577,55 @@ var TokenType; TokenType[TokenType["BETWEEN"] = 2] = "BETWEEN"; TokenType[TokenType["QUOTE"] = 3] = "QUOTE"; TokenType[TokenType["NUMBER"] = 4] = "NUMBER"; - TokenType[TokenType["KEYWORD_BETWEEN"] = 5] = "KEYWORD_BETWEEN"; - TokenType[TokenType["KEYWORD_OPTIONAL"] = 6] = "KEYWORD_OPTIONAL"; - TokenType[TokenType["KEYWORD_MATCH"] = 7] = "KEYWORD_MATCH"; - TokenType[TokenType["KEYWORD_THEN"] = 8] = "KEYWORD_THEN"; - TokenType[TokenType["KEYWORD_AND"] = 9] = "KEYWORD_AND"; - TokenType[TokenType["KEYWORD_OR"] = 10] = "KEYWORD_OR"; - TokenType[TokenType["KEYWORD_ANY"] = 11] = "KEYWORD_ANY"; - TokenType[TokenType["KEYWORD_OF"] = 12] = "KEYWORD_OF"; - TokenType[TokenType["KEYWODE_WORD_SPECIFIER"] = 13] = "KEYWODE_WORD_SPECIFIER"; - TokenType[TokenType["KEYWORD_DIGIT_SPECIFIER"] = 14] = "KEYWORD_DIGIT_SPECIFIER"; - TokenType[TokenType["KEYWORD_CHAR_SPECIFIER"] = 15] = "KEYWORD_CHAR_SPECIFIER"; - TokenType[TokenType["KEYWORD_WHITESPACE_SPECIFIER"] = 16] = "KEYWORD_WHITESPACE_SPECIFIER"; - TokenType[TokenType["KEYWORD_NUMBER_SPECIFIER"] = 17] = "KEYWORD_NUMBER_SPECIFIER"; - TokenType[TokenType["KEYWORD_MULTIPLE"] = 18] = "KEYWORD_MULTIPLE"; - TokenType[TokenType["KEYWORD_AS"] = 19] = "KEYWORD_AS"; - TokenType[TokenType["KEYWORD_IF"] = 20] = "KEYWORD_IF"; - TokenType[TokenType["KEYWORD_STARTS"] = 21] = "KEYWORD_STARTS"; - TokenType[TokenType["KEYWORD_WITH"] = 22] = "KEYWORD_WITH"; - TokenType[TokenType["KEYWORD_ENDS"] = 23] = "KEYWORD_ENDS"; - TokenType[TokenType["KEYWORD_ELSE"] = 24] = "KEYWORD_ELSE"; - TokenType[TokenType["KEYWORD_UNLESS"] = 25] = "KEYWORD_UNLESS"; - TokenType[TokenType["KEYWORD_WHILE"] = 26] = "KEYWORD_WHILE"; - TokenType[TokenType["KEYWORD_MORE"] = 27] = "KEYWORD_MORE"; - TokenType[TokenType["KEYWORD_USING"] = 28] = "KEYWORD_USING"; - TokenType[TokenType["KEYWORD_GLOBAL"] = 29] = "KEYWORD_GLOBAL"; - TokenType[TokenType["KEYWORD_MULTILINE"] = 30] = "KEYWORD_MULTILINE"; - TokenType[TokenType["KEYWORD_EXACT"] = 31] = "KEYWORD_EXACT"; - TokenType[TokenType["KEYWORD_MATCHING"] = 32] = "KEYWORD_MATCHING"; - TokenType[TokenType["KEYWORD_NOT"] = 33] = "KEYWORD_NOT"; - TokenType[TokenType["KEYWORD_TAB"] = 34] = "KEYWORD_TAB"; - TokenType[TokenType["KEYWORD_LINEFEED"] = 35] = "KEYWORD_LINEFEED"; - TokenType[TokenType["KEYWORD_CARRIAGE"] = 36] = "KEYWORD_CARRIAGE"; - TokenType[TokenType["KEYWORD_RETURN"] = 37] = "KEYWORD_RETURN"; - TokenType[TokenType["KEYWORD_GROUP"] = 38] = "KEYWORD_GROUP"; - TokenType[TokenType["KEYWORD_BY"] = 39] = "KEYWORD_BY"; - TokenType[TokenType["KEYWORD_ARTICLE"] = 40] = "KEYWORD_ARTICLE"; - TokenType[TokenType["KEYWORD_EXACTLY"] = 41] = "KEYWORD_EXACTLY"; - TokenType[TokenType["KEYWORD_INCLUSIVE"] = 42] = "KEYWORD_INCLUSIVE"; - TokenType[TokenType["KEYWORD_EXCLUSIVE"] = 43] = "KEYWORD_EXCLUSIVE"; - TokenType[TokenType["KEYWORD_FROM"] = 44] = "KEYWORD_FROM"; - TokenType[TokenType["KEYWORD_TO"] = 45] = "KEYWORD_TO"; + TokenType[TokenType["PARTIAL_KEYWORD"] = 5] = "PARTIAL_KEYWORD"; + TokenType[TokenType["KEYWORD_BETWEEN"] = 6] = "KEYWORD_BETWEEN"; + TokenType[TokenType["KEYWORD_OPTIONAL"] = 7] = "KEYWORD_OPTIONAL"; + TokenType[TokenType["KEYWORD_MATCH"] = 8] = "KEYWORD_MATCH"; + TokenType[TokenType["KEYWORD_THEN"] = 9] = "KEYWORD_THEN"; + TokenType[TokenType["KEYWORD_AND"] = 10] = "KEYWORD_AND"; + TokenType[TokenType["KEYWORD_OR"] = 11] = "KEYWORD_OR"; + TokenType[TokenType["KEYWORD_ANY"] = 12] = "KEYWORD_ANY"; + TokenType[TokenType["KEYWORD_OF"] = 13] = "KEYWORD_OF"; + TokenType[TokenType["KEYWORD_NONE"] = 14] = "KEYWORD_NONE"; + TokenType[TokenType["KEYWORD_NEITHER"] = 15] = "KEYWORD_NEITHER"; + TokenType[TokenType["KEYWODE_WORD_SPECIFIER"] = 16] = "KEYWODE_WORD_SPECIFIER"; + TokenType[TokenType["KEYWORD_DIGIT_SPECIFIER"] = 17] = "KEYWORD_DIGIT_SPECIFIER"; + TokenType[TokenType["KEYWORD_CHAR_SPECIFIER"] = 18] = "KEYWORD_CHAR_SPECIFIER"; + TokenType[TokenType["KEYWORD_WHITESPACE_SPECIFIER"] = 19] = "KEYWORD_WHITESPACE_SPECIFIER"; + TokenType[TokenType["KEYWORD_NUMBER_SPECIFIER"] = 20] = "KEYWORD_NUMBER_SPECIFIER"; + TokenType[TokenType["KEYWORD_MULTIPLE"] = 21] = "KEYWORD_MULTIPLE"; + TokenType[TokenType["KEYWORD_AS"] = 22] = "KEYWORD_AS"; + TokenType[TokenType["KEYWORD_IF"] = 23] = "KEYWORD_IF"; + TokenType[TokenType["KEYWORD_STARTS"] = 24] = "KEYWORD_STARTS"; + TokenType[TokenType["KEYWORD_WITH"] = 25] = "KEYWORD_WITH"; + TokenType[TokenType["KEYWORD_ENDS"] = 26] = "KEYWORD_ENDS"; + TokenType[TokenType["KEYWORD_ELSE"] = 27] = "KEYWORD_ELSE"; + TokenType[TokenType["KEYWORD_UNLESS"] = 28] = "KEYWORD_UNLESS"; + TokenType[TokenType["KEYWORD_WHILE"] = 29] = "KEYWORD_WHILE"; + TokenType[TokenType["KEYWORD_MORE"] = 30] = "KEYWORD_MORE"; + TokenType[TokenType["KEYWORD_USING"] = 31] = "KEYWORD_USING"; + TokenType[TokenType["KEYWORD_GLOBAL"] = 32] = "KEYWORD_GLOBAL"; + TokenType[TokenType["KEYWORD_MULTILINE"] = 33] = "KEYWORD_MULTILINE"; + TokenType[TokenType["KEYWORD_EXACT"] = 34] = "KEYWORD_EXACT"; + TokenType[TokenType["KEYWORD_MATCHING"] = 35] = "KEYWORD_MATCHING"; + TokenType[TokenType["KEYWORD_NOT"] = 36] = "KEYWORD_NOT"; + TokenType[TokenType["KEYWORD_TAB"] = 37] = "KEYWORD_TAB"; + TokenType[TokenType["KEYWORD_LINEFEED"] = 38] = "KEYWORD_LINEFEED"; + TokenType[TokenType["KEYWORD_CARRIAGE_RETURN"] = 39] = "KEYWORD_CARRIAGE_RETURN"; + TokenType[TokenType["KEYWORD_GROUP"] = 40] = "KEYWORD_GROUP"; + TokenType[TokenType["KEYWORD_BY"] = 41] = "KEYWORD_BY"; + TokenType[TokenType["KEYWORD_ARTICLE"] = 42] = "KEYWORD_ARTICLE"; + TokenType[TokenType["KEYWORD_EXACTLY"] = 43] = "KEYWORD_EXACTLY"; + TokenType[TokenType["KEYWORD_INCLUSIVE"] = 44] = "KEYWORD_INCLUSIVE"; + TokenType[TokenType["KEYWORD_EXCLUSIVE"] = 45] = "KEYWORD_EXCLUSIVE"; + TokenType[TokenType["KEYWORD_FROM"] = 46] = "KEYWORD_FROM"; + TokenType[TokenType["KEYWORD_TO"] = 47] = "KEYWORD_TO"; + TokenType[TokenType["KEYWORD_CREATE"] = 48] = "KEYWORD_CREATE"; + TokenType[TokenType["KEYWORD_CALLED"] = 49] = "KEYWORD_CALLED"; + TokenType[TokenType["KEYWORD_REPEAT"] = 50] = "KEYWORD_REPEAT"; + TokenType[TokenType["KEYWORD_NEWLINE"] = 51] = "KEYWORD_NEWLINE"; + TokenType[TokenType["KEYWORD_CASE_SENSITIVE"] = 52] = "KEYWORD_CASE_SENSITIVE"; + TokenType[TokenType["KEYWORD_CASE_INSENSITIVE"] = 53] = "KEYWORD_CASE_INSENSITIVE"; })(TokenType = exports.TokenType || (exports.TokenType = {})); class TokenError extends Error { constructor(message, line, position) { @@ -523,16 +634,26 @@ class TokenError extends Error { this.position = position; } to_string() { - return `${this.line}:${this.position} ${this.message}`; + return `Token Error: ${this.line}:${this.position} ${this.message}`; } } exports.TokenError = TokenError; class Token { - constructor(type, line, position, token_string) { + constructor(type, line, position, length, token_string) { this.type = type; this.line = line; this.position = position; + this.length = length; this.token_string = token_string; + /* nothing required */ + } + to_string() { + let str = `${this.line}:${this.position} ${TokenType[this.type]}`; + if (this.token_string) { + str += ` "${this.token_string}"`; + } + str += ` (size: ${this.length})`; + return str; } } exports.Token = Token; diff --git a/src/ast.ts b/src/ast.ts index e69de29..148e3ce 100644 --- a/src/ast.ts +++ b/src/ast.ts @@ -0,0 +1,236 @@ +/*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */ + +import { Token } from "./tokens"; + +export class SyntaxError extends Error { + constructor(message: string, public tokens: Token[]) { + super(message); + } + + public to_string(): string { + return `Syntax Error: ${this.message}`; + } +} + +/* TODO: line number/position? */ +export interface AbstractSyntaxTree { + to_string(): string; +} + +export class Qualifier implements AbstractSyntaxTree { + constructor(public type: string) { + /* empty */ + } + public to_string(): string { + + if(this.type === "g") { + return "g"; + } + else if(this.type === "m") { + return "m"; + } + else { + return "i"; + } + } +} + +export class Regex implements AbstractSyntaxTree { + constructor(public inner_trees: AbstractSyntaxTree[], public qualifiers: Qualifier[]) { + /* empty */ + } + + public to_string(): string { + let str = "/"; + + for(const tree of this.inner_trees) { + str += tree.to_string(); + } + + str += "/"; + + for(const tree of this.qualifiers) { + str += tree.to_string(); + } + + return str; + } +} + +export class Group implements AbstractSyntaxTree { + constructor(public inner_tree: AbstractSyntaxTree, public name?: string) { + /* empty */ + } + + public to_string(): string { + return "(" + (name ? `?<${this.name}>` : "") + `${this.inner_tree.to_string()})`; + } +} + +export class Any implements AbstractSyntaxTree { + constructor() { + /* empty */ + } + + public to_string(): string { + return "."; + } +} + +export class AnyOf implements AbstractSyntaxTree { + constructor(public inner_trees: AbstractSyntaxTree[], public negated: boolean) { + /* empty */ + } + + public to_string(): string { + let str = "["; + + if(this.negated) { + str += "^"; + } + + for(const tree of this.inner_trees) { + str += tree.to_string(); + } + + str += "]"; + return str; + } +} + +export class Repeat implements AbstractSyntaxTree { + constructor(public inner_tree: AbstractSyntaxTree, public first_required: boolean) { + /* empty */ + } + + public to_string(): string { + return this.inner_tree.to_string() + (this.first_required ? "+" : "*"); + } +} + +export class Optional implements AbstractSyntaxTree { + constructor(public inner_tree: AbstractSyntaxTree) { + /* empty */ + } + + public to_string(): string { + return `${this.inner_tree.to_string()}?`; + } +} + +export class Anchor implements AbstractSyntaxTree { + constructor(public inner_tree: AbstractSyntaxTree) { + /* empty */ + } + + public to_string(): string { + return `^${this.inner_tree.to_string()}$`; + } +} + +export class Range implements AbstractSyntaxTree { + constructor(public from: string, public to: string) { + /* empty */ + } + public to_string(): string { + return `${this.from}-${this.to}`; + } +} + +export class QuantifierExactly implements AbstractSyntaxTree { + constructor(public inner_tree: AbstractSyntaxTree, public count: number) { + /* empty */ + } + public to_string(): string { + return `${this.inner_tree.to_string()}{${this.count}}`; + } +} + +export class QuantifierBetween implements AbstractSyntaxTree { + constructor(public inner_tree: AbstractSyntaxTree, public from: number, public to?: number, public inclusive?: boolean) { + /* empty */ + } + public to_string(): string { + let str = `${this.inner_tree.to_string()}{${this.from},`; + + if(this.to) { + str += (this.to-(this.inclusive?0:1)); + } + + str += "}"; + return str; + } +} + +export class Or implements AbstractSyntaxTree { + constructor(public left_tree: AbstractSyntaxTree, public right_tree: AbstractSyntaxTree) { + /* empty */ + } + public to_string(): string { + return `${this.left_tree.to_string()}|${this.right_tree.to_string()}`; + } +} + +export class And implements AbstractSyntaxTree { + constructor(public left_tree: AbstractSyntaxTree, public right_tree: AbstractSyntaxTree) { + /* empty */ + } + public to_string(): string { + return `${this.left_tree.to_string()}${this.right_tree.to_string()}`; + } +} + +export class Specifier implements AbstractSyntaxTree { + constructor(public type: string, public negated: boolean) { + /* empty */ + } + public to_string(): string { + let str = "\\"; + + if(this.type === "w") { + str += (this.negated ? "W" : "w"); + } + else if(this.type === "d") { + str += (this.negated ? "D" : "d"); + } + else { + str += (this.negated ? "S" : "s"); + } + + return str; + } + + // \w \d \s : word, digit, whitespace +} + +export class Match implements AbstractSyntaxTree { + // remember: transform unicode, escape stuff + + constructor(public match: string) { + /* empty */ + } + public to_string(): string { + /* TODO: ESCAPE/TRANSFORM CHARACTERS! */ + + return this.match; + } +} + +export class SpecialCharacter implements AbstractSyntaxTree { + //type: \t\r\n + + constructor(public type: string) { + /* empty */ + } + public to_string(): string { + if(this.type === "t") { + return "\\t"; + } + else if(this.type === "r") { + return "\\r"; + } + else { + return "\\n"; + } + } +} \ No newline at end of file diff --git a/src/parser.ts b/src/parser.ts index cd91cb7..4214e60 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -1,6 +1,7 @@ /*! Copyright (c) 2020 Patrick Demian; Licensed under MIT */ import { Token, TokenType } from "./tokens"; +import * as AST from "./ast"; export class ParserOptions { diff --git a/src/tokenizer.ts b/src/tokenizer.ts index 7040442..3b65372 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -67,6 +67,8 @@ const keywords = { "repeat": TokenType.KEYWORD_REPEAT, "repeats": TokenType.KEYWORD_REPEAT, "newline": TokenType.KEYWORD_NEWLINE, + "none": TokenType.KEYWORD_NONE, + "neither": TokenType.KEYWORD_NEITHER, /* Partial keywords */ "thing": TokenType.PARTIAL_KEYWORD, @@ -82,6 +84,9 @@ const keywords = { "feed": TokenType.PARTIAL_KEYWORD, "carriage": TokenType.PARTIAL_KEYWORD, "return": TokenType.PARTIAL_KEYWORD, + "case": TokenType.PARTIAL_KEYWORD, + "insensitive": TokenType.PARTIAL_KEYWORD, + "sensitive": TokenType.PARTIAL_KEYWORD }; const numbers = { @@ -103,15 +108,17 @@ interface token_transformation { } const token_transformations : token_transformation = { - "thing": [ { preceeding_token: "any", transforms_to: TokenType.KEYWORD_ANY } ], - "things": [ { preceeding_token: "any", transforms_to: TokenType.KEYWORD_ANY } ], - "space": [ { preceeding_token: "white", transforms_to: TokenType.KEYWORD_WHITESPACE_SPECIFIER } ], - "spaces": [ { preceeding_token: "white", transforms_to: TokenType.KEYWORD_WHITESPACE_SPECIFIER } ], - "wise": [ { preceeding_token: "other", transforms_to: TokenType.KEYWORD_ELSE } ], - "line": [ { preceeding_token: "multi", transforms_to: TokenType.KEYWORD_MULTILINE }, - { preceeding_token: "new", transforms_to: TokenType.KEYWORD_NEWLINE } ], - "feed": [ { preceeding_token: "line", transforms_to: TokenType.KEYWORD_LINEFEED } ], - "return": [ { preceeding_token: "carriage", transforms_to: TokenType.KEYWORD_CARRIAGE_RETURN } ], + "thing": [ { preceeding_token: "any", transforms_to: TokenType.KEYWORD_ANY } ], + "things": [ { preceeding_token: "any", transforms_to: TokenType.KEYWORD_ANY } ], + "space": [ { preceeding_token: "white", transforms_to: TokenType.KEYWORD_WHITESPACE_SPECIFIER } ], + "spaces": [ { preceeding_token: "white", transforms_to: TokenType.KEYWORD_WHITESPACE_SPECIFIER } ], + "wise": [ { preceeding_token: "other", transforms_to: TokenType.KEYWORD_ELSE } ], + "line": [ { preceeding_token: "multi", transforms_to: TokenType.KEYWORD_MULTILINE }, + { preceeding_token: "new", transforms_to: TokenType.KEYWORD_NEWLINE } ], + "feed": [ { preceeding_token: "line", transforms_to: TokenType.KEYWORD_LINEFEED } ], + "return": [ { preceeding_token: "carriage", transforms_to: TokenType.KEYWORD_CARRIAGE_RETURN } ], + "sensitive": [ { preceeding_token: "case", transforms_to: TokenType.KEYWORD_CASE_SENSITIVE } ], + "insensitive": [ { preceeding_token: "case", transforms_to: TokenType.KEYWORD_CASE_INSENSITIVE } ], }; const escape_sequences = { diff --git a/src/tokens.ts b/src/tokens.ts index c69a3fc..8aee1da 100644 --- a/src/tokens.ts +++ b/src/tokens.ts @@ -13,6 +13,8 @@ export enum TokenType { KEYWORD_OR, KEYWORD_ANY, KEYWORD_OF, + KEYWORD_NONE, + KEYWORD_NEITHER, KEYWODE_WORD_SPECIFIER, KEYWORD_DIGIT_SPECIFIER, KEYWORD_CHAR_SPECIFIER, @@ -48,7 +50,9 @@ export enum TokenType { KEYWORD_CREATE, KEYWORD_CALLED, KEYWORD_REPEAT, - KEYWORD_NEWLINE + KEYWORD_NEWLINE, + KEYWORD_CASE_SENSITIVE, + KEYWORD_CASE_INSENSITIVE } export class TokenError extends Error { @@ -57,7 +61,7 @@ export class TokenError extends Error { } public to_string(): string { - return `${this.line}:${this.position} ${this.message}`; + return `Token Error: ${this.line}:${this.position} ${this.message}`; } }