From 88c5b203fdbd0f0fdab756a05610756182a9f2b5 Mon Sep 17 00:00:00 2001 From: Patrick Demian Date: Tue, 13 Oct 2020 05:04:44 -0400 Subject: [PATCH] Fixed tokenizer Just 1 more commit and I think I'll be done with it --- src/ast.ts | 0 src/script.ts | 41 ++++++++++- src/tokenizer.ts | 188 +++++++++++++++++++++++++++++++++-------------- src/tokens.ts | 11 ++- 4 files changed, 179 insertions(+), 61 deletions(-) create mode 100644 src/ast.ts diff --git a/src/ast.ts b/src/ast.ts new file mode 100644 index 0000000..e69de29 diff --git a/src/script.ts b/src/script.ts index 961d6b7..659901a 100644 --- a/src/script.ts +++ b/src/script.ts @@ -12,7 +12,46 @@ $(function() { */ const opts = new TokenizerOptions(); -const result = tokenize("match /* 9+ */ 1+ optionally 1..3 0-zero then //comment match", opts); +const result = tokenize(` +// H2R supports // # and /**/ as comments +// A group is only captured if given a name. +// You can use "and", "or", "not" to specify "[]" regex +// You can use "then" to combine match statements, however I find using multiple "match" statements easier to read + +// exact matching means use a ^ and $ to signify the start and end of the string + +using global and exact matching +create an optional group called "protocol" + match "http" + optionally match "s" + match "://" +create a group called "subdomain" + repeat + match 1+ words + match "." +create a group called "domain" + match 1+ words or "_" or "-" + match "." + match a word +# port, but we don't care about it, so ignore it +optionally match ":" then 0+ digits +create an optional group called "path" + repeat + match "/" + match 0+ words or "_" or "-" +create an optional group + # we don't want to capture the '?', so don't name the group until afterwards + match "?" + create a group called "query" + repeat + match 1+ words or "_" or "-" + match "=" + match 1+ words or "_" or "-" +create an optional group + # fragment, again, we don't care, so ignore everything afterwards + match "#" + match 0+ any thing +`, opts); for(const r of result.tokens) { console.log(r.to_string()); diff --git a/src/tokenizer.ts b/src/tokenizer.ts index 4891a10..4575d51 100644 --- a/src/tokenizer.ts +++ b/src/tokenizer.ts @@ -5,12 +5,16 @@ import { Token, TokenType, TokenError } from "./tokens"; const keywords = { + + /* Full Keywords */ "optional": TokenType.KEYWORD_OPTIONAL, "optionally": TokenType.KEYWORD_OPTIONAL, "match": TokenType.KEYWORD_MATCH, + "matches": TokenType.KEYWORD_MATCH, "then": TokenType.KEYWORD_THEN, "any": TokenType.KEYWORD_ANY, "anything": TokenType.KEYWORD_ANY, + "anythings": TokenType.KEYWORD_ANY, "of": TokenType.KEYWORD_OF, "or": TokenType.KEYWORD_OR, "and": TokenType.KEYWORD_AND, @@ -19,9 +23,15 @@ const keywords = { "character": TokenType.KEYWORD_CHAR_SPECIFIER, "whitespace": TokenType.KEYWORD_WHITESPACE_SPECIFIER, "number": TokenType.KEYWORD_NUMBER_SPECIFIER, + "words": TokenType.KEYWODE_WORD_SPECIFIER, + "digits": TokenType.KEYWORD_DIGIT_SPECIFIER, + "characters": TokenType.KEYWORD_CHAR_SPECIFIER, + "whitespaces": TokenType.KEYWORD_WHITESPACE_SPECIFIER, + "numbers": TokenType.KEYWORD_NUMBER_SPECIFIER, "multiple": TokenType.KEYWORD_MULTIPLE, "as": TokenType.KEYWORD_AS, "if": TokenType.KEYWORD_IF, + "start": TokenType.KEYWORD_STARTS, "starts": TokenType.KEYWORD_STARTS, "with": TokenType.KEYWORD_WITH, "ends": TokenType.KEYWORD_ENDS, @@ -39,8 +49,6 @@ const keywords = { "between": TokenType.KEYWORD_BETWEEN, "tab": TokenType.KEYWORD_TAB, "linefeed": TokenType.KEYWORD_LINEFEED, - "carriage": TokenType.KEYWORD_CARRIAGE, - "return": TokenType.KEYWORD_RETURN, "group": TokenType.KEYWORD_GROUP, "by": TokenType.KEYWORD_BY, "an": TokenType.KEYWORD_ARTICLE, @@ -52,7 +60,58 @@ const keywords = { "exclusive": TokenType.KEYWORD_EXCLUSIVE, "exclusively": TokenType.KEYWORD_EXCLUSIVE, "from": TokenType.KEYWORD_FROM, - "to": TokenType.KEYWORD_TO + "to": TokenType.KEYWORD_TO, + "create": TokenType.KEYWORD_CREATE, + "creates": TokenType.KEYWORD_CREATE, + "called": TokenType.KEYWORD_CALLED, + "repeat": TokenType.KEYWORD_REPEAT, + "repeats": TokenType.KEYWORD_REPEAT, + "newline": TokenType.KEYWORD_NEWLINE, + + /* Partial keywords */ + "thing": TokenType.PARTIAL_KEYWORD, + "things": TokenType.PARTIAL_KEYWORD, + "white": TokenType.PARTIAL_KEYWORD, + "space": TokenType.PARTIAL_KEYWORD, + "spaces": TokenType.PARTIAL_KEYWORD, + "other": TokenType.PARTIAL_KEYWORD, + "wise": TokenType.PARTIAL_KEYWORD, + "multi": TokenType.PARTIAL_KEYWORD, + "new": TokenType.PARTIAL_KEYWORD, + "line": TokenType.PARTIAL_KEYWORD, + "feed": TokenType.PARTIAL_KEYWORD, + "carriage": TokenType.PARTIAL_KEYWORD, + "return": TokenType.PARTIAL_KEYWORD, +}; + +const numbers = { + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10" +} + +interface token_transformation { + [key: string]: { preceeding_token: string, transforms_to: TokenType }[] +} + +const token_transformations : token_transformation = { + "thing": [{ preceeding_token: "any", transforms_to: TokenType.KEYWORD_ANY }], + "things": [{ preceeding_token: "any", transforms_to: TokenType.KEYWORD_ANY }], + "space": [{ preceeding_token: "white", transforms_to: TokenType.KEYWORD_WHITESPACE_SPECIFIER }], + "spaces": [{ preceeding_token: "white", transforms_to: TokenType.KEYWORD_WHITESPACE_SPECIFIER }], + "wise": [{ preceeding_token: "other", transforms_to: TokenType.KEYWORD_ELSE }], + "line": [{ preceeding_token: "multi", transforms_to: TokenType.KEYWORD_MULTILINE }, + { preceeding_token: "new", transforms_to: TokenType.KEYWORD_NEWLINE }], + "feed": [{ preceeding_token: "line", transforms_to: TokenType.KEYWORD_LINEFEED }], + "return": [{ preceeding_token: "carriage", transforms_to: TokenType.KEYWORD_CARRIAGE_RETURN }], }; const escape_sequences = { @@ -68,10 +127,6 @@ const escape_sequences = { "\\": "\\", }; -export class TokenizerOptions { - public convert_spaces_to_tabs: boolean = true; -} - const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g); function escape_sequence_gather_hex(input: string, i : number, max: number) : string { @@ -120,31 +175,73 @@ function escape_sequence_mapper(input: string, i : number) : { code: string, rea } } -const test_chars = "09azAZ"; +const test_char_0 = "0".charCodeAt(0); +const test_char_9 = "9".charCodeAt(0); +const test_char_a = "a".charCodeAt(0); +const test_char_z = "z".charCodeAt(0); +const test_char_A = "A".charCodeAt(0); +const test_char_Z = "Z".charCodeAt(0); function is_digit(input: string, i: number) : boolean { - //return /[0-9]/g.test(input); const value = input.charCodeAt(i); - return value >= test_chars.charCodeAt(0) && value <= test_chars.charCodeAt(1); + return value >= test_char_0 && value <= test_char_9; } function is_char(input: string, i: number) : boolean { - //return input.toUpperCase() != input.toLowerCase(); - //return /[a-zA-Z]/g.test(input); - const value = input.charCodeAt(i); - return ((value >= test_chars.charCodeAt(2) && value <= test_chars.charCodeAt(3)) || - (value >= test_chars.charCodeAt(4) && value <= test_chars.charCodeAt(5))); + return ((value >= test_char_a && value <= test_char_z) || + (value >= test_char_A && value <= test_char_Z)); +} + +function transform_tokens(tokens: Token[], errors: TokenError[]) : void { + for(let i = 0; i < tokens.length; i++) { + //check past tokens: if it matches the preceeding tokens, we transform it. + + if(tokens[i].type === TokenType.PARTIAL_KEYWORD && token_transformations[tokens[i].token_string as string]) { + + const transform = token_transformations[tokens[i].token_string as string]; + + for(let j = 0; j < transform.length; j++) { + if(i-1 >= 0 && transform[j].preceeding_token === tokens[i-1].token_string) { + // use the i-1 token because it has the start line and position + + tokens[i-1].type = transform[j].transforms_to; + (tokens[i-1].token_string as string) += " " + tokens[i].token_string as string; + tokens.splice(i, 1); // remove this token + i--; // move token counter back because we removed the token + break; + } + } + } + /* else ignore */ + } + + // do we still have partial tokens? those are errors then + for(let i = 0; i < tokens.length; i++) { + if(tokens[i].type === TokenType.PARTIAL_KEYWORD) { + errors.push(new TokenError(`Unknown keyword "${tokens[i].token_string}"`, tokens[i].line, tokens[i].position)); + } + } +} + +export class TokenizerOptions { + public convert_spaces_to_tabs: boolean = true; +} + +export interface TokenizeResult { + tokens: Token[], + errors: TokenError[] } /* Basic Tokenizer */ -export function tokenize(input: string, options: TokenizerOptions) : { tokens: Token[], errors: TokenError[] } { +export function tokenize(input: string, options: TokenizerOptions) : TokenizeResult { let line = 1; let position = 1; const tokens : Token[] = []; const errors : TokenError[] = []; + // gather tokens for(let i = 0; i < input.length; i++, position++) { // 4 spaces = 1 tab. That is final. Debate over if(options.convert_spaces_to_tabs && input.startsWith(" ", i)) { @@ -276,6 +373,8 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T break; case "\n": tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position)); + line++; + position = 0; break; case "\r": // ignore @@ -284,20 +383,25 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T tokens.push(new Token(TokenType.INDENT, line, position)); break; case " ": + // ignore break; default: // is digit? build up a number if(is_digit(input, i)) { + const digit_begin = position; + let digits = input[i]; for(; i+1 < input.length && is_digit(input, i+1); i++, position++) { digits += input[i+1]; } - tokens.push(new Token(TokenType.NUMBER, line, position, digits)); + tokens.push(new Token(TokenType.NUMBER, line, digit_begin, digits)); } // is char? build up a word else if(is_char(input, i)) { + const word_begin = position; + let text = input[i]; for(; i+1 < input.length && is_char(input, i+1); i++, position++) { @@ -306,49 +410,16 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T const keyword_text = text.toLowerCase(); + // keyword (ex. "match") if(keywords[keyword_text]) { - tokens.push(new Token(keywords[keyword_text], line, position)); + tokens.push(new Token(keywords[keyword_text], line, word_begin, keyword_text)); + } + // text number (ex. "one") + else if(numbers[keyword_text]) { + tokens.push(new Token(TokenType.NUMBER, line, word_begin, keyword_text)); } else { - switch(keyword_text) { - case "none": - case "zero": - tokens.push(new Token(TokenType.NUMBER, line, position, "0")); - break; - case "one": - tokens.push(new Token(TokenType.NUMBER, line, position, "1")); - break; - case "two": - tokens.push(new Token(TokenType.NUMBER, line, position, "2")); - break; - case "three": - tokens.push(new Token(TokenType.NUMBER, line, position, "3")); - break; - case "four": - tokens.push(new Token(TokenType.NUMBER, line, position, "4")); - break; - case "five": - tokens.push(new Token(TokenType.NUMBER, line, position, "5")); - break; - case "six": - tokens.push(new Token(TokenType.NUMBER, line, position, "6")); - break; - case "seven": - tokens.push(new Token(TokenType.NUMBER, line, position, "7")); - break; - case "eight": - tokens.push(new Token(TokenType.NUMBER, line, position, "8")); - break; - case "nine": - tokens.push(new Token(TokenType.NUMBER, line, position, "9")); - break; - case "ten": - tokens.push(new Token(TokenType.NUMBER, line, position, "10")); - break; - default: - errors.push(new TokenError(`Unknown keyword ${text}`, line, position)); - break; - } + errors.push(new TokenError(`Unknown keyword "${text}"`, line, word_begin)); } } else { @@ -359,5 +430,8 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T } } + // transform tokens + transform_tokens(tokens, errors); + return { tokens: tokens, errors: errors }; } \ No newline at end of file diff --git a/src/tokens.ts b/src/tokens.ts index c1d633c..6a9fbad 100644 --- a/src/tokens.ts +++ b/src/tokens.ts @@ -4,6 +4,7 @@ export enum TokenType { BETWEEN, QUOTE, NUMBER, + PARTIAL_KEYWORD, KEYWORD_BETWEEN, KEYWORD_OPTIONAL, KEYWORD_MATCH, @@ -35,8 +36,7 @@ export enum TokenType { KEYWORD_NOT, KEYWORD_TAB, KEYWORD_LINEFEED, - KEYWORD_CARRIAGE, - KEYWORD_RETURN, + KEYWORD_CARRIAGE_RETURN, KEYWORD_GROUP, KEYWORD_BY, KEYWORD_ARTICLE, @@ -44,7 +44,11 @@ export enum TokenType { KEYWORD_INCLUSIVE, KEYWORD_EXCLUSIVE, KEYWORD_FROM, - KEYWORD_TO + KEYWORD_TO, + KEYWORD_CREATE, + KEYWORD_CALLED, + KEYWORD_REPEAT, + KEYWORD_NEWLINE } export class TokenError extends Error { @@ -58,6 +62,7 @@ export class TokenError extends Error { } export class Token { + /* TODO: end line and position? */ constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) { /* nothing required */ }