Bug fix in tokenizer

2025-05-16 04:20:35 -07:00 · 2020-10-11 00:17:07 -04:00 · 2020-10-11 00:17:07 -04:00 · 9f46d1246c
commit 9f46d1246c
parent ae84a52821
5 changed files with 48 additions and 26 deletions
--- a/.eslintrc.json
+++ b/.eslintrc.json
@ -17,6 +17,11 @@
    ],
    "rules": {
        "@typescript-eslint/no-inferrable-types": "off",
+		"@typescript-eslint/explicit-function-return-type": "error",
+		"no-magic-numbers": [ 
+			"warn", 
+			{ "ignoreArrayIndexes": true, "ignore": [0,1,2,3,4,5,6,7,8,9]}
+		],
        "curly": "warn",
        "no-loss-of-precision": "error",
        "default-case-last": "warn",
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,3 @@
 # Node build artifacts
 node_modules/
-npm-debug.log
-src/*.js
+npm-debug.log
--- a/src/script.ts
+++ b/src/script.ts
@ -12,6 +12,10 @@ $(function() {
 */

 const opts = new TokenizerOptions();
-const res = tokenize("match 1+ thing from thingy", opts);
+const result = tokenize("match /* 9+ */ 1+ optionally 1..3 0-zero then //comment match", opts);

-console.log(res);
+for(const r of result.tokens) {
+    console.log(r.to_string());
+}
+
+console.log(result.errors);
--- a/src/tokenizer.ts
+++ b/src/tokenizer.ts
@ -69,7 +69,7 @@ const escape_sequences = {
 };

 export class TokenizerOptions {
-    public convert_spaces_to_tabs: boolean = false;
+    public convert_spaces_to_tabs: boolean = true;
 }

 const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g);
@ -120,18 +120,21 @@ function escape_sequence_mapper(input: string, i : number) : { code: string, rea
    }
 }

-function is_digit(input: string) : boolean {
+const test_chars = "09azAZ";
+
+function is_digit(input: string, i: number) : boolean {
    //return /[0-9]/g.test(input);
-    const value = input.charCodeAt(0);
-    return value >= 48 && value <= 57;
+    const value = input.charCodeAt(i);
+    return value >= test_chars.charCodeAt(0) && value <= test_chars.charCodeAt(1);
 }

-function is_char(input: string) : boolean {
+function is_char(input: string, i: number) : boolean {
    //return input.toUpperCase() != input.toLowerCase();
    //return /[a-zA-Z]/g.test(input);

-    const value = input.charCodeAt(0);
-    return ((value >= 65 && value <= 90) || (value >= 97 && value <= 122));
+    const value = input.charCodeAt(i);
+    return ((value >= test_chars.charCodeAt(2) && value <= test_chars.charCodeAt(3)) || 
+            (value >= test_chars.charCodeAt(4) && value <= test_chars.charCodeAt(5)));
 }

 /* Basic Tokenizer */
@ -174,7 +177,6 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
        else if(input.startsWith("/*", i)) {
            for(i++, position++; i < input.length-1; i++, position++) {
                if(input[i] === "*" && input[i+1] === "/") {
-                    tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
                    i++;
                    position++;
                    break;
@ -268,6 +270,10 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
                case "-":
                    tokens.push(new Token(TokenType.BETWEEN, line, position));
                    break;
+                case "+":
+                    tokens.push(new Token(TokenType.KEYWORD_OR, line, position));
+                    tokens.push(new Token(TokenType.KEYWORD_MORE, line, position));
+                    break;
                case "\n":
                    tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
                    break;
@ -281,24 +287,22 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
                    break;
                default:
                    // is digit? build up a number
-                    if(is_digit(input[i])) {
+                    if(is_digit(input, i)) {
                        let digits = input[i];
                        
-                        do {
-                            i++; position++;
-                            digits += input[i];
-                        } while(i+1 < input.length && is_digit(input[i+1]));
+                        for(; i+1 < input.length && is_digit(input, i+1); i++, position++) {
+                            digits += input[i+1];
+                        }

                        tokens.push(new Token(TokenType.NUMBER, line, position, digits));
                    }
                    // is char? build up a word
-                    else if(is_char(input[i])) {
+                    else if(is_char(input, i)) {
                        let text = input[i];

-                        do {
-                            i++; position++;
-                            text += input[i];
-                        } while(i+1 < input.length && is_char(input[i+1]));
+                        for(; i+1 < input.length && is_char(input, i+1); i++, position++) {
+                            text += input[i+1];
+                        }

                        const keyword_text = text.toLowerCase();

@ -348,7 +352,7 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
                        }
                    }
                    else {
-                        errors.push(new TokenError(`Unknown character in text: ${input.charCodeAt(i)}`, line, position));
+                        errors.push(new TokenError(`Unknown character in text: "${input[i]}" (${input.charCodeAt(i)})`, line, position));
                    }
                    break;
            }
--- a/src/tokens.ts
+++ b/src/tokens.ts
@ -52,13 +52,23 @@ export class TokenError extends Error {
        super(message);
    }

-    public to_string() {
+    public to_string(): string {
        return `${this.line}:${this.position} ${this.message}`;
    }
 }

 export class Token {
-    constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) {
-        
+    constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) { 
+        /* nothing required */
+    }
+
+    public to_string(): string {
+        let str = `${this.line}:${this.position} ${TokenType[this.type]}`;
+
+        if (this.token_string) {
+            str += ` "${this.token_string}"`;
+        }
+
+        return str;
    }
 }