1
0
mirror of https://github.com/pdemian/human2regex.git synced 2025-05-16 12:30:09 -07:00

Bug fix in tokenizer

This commit is contained in:
Patrick Demian 2020-10-11 00:17:07 -04:00
parent ae84a52821
commit 9f46d1246c
5 changed files with 48 additions and 26 deletions

View File

@ -17,6 +17,11 @@
],
"rules": {
"@typescript-eslint/no-inferrable-types": "off",
"@typescript-eslint/explicit-function-return-type": "error",
"no-magic-numbers": [
"warn",
{ "ignoreArrayIndexes": true, "ignore": [0,1,2,3,4,5,6,7,8,9]}
],
"curly": "warn",
"no-loss-of-precision": "error",
"default-case-last": "warn",

1
.gitignore vendored
View File

@ -1,4 +1,3 @@
# Node build artifacts
node_modules/
npm-debug.log
src/*.js

View File

@ -12,6 +12,10 @@ $(function() {
*/
const opts = new TokenizerOptions();
const res = tokenize("match 1+ thing from thingy", opts);
const result = tokenize("match /* 9+ */ 1+ optionally 1..3 0-zero then //comment match", opts);
console.log(res);
for(const r of result.tokens) {
console.log(r.to_string());
}
console.log(result.errors);

View File

@ -69,7 +69,7 @@ const escape_sequences = {
};
export class TokenizerOptions {
public convert_spaces_to_tabs: boolean = false;
public convert_spaces_to_tabs: boolean = true;
}
const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g);
@ -120,18 +120,21 @@ function escape_sequence_mapper(input: string, i : number) : { code: string, rea
}
}
function is_digit(input: string) : boolean {
const test_chars = "09azAZ";
function is_digit(input: string, i: number) : boolean {
//return /[0-9]/g.test(input);
const value = input.charCodeAt(0);
return value >= 48 && value <= 57;
const value = input.charCodeAt(i);
return value >= test_chars.charCodeAt(0) && value <= test_chars.charCodeAt(1);
}
function is_char(input: string) : boolean {
function is_char(input: string, i: number) : boolean {
//return input.toUpperCase() != input.toLowerCase();
//return /[a-zA-Z]/g.test(input);
const value = input.charCodeAt(0);
return ((value >= 65 && value <= 90) || (value >= 97 && value <= 122));
const value = input.charCodeAt(i);
return ((value >= test_chars.charCodeAt(2) && value <= test_chars.charCodeAt(3)) ||
(value >= test_chars.charCodeAt(4) && value <= test_chars.charCodeAt(5)));
}
/* Basic Tokenizer */
@ -174,7 +177,6 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
else if(input.startsWith("/*", i)) {
for(i++, position++; i < input.length-1; i++, position++) {
if(input[i] === "*" && input[i+1] === "/") {
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
i++;
position++;
break;
@ -268,6 +270,10 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
case "-":
tokens.push(new Token(TokenType.BETWEEN, line, position));
break;
case "+":
tokens.push(new Token(TokenType.KEYWORD_OR, line, position));
tokens.push(new Token(TokenType.KEYWORD_MORE, line, position));
break;
case "\n":
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
break;
@ -281,24 +287,22 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
break;
default:
// is digit? build up a number
if(is_digit(input[i])) {
if(is_digit(input, i)) {
let digits = input[i];
do {
i++; position++;
digits += input[i];
} while(i+1 < input.length && is_digit(input[i+1]));
for(; i+1 < input.length && is_digit(input, i+1); i++, position++) {
digits += input[i+1];
}
tokens.push(new Token(TokenType.NUMBER, line, position, digits));
}
// is char? build up a word
else if(is_char(input[i])) {
else if(is_char(input, i)) {
let text = input[i];
do {
i++; position++;
text += input[i];
} while(i+1 < input.length && is_char(input[i+1]));
for(; i+1 < input.length && is_char(input, i+1); i++, position++) {
text += input[i+1];
}
const keyword_text = text.toLowerCase();
@ -348,7 +352,7 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
}
}
else {
errors.push(new TokenError(`Unknown character in text: ${input.charCodeAt(i)}`, line, position));
errors.push(new TokenError(`Unknown character in text: "${input[i]}" (${input.charCodeAt(i)})`, line, position));
}
break;
}

View File

@ -52,13 +52,23 @@ export class TokenError extends Error {
super(message);
}
public to_string() {
public to_string(): string {
return `${this.line}:${this.position} ${this.message}`;
}
}
export class Token {
constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) {
/* nothing required */
}
public to_string(): string {
let str = `${this.line}:${this.position} ${TokenType[this.type]}`;
if (this.token_string) {
str += ` "${this.token_string}"`;
}
return str;
}
}