mirror of
https://github.com/pdemian/human2regex.git
synced 2025-05-16 04:20:35 -07:00
Bug fix in tokenizer
This commit is contained in:
parent
ae84a52821
commit
9f46d1246c
@ -17,6 +17,11 @@
|
||||
],
|
||||
"rules": {
|
||||
"@typescript-eslint/no-inferrable-types": "off",
|
||||
"@typescript-eslint/explicit-function-return-type": "error",
|
||||
"no-magic-numbers": [
|
||||
"warn",
|
||||
{ "ignoreArrayIndexes": true, "ignore": [0,1,2,3,4,5,6,7,8,9]}
|
||||
],
|
||||
"curly": "warn",
|
||||
"no-loss-of-precision": "error",
|
||||
"default-case-last": "warn",
|
||||
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,4 +1,3 @@
|
||||
# Node build artifacts
|
||||
node_modules/
|
||||
npm-debug.log
|
||||
src/*.js
|
||||
npm-debug.log
|
@ -12,6 +12,10 @@ $(function() {
|
||||
*/
|
||||
|
||||
const opts = new TokenizerOptions();
|
||||
const res = tokenize("match 1+ thing from thingy", opts);
|
||||
const result = tokenize("match /* 9+ */ 1+ optionally 1..3 0-zero then //comment match", opts);
|
||||
|
||||
console.log(res);
|
||||
for(const r of result.tokens) {
|
||||
console.log(r.to_string());
|
||||
}
|
||||
|
||||
console.log(result.errors);
|
@ -69,7 +69,7 @@ const escape_sequences = {
|
||||
};
|
||||
|
||||
export class TokenizerOptions {
|
||||
public convert_spaces_to_tabs: boolean = false;
|
||||
public convert_spaces_to_tabs: boolean = true;
|
||||
}
|
||||
|
||||
const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g);
|
||||
@ -120,18 +120,21 @@ function escape_sequence_mapper(input: string, i : number) : { code: string, rea
|
||||
}
|
||||
}
|
||||
|
||||
function is_digit(input: string) : boolean {
|
||||
const test_chars = "09azAZ";
|
||||
|
||||
function is_digit(input: string, i: number) : boolean {
|
||||
//return /[0-9]/g.test(input);
|
||||
const value = input.charCodeAt(0);
|
||||
return value >= 48 && value <= 57;
|
||||
const value = input.charCodeAt(i);
|
||||
return value >= test_chars.charCodeAt(0) && value <= test_chars.charCodeAt(1);
|
||||
}
|
||||
|
||||
function is_char(input: string) : boolean {
|
||||
function is_char(input: string, i: number) : boolean {
|
||||
//return input.toUpperCase() != input.toLowerCase();
|
||||
//return /[a-zA-Z]/g.test(input);
|
||||
|
||||
const value = input.charCodeAt(0);
|
||||
return ((value >= 65 && value <= 90) || (value >= 97 && value <= 122));
|
||||
const value = input.charCodeAt(i);
|
||||
return ((value >= test_chars.charCodeAt(2) && value <= test_chars.charCodeAt(3)) ||
|
||||
(value >= test_chars.charCodeAt(4) && value <= test_chars.charCodeAt(5)));
|
||||
}
|
||||
|
||||
/* Basic Tokenizer */
|
||||
@ -174,7 +177,6 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
|
||||
else if(input.startsWith("/*", i)) {
|
||||
for(i++, position++; i < input.length-1; i++, position++) {
|
||||
if(input[i] === "*" && input[i+1] === "/") {
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
|
||||
i++;
|
||||
position++;
|
||||
break;
|
||||
@ -268,6 +270,10 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
|
||||
case "-":
|
||||
tokens.push(new Token(TokenType.BETWEEN, line, position));
|
||||
break;
|
||||
case "+":
|
||||
tokens.push(new Token(TokenType.KEYWORD_OR, line, position));
|
||||
tokens.push(new Token(TokenType.KEYWORD_MORE, line, position));
|
||||
break;
|
||||
case "\n":
|
||||
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
|
||||
break;
|
||||
@ -281,24 +287,22 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
|
||||
break;
|
||||
default:
|
||||
// is digit? build up a number
|
||||
if(is_digit(input[i])) {
|
||||
if(is_digit(input, i)) {
|
||||
let digits = input[i];
|
||||
|
||||
do {
|
||||
i++; position++;
|
||||
digits += input[i];
|
||||
} while(i+1 < input.length && is_digit(input[i+1]));
|
||||
for(; i+1 < input.length && is_digit(input, i+1); i++, position++) {
|
||||
digits += input[i+1];
|
||||
}
|
||||
|
||||
tokens.push(new Token(TokenType.NUMBER, line, position, digits));
|
||||
}
|
||||
// is char? build up a word
|
||||
else if(is_char(input[i])) {
|
||||
else if(is_char(input, i)) {
|
||||
let text = input[i];
|
||||
|
||||
do {
|
||||
i++; position++;
|
||||
text += input[i];
|
||||
} while(i+1 < input.length && is_char(input[i+1]));
|
||||
for(; i+1 < input.length && is_char(input, i+1); i++, position++) {
|
||||
text += input[i+1];
|
||||
}
|
||||
|
||||
const keyword_text = text.toLowerCase();
|
||||
|
||||
@ -348,7 +352,7 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
|
||||
}
|
||||
}
|
||||
else {
|
||||
errors.push(new TokenError(`Unknown character in text: ${input.charCodeAt(i)}`, line, position));
|
||||
errors.push(new TokenError(`Unknown character in text: "${input[i]}" (${input.charCodeAt(i)})`, line, position));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -52,13 +52,23 @@ export class TokenError extends Error {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public to_string() {
|
||||
public to_string(): string {
|
||||
return `${this.line}:${this.position} ${this.message}`;
|
||||
}
|
||||
}
|
||||
|
||||
export class Token {
|
||||
constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) {
|
||||
|
||||
constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) {
|
||||
/* nothing required */
|
||||
}
|
||||
|
||||
public to_string(): string {
|
||||
let str = `${this.line}:${this.position} ${TokenType[this.type]}`;
|
||||
|
||||
if (this.token_string) {
|
||||
str += ` "${this.token_string}"`;
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user