mirror of
https://github.com/pdemian/human2regex.git
synced 2025-05-16 12:30:09 -07:00
Bug fix in tokenizer
This commit is contained in:
parent
ae84a52821
commit
9f46d1246c
@ -17,6 +17,11 @@
|
|||||||
],
|
],
|
||||||
"rules": {
|
"rules": {
|
||||||
"@typescript-eslint/no-inferrable-types": "off",
|
"@typescript-eslint/no-inferrable-types": "off",
|
||||||
|
"@typescript-eslint/explicit-function-return-type": "error",
|
||||||
|
"no-magic-numbers": [
|
||||||
|
"warn",
|
||||||
|
{ "ignoreArrayIndexes": true, "ignore": [0,1,2,3,4,5,6,7,8,9]}
|
||||||
|
],
|
||||||
"curly": "warn",
|
"curly": "warn",
|
||||||
"no-loss-of-precision": "error",
|
"no-loss-of-precision": "error",
|
||||||
"default-case-last": "warn",
|
"default-case-last": "warn",
|
||||||
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,4 +1,3 @@
|
|||||||
# Node build artifacts
|
# Node build artifacts
|
||||||
node_modules/
|
node_modules/
|
||||||
npm-debug.log
|
npm-debug.log
|
||||||
src/*.js
|
|
@ -12,6 +12,10 @@ $(function() {
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
const opts = new TokenizerOptions();
|
const opts = new TokenizerOptions();
|
||||||
const res = tokenize("match 1+ thing from thingy", opts);
|
const result = tokenize("match /* 9+ */ 1+ optionally 1..3 0-zero then //comment match", opts);
|
||||||
|
|
||||||
console.log(res);
|
for(const r of result.tokens) {
|
||||||
|
console.log(r.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(result.errors);
|
@ -69,7 +69,7 @@ const escape_sequences = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export class TokenizerOptions {
|
export class TokenizerOptions {
|
||||||
public convert_spaces_to_tabs: boolean = false;
|
public convert_spaces_to_tabs: boolean = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g);
|
const escape_sequence_hex_regex = new RegExp(/[0-9A-Fa-f]/g);
|
||||||
@ -120,18 +120,21 @@ function escape_sequence_mapper(input: string, i : number) : { code: string, rea
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function is_digit(input: string) : boolean {
|
const test_chars = "09azAZ";
|
||||||
|
|
||||||
|
function is_digit(input: string, i: number) : boolean {
|
||||||
//return /[0-9]/g.test(input);
|
//return /[0-9]/g.test(input);
|
||||||
const value = input.charCodeAt(0);
|
const value = input.charCodeAt(i);
|
||||||
return value >= 48 && value <= 57;
|
return value >= test_chars.charCodeAt(0) && value <= test_chars.charCodeAt(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
function is_char(input: string) : boolean {
|
function is_char(input: string, i: number) : boolean {
|
||||||
//return input.toUpperCase() != input.toLowerCase();
|
//return input.toUpperCase() != input.toLowerCase();
|
||||||
//return /[a-zA-Z]/g.test(input);
|
//return /[a-zA-Z]/g.test(input);
|
||||||
|
|
||||||
const value = input.charCodeAt(0);
|
const value = input.charCodeAt(i);
|
||||||
return ((value >= 65 && value <= 90) || (value >= 97 && value <= 122));
|
return ((value >= test_chars.charCodeAt(2) && value <= test_chars.charCodeAt(3)) ||
|
||||||
|
(value >= test_chars.charCodeAt(4) && value <= test_chars.charCodeAt(5)));
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Basic Tokenizer */
|
/* Basic Tokenizer */
|
||||||
@ -174,7 +177,6 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
|
|||||||
else if(input.startsWith("/*", i)) {
|
else if(input.startsWith("/*", i)) {
|
||||||
for(i++, position++; i < input.length-1; i++, position++) {
|
for(i++, position++; i < input.length-1; i++, position++) {
|
||||||
if(input[i] === "*" && input[i+1] === "/") {
|
if(input[i] === "*" && input[i+1] === "/") {
|
||||||
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
|
|
||||||
i++;
|
i++;
|
||||||
position++;
|
position++;
|
||||||
break;
|
break;
|
||||||
@ -268,6 +270,10 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
|
|||||||
case "-":
|
case "-":
|
||||||
tokens.push(new Token(TokenType.BETWEEN, line, position));
|
tokens.push(new Token(TokenType.BETWEEN, line, position));
|
||||||
break;
|
break;
|
||||||
|
case "+":
|
||||||
|
tokens.push(new Token(TokenType.KEYWORD_OR, line, position));
|
||||||
|
tokens.push(new Token(TokenType.KEYWORD_MORE, line, position));
|
||||||
|
break;
|
||||||
case "\n":
|
case "\n":
|
||||||
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
|
tokens.push(new Token(TokenType.END_OF_STATEMENT, line, position));
|
||||||
break;
|
break;
|
||||||
@ -281,24 +287,22 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
|
|||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
// is digit? build up a number
|
// is digit? build up a number
|
||||||
if(is_digit(input[i])) {
|
if(is_digit(input, i)) {
|
||||||
let digits = input[i];
|
let digits = input[i];
|
||||||
|
|
||||||
do {
|
for(; i+1 < input.length && is_digit(input, i+1); i++, position++) {
|
||||||
i++; position++;
|
digits += input[i+1];
|
||||||
digits += input[i];
|
}
|
||||||
} while(i+1 < input.length && is_digit(input[i+1]));
|
|
||||||
|
|
||||||
tokens.push(new Token(TokenType.NUMBER, line, position, digits));
|
tokens.push(new Token(TokenType.NUMBER, line, position, digits));
|
||||||
}
|
}
|
||||||
// is char? build up a word
|
// is char? build up a word
|
||||||
else if(is_char(input[i])) {
|
else if(is_char(input, i)) {
|
||||||
let text = input[i];
|
let text = input[i];
|
||||||
|
|
||||||
do {
|
for(; i+1 < input.length && is_char(input, i+1); i++, position++) {
|
||||||
i++; position++;
|
text += input[i+1];
|
||||||
text += input[i];
|
}
|
||||||
} while(i+1 < input.length && is_char(input[i+1]));
|
|
||||||
|
|
||||||
const keyword_text = text.toLowerCase();
|
const keyword_text = text.toLowerCase();
|
||||||
|
|
||||||
@ -348,7 +352,7 @@ export function tokenize(input: string, options: TokenizerOptions) : { tokens: T
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
errors.push(new TokenError(`Unknown character in text: ${input.charCodeAt(i)}`, line, position));
|
errors.push(new TokenError(`Unknown character in text: "${input[i]}" (${input.charCodeAt(i)})`, line, position));
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -52,13 +52,23 @@ export class TokenError extends Error {
|
|||||||
super(message);
|
super(message);
|
||||||
}
|
}
|
||||||
|
|
||||||
public to_string() {
|
public to_string(): string {
|
||||||
return `${this.line}:${this.position} ${this.message}`;
|
return `${this.line}:${this.position} ${this.message}`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export class Token {
|
export class Token {
|
||||||
constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) {
|
constructor(public type: TokenType, public line: number, public position: number, public token_string?: string) {
|
||||||
|
/* nothing required */
|
||||||
|
}
|
||||||
|
|
||||||
|
public to_string(): string {
|
||||||
|
let str = `${this.line}:${this.position} ${TokenType[this.type]}`;
|
||||||
|
|
||||||
|
if (this.token_string) {
|
||||||
|
str += ` "${this.token_string}"`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return str;
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
x
Reference in New Issue
Block a user