2024-03-27 13:12:32 +00:00
|
|
|
import { scan_identifier } from "./identifier_lexer.ts";
|
|
|
|
import { scan_number } from "./number_lexer.ts";
|
2024-03-27 13:36:34 +00:00
|
|
|
import { scan_string } from "./string_lexer.ts";
|
2024-03-27 13:18:31 +00:00
|
|
|
import { is_digit, is_lowercase, is_uppercase } from "./utils.ts";
|
2024-03-26 23:05:58 +00:00
|
|
|
|
2024-03-26 23:29:52 +00:00
|
|
|
export type Token = {
|
|
|
|
v: string,
|
|
|
|
token_type: string,
|
|
|
|
};
|
2024-03-26 23:05:58 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Lexes a string of THP code, and returns an array of tokens. Unlike a regular
|
|
|
|
* lexer, whitespace and other characters are not ignored, and are instead treated
|
|
|
|
* as a default token.
|
|
|
|
*
|
|
|
|
* This lexer implements a subset of the grammar defined in the THP language specification,
|
|
|
|
* only recognizing the following tokens:
|
|
|
|
* - Identifier
|
|
|
|
* - Datatype
|
|
|
|
* - String
|
|
|
|
* - Number
|
|
|
|
* - Single line comment
|
|
|
|
* - Multi line comment
|
|
|
|
* - Keywords
|
|
|
|
*
|
|
|
|
* @param code Code to lex
|
|
|
|
* @returns An array of all the tokens found
|
|
|
|
*/
|
|
|
|
export function lex(code: string): Array<Token> {
|
|
|
|
const code_len = code.length;
|
|
|
|
const tokens: Array<Token> = [];
|
|
|
|
|
|
|
|
let current_pos = 0;
|
|
|
|
let current_default_token = "";
|
|
|
|
|
|
|
|
while (current_pos < code_len) {
|
|
|
|
const c = code[current_pos];
|
|
|
|
|
2024-03-27 13:12:32 +00:00
|
|
|
// try to scan a number
|
2024-03-26 23:05:58 +00:00
|
|
|
if (is_digit(c)) {
|
|
|
|
// if the current default token is not empty, push it to the tokens array
|
|
|
|
if (current_default_token !== "") {
|
2024-03-26 23:29:52 +00:00
|
|
|
tokens.push({ v: current_default_token, token_type: "" });
|
2024-03-26 23:05:58 +00:00
|
|
|
current_default_token = "";
|
|
|
|
}
|
|
|
|
|
|
|
|
// lex a number
|
2024-03-27 13:12:32 +00:00
|
|
|
const [token, next] = scan_number(code, current_pos);
|
|
|
|
current_pos = next;
|
|
|
|
tokens.push(token);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// try to scan an identifier/keyword
|
|
|
|
else if (is_lowercase(c) || c === "_") {
|
|
|
|
// if the current default token is not empty, push it to the tokens array
|
|
|
|
if (current_default_token !== "") {
|
|
|
|
tokens.push({ v: current_default_token, token_type: "" });
|
|
|
|
current_default_token = "";
|
|
|
|
}
|
|
|
|
|
|
|
|
const [token, next] = scan_identifier(code, current_pos);
|
2024-03-26 23:05:58 +00:00
|
|
|
current_pos = next;
|
|
|
|
tokens.push(token);
|
|
|
|
continue;
|
|
|
|
}
|
2024-03-27 13:18:31 +00:00
|
|
|
// try to scan a datatype
|
|
|
|
else if (is_uppercase(c)) {
|
|
|
|
// if the current default token is not empty, push it to the tokens array
|
|
|
|
if (current_default_token !== "") {
|
|
|
|
tokens.push({ v: current_default_token, token_type: "" });
|
|
|
|
current_default_token = "";
|
|
|
|
}
|
|
|
|
|
|
|
|
const [token, next] = scan_identifier(code, current_pos, true);
|
|
|
|
current_pos = next;
|
|
|
|
tokens.push(token);
|
|
|
|
continue;
|
|
|
|
}
|
2024-03-27 13:36:34 +00:00
|
|
|
// try to scan a string
|
|
|
|
else if (c === "\"") {
|
|
|
|
// if the current default token is not empty, push it to the tokens array
|
|
|
|
if (current_default_token !== "") {
|
|
|
|
tokens.push({ v: current_default_token, token_type: "" });
|
|
|
|
current_default_token = "";
|
|
|
|
}
|
|
|
|
|
|
|
|
const [token, next] = scan_string(code, current_pos);
|
|
|
|
current_pos = next;
|
|
|
|
tokens.push(token);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// try to scan a comment
|
|
|
|
else if (c === "/" && code[current_pos + 1] === "/") {
|
|
|
|
// if the current default token is not empty, push it to the tokens array
|
|
|
|
if (current_default_token !== "") {
|
|
|
|
tokens.push({ v: current_default_token, token_type: "" });
|
|
|
|
current_default_token = "";
|
|
|
|
}
|
|
|
|
|
|
|
|
let comment = "";
|
|
|
|
let pos = current_pos;
|
|
|
|
|
|
|
|
while (pos < code_len) {
|
|
|
|
const char = code[pos];
|
|
|
|
|
|
|
|
if (char === "\n") {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
comment += char;
|
|
|
|
pos++;
|
|
|
|
}
|
|
|
|
|
|
|
|
tokens.push({ v: comment, token_type: "comment" });
|
|
|
|
current_pos = pos;
|
|
|
|
continue;
|
|
|
|
}
|
2024-03-27 17:01:14 +00:00
|
|
|
// replace < with <
|
|
|
|
else if (c === "<") {
|
|
|
|
current_default_token += "<";
|
2024-03-26 23:05:58 +00:00
|
|
|
current_pos++;
|
2024-03-27 17:01:14 +00:00
|
|
|
continue;
|
2024-03-26 23:05:58 +00:00
|
|
|
}
|
2024-03-27 17:01:14 +00:00
|
|
|
|
|
|
|
current_default_token += c;
|
|
|
|
current_pos++;
|
2024-03-26 23:05:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// if there was a default token, push it to the tokens array
|
|
|
|
if (current_default_token !== "") {
|
2024-03-26 23:29:52 +00:00
|
|
|
tokens.push({ v: current_default_token, token_type: "" });
|
2024-03-26 23:05:58 +00:00
|
|
|
current_default_token = "";
|
|
|
|
}
|
|
|
|
|
|
|
|
return tokens;
|
|
|
|
}
|
|
|
|
|
|
|
|
|