thp-lang.org/lexer/lexer.ts

139 lines
4.3 KiB
TypeScript
Raw Normal View History

2024-03-27 13:12:32 +00:00
import { scan_identifier } from "./identifier_lexer.ts";
import { scan_number } from "./number_lexer.ts";
2024-03-27 13:36:34 +00:00
import { scan_string } from "./string_lexer.ts";
2024-03-27 13:18:31 +00:00
import { is_digit, is_lowercase, is_uppercase } from "./utils.ts";
2024-03-26 23:05:58 +00:00
export type Token = {
v: string,
token_type: string,
};
2024-03-26 23:05:58 +00:00
/**
* Lexes a string of THP code, and returns an array of tokens. Unlike a regular
* lexer, whitespace and other characters are not ignored, and are instead treated
* as a default token.
*
* This lexer implements a subset of the grammar defined in the THP language specification,
* only recognizing the following tokens:
* - Identifier
* - Datatype
* - String
* - Number
* - Single line comment
* - Multi line comment
* - Keywords
*
* @param code Code to lex
* @returns An array of all the tokens found
*/
export function lex(code: string): Array<Token> {
const code_len = code.length;
const tokens: Array<Token> = [];
let current_pos = 0;
let current_default_token = "";
while (current_pos < code_len) {
const c = code[current_pos];
2024-03-27 13:12:32 +00:00
// try to scan a number
2024-03-26 23:05:58 +00:00
if (is_digit(c)) {
// if the current default token is not empty, push it to the tokens array
if (current_default_token !== "") {
tokens.push({ v: current_default_token, token_type: "" });
2024-03-26 23:05:58 +00:00
current_default_token = "";
}
// lex a number
2024-03-27 13:12:32 +00:00
const [token, next] = scan_number(code, current_pos);
current_pos = next;
tokens.push(token);
continue;
}
// try to scan an identifier/keyword
else if (is_lowercase(c) || c === "_") {
// if the current default token is not empty, push it to the tokens array
if (current_default_token !== "") {
tokens.push({ v: current_default_token, token_type: "" });
current_default_token = "";
}
const [token, next] = scan_identifier(code, current_pos);
2024-03-26 23:05:58 +00:00
current_pos = next;
tokens.push(token);
continue;
}
2024-03-27 13:18:31 +00:00
// try to scan a datatype
else if (is_uppercase(c)) {
// if the current default token is not empty, push it to the tokens array
if (current_default_token !== "") {
tokens.push({ v: current_default_token, token_type: "" });
current_default_token = "";
}
const [token, next] = scan_identifier(code, current_pos, true);
current_pos = next;
tokens.push(token);
continue;
}
2024-03-27 13:36:34 +00:00
// try to scan a string
else if (c === "\"") {
// if the current default token is not empty, push it to the tokens array
if (current_default_token !== "") {
tokens.push({ v: current_default_token, token_type: "" });
current_default_token = "";
}
const [token, next] = scan_string(code, current_pos);
current_pos = next;
tokens.push(token);
continue;
}
// try to scan a comment
else if (c === "/" && code[current_pos + 1] === "/") {
// if the current default token is not empty, push it to the tokens array
if (current_default_token !== "") {
tokens.push({ v: current_default_token, token_type: "" });
current_default_token = "";
}
let comment = "";
let pos = current_pos;
while (pos < code_len) {
const char = code[pos];
if (char === "\n") {
break;
}
comment += char;
pos++;
}
tokens.push({ v: comment, token_type: "comment" });
current_pos = pos;
continue;
}
// replace < with &lt;
else if (c === "<") {
current_default_token += "&lt;";
2024-03-26 23:05:58 +00:00
current_pos++;
continue;
2024-03-26 23:05:58 +00:00
}
current_default_token += c;
current_pos++;
2024-03-26 23:05:58 +00:00
}
// if there was a default token, push it to the tokens array
if (current_default_token !== "") {
tokens.push({ v: current_default_token, token_type: "" });
2024-03-26 23:05:58 +00:00
current_default_token = "";
}
return tokens;
}