thp-lang.org/lexer/lexer.ts

import { scan_identifier } from "./identifier_lexer.ts";
import { scan_number } from "./number_lexer.ts";
import { scan_string } from "./string_lexer.ts";
import { is_digit, is_lowercase, is_uppercase } from "./utils.ts";

export type Token = {
    v: string,
    token_type: string,
};

/**
 * Lexes a string of THP code, and returns an array of tokens. Unlike a regular
 * lexer, whitespace and other characters are not ignored, and are instead treated
 * as a default token.
 * 
 * This lexer implements a subset of the grammar defined in the THP language specification,
 * only recognizing the following tokens:
 * - Identifier
 * - Datatype
 * - String
 * - Number
 * - Single line comment
 * - Multi line comment
 * - Keywords
 * 
 * @param code Code to lex
 * @returns An array of all the tokens found
 */
export function lex(code: string): Array<Token> {
    const code_len = code.length;
    const tokens: Array<Token> = [];

    let current_pos = 0;
    let current_default_token = "";

    while (current_pos < code_len) {
        const c = code[current_pos];

        // try to scan a number
        if (is_digit(c)) {
            // if the current default token is not empty, push it to the tokens array
            if (current_default_token !== "") {
                tokens.push({ v: current_default_token, token_type: "" });
                current_default_token = "";
            }

            // lex a number
            const [token, next] = scan_number(code, current_pos);
            current_pos = next;
            tokens.push(token);
            continue;
        }
        // try to scan an identifier/keyword
        else if (is_lowercase(c) || c === "_") {
            // if the current default token is not empty, push it to the tokens array
            if (current_default_token !== "") {
                tokens.push({ v: current_default_token, token_type: "" });
                current_default_token = "";
            }

            const [token, next] = scan_identifier(code, current_pos);
            current_pos = next;
            tokens.push(token);
            continue;
        }
        // try to scan a datatype
        else if (is_uppercase(c)) {
            // if the current default token is not empty, push it to the tokens array
            if (current_default_token !== "") {
                tokens.push({ v: current_default_token, token_type: "" });
                current_default_token = "";
            }

            const [token, next] = scan_identifier(code, current_pos, true);
            current_pos = next;
            tokens.push(token);
            continue;
        }
        // try to scan a string
        else if (c === "\"") {
            // if the current default token is not empty, push it to the tokens array
            if (current_default_token !== "") {
                tokens.push({ v: current_default_token, token_type: "" });
                current_default_token = "";
            }

            const [token, next] = scan_string(code, current_pos);
            current_pos = next;
            tokens.push(token);
            continue;
        }
        // try to scan a comment
        else if (c === "/" && code[current_pos + 1] === "/") {
            // if the current default token is not empty, push it to the tokens array
            if (current_default_token !== "") {
                tokens.push({ v: current_default_token, token_type: "" });
                current_default_token = "";
            }

            let comment = "";
            let pos = current_pos;

            while (pos < code_len) {
                const char = code[pos];

                if (char === "\n") {
                    break;
                }

                comment += char;
                pos++;
            }

            tokens.push({ v: comment, token_type: "comment" });
            current_pos = pos;
            continue;
        }
        // replace < with &lt;
        else if (c === "<") {
            current_default_token += "&lt;";
            current_pos++;
            continue;
        }

        current_default_token += c;
        current_pos++;
    }

    // if there was a default token, push it to the tokens array
    if (current_default_token !== "") {
        tokens.push({ v: current_default_token, token_type: "" });
        current_default_token = "";
    }

    return tokens;
}
scan identifiers 2024-03-27 13:12:32 +00:00			`import { scan_identifier } from "./identifier_lexer.ts";`
			`import { scan_number } from "./number_lexer.ts";`
scan strings and comments 2024-03-27 13:36:34 +00:00			`import { scan_string } from "./string_lexer.ts";`
scan datatypes 2024-03-27 13:18:31 +00:00			`import { is_digit, is_lowercase, is_uppercase } from "./utils.ts";`
write a custom highlighter for codejar 2024-03-26 23:05:58 +00:00
Complete minimal flow for editor highlighting 2024-03-26 23:29:52 +00:00			`export type Token = {`
			`v: string,`
			`token_type: string,`
			`};`
write a custom highlighter for codejar 2024-03-26 23:05:58 +00:00
			`/**`
			`* Lexes a string of THP code, and returns an array of tokens. Unlike a regular`
			`* lexer, whitespace and other characters are not ignored, and are instead treated`
			`* as a default token.`
			`*`
			`* This lexer implements a subset of the grammar defined in the THP language specification,`
			`* only recognizing the following tokens:`
			`* - Identifier`
			`* - Datatype`
			`* - String`
			`* - Number`
			`* - Single line comment`
			`* - Multi line comment`
			`* - Keywords`
			`*`
			`* @param code Code to lex`
			`* @returns An array of all the tokens found`
			`*/`
			`export function lex(code: string): Array<Token> {`
			`const code_len = code.length;`
			`const tokens: Array<Token> = [];`

			`let current_pos = 0;`
			`let current_default_token = "";`

			`while (current_pos < code_len) {`
			`const c = code[current_pos];`

scan identifiers 2024-03-27 13:12:32 +00:00			`// try to scan a number`
write a custom highlighter for codejar 2024-03-26 23:05:58 +00:00			`if (is_digit(c)) {`
			`// if the current default token is not empty, push it to the tokens array`
			`if (current_default_token !== "") {`
Complete minimal flow for editor highlighting 2024-03-26 23:29:52 +00:00			`tokens.push({ v: current_default_token, token_type: "" });`
write a custom highlighter for codejar 2024-03-26 23:05:58 +00:00			`current_default_token = "";`
			`}`

			`// lex a number`
scan identifiers 2024-03-27 13:12:32 +00:00			`const [token, next] = scan_number(code, current_pos);`
			`current_pos = next;`
			`tokens.push(token);`
			`continue;`
			`}`
			`// try to scan an identifier/keyword`
			`else if (is_lowercase(c) \|\| c === "_") {`
			`// if the current default token is not empty, push it to the tokens array`
			`if (current_default_token !== "") {`
			`tokens.push({ v: current_default_token, token_type: "" });`
			`current_default_token = "";`
			`}`

			`const [token, next] = scan_identifier(code, current_pos);`
write a custom highlighter for codejar 2024-03-26 23:05:58 +00:00			`current_pos = next;`
			`tokens.push(token);`
			`continue;`
			`}`
scan datatypes 2024-03-27 13:18:31 +00:00			`// try to scan a datatype`
			`else if (is_uppercase(c)) {`
			`// if the current default token is not empty, push it to the tokens array`
			`if (current_default_token !== "") {`
			`tokens.push({ v: current_default_token, token_type: "" });`
			`current_default_token = "";`
			`}`

			`const [token, next] = scan_identifier(code, current_pos, true);`
			`current_pos = next;`
			`tokens.push(token);`
			`continue;`
			`}`
scan strings and comments 2024-03-27 13:36:34 +00:00			`// try to scan a string`
			`else if (c === "\"") {`
			`// if the current default token is not empty, push it to the tokens array`
			`if (current_default_token !== "") {`
			`tokens.push({ v: current_default_token, token_type: "" });`
			`current_default_token = "";`
			`}`

			`const [token, next] = scan_string(code, current_pos);`
			`current_pos = next;`
			`tokens.push(token);`
			`continue;`
			`}`
			`// try to scan a comment`
			`else if (c === "/" && code[current_pos + 1] === "/") {`
			`// if the current default token is not empty, push it to the tokens array`
			`if (current_default_token !== "") {`
			`tokens.push({ v: current_default_token, token_type: "" });`
			`current_default_token = "";`
			`}`

			`let comment = "";`
			`let pos = current_pos;`

			`while (pos < code_len) {`
			`const char = code[pos];`

			`if (char === "\n") {`
			`break;`
			`}`

			`comment += char;`
			`pos++;`
			`}`

			`tokens.push({ v: comment, token_type: "comment" });`
			`current_pos = pos;`
			`continue;`
			`}`
use the new highlighter in all the pages 2024-03-27 17:01:14 +00:00			`// replace < with <`
			`else if (c === "<") {`
			`current_default_token += "<";`
write a custom highlighter for codejar 2024-03-26 23:05:58 +00:00			`current_pos++;`
use the new highlighter in all the pages 2024-03-27 17:01:14 +00:00			`continue;`
write a custom highlighter for codejar 2024-03-26 23:05:58 +00:00			`}`
use the new highlighter in all the pages 2024-03-27 17:01:14 +00:00
			`current_default_token += c;`
			`current_pos++;`
write a custom highlighter for codejar 2024-03-26 23:05:58 +00:00			`}`

			`// if there was a default token, push it to the tokens array`
			`if (current_default_token !== "") {`
Complete minimal flow for editor highlighting 2024-03-26 23:29:52 +00:00			`tokens.push({ v: current_default_token, token_type: "" });`
write a custom highlighter for codejar 2024-03-26 23:05:58 +00:00			`current_default_token = "";`
			`}`

			`return tokens;`
			`}`