thp-lang.org/src/lexer/highlighter.ts

import { spawn } from "node:child_process";
import { leftTrimDedent } from "../components/utils";
import { HighlightLevel } from "./types";
import type { LexError, SyntaxError, Token, TokenizeResult, TokenType } from "./types";

const error_classes = "underline underline-offset-4 decoration-wavy decoration-red-500";

export async function native_highlighter(code: string, level = HighlightLevel.Lexic): Promise<[string, string, string | null]> {
    let formatted_code = leftTrimDedent(code).join("\n");

    try {
        let result = await native_lex(formatted_code);
        return highlight_syntax(formatted_code, result, level);
    } catch (error) {
        return compiler_error(formatted_code, error as Error);
    }
}

function highlight_syntax(code: string, result: TokenizeResult, level: HighlightLevel): [string, string, string | null] {
    let tokens_final: Array<Token>;

    if (result.SemanticError) {
        const [tokens, semanticError] = result.SemanticError;

        if (level === HighlightLevel.Semantic) {
            return semantic_error_highlighter(code, tokens, semanticError.Semantic!);
        } else {
            tokens_final = tokens;
        }
    } else if (result.SyntaxError) {
        const [tokens, syntaxError] = result.SyntaxError;

        if (level === HighlightLevel.Semantic || level === HighlightLevel.Syntactic) {
            return syntax_error_highlighter(code, tokens, syntaxError.Syntax!);
        } else {
            tokens_final = tokens;
        }
    } else if (result.LexError) {
        // There is no error level that bypasses a lex error
        return lex_error_highlighter(code, result.LexError!.Lex!);
    } else if (result.Ok) {
        tokens_final = result.Ok;
    } else {
        console.error(result);
        throw new Error("Web page error: The compiler returned a case that wasn't handled.");
    }

    // At this point all error cases have been handled
    // and tokens_final contains valid tokens.

    const output = highlight_tokens(code, tokens_final);
    return [output, "", null];
}


/**
 * Highlights code that has a lexic error
 */
function lex_error_highlighter(code: string, error: LexError): [string, string, string] {
    // Create a single error token

    const err_pos = error.position;
    const before_err = code.substring(0, err_pos);
    const err_str = code[err_pos];
    const after_err = code.substring(err_pos + 1);

    const token = `<span class="token ${error_classes}">${err_str}</span>`;

    const all = `${before_err}${token}${after_err}`;
    const [error_line, error_column] = absolute_to_line_column(code, error.position);

    // TODO: Transform absolute posijion (error.position) into line:column
    return [all, "Lexical", error.reason + ` at line ${error_line}:${error_column} `]
}

function syntax_error_highlighter(code: string, tokens: Array<Token>, error: SyntaxError): [string, string, string] {
    const highlighted = highlight_tokens(code, tokens, error.error_start, error.error_end);
    const [error_line, error_column] = absolute_to_line_column(code, error.error_start);

    const error_message = `${error.reason} at line ${error_line}:${error_column}`;
    return [highlighted, "Syntax", error_message];
}

function semantic_error_highlighter(code: string, tokens: Array<Token>, error: SyntaxError): [string, string, string] {
    const highlighted = highlight_tokens(code, tokens, error.error_start, error.error_end);
    const [error_line, error_column] = absolute_to_line_column(code, error.error_start);

    const error_message = `${error.reason} at line ${error_line}:${error_column}`;
    return [highlighted, "Semantic", error_message];
}

function compiler_error(code: string, error: Error): [string, string, string] {
    return [code, "Fatal Compiler", error.message];
}

/**
 * Transforms a list of tokens into colored HTML, and underlines errors
 * if present
 * @param input The original source code
 * @param tokens The list of tokens
 * @param error_start Absolute position from where the error starts.
 * @param error_end Absolute position to where the error ends.
 * @returns 
 */
function highlight_tokens(input: string, tokens: Array<Token>, error_start = -1, error_end = -1): string {
    const input_chars = input.split("");
    let output = "";

    let current_pos = 0;
    for (let i = 0; i < tokens.length; i += 1) {
        const t = tokens[i]!;
        const token_start = t.position;
        const token_end = t.position + t.value.length;

        let is_errored = (token_start >= error_start && token_end <= error_end);

        // Some tokens require processing (like multiline comments)

        // There are some tokens that are empty, ignore them
        if (t.value == "") {
            continue;
        }

        // Append all characters before the token
        output += input_chars.slice(current_pos, token_start).join("");

        // Append the token
        const [token_value, new_token_end] = process_token_value_and_end(t.value, t.token_type, token_end);
        const token_type = translate_token_type(t.token_type, token_value);
        output += `<span class="token ${token_type} ${is_errored ? error_classes : ""}">${token_value}</span>`;

        current_pos = new_token_end;
    }

    return output;
}

/**
 * Transform an absolute position in source code to a line:column combination.
 * 
 * Both line and column are 1-based
 * 
 * @param input the source code
 * @param absolute the absolute position
 */
function absolute_to_line_column(input: string, absolute: number): [number, number] {
    let line_count = 1;
    let last_newline_pos = 0;

    // Count lines
    for (let i = 0; i < input.length; i += 1) {
        if (i === absolute) {
            break;
        }

        if (input[i] === "\n") {
            line_count += 1;
            last_newline_pos = i;
        }
    }

    return [line_count, absolute - last_newline_pos];
}

/**
 * Certain tokens store values that differ from the source code representation.
 * For example, the multiline comment token stores the content of the comment
 * without `/*` and `* /`, this function handles those cases.
 * 
 * @param value The value of the token
 * @param token_type The type of the token, used to know if it needs preprocessing
 * @param first_end The position where the token ends according to the token value
 * @returns 
 */
function process_token_value_and_end(value: string, token_type: TokenType, first_end: number): [string, number] {
    let token_value = value;
    let new_end = first_end;
    if (token_type === "MultilineComment") {
        token_value = `/*${token_value}*/`;
        new_end += 4;
    } else if (token_type === "String") {
        token_value = `"${token_value}"`;
        new_end += 2;
    }

    // Escape html and return
    return [
        token_value.replaceAll(/</g, "&lt;").replaceAll(/>/g, "&gt;"),
        new_end
    ];
}

function translate_token_type(tt: TokenType, value: string): string {
    const keywords = ["throws", "extends", "constructor", "case", "static", "const",
        "enum", "union", "loop", "use", "break", "catch", "continue", "as", "do",
        "else", "finally", "for", "fun", "if", "in", "fn", "nil", "return", "throw",
        "try", "while", "type", "match", "with", "of", "abstract", "class", "interface",
        "private", "protected", "pub", "override", "open", "init", "val", "var", "mut", "clone"];

    switch (tt) {
        case "Datatype":
            return "class-name";
        case "Identifier": {
            if (keywords.includes(value)) {
                return "keyword";
            }

            return "identifier";
        }
        case "Int":
            return "number";
        case "Float":
            return "number";
        case "String":
            return "string";
        case "Comment":
        case "MultilineComment":
            return "comment";
        // keywords:
        case "VAL":
        case "VAR":
        case "FUN":
            return "keyword";
        default:
            return tt;
    }
}

const native_lex = (code: string) => new Promise<TokenizeResult>((resolve, reject) => {
    // Get binary path from .env
    const binary = import.meta.env.THP_BINARY;
    if (!binary) {
        throw new Error("THP_BINARY not set in .env");
    }

    const subprocess = spawn(binary, ["tokenize"]);
    let response = "";
    let error = "";

    subprocess.stdin.write(code);
    subprocess.stdin.end();

    subprocess.stdout.on("data", (data) => {
        response += data.toString();
    });

    subprocess.stderr.on("data", (data) => {
        error += data.toString();
    });

    subprocess.on("close", (code) => {
        if (code === 0) {
            resolve(JSON.parse(response));
        } else {
            reject(new Error(error));
        }
    });
})
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00			`import { spawn } from "node:child_process";`
			`import { leftTrimDedent } from "../components/utils";`
refactor: use a highlight level to emit errors 2024-08-26 13:43:36 +00:00			`import { HighlightLevel } from "./types";`
feat: small ui improvements 2024-08-26 14:00:43 +00:00			`import type { LexError, SyntaxError, Token, TokenizeResult, TokenType } from "./types";`
Refactor syntax highlighter to new spec from the compiler 2024-07-28 23:23:51 +00:00
feat: improve error rendering on code snippets 2024-08-13 01:31:19 +00:00			`const error_classes = "underline underline-offset-4 decoration-wavy decoration-red-500";`
Complete minimal flow for editor highlighting 2024-03-26 23:29:52 +00:00
refactor: use a highlight level to emit errors 2024-08-26 13:43:36 +00:00			`export async function native_highlighter(code: string, level = HighlightLevel.Lexic): Promise<[string, string, string \| null]> {`
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00			`let formatted_code = leftTrimDedent(code).join("\n");`

Add rendering of compiler panics in code snippets 2024-07-29 00:54:15 +00:00			`try {`
refactor: use a highlight level to emit errors 2024-08-26 13:43:36 +00:00			`let result = await native_lex(formatted_code);`
			`return highlight_syntax(formatted_code, result, level);`
Add rendering of compiler panics in code snippets 2024-07-29 00:54:15 +00:00			`} catch (error) {`
			`return compiler_error(formatted_code, error as Error);`
			`}`
refactor: use a highlight level to emit errors 2024-08-26 13:43:36 +00:00			`}`
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00
refactor: use a highlight level to emit errors 2024-08-26 13:43:36 +00:00			`function highlight_syntax(code: string, result: TokenizeResult, level: HighlightLevel): [string, string, string \| null] {`
			`let tokens_final: Array<Token>;`

			`if (result.SemanticError) {`
			`const [tokens, semanticError] = result.SemanticError;`

			`if (level === HighlightLevel.Semantic) {`
			`return semantic_error_highlighter(code, tokens, semanticError.Semantic!);`
			`} else {`
			`tokens_final = tokens;`
			`}`
			`} else if (result.SyntaxError) {`
			`const [tokens, syntaxError] = result.SyntaxError;`

			`if (level === HighlightLevel.Semantic \|\| level === HighlightLevel.Syntactic) {`
			`return syntax_error_highlighter(code, tokens, syntaxError.Syntax!);`
			`} else {`
			`tokens_final = tokens;`
			`}`
			`} else if (result.LexError) {`
			`// There is no error level that bypasses a lex error`
			`return lex_error_highlighter(code, result.LexError!.Lex!);`
			`} else if (result.Ok) {`
			`tokens_final = result.Ok;`
			`} else {`
			`console.error(result);`
			`throw new Error("Web page error: The compiler returned a case that wasn't handled.");`
Complete minimal flow for editor highlighting 2024-03-26 23:29:52 +00:00			`}`
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00
refactor: use a highlight level to emit errors 2024-08-26 13:43:36 +00:00			`// At this point all error cases have been handled`
			`// and tokens_final contains valid tokens.`
Refactor syntax highlighter to new spec from the compiler 2024-07-28 23:23:51 +00:00
refactor: use a highlight level to emit errors 2024-08-26 13:43:36 +00:00			`const output = highlight_tokens(code, tokens_final);`
Refactor syntax highlighter to new spec from the compiler 2024-07-28 23:23:51 +00:00			`return [output, "", null];`
			`}`


			`/**`
			`* Highlights code that has a lexic error`
			`*/`
			`function lex_error_highlighter(code: string, error: LexError): [string, string, string] {`
			`// Create a single error token`

			`const err_pos = error.position;`
			`const before_err = code.substring(0, err_pos);`
			`const err_str = code[err_pos];`
			`const after_err = code.substring(err_pos + 1);`

Highlight syntax errors in code snippets 2024-07-29 01:12:53 +00:00			const token = `<span class="token ${error_classes}">${err_str}</span>`;
Refactor syntax highlighter to new spec from the compiler 2024-07-28 23:23:51 +00:00
			const all = `${before_err}${token}${after_err}`;
feat: improve error rendering on code snippets 2024-08-13 01:31:19 +00:00			`const [error_line, error_column] = absolute_to_line_column(code, error.position);`
Refactor syntax highlighter to new spec from the compiler 2024-07-28 23:23:51 +00:00
			`// TODO: Transform absolute posijion (error.position) into line:column`
feat: improve error rendering on code snippets 2024-08-13 01:31:19 +00:00			return [all, "Lexical", error.reason + ` at line ${error_line}:${error_column} `]
Refactor syntax highlighter to new spec from the compiler 2024-07-28 23:23:51 +00:00			`}`

			`function syntax_error_highlighter(code: string, tokens: Array<Token>, error: SyntaxError): [string, string, string] {`
Highlight syntax errors in code snippets 2024-07-29 01:12:53 +00:00			`const highlighted = highlight_tokens(code, tokens, error.error_start, error.error_end);`
feat: improve error rendering on code snippets 2024-08-13 01:31:19 +00:00			`const [error_line, error_column] = absolute_to_line_column(code, error.error_start);`
Refactor syntax highlighter to new spec from the compiler 2024-07-28 23:23:51 +00:00
feat: improve error rendering on code snippets 2024-08-13 01:31:19 +00:00			const error_message = `${error.reason} at line ${error_line}:${error_column}`;
Refactor syntax highlighter to new spec from the compiler 2024-07-28 23:23:51 +00:00			`return [highlighted, "Syntax", error_message];`
			`}`

Add semantic errors display from the compiler 2024-08-02 01:33:44 +00:00			`function semantic_error_highlighter(code: string, tokens: Array<Token>, error: SyntaxError): [string, string, string] {`
			`const highlighted = highlight_tokens(code, tokens, error.error_start, error.error_end);`
feat: improve error rendering on code snippets 2024-08-13 01:31:19 +00:00			`const [error_line, error_column] = absolute_to_line_column(code, error.error_start);`
Add semantic errors display from the compiler 2024-08-02 01:33:44 +00:00
feat: improve error rendering on code snippets 2024-08-13 01:31:19 +00:00			const error_message = `${error.reason} at line ${error_line}:${error_column}`;
Add semantic errors display from the compiler 2024-08-02 01:33:44 +00:00			`return [highlighted, "Semantic", error_message];`
			`}`

Add rendering of compiler panics in code snippets 2024-07-29 00:54:15 +00:00			`function compiler_error(code: string, error: Error): [string, string, string] {`
			`return [code, "Fatal Compiler", error.message];`
			`}`

Highlight syntax errors in code snippets 2024-07-29 01:12:53 +00:00			`/**`
			`* Transforms a list of tokens into colored HTML, and underlines errors`
			`* if present`
			`* @param input The original source code`
			`* @param tokens The list of tokens`
			`* @param error_start Absolute position from where the error starts.`
			`* @param error_end Absolute position to where the error ends.`
			`* @returns`
			`*/`
			`function highlight_tokens(input: string, tokens: Array<Token>, error_start = -1, error_end = -1): string {`
Refactor syntax highlighter to new spec from the compiler 2024-07-28 23:23:51 +00:00			`const input_chars = input.split("");`
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00			`let output = "";`

			`let current_pos = 0;`
			`for (let i = 0; i < tokens.length; i += 1) {`
			`const t = tokens[i]!;`
			`const token_start = t.position;`
			`const token_end = t.position + t.value.length;`

feat: improve error rendering on code snippets 2024-08-13 01:31:19 +00:00			`let is_errored = (token_start >= error_start && token_end <= error_end);`
Highlight syntax errors in code snippets 2024-07-29 01:12:53 +00:00
Changes for multiline comments 2024-07-29 22:03:09 +00:00			`// Some tokens require processing (like multiline comments)`
Highlight syntax errors in code snippets 2024-07-29 01:12:53 +00:00
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00			`// There are some tokens that are empty, ignore them`
			`if (t.value == "") {`
			`continue;`
			`}`

			`// Append all characters before the token`
			`output += input_chars.slice(current_pos, token_start).join("");`

			`// Append the token`
Changes for multiline comments 2024-07-29 22:03:09 +00:00			`const [token_value, new_token_end] = process_token_value_and_end(t.value, t.token_type, token_end);`
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00			`const token_type = translate_token_type(t.token_type, token_value);`
Highlight syntax errors in code snippets 2024-07-29 01:12:53 +00:00			output += `<span class="token ${token_type} ${is_errored ? error_classes : ""}">${token_value}</span>`;
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00
Changes for multiline comments 2024-07-29 22:03:09 +00:00			`current_pos = new_token_end;`
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00			`}`

Refactor syntax highlighter to new spec from the compiler 2024-07-28 23:23:51 +00:00			`return output;`
Add visual indicator to lexical errors in code snippets 2024-07-28 21:51:53 +00:00			`}`

feat: improve error rendering on code snippets 2024-08-13 01:31:19 +00:00			`/**`
			`* Transform an absolute position in source code to a line:column combination.`
			`*`
			`* Both line and column are 1-based`
			`*`
			`* @param input the source code`
			`* @param absolute the absolute position`
			`*/`
			`function absolute_to_line_column(input: string, absolute: number): [number, number] {`
			`let line_count = 1;`
			`let last_newline_pos = 0;`

			`// Count lines`
			`for (let i = 0; i < input.length; i += 1) {`
			`if (i === absolute) {`
			`break;`
			`}`

			`if (input[i] === "\n") {`
			`line_count += 1;`
			`last_newline_pos = i;`
			`}`
			`}`

			`return [line_count, absolute - last_newline_pos];`
			`}`

Changes for multiline comments 2024-07-29 22:03:09 +00:00			`/**`
			`* Certain tokens store values that differ from the source code representation.`
			`* For example, the multiline comment token stores the content of the comment`
			* without `/` and ` /`, this function handles those cases.
			`*`
			`* @param value The value of the token`
			`* @param token_type The type of the token, used to know if it needs preprocessing`
			`* @param first_end The position where the token ends according to the token value`
			`* @returns`
			`*/`
			`function process_token_value_and_end(value: string, token_type: TokenType, first_end: number): [string, number] {`
			`let token_value = value;`
			`let new_end = first_end;`
			`if (token_type === "MultilineComment") {`
			token_value = `/${token_value}/`;
			`new_end += 4;`
Change highlighting to match the compiler 2024-07-31 20:25:49 +00:00			`} else if (token_type === "String") {`
			token_value = `"${token_value}"`;
			`new_end += 2;`
Changes for multiline comments 2024-07-29 22:03:09 +00:00			`}`

			`// Escape html and return`
			`return [`
			`token_value.replaceAll(/</g, "<").replaceAll(/>/g, ">"),`
			`new_end`
			`];`
			`}`
Add visual indicator to lexical errors in code snippets 2024-07-28 21:51:53 +00:00
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00			`function translate_token_type(tt: TokenType, value: string): string {`
Add docs on constructors 2024-07-21 23:06:01 +00:00			`const keywords = ["throws", "extends", "constructor", "case", "static", "const",`
			`"enum", "union", "loop", "use", "break", "catch", "continue", "as", "do",`
			`"else", "finally", "for", "fun", "if", "in", "fn", "nil", "return", "throw",`
			`"try", "while", "type", "match", "with", "of", "abstract", "class", "interface",`
			`"private", "protected", "pub", "override", "open", "init", "val", "var", "mut", "clone"];`
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00
			`switch (tt) {`
			`case "Datatype":`
			`return "class-name";`
			`case "Identifier": {`
			`if (keywords.includes(value)) {`
			`return "keyword";`
			`}`

			`return "identifier";`
			`}`
			`case "Int":`
			`return "number";`
			`case "Float":`
			`return "number";`
			`case "String":`
			`return "string";`
			`case "Comment":`
Changes for multiline comments 2024-07-29 22:03:09 +00:00			`case "MultilineComment":`
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00			`return "comment";`
			`// keywords:`
			`case "VAL":`
			`case "VAR":`
			`case "FUN":`
			`return "keyword";`
			`default:`
			`return tt;`
			`}`
Complete minimal flow for editor highlighting 2024-03-26 23:29:52 +00:00			`}`
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00
Refactor syntax highlighter to new spec from the compiler 2024-07-28 23:23:51 +00:00			`const native_lex = (code: string) => new Promise<TokenizeResult>((resolve, reject) => {`
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00			`// Get binary path from .env`
			`const binary = import.meta.env.THP_BINARY;`
			`if (!binary) {`
			`throw new Error("THP_BINARY not set in .env");`
			`}`

			`const subprocess = spawn(binary, ["tokenize"]);`
			`let response = "";`
			`let error = "";`

			`subprocess.stdin.write(code);`
			`subprocess.stdin.end();`

			`subprocess.stdout.on("data", (data) => {`
			`response += data.toString();`
			`});`

			`subprocess.stderr.on("data", (data) => {`
			`error += data.toString();`
			`});`

			`subprocess.on("close", (code) => {`
			`if (code === 0) {`
			`resolve(JSON.parse(response));`
			`} else {`
Add rendering of compiler panics in code snippets 2024-07-29 00:54:15 +00:00			`reject(new Error(error));`
Use the native lexer instead of a TS reimplementation 2024-07-06 00:12:30 +00:00			`}`
			`});`
			`})`