Use the native lexer instead of a TS reimplementation

2024-07-05 19:12:30 -05:00 · 2024-07-05 19:12:30 -05:00 · a542071af9
parent 49faed4fcb
commit a542071af9
12 changed files with 148 additions and 469 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1 @@
 THP_BINARY=/path/to/rust/thp/binary
--- a/src/components/Code.astro
+++ b/src/components/Code.astro
@ -1,10 +1,10 @@
 ---
-import { leftTrimDedent } from "./utils";
+import { native_highlighter } from "../lexer/highlighter";
 import { thp_highlighter } from "../lexer/highlighter";
 const { thpcode } = Astro.props;
-const html_code = thp_highlighter(leftTrimDedent(thpcode).join("\n"));
+
 const native_html = await native_highlighter(thpcode);
 ---
 <pre
-    class="language-thp"><code class="language-thp" set:html={html_code} /><span class="absolute top-2 right-2 inline-block text-sm select-none opacity-75">thp</span></pre>
+    class="language-thp"><code class="language-thp" set:html={native_html} /><span class="absolute top-2 right-2 inline-block text-sm select-none opacity-75">thp</span></pre>
--- a/src/lexer/highlighter.ts
+++ b/src/lexer/highlighter.ts
@ -1,13 +1,148 @@
-import { lex } from "./lexer";
+import { spawn } from "node:child_process";
 import { leftTrimDedent } from "../components/utils";
-export function thp_highlighter(code: string) {
+export interface LexResult {
-    let tokens = lex(code);
+    Ok?: Token[]
-
+    Err?: Err
    let highlighted_code = "";
    for (let token of tokens) {
        highlighted_code += `<span class="token ${token.token_type}">${token.v}</span>`;
 }
-    return highlighted_code;
+export interface Token {
    token_type: TokenType
    value: string
    position: number
 }
 type TokenType =
    "Identifier" |
    "Datatype" |
    "Int" |
    "Float" |
    "String" |
    "Operator" |
    "LeftParen" |
    "RightParen" |
    "LeftBracket" |
    "RightBracket" |
    "LeftBrace" |
    "RightBrace" |
    "NewLine" |
    "Comment" |
    "Comma" |
    "INDENT" |
    "DEDENT" |
    "VAL" |
    "VAR" |
    "EOF" |
    "FUN";
 export interface Err {
    Lex: LexError
 }
 export interface LexError {
    position: number
    reason: string
 }
 export async function native_highlighter(code: string): Promise<string> {
    let formatted_code = leftTrimDedent(code).join("\n");
    const result = await native_lex(formatted_code);
    if (result.Err) {
        throw new Error(JSON.stringify(result.Err.Lex) + "\n" + code);
    }
    const tokens = result.Ok!;
    const input_chars = formatted_code.split("");
    let output = "";
    let current_pos = 0;
    for (let i = 0; i < tokens.length; i += 1) {
        const t = tokens[i]!;
        const token_start = t.position;
        const token_end = t.position + t.value.length;
        // There are some tokens that are empty, ignore them
        if (t.value == "") {
            continue;
        }
        // Append all characters before the token
        output += input_chars.slice(current_pos, token_start).join("");
        // Append the token
        const token_value = t.value.replaceAll(/</g, "&lt;").replaceAll(/>/g, "&gt;");
        const token_type = translate_token_type(t.token_type, token_value);
        output += `<span class="token ${token_type}">${token_value}</span>`;
        current_pos = token_end;
    }
    return output;
 }
 function translate_token_type(tt: TokenType, value: string): string {
    const keywords = ["throws", "extends", "constructor", "case", "static", "const", "enum", "union", "loop", "use", "break", "catch", "continue", "as", "do", "else", "finally", "for", "fun", "if", "in", "fn", "nil", "return", "throw", "try", "while", "type", "match", "with", "of", "abstract", "class", "interface", "private", "pub", "override", "open", "init", "val", "var", "mut", "clone"];
    switch (tt) {
        case "Datatype":
            return "class-name";
        case "Identifier": {
            if (keywords.includes(value)) {
                return "keyword";
            }
            return "identifier";
        }
        case "Int":
            return "number";
        case "Float":
            return "number";
        case "String":
            return "string";
        case "Comment":
            return "comment";
        // keywords:
        case "VAL":
        case "VAR":
        case "FUN":
            return "keyword";
        default:
            return tt;
    }
 }
 const native_lex = (code: string) => new Promise<LexResult>((resolve, reject) => {
    // Get binary path from .env
    const binary = import.meta.env.THP_BINARY;
    if (!binary) {
        throw new Error("THP_BINARY not set in .env");
    }
    const subprocess = spawn(binary, ["tokenize"]);
    let response = "";
    let error = "";
    subprocess.stdin.write(code);
    subprocess.stdin.end();
    subprocess.stdout.on("data", (data) => {
        response += data.toString();
    });
    subprocess.stderr.on("data", (data) => {
        error += data.toString();
    });
    subprocess.on("close", (code) => {
        if (code === 0) {
            resolve(JSON.parse(response));
        } else {
            reject(error);
        }
    });
 })
--- a/src/lexer/identifier_lexer.test.ts
+++ b/src/lexer/identifier_lexer.test.ts
@ -1,55 +0,0 @@
 import { expect, test, describe } from "vitest";
 import { scan_identifier } from "./identifier_lexer";
 describe("Identifier Lexer", () => {
    test("should return an identifier token", () => {
        const code = "a";
        const token = scan_identifier(code, 0);
        expect(token).toEqual([{ v: "a", token_type: "identifier" }, 1]);
    });
    test("should scan an underscore", () => {
        const code = "_";
        const token = scan_identifier(code, 0);
        expect(token).toEqual([{ v: "_", token_type: "identifier" }, 1]);
    });
    test("should scan an identifier with an underscore", () => {
        const code = "a_";
        const token = scan_identifier(code, 0);
        expect(token).toEqual([{ v: "a_", token_type: "identifier" }, 2]);
    });
    test("should scan an identifier that starts with an underscore", () => {
        const code = "_a";
        const token = scan_identifier(code, 0);
        expect(token).toEqual([{ v: "_a", token_type: "identifier" }, 2]);
    });
    test("should scan an identifier with numbers and uppercase letters", () => {
        const code = "aA1";
        const token = scan_identifier(code, 0);
        expect(token).toEqual([{ v: "aA1", token_type: "identifier" }, 3]);
    });
    test("should scan a keyword", () => {
        const code = "val";
        const token = scan_identifier(code, 0);
        expect(token).toEqual([{ v: "val", token_type: "keyword" }, 3]);
    });
    test("should scan a datatype", () => {
        const code = "Int";
        const token = scan_identifier(code, 0, true);
        expect(token).toEqual([{ v: "Int", token_type: "class-name" }, 3]);
    });
 });
--- a/src/lexer/identifier_lexer.ts
+++ b/src/lexer/identifier_lexer.ts
@ -1,44 +0,0 @@
 import type { Token } from "./lexer";
 import { is_identifier_char } from "./utils";
 /**
 * Scans an identifier, at the given position in the input string.
 * This function assumes that the character at the given position is a letter.
 * 
 * @param input the input string
 * @param starting_position the position to start scanning from
 * @param is_datatype whether the identifier is a datatype
 */
 export function scan_identifier(input: string, starting_position: number, is_datatype = false): [Token, number] {
    let value = input[starting_position]!;
    let pos = starting_position + 1;
    while (pos < input.length) {
        const c = input[pos]!;
        if (is_identifier_char(c)) {
            pos += 1;
            value += c;
        }
        else {
            break;
        }
    }
    if (is_datatype) {
        return [{ v: value, token_type: "class-name" }, pos];
    }
    else {
        return [{ v: value, token_type: check_keyword(value) }, pos];
    }
 }
 function check_keyword(value: string): string {
    const keywords = ["throws", "extends", "constructor", "case", "static", "const", "enum", "union", "loop", "use", "break", "catch", "continue", "as", "do", "else", "finally", "for", "fun", "if", "in", "fn", "nil", "return", "throw", "try", "while", "type", "match", "with", "of", "abstract", "class", "interface", "private", "pub", "override", "open", "init", "val", "var", "mut", "clone"];
    if (keywords.includes(value)) {
        return "keyword";
    }
    return "identifier";
 }
--- a/src/lexer/lexer.test.ts
+++ b/src/lexer/lexer.test.ts
@ -1,45 +0,0 @@
 import { expect, test, describe } from "vitest";
 import { lex } from "./lexer";
 describe("Lexer", () => {
    test("empty program should return no tokens", () => {
        const code = "";
        const tokens = lex(code);
        expect(tokens).toEqual([]);
    });
    test("program with whitespace should return a single token", () => {
        const code = " ";
        const tokens = lex(code);
        expect(tokens).toEqual([{v: " ", token_type: ""}]);
    })
    test("program with newlines should return a single token", () => {
        const code = "\n";
        const tokens = lex(code);
        expect(tokens).toEqual([{v: "\n", token_type: ""}]);
    });
    test("program with random unicode should return the same unicode", () => {
        const code = "🍕";
        const tokens = lex(code);
        expect(tokens).toEqual([{v: "🍕", token_type: ""}]);
    });
    test("should scan integers", () => {
        const code = "12345";
        const tokens = lex(code);
        expect(tokens).toEqual([{v: "12345", token_type: "number"}]);
    });
    test("should scan integers and whitespace around", () => {
        const code = "   12345  \n  ";
        const tokens = lex(code);
        expect(tokens).toEqual([
            {v: "   ", token_type: ""},
            {v: "12345", token_type: "number"},
            {v: "  \n  ", token_type: ""},
        ]);
    });
 });
--- a/src/lexer/lexer.ts
+++ b/src/lexer/lexer.ts
@ -1,166 +0,0 @@
 import { scan_identifier } from "./identifier_lexer";
 import { scan_number } from "./number_lexer";
 import { scan_string } from "./string_lexer";
 import { is_digit, is_lowercase, is_uppercase } from "./utils";
 export type Token = {
    v: string,
    token_type: string,
 };
 /**
 * Lexes a string of THP code, and returns an array of tokens. Unlike a regular
 * lexer, whitespace and other characters are not ignored, and are instead treated
 * as a default token.
 * 
 * This lexer implements a subset of the grammar defined in the THP language specification,
 * only recognizing the following tokens:
 * - Identifier
 * - Datatype
 * - String
 * - Number
 * - Single line comment
 * - Multi line comment
 * - Keywords
 * 
 * @param code Code to lex
 * @returns An array of all the tokens found
 */
 export function lex(code: string, start = 0): Array<Token> {
    const code_len = code.length;
    const tokens: Array<Token> = [];
    let current_pos = start;
    let current_default_token = "";
    while (current_pos < code_len) {
        const c = code[current_pos]!;
        // try to scan a number
        if (is_digit(c)) {
            // if the current default token is not empty, push it to the tokens array
            if (current_default_token !== "") {
                tokens.push({ v: current_default_token, token_type: "" });
                current_default_token = "";
            }
            // lex a number
            const [token, next] = scan_number(code, current_pos);
            current_pos = next;
            tokens.push(token);
            continue;
        }
        // try to scan an identifier/keyword
        else if (is_lowercase(c) || c === "_") {
            // if the current default token is not empty, push it to the tokens array
            if (current_default_token !== "") {
                tokens.push({ v: current_default_token, token_type: "" });
                current_default_token = "";
            }
            const [token, next] = scan_identifier(code, current_pos);
            current_pos = next;
            tokens.push(token);
            continue;
        }
        // try to scan a datatype
        else if (is_uppercase(c)) {
            // if the current default token is not empty, push it to the tokens array
            if (current_default_token !== "") {
                tokens.push({ v: current_default_token, token_type: "" });
                current_default_token = "";
            }
            const [token, next] = scan_identifier(code, current_pos, true);
            current_pos = next;
            tokens.push(token);
            continue;
        }
        // try to scan a string
        else if (c === "\"") {
            // if the current default token is not empty, push it to the tokens array
            if (current_default_token !== "") {
                tokens.push({ v: current_default_token, token_type: "" });
                current_default_token = "";
            }
            const [token, next] = scan_string(code, current_pos);
            current_pos = next;
            tokens.push(token);
            continue;
        }
        // try to scan a comment
        else if (c === "/" && code[current_pos + 1] === "/") {
            // if the current default token is not empty, push it to the tokens array
            if (current_default_token !== "") {
                tokens.push({ v: current_default_token, token_type: "" });
                current_default_token = "";
            }
            let comment = "";
            let pos = current_pos;
            while (pos < code_len) {
                const char = code[pos];
                if (char === "\n") {
                    break;
                }
                comment += char;
                pos++;
            }
            tokens.push({ v: comment, token_type: "comment" });
            current_pos = pos;
            continue;
        }
        // try to scan a multiline comment
        else if (c === "/" && code[current_pos + 1] === "*") {
            // if the current default token is not empty, push it to the tokens array
            if (current_default_token !== "") {
                tokens.push({ v: current_default_token, token_type: "" });
                current_default_token = "";
            }
            let comment = "";
            let pos = current_pos;
            while (pos < code_len) {
                const char = code[pos];
                if (char === "*" && code[pos + 1] === "/") {
                    pos += 2;
                    comment += "*/";
                    break;
                }
                comment += char;
                pos++;
            }
            tokens.push({ v: comment, token_type: "comment" });
            current_pos = pos;
            continue;
        }
        // replace < with &lt;
        else if (c === "<") {
            current_default_token += "&lt;";
            current_pos++;
            continue;
        }
        current_default_token += c;
        current_pos++;
    }
    // if there was a default token, push it to the tokens array
    if (current_default_token !== "") {
        tokens.push({ v: current_default_token, token_type: "" });
        current_default_token = "";
    }
    return tokens;
 }
--- a/src/lexer/number_lexer.test.ts
+++ b/src/lexer/number_lexer.test.ts
@ -1,19 +0,0 @@
 import { expect, test, describe } from "vitest";
 import { scan_number } from "./number_lexer";
 describe("Number Lexer", () => {
    test("should return a whole number token", () => {
        const code = "1";
        const token = scan_number(code, 0);
        expect(token).toEqual([{ v: "1", token_type: "number" }, 1]);
    });
    test("should return a whole number token pt 2", () => {
        const code = "12345";
        const token = scan_number(code, 0);
        expect(token).toEqual([{ v: "12345", token_type: "number" }, 5]);
    });
 });
--- a/src/lexer/number_lexer.ts
+++ b/src/lexer/number_lexer.ts
@ -1,47 +0,0 @@
 import type { Token } from "./lexer";
 import { is_digit } from "./utils";
 /**
 * Scans a number, at the given position in the input string.
 * This function assumes that the character at the given position is a digit.
 * It follows this grammar:
 * 
 * @param input the input string
 * @param pos the position to start scanning from
 * @returns 
 */
 export function scan_number(input: string, pos: number): [Token, number] {
    const [token_value, next] = scan_decimal(input, pos);
    return [{ v: token_value, token_type: "number" }, next];
 }
 function scan_decimal(input: string, starting_position: number): [string, number] {
    let current_value = "";
    let pos = starting_position;
    while (pos < input.length) {
        const c = input[pos]!;
        if (c === ".") {
            // todo
            return [current_value, pos];
        }
        else if (c == "e" || c == "E") {
            // todo
            return [current_value, pos];
        }
        else if (is_digit(c)) {
            current_value += c;
            pos += 1;
        }
        else {
            break;
        }
    }
    return [current_value, pos];
 }
--- a/src/lexer/string_lexer.test.ts
+++ b/src/lexer/string_lexer.test.ts
@ -1,32 +0,0 @@
 import { expect, test, describe } from "vitest";
 import { scan_string } from "./string_lexer";
 describe("String Lexer", () => {
    test("should scan an empty string", () => {
        const code = "\"\"";
        const token = scan_string(code, 0);
        expect(token).toEqual([{ v: "\"\"", token_type: "string" }, 2]);
    });
    test("should scan a string with a single character", () => {
        const code = "\"a\"";
        const token = scan_string(code, 0);
        expect(token).toEqual([{ v: "\"a\"", token_type: "string" }, 3]);
    });
    test("should scan a string with multiple characters", () => {
        const code = "\"hello\"";
        const token = scan_string(code, 0);
        expect(token).toEqual([{ v: "\"hello\"", token_type: "string" }, 7]);
    });
    test("should scan a string with an escape character", () => {
        const code = "\"\\n\"";
        const token = scan_string(code, 0);
        expect(token).toEqual([{ v: "\"\\n\"", token_type: "string" }, 4]);
    });
 });
--- a/src/lexer/string_lexer.ts
+++ b/src/lexer/string_lexer.ts
@ -1,49 +0,0 @@
 import type { Token } from "./lexer";
 export function scan_string(input: string, starting_position: number): [Token, number] {
    let value = "\"";
    let pos = starting_position + 1;
    while (pos < input.length) {
        const c = input[pos];
        if (c === "\"") {
            value += c;
            pos += 1;
            break;
        }
        if (c === "\n") {
            // todo: error handling, return an error indicator and the caller should render a red wavy underline
            break;
        }
        if (c === "\\") {
            const next_char = input[pos + 1];
            value += handle_escape_char(next_char);
            pos += 2;
            continue;
        }
        value += c;
        pos += 1;
    }
    return [{ v: value, token_type: "string" }, pos];
 }
 function handle_escape_char(next_char: string): string {
    switch (next_char) {
        case "n":
            return "\\n"
        case "t":
            return "\\t"
        case "r":
            return "\\r"
        case "\"":
            return "\\\""
        case "\\":
            return "\\\\"
        default:
            return "\\" + next_char
    }
 }
--- a/src/pages/learn/index.mdx
+++ b/src/pages/learn/index.mdx
@ -196,7 +196,7 @@ $cat->meow();
 <Code thpcode={`
 // THP
 val cat = Cat("Michifu", 7)
-cat.meow();
+cat.meow()
 `} />
 - Instantiate classes without `new`