From a542071af97e2daedf5a36be4131accd61911bb2 Mon Sep 17 00:00:00 2001 From: Araozu Date: Fri, 5 Jul 2024 19:12:30 -0500 Subject: [PATCH] Use the native lexer instead of a TS reimplementation --- .env.example | 1 + src/components/Code.astro | 8 +- src/lexer/highlighter.ts | 149 ++++++++++++++++++++++++-- src/lexer/identifier_lexer.test.ts | 55 ---------- src/lexer/identifier_lexer.ts | 44 -------- src/lexer/lexer.test.ts | 45 -------- src/lexer/lexer.ts | 166 ----------------------------- src/lexer/number_lexer.test.ts | 19 ---- src/lexer/number_lexer.ts | 47 -------- src/lexer/string_lexer.test.ts | 32 ------ src/lexer/string_lexer.ts | 49 --------- src/pages/learn/index.mdx | 2 +- 12 files changed, 148 insertions(+), 469 deletions(-) create mode 100644 .env.example delete mode 100644 src/lexer/identifier_lexer.test.ts delete mode 100644 src/lexer/identifier_lexer.ts delete mode 100644 src/lexer/lexer.test.ts delete mode 100644 src/lexer/lexer.ts delete mode 100644 src/lexer/number_lexer.test.ts delete mode 100644 src/lexer/number_lexer.ts delete mode 100644 src/lexer/string_lexer.test.ts delete mode 100644 src/lexer/string_lexer.ts diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..ed57710 --- /dev/null +++ b/.env.example @@ -0,0 +1 @@ +THP_BINARY=/path/to/rust/thp/binary diff --git a/src/components/Code.astro b/src/components/Code.astro index 0be5525..de68a4d 100644 --- a/src/components/Code.astro +++ b/src/components/Code.astro @@ -1,10 +1,10 @@ --- -import { leftTrimDedent } from "./utils"; -import { thp_highlighter } from "../lexer/highlighter"; +import { native_highlighter } from "../lexer/highlighter"; const { thpcode } = Astro.props; -const html_code = thp_highlighter(leftTrimDedent(thpcode).join("\n")); + +const native_html = await native_highlighter(thpcode); ---
thp
+ class="language-thp">thp diff --git a/src/lexer/highlighter.ts b/src/lexer/highlighter.ts index 8c7d1b2..5071b22 100644 --- a/src/lexer/highlighter.ts +++ b/src/lexer/highlighter.ts @@ -1,13 +1,148 @@ -import { lex } from "./lexer"; +import { spawn } from "node:child_process"; +import { leftTrimDedent } from "../components/utils"; -export function thp_highlighter(code: string) { - let tokens = lex(code); +export interface LexResult { + Ok?: Token[] + Err?: Err +} - let highlighted_code = ""; +export interface Token { + token_type: TokenType + value: string + position: number +} - for (let token of tokens) { - highlighted_code += `${token.v}`; +type TokenType = + "Identifier" | + "Datatype" | + "Int" | + "Float" | + "String" | + "Operator" | + "LeftParen" | + "RightParen" | + "LeftBracket" | + "RightBracket" | + "LeftBrace" | + "RightBrace" | + "NewLine" | + "Comment" | + "Comma" | + "INDENT" | + "DEDENT" | + "VAL" | + "VAR" | + "EOF" | + "FUN"; + +export interface Err { + Lex: LexError +} + +export interface LexError { + position: number + reason: string +} + + +export async function native_highlighter(code: string): Promise { + let formatted_code = leftTrimDedent(code).join("\n"); + + const result = await native_lex(formatted_code); + + if (result.Err) { + throw new Error(JSON.stringify(result.Err.Lex) + "\n" + code); } - return highlighted_code; + const tokens = result.Ok!; + + const input_chars = formatted_code.split(""); + let output = ""; + + let current_pos = 0; + + for (let i = 0; i < tokens.length; i += 1) { + const t = tokens[i]!; + const token_start = t.position; + const token_end = t.position + t.value.length; + + // There are some tokens that are empty, ignore them + if (t.value == "") { + continue; + } + + // Append all characters before the token + output += input_chars.slice(current_pos, token_start).join(""); + + // Append the token + const token_value = t.value.replaceAll(//g, ">"); + const token_type = translate_token_type(t.token_type, token_value); + output += `${token_value}`; + + current_pos = token_end; + } + + return output; } + +function translate_token_type(tt: TokenType, value: string): string { + const keywords = ["throws", "extends", "constructor", "case", "static", "const", "enum", "union", "loop", "use", "break", "catch", "continue", "as", "do", "else", "finally", "for", "fun", "if", "in", "fn", "nil", "return", "throw", "try", "while", "type", "match", "with", "of", "abstract", "class", "interface", "private", "pub", "override", "open", "init", "val", "var", "mut", "clone"]; + + switch (tt) { + case "Datatype": + return "class-name"; + case "Identifier": { + if (keywords.includes(value)) { + return "keyword"; + } + + return "identifier"; + } + case "Int": + return "number"; + case "Float": + return "number"; + case "String": + return "string"; + case "Comment": + return "comment"; + // keywords: + case "VAL": + case "VAR": + case "FUN": + return "keyword"; + default: + return tt; + } +} + +const native_lex = (code: string) => new Promise((resolve, reject) => { + // Get binary path from .env + const binary = import.meta.env.THP_BINARY; + if (!binary) { + throw new Error("THP_BINARY not set in .env"); + } + + const subprocess = spawn(binary, ["tokenize"]); + let response = ""; + let error = ""; + + subprocess.stdin.write(code); + subprocess.stdin.end(); + + subprocess.stdout.on("data", (data) => { + response += data.toString(); + }); + + subprocess.stderr.on("data", (data) => { + error += data.toString(); + }); + + subprocess.on("close", (code) => { + if (code === 0) { + resolve(JSON.parse(response)); + } else { + reject(error); + } + }); +}) diff --git a/src/lexer/identifier_lexer.test.ts b/src/lexer/identifier_lexer.test.ts deleted file mode 100644 index 44ecd37..0000000 --- a/src/lexer/identifier_lexer.test.ts +++ /dev/null @@ -1,55 +0,0 @@ -import { expect, test, describe } from "vitest"; -import { scan_identifier } from "./identifier_lexer"; - - -describe("Identifier Lexer", () => { - test("should return an identifier token", () => { - const code = "a"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "a", token_type: "identifier" }, 1]); - }); - - test("should scan an underscore", () => { - const code = "_"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "_", token_type: "identifier" }, 1]); - }); - - test("should scan an identifier with an underscore", () => { - const code = "a_"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "a_", token_type: "identifier" }, 2]); - }); - - test("should scan an identifier that starts with an underscore", () => { - const code = "_a"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "_a", token_type: "identifier" }, 2]); - }); - - test("should scan an identifier with numbers and uppercase letters", () => { - const code = "aA1"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "aA1", token_type: "identifier" }, 3]); - }); - - test("should scan a keyword", () => { - const code = "val"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "val", token_type: "keyword" }, 3]); - }); - - test("should scan a datatype", () => { - const code = "Int"; - const token = scan_identifier(code, 0, true); - - expect(token).toEqual([{ v: "Int", token_type: "class-name" }, 3]); - }); -}); - diff --git a/src/lexer/identifier_lexer.ts b/src/lexer/identifier_lexer.ts deleted file mode 100644 index a531c52..0000000 --- a/src/lexer/identifier_lexer.ts +++ /dev/null @@ -1,44 +0,0 @@ -import type { Token } from "./lexer"; -import { is_identifier_char } from "./utils"; - -/** - * Scans an identifier, at the given position in the input string. - * This function assumes that the character at the given position is a letter. - * - * @param input the input string - * @param starting_position the position to start scanning from - * @param is_datatype whether the identifier is a datatype - */ -export function scan_identifier(input: string, starting_position: number, is_datatype = false): [Token, number] { - let value = input[starting_position]!; - let pos = starting_position + 1; - - while (pos < input.length) { - const c = input[pos]!; - - if (is_identifier_char(c)) { - pos += 1; - value += c; - } - else { - break; - } - } - - if (is_datatype) { - return [{ v: value, token_type: "class-name" }, pos]; - } - else { - return [{ v: value, token_type: check_keyword(value) }, pos]; - } -} - -function check_keyword(value: string): string { - const keywords = ["throws", "extends", "constructor", "case", "static", "const", "enum", "union", "loop", "use", "break", "catch", "continue", "as", "do", "else", "finally", "for", "fun", "if", "in", "fn", "nil", "return", "throw", "try", "while", "type", "match", "with", "of", "abstract", "class", "interface", "private", "pub", "override", "open", "init", "val", "var", "mut", "clone"]; - - if (keywords.includes(value)) { - return "keyword"; - } - return "identifier"; -} - diff --git a/src/lexer/lexer.test.ts b/src/lexer/lexer.test.ts deleted file mode 100644 index a4c9461..0000000 --- a/src/lexer/lexer.test.ts +++ /dev/null @@ -1,45 +0,0 @@ -import { expect, test, describe } from "vitest"; -import { lex } from "./lexer"; - -describe("Lexer", () => { - test("empty program should return no tokens", () => { - const code = ""; - const tokens = lex(code); - expect(tokens).toEqual([]); - }); - - test("program with whitespace should return a single token", () => { - const code = " "; - const tokens = lex(code); - expect(tokens).toEqual([{v: " ", token_type: ""}]); - }) - - test("program with newlines should return a single token", () => { - const code = "\n"; - const tokens = lex(code); - expect(tokens).toEqual([{v: "\n", token_type: ""}]); - }); - - test("program with random unicode should return the same unicode", () => { - const code = "🍕"; - const tokens = lex(code); - expect(tokens).toEqual([{v: "🍕", token_type: ""}]); - }); - - test("should scan integers", () => { - const code = "12345"; - const tokens = lex(code); - expect(tokens).toEqual([{v: "12345", token_type: "number"}]); - }); - - test("should scan integers and whitespace around", () => { - const code = " 12345 \n "; - const tokens = lex(code); - expect(tokens).toEqual([ - {v: " ", token_type: ""}, - {v: "12345", token_type: "number"}, - {v: " \n ", token_type: ""}, - ]); - }); -}); - diff --git a/src/lexer/lexer.ts b/src/lexer/lexer.ts deleted file mode 100644 index 97a13dd..0000000 --- a/src/lexer/lexer.ts +++ /dev/null @@ -1,166 +0,0 @@ -import { scan_identifier } from "./identifier_lexer"; -import { scan_number } from "./number_lexer"; -import { scan_string } from "./string_lexer"; -import { is_digit, is_lowercase, is_uppercase } from "./utils"; - -export type Token = { - v: string, - token_type: string, -}; - -/** - * Lexes a string of THP code, and returns an array of tokens. Unlike a regular - * lexer, whitespace and other characters are not ignored, and are instead treated - * as a default token. - * - * This lexer implements a subset of the grammar defined in the THP language specification, - * only recognizing the following tokens: - * - Identifier - * - Datatype - * - String - * - Number - * - Single line comment - * - Multi line comment - * - Keywords - * - * @param code Code to lex - * @returns An array of all the tokens found - */ -export function lex(code: string, start = 0): Array { - const code_len = code.length; - const tokens: Array = []; - - let current_pos = start; - let current_default_token = ""; - - while (current_pos < code_len) { - const c = code[current_pos]!; - - // try to scan a number - if (is_digit(c)) { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - // lex a number - const [token, next] = scan_number(code, current_pos); - current_pos = next; - tokens.push(token); - continue; - } - // try to scan an identifier/keyword - else if (is_lowercase(c) || c === "_") { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - const [token, next] = scan_identifier(code, current_pos); - current_pos = next; - tokens.push(token); - continue; - } - // try to scan a datatype - else if (is_uppercase(c)) { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - const [token, next] = scan_identifier(code, current_pos, true); - current_pos = next; - tokens.push(token); - continue; - } - // try to scan a string - else if (c === "\"") { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - const [token, next] = scan_string(code, current_pos); - current_pos = next; - tokens.push(token); - continue; - } - // try to scan a comment - else if (c === "/" && code[current_pos + 1] === "/") { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - let comment = ""; - let pos = current_pos; - - while (pos < code_len) { - const char = code[pos]; - - if (char === "\n") { - break; - } - - comment += char; - pos++; - } - - tokens.push({ v: comment, token_type: "comment" }); - current_pos = pos; - continue; - } - // try to scan a multiline comment - else if (c === "/" && code[current_pos + 1] === "*") { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - let comment = ""; - let pos = current_pos; - - while (pos < code_len) { - const char = code[pos]; - - if (char === "*" && code[pos + 1] === "/") { - pos += 2; - comment += "*/"; - break; - } - - comment += char; - pos++; - } - - tokens.push({ v: comment, token_type: "comment" }); - current_pos = pos; - continue; - } - // replace < with < - else if (c === "<") { - current_default_token += "<"; - current_pos++; - continue; - } - - current_default_token += c; - current_pos++; - } - - // if there was a default token, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - return tokens; -} - - diff --git a/src/lexer/number_lexer.test.ts b/src/lexer/number_lexer.test.ts deleted file mode 100644 index d8fa634..0000000 --- a/src/lexer/number_lexer.test.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { expect, test, describe } from "vitest"; -import { scan_number } from "./number_lexer"; - -describe("Number Lexer", () => { - test("should return a whole number token", () => { - const code = "1"; - const token = scan_number(code, 0); - - expect(token).toEqual([{ v: "1", token_type: "number" }, 1]); - }); - - test("should return a whole number token pt 2", () => { - const code = "12345"; - const token = scan_number(code, 0); - - expect(token).toEqual([{ v: "12345", token_type: "number" }, 5]); - }); -}); - diff --git a/src/lexer/number_lexer.ts b/src/lexer/number_lexer.ts deleted file mode 100644 index 01d4880..0000000 --- a/src/lexer/number_lexer.ts +++ /dev/null @@ -1,47 +0,0 @@ -import type { Token } from "./lexer"; -import { is_digit } from "./utils"; - -/** - * Scans a number, at the given position in the input string. - * This function assumes that the character at the given position is a digit. - * It follows this grammar: - * - * @param input the input string - * @param pos the position to start scanning from - * @returns - */ -export function scan_number(input: string, pos: number): [Token, number] { - const [token_value, next] = scan_decimal(input, pos); - - return [{ v: token_value, token_type: "number" }, next]; -} - -function scan_decimal(input: string, starting_position: number): [string, number] { - let current_value = ""; - let pos = starting_position; - - while (pos < input.length) { - const c = input[pos]!; - - if (c === ".") { - // todo - return [current_value, pos]; - } - else if (c == "e" || c == "E") { - // todo - return [current_value, pos]; - } - else if (is_digit(c)) { - current_value += c; - pos += 1; - } - else { - break; - } - - } - - return [current_value, pos]; -} - - diff --git a/src/lexer/string_lexer.test.ts b/src/lexer/string_lexer.test.ts deleted file mode 100644 index 7e42890..0000000 --- a/src/lexer/string_lexer.test.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { expect, test, describe } from "vitest"; -import { scan_string } from "./string_lexer"; - -describe("String Lexer", () => { - test("should scan an empty string", () => { - const code = "\"\""; - const token = scan_string(code, 0); - - expect(token).toEqual([{ v: "\"\"", token_type: "string" }, 2]); - }); - - test("should scan a string with a single character", () => { - const code = "\"a\""; - const token = scan_string(code, 0); - - expect(token).toEqual([{ v: "\"a\"", token_type: "string" }, 3]); - }); - - test("should scan a string with multiple characters", () => { - const code = "\"hello\""; - const token = scan_string(code, 0); - - expect(token).toEqual([{ v: "\"hello\"", token_type: "string" }, 7]); - }); - - test("should scan a string with an escape character", () => { - const code = "\"\\n\""; - const token = scan_string(code, 0); - - expect(token).toEqual([{ v: "\"\\n\"", token_type: "string" }, 4]); - }); -}); diff --git a/src/lexer/string_lexer.ts b/src/lexer/string_lexer.ts deleted file mode 100644 index b5e75bd..0000000 --- a/src/lexer/string_lexer.ts +++ /dev/null @@ -1,49 +0,0 @@ -import type { Token } from "./lexer"; - -export function scan_string(input: string, starting_position: number): [Token, number] { - let value = "\""; - let pos = starting_position + 1; - - while (pos < input.length) { - const c = input[pos]; - - if (c === "\"") { - value += c; - pos += 1; - break; - } - if (c === "\n") { - // todo: error handling, return an error indicator and the caller should render a red wavy underline - break; - } - if (c === "\\") { - const next_char = input[pos + 1]; - value += handle_escape_char(next_char); - pos += 2; - continue; - } - - value += c; - pos += 1; - } - - return [{ v: value, token_type: "string" }, pos]; -} - -function handle_escape_char(next_char: string): string { - switch (next_char) { - case "n": - return "\\n" - case "t": - return "\\t" - case "r": - return "\\r" - case "\"": - return "\\\"" - case "\\": - return "\\\\" - default: - return "\\" + next_char - } -} - diff --git a/src/pages/learn/index.mdx b/src/pages/learn/index.mdx index 066945d..e558562 100644 --- a/src/pages/learn/index.mdx +++ b/src/pages/learn/index.mdx @@ -196,7 +196,7 @@ $cat->meow(); - Instantiate classes without `new`