diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..ed57710
--- /dev/null
+++ b/.env.example
@@ -0,0 +1 @@
+THP_BINARY=/path/to/rust/thp/binary
diff --git a/src/components/Code.astro b/src/components/Code.astro
index 0be5525..de68a4d 100644
--- a/src/components/Code.astro
+++ b/src/components/Code.astro
@@ -1,10 +1,10 @@
---
-import { leftTrimDedent } from "./utils";
-import { thp_highlighter } from "../lexer/highlighter";
+import { native_highlighter } from "../lexer/highlighter";
const { thpcode } = Astro.props;
-const html_code = thp_highlighter(leftTrimDedent(thpcode).join("\n"));
+
+const native_html = await native_highlighter(thpcode);
---
thp
+ class="language-thp">
thp
diff --git a/src/lexer/highlighter.ts b/src/lexer/highlighter.ts
index 8c7d1b2..5071b22 100644
--- a/src/lexer/highlighter.ts
+++ b/src/lexer/highlighter.ts
@@ -1,13 +1,148 @@
-import { lex } from "./lexer";
+import { spawn } from "node:child_process";
+import { leftTrimDedent } from "../components/utils";
-export function thp_highlighter(code: string) {
- let tokens = lex(code);
+export interface LexResult {
+ Ok?: Token[]
+ Err?: Err
+}
- let highlighted_code = "";
+export interface Token {
+ token_type: TokenType
+ value: string
+ position: number
+}
- for (let token of tokens) {
- highlighted_code += `${token.v}`;
+type TokenType =
+ "Identifier" |
+ "Datatype" |
+ "Int" |
+ "Float" |
+ "String" |
+ "Operator" |
+ "LeftParen" |
+ "RightParen" |
+ "LeftBracket" |
+ "RightBracket" |
+ "LeftBrace" |
+ "RightBrace" |
+ "NewLine" |
+ "Comment" |
+ "Comma" |
+ "INDENT" |
+ "DEDENT" |
+ "VAL" |
+ "VAR" |
+ "EOF" |
+ "FUN";
+
+export interface Err {
+ Lex: LexError
+}
+
+export interface LexError {
+ position: number
+ reason: string
+}
+
+
+export async function native_highlighter(code: string): Promise {
+ let formatted_code = leftTrimDedent(code).join("\n");
+
+ const result = await native_lex(formatted_code);
+
+ if (result.Err) {
+ throw new Error(JSON.stringify(result.Err.Lex) + "\n" + code);
}
- return highlighted_code;
+ const tokens = result.Ok!;
+
+ const input_chars = formatted_code.split("");
+ let output = "";
+
+ let current_pos = 0;
+
+ for (let i = 0; i < tokens.length; i += 1) {
+ const t = tokens[i]!;
+ const token_start = t.position;
+ const token_end = t.position + t.value.length;
+
+ // There are some tokens that are empty, ignore them
+ if (t.value == "") {
+ continue;
+ }
+
+ // Append all characters before the token
+ output += input_chars.slice(current_pos, token_start).join("");
+
+ // Append the token
+ const token_value = t.value.replaceAll(//g, ">");
+ const token_type = translate_token_type(t.token_type, token_value);
+ output += `${token_value}`;
+
+ current_pos = token_end;
+ }
+
+ return output;
}
+
+function translate_token_type(tt: TokenType, value: string): string {
+ const keywords = ["throws", "extends", "constructor", "case", "static", "const", "enum", "union", "loop", "use", "break", "catch", "continue", "as", "do", "else", "finally", "for", "fun", "if", "in", "fn", "nil", "return", "throw", "try", "while", "type", "match", "with", "of", "abstract", "class", "interface", "private", "pub", "override", "open", "init", "val", "var", "mut", "clone"];
+
+ switch (tt) {
+ case "Datatype":
+ return "class-name";
+ case "Identifier": {
+ if (keywords.includes(value)) {
+ return "keyword";
+ }
+
+ return "identifier";
+ }
+ case "Int":
+ return "number";
+ case "Float":
+ return "number";
+ case "String":
+ return "string";
+ case "Comment":
+ return "comment";
+ // keywords:
+ case "VAL":
+ case "VAR":
+ case "FUN":
+ return "keyword";
+ default:
+ return tt;
+ }
+}
+
+const native_lex = (code: string) => new Promise((resolve, reject) => {
+ // Get binary path from .env
+ const binary = import.meta.env.THP_BINARY;
+ if (!binary) {
+ throw new Error("THP_BINARY not set in .env");
+ }
+
+ const subprocess = spawn(binary, ["tokenize"]);
+ let response = "";
+ let error = "";
+
+ subprocess.stdin.write(code);
+ subprocess.stdin.end();
+
+ subprocess.stdout.on("data", (data) => {
+ response += data.toString();
+ });
+
+ subprocess.stderr.on("data", (data) => {
+ error += data.toString();
+ });
+
+ subprocess.on("close", (code) => {
+ if (code === 0) {
+ resolve(JSON.parse(response));
+ } else {
+ reject(error);
+ }
+ });
+})
diff --git a/src/lexer/identifier_lexer.test.ts b/src/lexer/identifier_lexer.test.ts
deleted file mode 100644
index 44ecd37..0000000
--- a/src/lexer/identifier_lexer.test.ts
+++ /dev/null
@@ -1,55 +0,0 @@
-import { expect, test, describe } from "vitest";
-import { scan_identifier } from "./identifier_lexer";
-
-
-describe("Identifier Lexer", () => {
- test("should return an identifier token", () => {
- const code = "a";
- const token = scan_identifier(code, 0);
-
- expect(token).toEqual([{ v: "a", token_type: "identifier" }, 1]);
- });
-
- test("should scan an underscore", () => {
- const code = "_";
- const token = scan_identifier(code, 0);
-
- expect(token).toEqual([{ v: "_", token_type: "identifier" }, 1]);
- });
-
- test("should scan an identifier with an underscore", () => {
- const code = "a_";
- const token = scan_identifier(code, 0);
-
- expect(token).toEqual([{ v: "a_", token_type: "identifier" }, 2]);
- });
-
- test("should scan an identifier that starts with an underscore", () => {
- const code = "_a";
- const token = scan_identifier(code, 0);
-
- expect(token).toEqual([{ v: "_a", token_type: "identifier" }, 2]);
- });
-
- test("should scan an identifier with numbers and uppercase letters", () => {
- const code = "aA1";
- const token = scan_identifier(code, 0);
-
- expect(token).toEqual([{ v: "aA1", token_type: "identifier" }, 3]);
- });
-
- test("should scan a keyword", () => {
- const code = "val";
- const token = scan_identifier(code, 0);
-
- expect(token).toEqual([{ v: "val", token_type: "keyword" }, 3]);
- });
-
- test("should scan a datatype", () => {
- const code = "Int";
- const token = scan_identifier(code, 0, true);
-
- expect(token).toEqual([{ v: "Int", token_type: "class-name" }, 3]);
- });
-});
-
diff --git a/src/lexer/identifier_lexer.ts b/src/lexer/identifier_lexer.ts
deleted file mode 100644
index a531c52..0000000
--- a/src/lexer/identifier_lexer.ts
+++ /dev/null
@@ -1,44 +0,0 @@
-import type { Token } from "./lexer";
-import { is_identifier_char } from "./utils";
-
-/**
- * Scans an identifier, at the given position in the input string.
- * This function assumes that the character at the given position is a letter.
- *
- * @param input the input string
- * @param starting_position the position to start scanning from
- * @param is_datatype whether the identifier is a datatype
- */
-export function scan_identifier(input: string, starting_position: number, is_datatype = false): [Token, number] {
- let value = input[starting_position]!;
- let pos = starting_position + 1;
-
- while (pos < input.length) {
- const c = input[pos]!;
-
- if (is_identifier_char(c)) {
- pos += 1;
- value += c;
- }
- else {
- break;
- }
- }
-
- if (is_datatype) {
- return [{ v: value, token_type: "class-name" }, pos];
- }
- else {
- return [{ v: value, token_type: check_keyword(value) }, pos];
- }
-}
-
-function check_keyword(value: string): string {
- const keywords = ["throws", "extends", "constructor", "case", "static", "const", "enum", "union", "loop", "use", "break", "catch", "continue", "as", "do", "else", "finally", "for", "fun", "if", "in", "fn", "nil", "return", "throw", "try", "while", "type", "match", "with", "of", "abstract", "class", "interface", "private", "pub", "override", "open", "init", "val", "var", "mut", "clone"];
-
- if (keywords.includes(value)) {
- return "keyword";
- }
- return "identifier";
-}
-
diff --git a/src/lexer/lexer.test.ts b/src/lexer/lexer.test.ts
deleted file mode 100644
index a4c9461..0000000
--- a/src/lexer/lexer.test.ts
+++ /dev/null
@@ -1,45 +0,0 @@
-import { expect, test, describe } from "vitest";
-import { lex } from "./lexer";
-
-describe("Lexer", () => {
- test("empty program should return no tokens", () => {
- const code = "";
- const tokens = lex(code);
- expect(tokens).toEqual([]);
- });
-
- test("program with whitespace should return a single token", () => {
- const code = " ";
- const tokens = lex(code);
- expect(tokens).toEqual([{v: " ", token_type: ""}]);
- })
-
- test("program with newlines should return a single token", () => {
- const code = "\n";
- const tokens = lex(code);
- expect(tokens).toEqual([{v: "\n", token_type: ""}]);
- });
-
- test("program with random unicode should return the same unicode", () => {
- const code = "🍕";
- const tokens = lex(code);
- expect(tokens).toEqual([{v: "🍕", token_type: ""}]);
- });
-
- test("should scan integers", () => {
- const code = "12345";
- const tokens = lex(code);
- expect(tokens).toEqual([{v: "12345", token_type: "number"}]);
- });
-
- test("should scan integers and whitespace around", () => {
- const code = " 12345 \n ";
- const tokens = lex(code);
- expect(tokens).toEqual([
- {v: " ", token_type: ""},
- {v: "12345", token_type: "number"},
- {v: " \n ", token_type: ""},
- ]);
- });
-});
-
diff --git a/src/lexer/lexer.ts b/src/lexer/lexer.ts
deleted file mode 100644
index 97a13dd..0000000
--- a/src/lexer/lexer.ts
+++ /dev/null
@@ -1,166 +0,0 @@
-import { scan_identifier } from "./identifier_lexer";
-import { scan_number } from "./number_lexer";
-import { scan_string } from "./string_lexer";
-import { is_digit, is_lowercase, is_uppercase } from "./utils";
-
-export type Token = {
- v: string,
- token_type: string,
-};
-
-/**
- * Lexes a string of THP code, and returns an array of tokens. Unlike a regular
- * lexer, whitespace and other characters are not ignored, and are instead treated
- * as a default token.
- *
- * This lexer implements a subset of the grammar defined in the THP language specification,
- * only recognizing the following tokens:
- * - Identifier
- * - Datatype
- * - String
- * - Number
- * - Single line comment
- * - Multi line comment
- * - Keywords
- *
- * @param code Code to lex
- * @returns An array of all the tokens found
- */
-export function lex(code: string, start = 0): Array {
- const code_len = code.length;
- const tokens: Array = [];
-
- let current_pos = start;
- let current_default_token = "";
-
- while (current_pos < code_len) {
- const c = code[current_pos]!;
-
- // try to scan a number
- if (is_digit(c)) {
- // if the current default token is not empty, push it to the tokens array
- if (current_default_token !== "") {
- tokens.push({ v: current_default_token, token_type: "" });
- current_default_token = "";
- }
-
- // lex a number
- const [token, next] = scan_number(code, current_pos);
- current_pos = next;
- tokens.push(token);
- continue;
- }
- // try to scan an identifier/keyword
- else if (is_lowercase(c) || c === "_") {
- // if the current default token is not empty, push it to the tokens array
- if (current_default_token !== "") {
- tokens.push({ v: current_default_token, token_type: "" });
- current_default_token = "";
- }
-
- const [token, next] = scan_identifier(code, current_pos);
- current_pos = next;
- tokens.push(token);
- continue;
- }
- // try to scan a datatype
- else if (is_uppercase(c)) {
- // if the current default token is not empty, push it to the tokens array
- if (current_default_token !== "") {
- tokens.push({ v: current_default_token, token_type: "" });
- current_default_token = "";
- }
-
- const [token, next] = scan_identifier(code, current_pos, true);
- current_pos = next;
- tokens.push(token);
- continue;
- }
- // try to scan a string
- else if (c === "\"") {
- // if the current default token is not empty, push it to the tokens array
- if (current_default_token !== "") {
- tokens.push({ v: current_default_token, token_type: "" });
- current_default_token = "";
- }
-
- const [token, next] = scan_string(code, current_pos);
- current_pos = next;
- tokens.push(token);
- continue;
- }
- // try to scan a comment
- else if (c === "/" && code[current_pos + 1] === "/") {
- // if the current default token is not empty, push it to the tokens array
- if (current_default_token !== "") {
- tokens.push({ v: current_default_token, token_type: "" });
- current_default_token = "";
- }
-
- let comment = "";
- let pos = current_pos;
-
- while (pos < code_len) {
- const char = code[pos];
-
- if (char === "\n") {
- break;
- }
-
- comment += char;
- pos++;
- }
-
- tokens.push({ v: comment, token_type: "comment" });
- current_pos = pos;
- continue;
- }
- // try to scan a multiline comment
- else if (c === "/" && code[current_pos + 1] === "*") {
- // if the current default token is not empty, push it to the tokens array
- if (current_default_token !== "") {
- tokens.push({ v: current_default_token, token_type: "" });
- current_default_token = "";
- }
-
- let comment = "";
- let pos = current_pos;
-
- while (pos < code_len) {
- const char = code[pos];
-
- if (char === "*" && code[pos + 1] === "/") {
- pos += 2;
- comment += "*/";
- break;
- }
-
- comment += char;
- pos++;
- }
-
- tokens.push({ v: comment, token_type: "comment" });
- current_pos = pos;
- continue;
- }
- // replace < with <
- else if (c === "<") {
- current_default_token += "<";
- current_pos++;
- continue;
- }
-
- current_default_token += c;
- current_pos++;
- }
-
- // if there was a default token, push it to the tokens array
- if (current_default_token !== "") {
- tokens.push({ v: current_default_token, token_type: "" });
- current_default_token = "";
- }
-
- return tokens;
-}
-
-
diff --git a/src/lexer/number_lexer.test.ts b/src/lexer/number_lexer.test.ts
deleted file mode 100644
index d8fa634..0000000
--- a/src/lexer/number_lexer.test.ts
+++ /dev/null
@@ -1,19 +0,0 @@
-import { expect, test, describe } from "vitest";
-import { scan_number } from "./number_lexer";
-
-describe("Number Lexer", () => {
- test("should return a whole number token", () => {
- const code = "1";
- const token = scan_number(code, 0);
-
- expect(token).toEqual([{ v: "1", token_type: "number" }, 1]);
- });
-
- test("should return a whole number token pt 2", () => {
- const code = "12345";
- const token = scan_number(code, 0);
-
- expect(token).toEqual([{ v: "12345", token_type: "number" }, 5]);
- });
-});
-
diff --git a/src/lexer/number_lexer.ts b/src/lexer/number_lexer.ts
deleted file mode 100644
index 01d4880..0000000
--- a/src/lexer/number_lexer.ts
+++ /dev/null
@@ -1,47 +0,0 @@
-import type { Token } from "./lexer";
-import { is_digit } from "./utils";
-
-/**
- * Scans a number, at the given position in the input string.
- * This function assumes that the character at the given position is a digit.
- * It follows this grammar:
- *
- * @param input the input string
- * @param pos the position to start scanning from
- * @returns
- */
-export function scan_number(input: string, pos: number): [Token, number] {
- const [token_value, next] = scan_decimal(input, pos);
-
- return [{ v: token_value, token_type: "number" }, next];
-}
-
-function scan_decimal(input: string, starting_position: number): [string, number] {
- let current_value = "";
- let pos = starting_position;
-
- while (pos < input.length) {
- const c = input[pos]!;
-
- if (c === ".") {
- // todo
- return [current_value, pos];
- }
- else if (c == "e" || c == "E") {
- // todo
- return [current_value, pos];
- }
- else if (is_digit(c)) {
- current_value += c;
- pos += 1;
- }
- else {
- break;
- }
-
- }
-
- return [current_value, pos];
-}
-
-
diff --git a/src/lexer/string_lexer.test.ts b/src/lexer/string_lexer.test.ts
deleted file mode 100644
index 7e42890..0000000
--- a/src/lexer/string_lexer.test.ts
+++ /dev/null
@@ -1,32 +0,0 @@
-import { expect, test, describe } from "vitest";
-import { scan_string } from "./string_lexer";
-
-describe("String Lexer", () => {
- test("should scan an empty string", () => {
- const code = "\"\"";
- const token = scan_string(code, 0);
-
- expect(token).toEqual([{ v: "\"\"", token_type: "string" }, 2]);
- });
-
- test("should scan a string with a single character", () => {
- const code = "\"a\"";
- const token = scan_string(code, 0);
-
- expect(token).toEqual([{ v: "\"a\"", token_type: "string" }, 3]);
- });
-
- test("should scan a string with multiple characters", () => {
- const code = "\"hello\"";
- const token = scan_string(code, 0);
-
- expect(token).toEqual([{ v: "\"hello\"", token_type: "string" }, 7]);
- });
-
- test("should scan a string with an escape character", () => {
- const code = "\"\\n\"";
- const token = scan_string(code, 0);
-
- expect(token).toEqual([{ v: "\"\\n\"", token_type: "string" }, 4]);
- });
-});
diff --git a/src/lexer/string_lexer.ts b/src/lexer/string_lexer.ts
deleted file mode 100644
index b5e75bd..0000000
--- a/src/lexer/string_lexer.ts
+++ /dev/null
@@ -1,49 +0,0 @@
-import type { Token } from "./lexer";
-
-export function scan_string(input: string, starting_position: number): [Token, number] {
- let value = "\"";
- let pos = starting_position + 1;
-
- while (pos < input.length) {
- const c = input[pos];
-
- if (c === "\"") {
- value += c;
- pos += 1;
- break;
- }
- if (c === "\n") {
- // todo: error handling, return an error indicator and the caller should render a red wavy underline
- break;
- }
- if (c === "\\") {
- const next_char = input[pos + 1];
- value += handle_escape_char(next_char);
- pos += 2;
- continue;
- }
-
- value += c;
- pos += 1;
- }
-
- return [{ v: value, token_type: "string" }, pos];
-}
-
-function handle_escape_char(next_char: string): string {
- switch (next_char) {
- case "n":
- return "\\n"
- case "t":
- return "\\t"
- case "r":
- return "\\r"
- case "\"":
- return "\\\""
- case "\\":
- return "\\\\"
- default:
- return "\\" + next_char
- }
-}
-
diff --git a/src/pages/learn/index.mdx b/src/pages/learn/index.mdx
index 066945d..e558562 100644
--- a/src/pages/learn/index.mdx
+++ b/src/pages/learn/index.mdx
@@ -196,7 +196,7 @@ $cat->meow();
- Instantiate classes without `new`