restore old lexer

2024-07-05 19:51:50 -05:00 · 2024-07-05 19:51:50 -05:00 · 07ff7ede1e
commit 07ff7ede1e
parent a542071af9
8 changed files with 457 additions and 0 deletions
--- a/src/lexer/identifier_lexer.test.ts
+++ b/src/lexer/identifier_lexer.test.ts
@ -0,0 +1,55 @@
+import { expect, test, describe } from "vitest";
+import { scan_identifier } from "./identifier_lexer";
+
+
+describe("Identifier Lexer", () => {
+    test("should return an identifier token", () => {
+        const code = "a";
+        const token = scan_identifier(code, 0);
+
+        expect(token).toEqual([{ v: "a", token_type: "identifier" }, 1]);
+    });
+
+    test("should scan an underscore", () => {
+        const code = "_";
+        const token = scan_identifier(code, 0);
+
+        expect(token).toEqual([{ v: "_", token_type: "identifier" }, 1]);
+    });
+
+    test("should scan an identifier with an underscore", () => {
+        const code = "a_";
+        const token = scan_identifier(code, 0);
+
+        expect(token).toEqual([{ v: "a_", token_type: "identifier" }, 2]);
+    });
+
+    test("should scan an identifier that starts with an underscore", () => {
+        const code = "_a";
+        const token = scan_identifier(code, 0);
+
+        expect(token).toEqual([{ v: "_a", token_type: "identifier" }, 2]);
+    });
+
+    test("should scan an identifier with numbers and uppercase letters", () => {
+        const code = "aA1";
+        const token = scan_identifier(code, 0);
+
+        expect(token).toEqual([{ v: "aA1", token_type: "identifier" }, 3]);
+    });
+
+    test("should scan a keyword", () => {
+        const code = "val";
+        const token = scan_identifier(code, 0);
+
+        expect(token).toEqual([{ v: "val", token_type: "keyword" }, 3]);
+    });
+
+    test("should scan a datatype", () => {
+        const code = "Int";
+        const token = scan_identifier(code, 0, true);
+
+        expect(token).toEqual([{ v: "Int", token_type: "class-name" }, 3]);
+    });
+});
+
--- a/src/lexer/identifier_lexer.ts
+++ b/src/lexer/identifier_lexer.ts
@ -0,0 +1,44 @@
+import type { Token } from "./lexer";
+import { is_identifier_char } from "./utils";
+
+/**
+ * Scans an identifier, at the given position in the input string.
+ * This function assumes that the character at the given position is a letter.
+ * 
+ * @param input the input string
+ * @param starting_position the position to start scanning from
+ * @param is_datatype whether the identifier is a datatype
+ */
+export function scan_identifier(input: string, starting_position: number, is_datatype = false): [Token, number] {
+    let value = input[starting_position]!;
+    let pos = starting_position + 1;
+
+    while (pos < input.length) {
+        const c = input[pos]!;
+
+        if (is_identifier_char(c)) {
+            pos += 1;
+            value += c;
+        }
+        else {
+            break;
+        }
+    }
+
+    if (is_datatype) {
+        return [{ v: value, token_type: "class-name" }, pos];
+    }
+    else {
+        return [{ v: value, token_type: check_keyword(value) }, pos];
+    }
+}
+
+function check_keyword(value: string): string {
+    const keywords = ["throws", "extends", "constructor", "case", "static", "const", "enum", "union", "loop", "use", "break", "catch", "continue", "as", "do", "else", "finally", "for", "fun", "if", "in", "fn", "nil", "return", "throw", "try", "while", "type", "match", "with", "of", "abstract", "class", "interface", "private", "pub", "override", "open", "init", "val", "var", "mut", "clone"];
+
+    if (keywords.includes(value)) {
+        return "keyword";
+    }
+    return "identifier";
+}
+
--- a/src/lexer/lexer.test.ts
+++ b/src/lexer/lexer.test.ts
@ -0,0 +1,45 @@
+import { expect, test, describe } from "vitest";
+import { lex } from "./lexer";
+
+describe("Lexer", () => {
+    test("empty program should return no tokens", () => {
+        const code = "";
+        const tokens = lex(code);
+        expect(tokens).toEqual([]);
+    });
+
+    test("program with whitespace should return a single token", () => {
+        const code = " ";
+        const tokens = lex(code);
+        expect(tokens).toEqual([{v: " ", token_type: ""}]);
+    })
+
+    test("program with newlines should return a single token", () => {
+        const code = "\n";
+        const tokens = lex(code);
+        expect(tokens).toEqual([{v: "\n", token_type: ""}]);
+    });
+
+    test("program with random unicode should return the same unicode", () => {
+        const code = "🍕";
+        const tokens = lex(code);
+        expect(tokens).toEqual([{v: "🍕", token_type: ""}]);
+    });
+
+    test("should scan integers", () => {
+        const code = "12345";
+        const tokens = lex(code);
+        expect(tokens).toEqual([{v: "12345", token_type: "number"}]);
+    });
+
+    test("should scan integers and whitespace around", () => {
+        const code = "   12345  \n  ";
+        const tokens = lex(code);
+        expect(tokens).toEqual([
+            {v: "   ", token_type: ""},
+            {v: "12345", token_type: "number"},
+            {v: "  \n  ", token_type: ""},
+        ]);
+    });
+});
+
--- a/src/lexer/lexer.ts
+++ b/src/lexer/lexer.ts
@ -0,0 +1,166 @@
+import { scan_identifier } from "./identifier_lexer";
+import { scan_number } from "./number_lexer";
+import { scan_string } from "./string_lexer";
+import { is_digit, is_lowercase, is_uppercase } from "./utils";
+
+export type Token = {
+    v: string,
+    token_type: string,
+};
+
+/**
+ * Lexes a string of THP code, and returns an array of tokens. Unlike a regular
+ * lexer, whitespace and other characters are not ignored, and are instead treated
+ * as a default token.
+ * 
+ * This lexer implements a subset of the grammar defined in the THP language specification,
+ * only recognizing the following tokens:
+ * - Identifier
+ * - Datatype
+ * - String
+ * - Number
+ * - Single line comment
+ * - Multi line comment
+ * - Keywords
+ * 
+ * @param code Code to lex
+ * @returns An array of all the tokens found
+ */
+export function lex(code: string, start = 0): Array<Token> {
+    const code_len = code.length;
+    const tokens: Array<Token> = [];
+
+    let current_pos = start;
+    let current_default_token = "";
+
+    while (current_pos < code_len) {
+        const c = code[current_pos]!;
+
+        // try to scan a number
+        if (is_digit(c)) {
+            // if the current default token is not empty, push it to the tokens array
+            if (current_default_token !== "") {
+                tokens.push({ v: current_default_token, token_type: "" });
+                current_default_token = "";
+            }
+
+            // lex a number
+            const [token, next] = scan_number(code, current_pos);
+            current_pos = next;
+            tokens.push(token);
+            continue;
+        }
+        // try to scan an identifier/keyword
+        else if (is_lowercase(c) || c === "_") {
+            // if the current default token is not empty, push it to the tokens array
+            if (current_default_token !== "") {
+                tokens.push({ v: current_default_token, token_type: "" });
+                current_default_token = "";
+            }
+
+            const [token, next] = scan_identifier(code, current_pos);
+            current_pos = next;
+            tokens.push(token);
+            continue;
+        }
+        // try to scan a datatype
+        else if (is_uppercase(c)) {
+            // if the current default token is not empty, push it to the tokens array
+            if (current_default_token !== "") {
+                tokens.push({ v: current_default_token, token_type: "" });
+                current_default_token = "";
+            }
+
+            const [token, next] = scan_identifier(code, current_pos, true);
+            current_pos = next;
+            tokens.push(token);
+            continue;
+        }
+        // try to scan a string
+        else if (c === "\"") {
+            // if the current default token is not empty, push it to the tokens array
+            if (current_default_token !== "") {
+                tokens.push({ v: current_default_token, token_type: "" });
+                current_default_token = "";
+            }
+
+            const [token, next] = scan_string(code, current_pos);
+            current_pos = next;
+            tokens.push(token);
+            continue;
+        }
+        // try to scan a comment
+        else if (c === "/" && code[current_pos + 1] === "/") {
+            // if the current default token is not empty, push it to the tokens array
+            if (current_default_token !== "") {
+                tokens.push({ v: current_default_token, token_type: "" });
+                current_default_token = "";
+            }
+
+            let comment = "";
+            let pos = current_pos;
+
+            while (pos < code_len) {
+                const char = code[pos];
+
+                if (char === "\n") {
+                    break;
+                }
+
+                comment += char;
+                pos++;
+            }
+
+            tokens.push({ v: comment, token_type: "comment" });
+            current_pos = pos;
+            continue;
+        }
+        // try to scan a multiline comment
+        else if (c === "/" && code[current_pos + 1] === "*") {
+            // if the current default token is not empty, push it to the tokens array
+            if (current_default_token !== "") {
+                tokens.push({ v: current_default_token, token_type: "" });
+                current_default_token = "";
+            }
+
+            let comment = "";
+            let pos = current_pos;
+
+            while (pos < code_len) {
+                const char = code[pos];
+
+                if (char === "*" && code[pos + 1] === "/") {
+                    pos += 2;
+                    comment += "*/";
+                    break;
+                }
+
+                comment += char;
+                pos++;
+            }
+
+            tokens.push({ v: comment, token_type: "comment" });
+            current_pos = pos;
+            continue;
+        }
+        // replace < with &lt;
+        else if (c === "<") {
+            current_default_token += "&lt;";
+            current_pos++;
+            continue;
+        }
+
+        current_default_token += c;
+        current_pos++;
+    }
+
+    // if there was a default token, push it to the tokens array
+    if (current_default_token !== "") {
+        tokens.push({ v: current_default_token, token_type: "" });
+        current_default_token = "";
+    }
+
+    return tokens;
+}
+
+
--- a/src/lexer/number_lexer.test.ts
+++ b/src/lexer/number_lexer.test.ts
@ -0,0 +1,19 @@
+import { expect, test, describe } from "vitest";
+import { scan_number } from "./number_lexer";
+
+describe("Number Lexer", () => {
+    test("should return a whole number token", () => {
+        const code = "1";
+        const token = scan_number(code, 0);
+
+        expect(token).toEqual([{ v: "1", token_type: "number" }, 1]);
+    });
+
+    test("should return a whole number token pt 2", () => {
+        const code = "12345";
+        const token = scan_number(code, 0);
+
+        expect(token).toEqual([{ v: "12345", token_type: "number" }, 5]);
+    });
+});
+
--- a/src/lexer/number_lexer.ts
+++ b/src/lexer/number_lexer.ts
@ -0,0 +1,47 @@
+import type { Token } from "./lexer";
+import { is_digit } from "./utils";
+
+/**
+ * Scans a number, at the given position in the input string.
+ * This function assumes that the character at the given position is a digit.
+ * It follows this grammar:
+ * 
+ * @param input the input string
+ * @param pos the position to start scanning from
+ * @returns 
+ */
+export function scan_number(input: string, pos: number): [Token, number] {
+    const [token_value, next] = scan_decimal(input, pos);
+
+    return [{ v: token_value, token_type: "number" }, next];
+}
+
+function scan_decimal(input: string, starting_position: number): [string, number] {
+    let current_value = "";
+    let pos = starting_position;
+
+    while (pos < input.length) {
+        const c = input[pos]!;
+
+        if (c === ".") {
+            // todo
+            return [current_value, pos];
+        }
+        else if (c == "e" || c == "E") {
+            // todo
+            return [current_value, pos];
+        }
+        else if (is_digit(c)) {
+            current_value += c;
+            pos += 1;
+        }
+        else {
+            break;
+        }
+
+    }
+
+    return [current_value, pos];
+}
+
+
--- a/src/lexer/string_lexer.test.ts
+++ b/src/lexer/string_lexer.test.ts
@ -0,0 +1,32 @@
+import { expect, test, describe } from "vitest";
+import { scan_string } from "./string_lexer";
+
+describe("String Lexer", () => {
+    test("should scan an empty string", () => {
+        const code = "\"\"";
+        const token = scan_string(code, 0);
+
+        expect(token).toEqual([{ v: "\"\"", token_type: "string" }, 2]);
+    });
+
+    test("should scan a string with a single character", () => {
+        const code = "\"a\"";
+        const token = scan_string(code, 0);
+
+        expect(token).toEqual([{ v: "\"a\"", token_type: "string" }, 3]);
+    });
+
+    test("should scan a string with multiple characters", () => {
+        const code = "\"hello\"";
+        const token = scan_string(code, 0);
+
+        expect(token).toEqual([{ v: "\"hello\"", token_type: "string" }, 7]);
+    });
+    
+    test("should scan a string with an escape character", () => {
+        const code = "\"\\n\"";
+        const token = scan_string(code, 0);
+
+        expect(token).toEqual([{ v: "\"\\n\"", token_type: "string" }, 4]);
+    });
+});
--- a/src/lexer/string_lexer.ts
+++ b/src/lexer/string_lexer.ts
@ -0,0 +1,49 @@
+import type { Token } from "./lexer";
+
+export function scan_string(input: string, starting_position: number): [Token, number] {
+    let value = "\"";
+    let pos = starting_position + 1;
+
+    while (pos < input.length) {
+        const c = input[pos];
+
+        if (c === "\"") {
+            value += c;
+            pos += 1;
+            break;
+        }
+        if (c === "\n") {
+            // todo: error handling, return an error indicator and the caller should render a red wavy underline
+            break;
+        }
+        if (c === "\\") {
+            const next_char = input[pos + 1];
+            value += handle_escape_char(next_char);
+            pos += 2;
+            continue;
+        }
+
+        value += c;
+        pos += 1;
+    }
+
+    return [{ v: value, token_type: "string" }, pos];
+}
+
+function handle_escape_char(next_char: string): string {
+    switch (next_char) {
+        case "n":
+            return "\\n"
+        case "t":
+            return "\\t"
+        case "r":
+            return "\\r"
+        case "\"":
+            return "\\\""
+        case "\\":
+            return "\\\\"
+        default:
+            return "\\" + next_char
+    }
+}
+