diff --git a/src/lexer/identifier_lexer.test.ts b/src/lexer/identifier_lexer.test.ts new file mode 100644 index 0000000..44ecd37 --- /dev/null +++ b/src/lexer/identifier_lexer.test.ts @@ -0,0 +1,55 @@ +import { expect, test, describe } from "vitest"; +import { scan_identifier } from "./identifier_lexer"; + + +describe("Identifier Lexer", () => { + test("should return an identifier token", () => { + const code = "a"; + const token = scan_identifier(code, 0); + + expect(token).toEqual([{ v: "a", token_type: "identifier" }, 1]); + }); + + test("should scan an underscore", () => { + const code = "_"; + const token = scan_identifier(code, 0); + + expect(token).toEqual([{ v: "_", token_type: "identifier" }, 1]); + }); + + test("should scan an identifier with an underscore", () => { + const code = "a_"; + const token = scan_identifier(code, 0); + + expect(token).toEqual([{ v: "a_", token_type: "identifier" }, 2]); + }); + + test("should scan an identifier that starts with an underscore", () => { + const code = "_a"; + const token = scan_identifier(code, 0); + + expect(token).toEqual([{ v: "_a", token_type: "identifier" }, 2]); + }); + + test("should scan an identifier with numbers and uppercase letters", () => { + const code = "aA1"; + const token = scan_identifier(code, 0); + + expect(token).toEqual([{ v: "aA1", token_type: "identifier" }, 3]); + }); + + test("should scan a keyword", () => { + const code = "val"; + const token = scan_identifier(code, 0); + + expect(token).toEqual([{ v: "val", token_type: "keyword" }, 3]); + }); + + test("should scan a datatype", () => { + const code = "Int"; + const token = scan_identifier(code, 0, true); + + expect(token).toEqual([{ v: "Int", token_type: "class-name" }, 3]); + }); +}); + diff --git a/src/lexer/identifier_lexer.ts b/src/lexer/identifier_lexer.ts new file mode 100644 index 0000000..a531c52 --- /dev/null +++ b/src/lexer/identifier_lexer.ts @@ -0,0 +1,44 @@ +import type { Token } from "./lexer"; +import { is_identifier_char } from "./utils"; + +/** + * Scans an identifier, at the given position in the input string. + * This function assumes that the character at the given position is a letter. + * + * @param input the input string + * @param starting_position the position to start scanning from + * @param is_datatype whether the identifier is a datatype + */ +export function scan_identifier(input: string, starting_position: number, is_datatype = false): [Token, number] { + let value = input[starting_position]!; + let pos = starting_position + 1; + + while (pos < input.length) { + const c = input[pos]!; + + if (is_identifier_char(c)) { + pos += 1; + value += c; + } + else { + break; + } + } + + if (is_datatype) { + return [{ v: value, token_type: "class-name" }, pos]; + } + else { + return [{ v: value, token_type: check_keyword(value) }, pos]; + } +} + +function check_keyword(value: string): string { + const keywords = ["throws", "extends", "constructor", "case", "static", "const", "enum", "union", "loop", "use", "break", "catch", "continue", "as", "do", "else", "finally", "for", "fun", "if", "in", "fn", "nil", "return", "throw", "try", "while", "type", "match", "with", "of", "abstract", "class", "interface", "private", "pub", "override", "open", "init", "val", "var", "mut", "clone"]; + + if (keywords.includes(value)) { + return "keyword"; + } + return "identifier"; +} + diff --git a/src/lexer/lexer.test.ts b/src/lexer/lexer.test.ts new file mode 100644 index 0000000..a4c9461 --- /dev/null +++ b/src/lexer/lexer.test.ts @@ -0,0 +1,45 @@ +import { expect, test, describe } from "vitest"; +import { lex } from "./lexer"; + +describe("Lexer", () => { + test("empty program should return no tokens", () => { + const code = ""; + const tokens = lex(code); + expect(tokens).toEqual([]); + }); + + test("program with whitespace should return a single token", () => { + const code = " "; + const tokens = lex(code); + expect(tokens).toEqual([{v: " ", token_type: ""}]); + }) + + test("program with newlines should return a single token", () => { + const code = "\n"; + const tokens = lex(code); + expect(tokens).toEqual([{v: "\n", token_type: ""}]); + }); + + test("program with random unicode should return the same unicode", () => { + const code = "🍕"; + const tokens = lex(code); + expect(tokens).toEqual([{v: "🍕", token_type: ""}]); + }); + + test("should scan integers", () => { + const code = "12345"; + const tokens = lex(code); + expect(tokens).toEqual([{v: "12345", token_type: "number"}]); + }); + + test("should scan integers and whitespace around", () => { + const code = " 12345 \n "; + const tokens = lex(code); + expect(tokens).toEqual([ + {v: " ", token_type: ""}, + {v: "12345", token_type: "number"}, + {v: " \n ", token_type: ""}, + ]); + }); +}); + diff --git a/src/lexer/lexer.ts b/src/lexer/lexer.ts new file mode 100644 index 0000000..97a13dd --- /dev/null +++ b/src/lexer/lexer.ts @@ -0,0 +1,166 @@ +import { scan_identifier } from "./identifier_lexer"; +import { scan_number } from "./number_lexer"; +import { scan_string } from "./string_lexer"; +import { is_digit, is_lowercase, is_uppercase } from "./utils"; + +export type Token = { + v: string, + token_type: string, +}; + +/** + * Lexes a string of THP code, and returns an array of tokens. Unlike a regular + * lexer, whitespace and other characters are not ignored, and are instead treated + * as a default token. + * + * This lexer implements a subset of the grammar defined in the THP language specification, + * only recognizing the following tokens: + * - Identifier + * - Datatype + * - String + * - Number + * - Single line comment + * - Multi line comment + * - Keywords + * + * @param code Code to lex + * @returns An array of all the tokens found + */ +export function lex(code: string, start = 0): Array { + const code_len = code.length; + const tokens: Array = []; + + let current_pos = start; + let current_default_token = ""; + + while (current_pos < code_len) { + const c = code[current_pos]!; + + // try to scan a number + if (is_digit(c)) { + // if the current default token is not empty, push it to the tokens array + if (current_default_token !== "") { + tokens.push({ v: current_default_token, token_type: "" }); + current_default_token = ""; + } + + // lex a number + const [token, next] = scan_number(code, current_pos); + current_pos = next; + tokens.push(token); + continue; + } + // try to scan an identifier/keyword + else if (is_lowercase(c) || c === "_") { + // if the current default token is not empty, push it to the tokens array + if (current_default_token !== "") { + tokens.push({ v: current_default_token, token_type: "" }); + current_default_token = ""; + } + + const [token, next] = scan_identifier(code, current_pos); + current_pos = next; + tokens.push(token); + continue; + } + // try to scan a datatype + else if (is_uppercase(c)) { + // if the current default token is not empty, push it to the tokens array + if (current_default_token !== "") { + tokens.push({ v: current_default_token, token_type: "" }); + current_default_token = ""; + } + + const [token, next] = scan_identifier(code, current_pos, true); + current_pos = next; + tokens.push(token); + continue; + } + // try to scan a string + else if (c === "\"") { + // if the current default token is not empty, push it to the tokens array + if (current_default_token !== "") { + tokens.push({ v: current_default_token, token_type: "" }); + current_default_token = ""; + } + + const [token, next] = scan_string(code, current_pos); + current_pos = next; + tokens.push(token); + continue; + } + // try to scan a comment + else if (c === "/" && code[current_pos + 1] === "/") { + // if the current default token is not empty, push it to the tokens array + if (current_default_token !== "") { + tokens.push({ v: current_default_token, token_type: "" }); + current_default_token = ""; + } + + let comment = ""; + let pos = current_pos; + + while (pos < code_len) { + const char = code[pos]; + + if (char === "\n") { + break; + } + + comment += char; + pos++; + } + + tokens.push({ v: comment, token_type: "comment" }); + current_pos = pos; + continue; + } + // try to scan a multiline comment + else if (c === "/" && code[current_pos + 1] === "*") { + // if the current default token is not empty, push it to the tokens array + if (current_default_token !== "") { + tokens.push({ v: current_default_token, token_type: "" }); + current_default_token = ""; + } + + let comment = ""; + let pos = current_pos; + + while (pos < code_len) { + const char = code[pos]; + + if (char === "*" && code[pos + 1] === "/") { + pos += 2; + comment += "*/"; + break; + } + + comment += char; + pos++; + } + + tokens.push({ v: comment, token_type: "comment" }); + current_pos = pos; + continue; + } + // replace < with < + else if (c === "<") { + current_default_token += "<"; + current_pos++; + continue; + } + + current_default_token += c; + current_pos++; + } + + // if there was a default token, push it to the tokens array + if (current_default_token !== "") { + tokens.push({ v: current_default_token, token_type: "" }); + current_default_token = ""; + } + + return tokens; +} + + diff --git a/src/lexer/number_lexer.test.ts b/src/lexer/number_lexer.test.ts new file mode 100644 index 0000000..d8fa634 --- /dev/null +++ b/src/lexer/number_lexer.test.ts @@ -0,0 +1,19 @@ +import { expect, test, describe } from "vitest"; +import { scan_number } from "./number_lexer"; + +describe("Number Lexer", () => { + test("should return a whole number token", () => { + const code = "1"; + const token = scan_number(code, 0); + + expect(token).toEqual([{ v: "1", token_type: "number" }, 1]); + }); + + test("should return a whole number token pt 2", () => { + const code = "12345"; + const token = scan_number(code, 0); + + expect(token).toEqual([{ v: "12345", token_type: "number" }, 5]); + }); +}); + diff --git a/src/lexer/number_lexer.ts b/src/lexer/number_lexer.ts new file mode 100644 index 0000000..01d4880 --- /dev/null +++ b/src/lexer/number_lexer.ts @@ -0,0 +1,47 @@ +import type { Token } from "./lexer"; +import { is_digit } from "./utils"; + +/** + * Scans a number, at the given position in the input string. + * This function assumes that the character at the given position is a digit. + * It follows this grammar: + * + * @param input the input string + * @param pos the position to start scanning from + * @returns + */ +export function scan_number(input: string, pos: number): [Token, number] { + const [token_value, next] = scan_decimal(input, pos); + + return [{ v: token_value, token_type: "number" }, next]; +} + +function scan_decimal(input: string, starting_position: number): [string, number] { + let current_value = ""; + let pos = starting_position; + + while (pos < input.length) { + const c = input[pos]!; + + if (c === ".") { + // todo + return [current_value, pos]; + } + else if (c == "e" || c == "E") { + // todo + return [current_value, pos]; + } + else if (is_digit(c)) { + current_value += c; + pos += 1; + } + else { + break; + } + + } + + return [current_value, pos]; +} + + diff --git a/src/lexer/string_lexer.test.ts b/src/lexer/string_lexer.test.ts new file mode 100644 index 0000000..7e42890 --- /dev/null +++ b/src/lexer/string_lexer.test.ts @@ -0,0 +1,32 @@ +import { expect, test, describe } from "vitest"; +import { scan_string } from "./string_lexer"; + +describe("String Lexer", () => { + test("should scan an empty string", () => { + const code = "\"\""; + const token = scan_string(code, 0); + + expect(token).toEqual([{ v: "\"\"", token_type: "string" }, 2]); + }); + + test("should scan a string with a single character", () => { + const code = "\"a\""; + const token = scan_string(code, 0); + + expect(token).toEqual([{ v: "\"a\"", token_type: "string" }, 3]); + }); + + test("should scan a string with multiple characters", () => { + const code = "\"hello\""; + const token = scan_string(code, 0); + + expect(token).toEqual([{ v: "\"hello\"", token_type: "string" }, 7]); + }); + + test("should scan a string with an escape character", () => { + const code = "\"\\n\""; + const token = scan_string(code, 0); + + expect(token).toEqual([{ v: "\"\\n\"", token_type: "string" }, 4]); + }); +}); diff --git a/src/lexer/string_lexer.ts b/src/lexer/string_lexer.ts new file mode 100644 index 0000000..b5e75bd --- /dev/null +++ b/src/lexer/string_lexer.ts @@ -0,0 +1,49 @@ +import type { Token } from "./lexer"; + +export function scan_string(input: string, starting_position: number): [Token, number] { + let value = "\""; + let pos = starting_position + 1; + + while (pos < input.length) { + const c = input[pos]; + + if (c === "\"") { + value += c; + pos += 1; + break; + } + if (c === "\n") { + // todo: error handling, return an error indicator and the caller should render a red wavy underline + break; + } + if (c === "\\") { + const next_char = input[pos + 1]; + value += handle_escape_char(next_char); + pos += 2; + continue; + } + + value += c; + pos += 1; + } + + return [{ v: value, token_type: "string" }, pos]; +} + +function handle_escape_char(next_char: string): string { + switch (next_char) { + case "n": + return "\\n" + case "t": + return "\\t" + case "r": + return "\\r" + case "\"": + return "\\\"" + case "\\": + return "\\\\" + default: + return "\\" + next_char + } +} +