diff --git a/src/components/Code.astro b/src/components/Code.astro index 4e7ed07..709f5e9 100644 --- a/src/components/Code.astro +++ b/src/components/Code.astro @@ -1,10 +1,11 @@ --- import { native_highlighter } from "../lexer/highlighter"; +import type { HighlightLevel } from "../lexer/types"; import CodeError from "./docs/CodeError.astro"; -const { thpcode, no_warnings } = Astro.props; +const { thpcode, no_warnings, level } = Astro.props; -const [native_html, error_type, error_message] = await native_highlighter(thpcode); +const [native_html, error_type, error_message] = await native_highlighter(thpcode, level as HighlightLevel); ---
): string { - let outLines: Array= []; - - for (const [idx, line] of lines.entries()) { - const tokens = lex(line); - const lineArray = [ - ` `, - ]; - - for (const token of tokens) { - if (token.token_type !== "") { - lineArray.push( - `${token.v}`, - ); - } else { - lineArray.push(token.v); - } - } - lineArray.push(""); - - outLines.push(lineArray.join("")); - } - - return outLines.join("\n"); -} - -const codeHtml = highlightCode(leftTrimDedent(code)); -let instructionSet: Array>; -try { - instructionSet = parse_str(steps); -} catch (e) { - console.error(Astro.url); - throw e; -} - -const serialized_inst = JSON.stringify(instructionSet); +import Code from "./Code.astro"; +const { code } = Astro.props; +// TODO: Delete this component, replace with Code --- - - thp code -- - +-----stdout------state-- ---- -- - --diff --git a/src/components/Navbar.astro b/src/components/Navbar.astro index 9e51092..8b9c34a 100644 --- a/src/components/Navbar.astro +++ b/src/components/Navbar.astro @@ -30,12 +30,6 @@ const { showSidebarButton = true } = Astro.props; > Learn - - How to guides - , Array
], - SyntaxOnly?: [Token[], Err], - TokensOnly?: [Token[], Err], - Err?: Err, -} +import { HighlightLevel } from "./types"; +import type { LexError, SyntaxError, SemanticError, Token, TokenizeResult, TokenType } from "./types"; const error_classes = "underline underline-offset-4 decoration-wavy decoration-red-500"; -export async function native_highlighter(code: string): Promise<[string, string, string | null]> { +export async function native_highlighter(code: string, level = HighlightLevel.Lexic): Promise<[string, string, string | null]> { let formatted_code = leftTrimDedent(code).join("\n"); - let result: TokenizeResult; try { - result = await native_lex(formatted_code); + let result = await native_lex(formatted_code); + return highlight_syntax(formatted_code, result, level); } catch (error) { return compiler_error(formatted_code, error as Error); } +} - if (result.Err) { - return lex_error_highlighter(formatted_code, result.Err!.Lex!); - } - else if (result.TokensOnly) { - const [tokens, error] = result.TokensOnly!; - return syntax_error_highlighter(formatted_code, tokens, error.Syntax!); - } - else if (result.SyntaxOnly) { - const [tokens, error] = result.SyntaxOnly!; - return semantic_error_highlighter(formatted_code, tokens, error.Semantic!); +function highlight_syntax(code: string, result: TokenizeResult, level: HighlightLevel): [string, string, string | null] { + let tokens_final: Array ; + + if (result.SemanticError) { + const [tokens, semanticError] = result.SemanticError; + + if (level === HighlightLevel.Semantic) { + return semantic_error_highlighter(code, tokens, semanticError.Semantic!); + } else { + tokens_final = tokens; + } + } else if (result.SyntaxError) { + const [tokens, syntaxError] = result.SyntaxError; + + if (level === HighlightLevel.Semantic || level === HighlightLevel.Syntactic) { + return syntax_error_highlighter(code, tokens, syntaxError.Syntax!); + } else { + tokens_final = tokens; + } + } else if (result.LexError) { + // There is no error level that bypasses a lex error + return lex_error_highlighter(code, result.LexError!.Lex!); + } else if (result.Ok) { + tokens_final = result.Ok; + } else { + console.error(result); + throw new Error("Web page error: The compiler returned a case that wasn't handled."); } - const tokens = result.Ok! as unknown as Array ; - // TODO: this is disable because the compiler has not - // implemented this feature yet - // const [tokens, references] = result.Ok!; - // console.log("refs:"); - // console.log(references); - - const output = highlight_tokens(formatted_code, tokens); + // At this point all error cases have been handled + // and tokens_final contains valid tokens. + const output = highlight_tokens(code, tokens_final); return [output, "", null]; } diff --git a/src/lexer/identifier_lexer.test.ts b/src/lexer/identifier_lexer.test.ts deleted file mode 100644 index 44ecd37..0000000 --- a/src/lexer/identifier_lexer.test.ts +++ /dev/null @@ -1,55 +0,0 @@ -import { expect, test, describe } from "vitest"; -import { scan_identifier } from "./identifier_lexer"; - - -describe("Identifier Lexer", () => { - test("should return an identifier token", () => { - const code = "a"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "a", token_type: "identifier" }, 1]); - }); - - test("should scan an underscore", () => { - const code = "_"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "_", token_type: "identifier" }, 1]); - }); - - test("should scan an identifier with an underscore", () => { - const code = "a_"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "a_", token_type: "identifier" }, 2]); - }); - - test("should scan an identifier that starts with an underscore", () => { - const code = "_a"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "_a", token_type: "identifier" }, 2]); - }); - - test("should scan an identifier with numbers and uppercase letters", () => { - const code = "aA1"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "aA1", token_type: "identifier" }, 3]); - }); - - test("should scan a keyword", () => { - const code = "val"; - const token = scan_identifier(code, 0); - - expect(token).toEqual([{ v: "val", token_type: "keyword" }, 3]); - }); - - test("should scan a datatype", () => { - const code = "Int"; - const token = scan_identifier(code, 0, true); - - expect(token).toEqual([{ v: "Int", token_type: "class-name" }, 3]); - }); -}); - diff --git a/src/lexer/identifier_lexer.ts b/src/lexer/identifier_lexer.ts deleted file mode 100644 index a531c52..0000000 --- a/src/lexer/identifier_lexer.ts +++ /dev/null @@ -1,44 +0,0 @@ -import type { Token } from "./lexer"; -import { is_identifier_char } from "./utils"; - -/** - * Scans an identifier, at the given position in the input string. - * This function assumes that the character at the given position is a letter. - * - * @param input the input string - * @param starting_position the position to start scanning from - * @param is_datatype whether the identifier is a datatype - */ -export function scan_identifier(input: string, starting_position: number, is_datatype = false): [Token, number] { - let value = input[starting_position]!; - let pos = starting_position + 1; - - while (pos < input.length) { - const c = input[pos]!; - - if (is_identifier_char(c)) { - pos += 1; - value += c; - } - else { - break; - } - } - - if (is_datatype) { - return [{ v: value, token_type: "class-name" }, pos]; - } - else { - return [{ v: value, token_type: check_keyword(value) }, pos]; - } -} - -function check_keyword(value: string): string { - const keywords = ["throws", "extends", "constructor", "case", "static", "const", "enum", "union", "loop", "use", "break", "catch", "continue", "as", "do", "else", "finally", "for", "fun", "if", "in", "fn", "nil", "return", "throw", "try", "while", "type", "match", "with", "of", "abstract", "class", "interface", "private", "pub", "override", "open", "init", "val", "var", "mut", "clone"]; - - if (keywords.includes(value)) { - return "keyword"; - } - return "identifier"; -} - diff --git a/src/lexer/lexer.test.ts b/src/lexer/lexer.test.ts deleted file mode 100644 index a4c9461..0000000 --- a/src/lexer/lexer.test.ts +++ /dev/null @@ -1,45 +0,0 @@ -import { expect, test, describe } from "vitest"; -import { lex } from "./lexer"; - -describe("Lexer", () => { - test("empty program should return no tokens", () => { - const code = ""; - const tokens = lex(code); - expect(tokens).toEqual([]); - }); - - test("program with whitespace should return a single token", () => { - const code = " "; - const tokens = lex(code); - expect(tokens).toEqual([{v: " ", token_type: ""}]); - }) - - test("program with newlines should return a single token", () => { - const code = "\n"; - const tokens = lex(code); - expect(tokens).toEqual([{v: "\n", token_type: ""}]); - }); - - test("program with random unicode should return the same unicode", () => { - const code = "🍕"; - const tokens = lex(code); - expect(tokens).toEqual([{v: "🍕", token_type: ""}]); - }); - - test("should scan integers", () => { - const code = "12345"; - const tokens = lex(code); - expect(tokens).toEqual([{v: "12345", token_type: "number"}]); - }); - - test("should scan integers and whitespace around", () => { - const code = " 12345 \n "; - const tokens = lex(code); - expect(tokens).toEqual([ - {v: " ", token_type: ""}, - {v: "12345", token_type: "number"}, - {v: " \n ", token_type: ""}, - ]); - }); -}); - diff --git a/src/lexer/lexer.ts b/src/lexer/lexer.ts deleted file mode 100644 index 97a13dd..0000000 --- a/src/lexer/lexer.ts +++ /dev/null @@ -1,166 +0,0 @@ -import { scan_identifier } from "./identifier_lexer"; -import { scan_number } from "./number_lexer"; -import { scan_string } from "./string_lexer"; -import { is_digit, is_lowercase, is_uppercase } from "./utils"; - -export type Token = { - v: string, - token_type: string, -}; - -/** - * Lexes a string of THP code, and returns an array of tokens. Unlike a regular - * lexer, whitespace and other characters are not ignored, and are instead treated - * as a default token. - * - * This lexer implements a subset of the grammar defined in the THP language specification, - * only recognizing the following tokens: - * - Identifier - * - Datatype - * - String - * - Number - * - Single line comment - * - Multi line comment - * - Keywords - * - * @param code Code to lex - * @returns An array of all the tokens found - */ -export function lex(code: string, start = 0): Array { - const code_len = code.length; - const tokens: Array = []; - - let current_pos = start; - let current_default_token = ""; - - while (current_pos < code_len) { - const c = code[current_pos]!; - - // try to scan a number - if (is_digit(c)) { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - // lex a number - const [token, next] = scan_number(code, current_pos); - current_pos = next; - tokens.push(token); - continue; - } - // try to scan an identifier/keyword - else if (is_lowercase(c) || c === "_") { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - const [token, next] = scan_identifier(code, current_pos); - current_pos = next; - tokens.push(token); - continue; - } - // try to scan a datatype - else if (is_uppercase(c)) { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - const [token, next] = scan_identifier(code, current_pos, true); - current_pos = next; - tokens.push(token); - continue; - } - // try to scan a string - else if (c === "\"") { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - const [token, next] = scan_string(code, current_pos); - current_pos = next; - tokens.push(token); - continue; - } - // try to scan a comment - else if (c === "/" && code[current_pos + 1] === "/") { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - let comment = ""; - let pos = current_pos; - - while (pos < code_len) { - const char = code[pos]; - - if (char === "\n") { - break; - } - - comment += char; - pos++; - } - - tokens.push({ v: comment, token_type: "comment" }); - current_pos = pos; - continue; - } - // try to scan a multiline comment - else if (c === "/" && code[current_pos + 1] === "*") { - // if the current default token is not empty, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - let comment = ""; - let pos = current_pos; - - while (pos < code_len) { - const char = code[pos]; - - if (char === "*" && code[pos + 1] === "/") { - pos += 2; - comment += "*/"; - break; - } - - comment += char; - pos++; - } - - tokens.push({ v: comment, token_type: "comment" }); - current_pos = pos; - continue; - } - // replace < with < - else if (c === "<") { - current_default_token += "<"; - current_pos++; - continue; - } - - current_default_token += c; - current_pos++; - } - - // if there was a default token, push it to the tokens array - if (current_default_token !== "") { - tokens.push({ v: current_default_token, token_type: "" }); - current_default_token = ""; - } - - return tokens; -} - - diff --git a/src/lexer/number_lexer.test.ts b/src/lexer/number_lexer.test.ts deleted file mode 100644 index d8fa634..0000000 --- a/src/lexer/number_lexer.test.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { expect, test, describe } from "vitest"; -import { scan_number } from "./number_lexer"; - -describe("Number Lexer", () => { - test("should return a whole number token", () => { - const code = "1"; - const token = scan_number(code, 0); - - expect(token).toEqual([{ v: "1", token_type: "number" }, 1]); - }); - - test("should return a whole number token pt 2", () => { - const code = "12345"; - const token = scan_number(code, 0); - - expect(token).toEqual([{ v: "12345", token_type: "number" }, 5]); - }); -}); - diff --git a/src/lexer/number_lexer.ts b/src/lexer/number_lexer.ts deleted file mode 100644 index 01d4880..0000000 --- a/src/lexer/number_lexer.ts +++ /dev/null @@ -1,47 +0,0 @@ -import type { Token } from "./lexer"; -import { is_digit } from "./utils"; - -/** - * Scans a number, at the given position in the input string. - * This function assumes that the character at the given position is a digit. - * It follows this grammar: - * - * @param input the input string - * @param pos the position to start scanning from - * @returns - */ -export function scan_number(input: string, pos: number): [Token, number] { - const [token_value, next] = scan_decimal(input, pos); - - return [{ v: token_value, token_type: "number" }, next]; -} - -function scan_decimal(input: string, starting_position: number): [string, number] { - let current_value = ""; - let pos = starting_position; - - while (pos < input.length) { - const c = input[pos]!; - - if (c === ".") { - // todo - return [current_value, pos]; - } - else if (c == "e" || c == "E") { - // todo - return [current_value, pos]; - } - else if (is_digit(c)) { - current_value += c; - pos += 1; - } - else { - break; - } - - } - - return [current_value, pos]; -} - - diff --git a/src/lexer/string_lexer.test.ts b/src/lexer/string_lexer.test.ts deleted file mode 100644 index 7e42890..0000000 --- a/src/lexer/string_lexer.test.ts +++ /dev/null @@ -1,32 +0,0 @@ -import { expect, test, describe } from "vitest"; -import { scan_string } from "./string_lexer"; - -describe("String Lexer", () => { - test("should scan an empty string", () => { - const code = "\"\""; - const token = scan_string(code, 0); - - expect(token).toEqual([{ v: "\"\"", token_type: "string" }, 2]); - }); - - test("should scan a string with a single character", () => { - const code = "\"a\""; - const token = scan_string(code, 0); - - expect(token).toEqual([{ v: "\"a\"", token_type: "string" }, 3]); - }); - - test("should scan a string with multiple characters", () => { - const code = "\"hello\""; - const token = scan_string(code, 0); - - expect(token).toEqual([{ v: "\"hello\"", token_type: "string" }, 7]); - }); - - test("should scan a string with an escape character", () => { - const code = "\"\\n\""; - const token = scan_string(code, 0); - - expect(token).toEqual([{ v: "\"\\n\"", token_type: "string" }, 4]); - }); -}); diff --git a/src/lexer/string_lexer.ts b/src/lexer/string_lexer.ts deleted file mode 100644 index b5e75bd..0000000 --- a/src/lexer/string_lexer.ts +++ /dev/null @@ -1,49 +0,0 @@ -import type { Token } from "./lexer"; - -export function scan_string(input: string, starting_position: number): [Token, number] { - let value = "\""; - let pos = starting_position + 1; - - while (pos < input.length) { - const c = input[pos]; - - if (c === "\"") { - value += c; - pos += 1; - break; - } - if (c === "\n") { - // todo: error handling, return an error indicator and the caller should render a red wavy underline - break; - } - if (c === "\\") { - const next_char = input[pos + 1]; - value += handle_escape_char(next_char); - pos += 2; - continue; - } - - value += c; - pos += 1; - } - - return [{ v: value, token_type: "string" }, pos]; -} - -function handle_escape_char(next_char: string): string { - switch (next_char) { - case "n": - return "\\n" - case "t": - return "\\t" - case "r": - return "\\r" - case "\"": - return "\\\"" - case "\\": - return "\\\\" - default: - return "\\" + next_char - } -} - diff --git a/src/lexer/types.ts b/src/lexer/types.ts new file mode 100644 index 0000000..d4cb53f --- /dev/null +++ b/src/lexer/types.ts @@ -0,0 +1,75 @@ +export type ReferenceItem = { + symbol_start: number + symbol_end: number + reference: string +} + +export interface Token { + token_type: TokenType + value: string + position: number +} + +export type TokenType = + "Identifier" | + "Datatype" | + "Int" | + "Float" | + "String" | + "Operator" | + "LeftParen" | + "RightParen" | + "LeftBracket" | + "RightBracket" | + "LeftBrace" | + "RightBrace" | + "NewLine" | + "Comment" | + "MultilineComment" | + "Comma" | + "INDENT" | + "DEDENT" | + "VAL" | + "VAR" | + "EOF" | + "FUN"; + +export interface Err { + Lex?: LexError + Syntax?: SyntaxError + Semantic?: SemanticError +} + +export interface LexError { + position: number + reason: string +} + +export interface SyntaxError { + error_start: number + error_end: number + reason: string +} + +export interface SemanticError { + error_start: number + error_end: number + reason: string +} + +export interface TokenizeResult { + /** All checks passed */ + Ok?: Array , + /** There were semantic errors */ + SemanticError?: [Array , Err], + /** There were syntax errors */ + SyntaxError?: [Array , Err], + /** No checks passed */ + LexError?: Err, +} + +export enum HighlightLevel { + Lexic = 0, + Syntactic = 1, + Semantic = 2, +} \ No newline at end of file diff --git a/src/pages/learn/basics/variables.mdx b/src/pages/learn/basics/variables.mdx index f41ac91..7f222b0 100644 --- a/src/pages/learn/basics/variables.mdx +++ b/src/pages/learn/basics/variables.mdx @@ -24,7 +24,7 @@ As a regex: `[a-z_][a-zA-Z0-9_]*` Defined with `val`, followed by a variable name and a value. - @@ -33,14 +33,14 @@ val year_of_birth = 1984 Written after the `val` keyword but before the variable name. -
When annotating an immutable variable the `val` keyword is optional -
@@ -61,7 +61,7 @@ String capital = 123.456 Defined with `var`, followed by a variable name and a value. -
@@ -70,14 +70,14 @@ var age = 32 Written after the `var` keywords but before the variable name. -
When annotating a mutable variable the keyword `var` is still **required**. -
{ - const characters = input.split(""); - const characters_len = characters.length; - let next_p = 0; - - const tokens: Array
= []; - - while (next_p < characters_len) - { - const c = characters[next_p]!; - - // word - if (is_lowercase(c) || is_uppercase(c)) - { - const [token, next] = lex_word(characters, next_p); - tokens.push(token); - next_p = next; - } - // number - else if (is_digit(c)) - { - const [token, next] = scan_number(input, next_p); - tokens.push([TokenType.Number, token.v]); - next_p = next; - } - // string - else if (c === "\"") - { - const [token, next] = scan_string(input, next_p); - tokens.push([TokenType.String, token.v]); - next_p = next; - } - else if (c === "{") - { - tokens.push([TokenType.BraceOpen, undefined]); - next_p += 1; - } - else if (c === "}") - { - tokens.push([TokenType.BraceClose, undefined]); - next_p += 1; - } - else if (c === " " || c === "\n") - { - next_p += 1; - } - else - { - throw new Error(`Invalid character: \`${c}\``); - } - } - - return tokens; -} - -function lex_word(input: Array , pos: number): [Token, number] { - let next_p = pos; - let value = ""; - - let c = input[next_p]; - while (c !== undefined && (is_lowercase(c) || is_uppercase(c) || is_digit(c) || c === "_")) - { - value += c; - next_p += 1; - c = input[next_p]; - } - - let token_type; - if (value === "step") { token_type = TokenType.Step; } - else if (value === "line") { token_type = TokenType.Line; } - else if (value === "set") { token_type = TokenType.Set; } - else if (value === "unset"){ token_type = TokenType.Unset; } - else if (value === "out") { token_type = TokenType.Out; } - else - { - throw new Error(`Invalid word: ${value}`); - } - - return [[token_type, value], next_p] -} - -export enum InstructionType { - Line, - Set, - Unset, - Out, -} - -export type Instruction = { - t: InstructionType, - v0: string, - v1?: string, -} - -export function parse_str(input: string): Array > { - return parse(lex(input)); -} - -// Parses the tokens into a instruction set -function parse(tokens: Array ): Array > { - let pos = 0; - let max = tokens.length; - - const ret = []; - - while (pos < max) { - const [steps, next_pos] = parse_step(tokens, pos); - pos = next_pos; - ret.push(steps); - } - - return ret; -} - -function parse_step(tokens: Array , _pos: number): [Array , number] { - let pos = _pos; - - expect(tokens, pos, TokenType.Step, "expected step"); - pos += 1; - expect(tokens, pos, TokenType.BraceOpen, "expected opening brace"); - pos += 1; - - const instructions = []; - - while (true) { - const [inst, next] = parse_instruction(tokens, pos); - if (inst === null) { - break; - } - instructions.push(inst); - pos = next; - } - - expect(tokens, pos, TokenType.BraceClose, "expected closing brace"); - pos += 1 - - return [instructions, pos]; -} - -function parse_instruction(tokens: Array , _pos: number): [Instruction|null, number] { - let pos = _pos; - - let instruction_type = tokens[pos]![0]; - if (instruction_type === TokenType.Line) { - pos += 1; - expect(tokens, pos, TokenType.Number, "expected a number after the `line` instruction"); - return [{ - t: InstructionType.Line, - v0: tokens[pos]![1]!, - }, pos + 1] - } - else if (instruction_type === TokenType.Set) { - pos += 1; - expect(tokens, pos, TokenType.String, "expected a string after the `set` instruction"); - pos += 1; - expect(tokens, pos, TokenType.String, "expected a second string after the `set` instruction"); - - return [{ - t: InstructionType.Set, - v0: tokens[pos - 1]![1]!, - v1: tokens[pos]![1]!, - }, pos + 1] - } - else if (instruction_type === TokenType.Unset) { - expect(tokens, pos + 1, TokenType.String, "expected a a string after the `unset` instruction"); - - return [{ - t: InstructionType.Unset, - v0: tokens[pos + 1]![1]!, - }, pos + 2] - } - else if (instruction_type === TokenType.Out) { - expect(tokens, pos + 1, TokenType.String, "expected a a string after the `unset` instruction"); - - return [{ - t: InstructionType.Out, - v0: tokens[pos + 1]![1]!, - }, pos + 2] - } - - return [null, pos]; -} - -function expect(t: Array , pos: number, type: TokenType, err: string) { - const [t_type] = t[pos]!; - if (t_type !== type) { - console.error("`" + String(t[pos]) + "`"); - throw new Error(err + " , got " + t[pos]); - } -} -