diff --git a/src/pages/learn/flow-control/ b/src/pages/learn/flow-control/ index 21afdee..1aa9737 100644 --- a/src/pages/learn/flow-control/ +++ b/src/pages/learn/flow-control/ @@ -1,5 +1,5 @@ --- -layout: ../../../layouts/PagesLayout.astro +layout: ../../../layouts/DocsLayout.astro title: Blocks --- diff --git a/src/pages/learn/flow-control/ b/src/pages/learn/flow-control/ index 369ff8d..3fabea4 100644 --- a/src/pages/learn/flow-control/ +++ b/src/pages/learn/flow-control/ @@ -1,5 +1,5 @@ --- -layout: ../../../layouts/PagesLayout.astro +layout: ../../../layouts/DocsLayout.astro title: Conditionals --- diff --git a/src/pages/learn/flow-control/ b/src/pages/learn/flow-control/ index 138404a..34d6eb3 100644 --- a/src/pages/learn/flow-control/ +++ b/src/pages/learn/flow-control/ @@ -1,5 +1,5 @@ --- -layout: ../../../layouts/PagesLayout.astro +layout: ../../../layouts/DocsLayout.astro title: Loops --- diff --git a/src/pages/learn/flow-control/ b/src/pages/learn/flow-control/ index 962c3a9..6419f81 100644 --- a/src/pages/learn/flow-control/ +++ b/src/pages/learn/flow-control/ @@ -1,5 +1,5 @@ --- -layout: ../../../layouts/PagesLayout.astro +layout: ../../../layouts/DocsLayout.astro title: Match --- diff --git a/src/pages/learn/functions/ b/src/pages/learn/functions/ index f08b659..ba2c1ae 100644 --- a/src/pages/learn/functions/ +++ b/src/pages/learn/functions/ @@ -1,5 +1,5 @@ --- -layout: ../../../layouts/PagesLayout.astro +layout: ../../../layouts/DocsLayout.astro title: Declaration --- diff --git a/src/pages/learn/functions/ b/src/pages/learn/functions/ index b6112ed..4b24183 100644 --- a/src/pages/learn/functions/ +++ b/src/pages/learn/functions/ @@ -1,5 +1,5 @@ --- -layout: ../../../layouts/PagesLayout.astro +layout: ../../../layouts/DocsLayout.astro title: Higher Order Functions --- diff --git a/src/pages/learn/functions/ b/src/pages/learn/functions/ index 02a31bd..03d1665 100644 --- a/src/pages/learn/functions/ +++ b/src/pages/learn/functions/ @@ -1,5 +1,5 @@ --- -layout: ../../../layouts/PagesLayout.astro +layout: ../../../layouts/DocsLayout.astro title: Lambdas --- diff --git a/src/pages/learn/functions/ b/src/pages/learn/functions/ index 8467517..f47bd6c 100644 --- a/src/pages/learn/functions/ +++ b/src/pages/learn/functions/ @@ -1,5 +1,5 @@ --- -layout: ../../../layouts/PagesLayout.astro +layout: ../../../layouts/DocsLayout.astro title: Function parameters --- diff --git a/src/pages/learn/index.mdx b/src/pages/learn/index.mdx index 8b0ea48..35a43d1 100644 --- a/src/pages/learn/index.mdx +++ b/src/pages/learn/index.mdx @@ -1,5 +1,5 @@ --- -layout: ../../layouts/PagesLayout.astro +layout: ../../layouts/DocsLayout.astro title: Welcome pagesLayout: - path: index diff --git a/src/pages/learn/ b/src/pages/learn/ index 8d54dfd..8505a34 100644 --- a/src/pages/learn/ +++ b/src/pages/learn/ @@ -1,5 +1,5 @@ --- -layout: ../../layouts/PagesLayout.astro +layout: ../../layouts/DocsLayout.astro title: Install --- diff --git a/src/pages/spec/ast/ b/src/pages/spec/ast/ new file mode 100644 index 0000000..223eb20 --- /dev/null +++ b/src/pages/spec/ast/ @@ -0,0 +1,36 @@ +--- +layout: ../../../layouts/SpecLayout.astro +title: AST +--- + +# THP AST + +Created during the syntax analysis phase, from the stream of +tokens produced by the lexic analysis phase. + +## File and modules + +Every file has its own AST, and every file is a module. + +```ebnf +AST = Module + +Module = (Statement | Expression)* +``` + +## Statement + +(At the moment) a statement is either a variable binding or a function declaration + +```ebnf +Statement = VariableBinding + | FunctionDeclaration +``` + +## Expression + +See the Expression section + + + + diff --git a/src/pages/spec/ b/src/pages/spec/ new file mode 100644 index 0000000..a6f40b9 --- /dev/null +++ b/src/pages/spec/ @@ -0,0 +1,129 @@ +--- +layout: ../../layouts/SpecLayout.astro +title: Welcome +pagesLayout: +- path: index +- path: tokens + title: Tokens + children: + - path: tokens + - path: numbers + - path: identifier + - path: string + - path: comments + - path: operator + - path: grouping + - path: newline +- path: ast + title: THP AST + children: + - path: ast +--- + + +# The THP Language Specification + +This series of pages define the THP Programming Language. + +THP's grammar is context-dependant. + +The syntax is specified using a weird mix of Extended Backus Naur Form +and RegExp: + +```abnf +; comments + +syntax = concatenation +concatenation = alternation grouping + +alternation = "a" | "b" + | "c" +grouping = ("a", "b") + +optional = "a"? +one_or_more = "a"+ +zero_or_more = "a"* + +range = "1".."9" +literal = "a" +``` + +## Compiler architecture + +The compiler consists of 5 common phases: + +- **Lexical Analysis**: Transforms the source code into tokens +- **Syntactic Analysis**: Parses the tokens and generates an AST +- **Semantic Analysis**: Checks the AST structure and performs type checking +- **IR**: Transforms the THP AST into a PHP AST +- **Codegen**: Generates PHP source code from the PHP AST + + + +## Source Code representation + +Source code is encoded in UTF-8, and a single UTF-8 codepoint is +a single character. As THP is implemented using the Rust programming +language, rules around Rust's UTF-8 usage are followed. + + +## Basic characters + +Although the source code must be encoded in UTF-8, most of the actual +source code will use only the basic 128 ASCII characters. String contents may +contain any Unicode code point. + +```abnf +underscore = "_" + +decimal_digit = "0".."9" +binary_digit = "0" | "1" +octal_digit = "0".."7" +hex_digit = decimal_digit | "a".."f" | "A".."F" + +lowercase_letter = "a".."z" +uppercase_letter = "A".."Z" +``` + + + + +## Whitespace + +THP is partially whitespace sensitive. It uses the following tokens: Indent, Dedent & NewLine +to determine when an expression spans multiple lines. + +The lexer stores the indentation level of every line, and when scanning the next line, +compares the previous indentation to the new one. If the amount of whitespace is +greater than before, it emits a Indent token. If it's lower, emits a Dedent token, and +if it's the same it does nothing. + + +```thp +1 + 2 + + 3 + + 4 +``` + +The previous code would emit the following tokens: `1` `+` `2` `NewLine` `Indent` `+` `3` `NewLine` +`+` `4` `Dedent` + + +Additionaly, it is a lexical error to have wrong indentation. The lexer stores all +previous indentation levels in a stack, and reports an error if a decrease in indentation +doesn't match a previous level. + +```thp +if true { // 0 indentation + print() // 4 indentation + print() // 2 indentation. Error. There is no 2-indentation level +} +``` + +These tokens are used to detect when a expression is done, instead of relying on +semicolons. This is performed by the parser. + +Every other production of the grammar doesn't care about whitespace/indentation, so +those ignore whitespace. + + diff --git a/src/pages/spec/tokens/ b/src/pages/spec/tokens/ new file mode 100644 index 0000000..56fc41c --- /dev/null +++ b/src/pages/spec/tokens/ @@ -0,0 +1,16 @@ +--- +layout: ../../../layouts/SpecLayout.astro +title: Comment +--- + +# Comment + +```ebnf +Comment = "//", any_except_new_line +``` + +```thp +// This is a comment +// +// Another // comment +``` diff --git a/src/pages/spec/tokens/ b/src/pages/spec/tokens/ new file mode 100644 index 0000000..db54ffa --- /dev/null +++ b/src/pages/spec/tokens/ @@ -0,0 +1,17 @@ +--- +layout: ../../../layouts/SpecLayout.astro +title: Grouping signs +--- + +# Grouping signs + +Each grouping sign has its own token: + +```ebnf +LeftParen = "(" +RightParen = ")" +LeftBracket = "[" +RightBracket = "]" +LeftBrace = "{" +RightBrace = "}" +``` diff --git a/src/pages/spec/tokens/ b/src/pages/spec/tokens/ new file mode 100644 index 0000000..0788dc3 --- /dev/null +++ b/src/pages/spec/tokens/ @@ -0,0 +1,54 @@ +--- +layout: ../../../layouts/SpecLayout.astro +title: Identifiers & Datatypes +--- + +# Identifiers & Datatypes + +Upper and lowercase letters carry different meaning when at the start of a word. + +A Datatype must always start with an uppercase letter, and an identifier must start +with either a lowercase letter or an underscore. + +## Identifier + +```ebnf +Identifier = (underscore | lowercase_letter), identifier_letter* + +identifier_letter = underscore | lowercase_letter | uppercase_letter | decimal_digit +``` + +```thp +identifier +_identifier +_123 +_many_letters +camelCase +``` + + +## Datatype + +```ebnf +Datatype = uppercase_letter, indentifier_letter* +``` + +```thp +Datatype +PDO +WEIRD_DATATYPE +``` + + +## Keywords + +The following are (currently) THP keywords: + +```thp +val var fun +``` + +Keywords are scanned first as identifiers, then transformed +to their respective tokens. + + diff --git a/src/pages/spec/tokens/ b/src/pages/spec/tokens/ new file mode 100644 index 0000000..cd35b5c --- /dev/null +++ b/src/pages/spec/tokens/ @@ -0,0 +1,15 @@ +--- +layout: ../../../layouts/SpecLayout.astro +title: New line +--- + +# New line + +When there are multiple empty lines, only a single NewLine token +is emitted. + +```ebnf +NewLine = "\n", empty_line* + +empty_line = " "*, "\n" +``` diff --git a/src/pages/spec/tokens/ b/src/pages/spec/tokens/ new file mode 100644 index 0000000..a10df0a --- /dev/null +++ b/src/pages/spec/tokens/ @@ -0,0 +1,56 @@ +--- +layout: ../../../layouts/SpecLayout.astro +title: Numbers +--- + +# Numbers + +## Int + +```ebnf +Int = hexadecimal_number + | decimal_number + +hexadecimal_number = "0", ("x" | "X"), hexadecimal_digit+ +decimal_number = decimal_digit+ +``` + +```thp +12345 +01234 // This is a decimal number, not an octal number +0xff25 +0XFfaA +``` + +`TODO`: Implement octal `0o777` and binary `0b0110`. + +`TODO`: Allow underscores `_` between any number: `1_000_000`. + + +## Float + +```ebnf +Float = decimal_number, ".", decimal_number+, scientific_notation? + | decimal_number, scientific_notation + +scientific_notation = "e", ("+" | "-"), decimal_number +``` + +```thp +123.456 +123.456e+4 +123.456e-2 + +123e+10 +123e-3 +``` + + +All floating point numbers must start with at least 1 digit. + `.5` is not a valid floating point number. + + +`TODO`: Allow scientific notation to omit the `+`/`-`: `10e4`. + + + diff --git a/src/pages/spec/tokens/ b/src/pages/spec/tokens/ new file mode 100644 index 0000000..becfa6b --- /dev/null +++ b/src/pages/spec/tokens/ @@ -0,0 +1,30 @@ +--- +layout: ../../../layouts/SpecLayout.astro +title: Operator +--- + +# Operator + + +```ebnf +Operator = operator_char+ + +operator_char = "+" | "-" | "=" | "*" | "!" | "/" | "|" + | "@" | "#" | "$" | "~" | "%" | "&" | "?" + | "<" | ">" | "^" | "." | ":" +``` + +```thp ++ - / * % < > <= >= -> => +``` + +These are all the characters that can make an operator. + +The lexer doesn't know about any operator in particular. +In other languages something like `+-1` would be interpreted +as `+` `-` `1`. In THP, this is always `+-` `1`, and that +would throw an error because the operator `+-` doesn't exist. + +## Comma + +Comma is its own token: `,`. diff --git a/src/pages/spec/tokens/ b/src/pages/spec/tokens/ new file mode 100644 index 0000000..e44bd10 --- /dev/null +++ b/src/pages/spec/tokens/ @@ -0,0 +1,29 @@ +--- +layout: ../../../layouts/SpecLayout.astro +title: String +--- + +# String + +A string is single line, delimited by double quotes `"` only. + +```ebnf +String = double_quote, (escape_seq | string_char)*, double_quote + +double_quote = '"' +escape_seq = "\n" + | '\"' + | "\r" + | "\\" + | "\t" +string_char = any_unicode_except_newline_and_double_quote +``` + +```thp +"hello" +"" +"it's me" +"\"Mario\"" +``` + +`TODO`: String interpolation diff --git a/src/pages/spec/tokens/ b/src/pages/spec/tokens/ new file mode 100644 index 0000000..dc41312 --- /dev/null +++ b/src/pages/spec/tokens/ @@ -0,0 +1,38 @@ +--- +layout: ../../../layouts/SpecLayout.astro +title: Index +--- + +# Tokens index + +These are all the THP tokens: + +```rust +pub enum TokenType { + Identifier, + Datatype, + Int, + Float, + String, + Operator, + LeftParen, + RightParen, + LeftBracket, + RightBracket, + LeftBrace, + RightBrace, + NewLine, + Comment, + Comma, + INDENT, + DEDENT, + VAL, + VAR, + EOF, + FUN, +} +``` + +Every keyword has its own token. + +