diff --git a/CHANGELOG.md b/CHANGELOG.md index d77793d..807ca9e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,16 +4,9 @@ - Test correct operator precedence - Implement functions as first class citizens -- Implement AST transformation before codegen: - Create a new AST to represent PHP source code - and a THP ast -> PHP ast process, so that the - codegen section can focus only in codegen, not in - translation of thp->php. -- Ignore indentation where it doesn't matter - Parse __more__ binary operators - Store tokens for the semantic analysis phase, to have actual error reporting - Parse more complex bindings -- Watch mode - Rework error messages - Parse other language constructions - Type checking @@ -28,7 +21,16 @@ - Not ignore comments & whitespace, for code formatting - Abstract the parsing of datatypes, such that in the future generics can be implemented in a single place - Include the original tokens in the AST -- Finish the workflow for a hello world + + +## v0.0.14 + +- [ ] Define a minimal PHP AST +- [ ] Transform THP AST into PHP AST +- [ ] Implement minimal codegen for the PHP AST +- [ ] Remove old codegen +- [ ] Finish the workflow for a hello world + ## v0.0.13 diff --git a/Cargo.lock b/Cargo.lock index edca375..c90a38f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12,19 +12,99 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "itoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" + [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "serde" +version = "1.0.203" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.203" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.120" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "thp" version = "0.0.13" dependencies = [ "colored", + "serde", + "serde_json", ] +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + [[package]] name = "windows-sys" version = "0.48.0" diff --git a/Cargo.toml b/Cargo.toml index cbbd2e8..7141c86 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,3 +8,5 @@ edition = "2021" [dependencies] colored = "2.1.0" +serde = { version = "1.0.203", features = ["derive"] } +serde_json = "1.0.120" diff --git a/src/cli/CLI.md b/src/cli/CLI.md index 1125c6e..8ed018a 100644 --- a/src/cli/CLI.md +++ b/src/cli/CLI.md @@ -12,6 +12,7 @@ init Initializes a new project in the current directory build, b Builds the project fmt Formats all files in the project watch, w Starts compilation of the project in watch mode +tokenize Tokenize code from STDIN and output tokens as JSON to STDOUT help, h Print this message & exit diff --git a/src/cli/mod.rs b/src/cli/mod.rs index f0271a4..eeb8baf 100644 --- a/src/cli/mod.rs +++ b/src/cli/mod.rs @@ -2,6 +2,7 @@ mod compile; mod empty; mod help; mod repl; +mod tokenize; mod types; use types::CommandType; @@ -23,6 +24,7 @@ Commands build Builds the project fmt Formats all files in the project watch, w Starts compilation of the project in watch mode + tokenize Tokenize code from STDIN and output tokens as JSON to STDOUT help, h Print this message & exit @@ -67,6 +69,7 @@ fn parse_args() -> Result<(CommandType, Vec), String> { "init" => CommandType::Init, "build" => CommandType::Build, "fmt" => CommandType::Fmt, + "tokenize" => CommandType::Tokenize, "watch" | "w" => CommandType::Watch, "help" | "h" => CommandType::Help, _ => return Err(format!("Unknown command `{}`", command)), diff --git a/src/cli/tokenize.rs b/src/cli/tokenize.rs new file mode 100644 index 0000000..f5ad719 --- /dev/null +++ b/src/cli/tokenize.rs @@ -0,0 +1,28 @@ +use std::io::{self, BufRead}; +use crate::lexic::get_tokens; + +pub fn tokenize_command(_options: Vec) -> Result<(), ()> { + // Get the input from stdin + let stdin = io::stdin(); + + let mut lines = Vec::new(); + for line in stdin.lock().lines() { + match line { + Ok(line) => { + lines.push(line) + } + Err(reason) => { + eprintln!("Error reading input: {}", reason); + return Err(()) + } + } + } + + let input_code = lines.join("\n"); + let tokens = get_tokens(&input_code); + + let json = serde_json::to_string(&tokens).unwrap(); + println!("{}", json); + + Ok(()) +} diff --git a/src/cli/types.rs b/src/cli/types.rs index be8bd9b..31be60a 100644 --- a/src/cli/types.rs +++ b/src/cli/types.rs @@ -8,6 +8,7 @@ pub enum CommandType { Fmt, Watch, Help, + Tokenize, None, } @@ -18,6 +19,7 @@ impl CommandType { CommandType::Compile => super::compile::compile_command(options), CommandType::Repl => super::repl::repl_command(options), CommandType::None => super::empty::empty_command(options), + CommandType::Tokenize => super::tokenize::tokenize_command(options), _ => { eprintln!("Not implemented yet! {:?} {:?}", self, options); Err(()) diff --git a/src/error_handling/mod.rs b/src/error_handling/mod.rs index 5a8a86a..928a5b6 100644 --- a/src/error_handling/mod.rs +++ b/src/error_handling/mod.rs @@ -1,3 +1,5 @@ +use serde::Serialize; + use self::semantic_error::SemanticError; mod lex_error; @@ -9,20 +11,20 @@ pub trait PrintableError { fn get_error_str(&self, chars: &Vec) -> String; } -#[derive(Debug)] +#[derive(Serialize, Debug)] pub enum MistiError { Lex(LexError), Syntax(SyntaxError), Semantic(SemanticError), } -#[derive(Debug)] +#[derive(Serialize, Debug)] pub struct LexError { pub position: usize, pub reason: String, } -#[derive(Debug)] +#[derive(Serialize, Debug)] pub struct SyntaxError { pub error_start: usize, pub error_end: usize, diff --git a/src/error_handling/semantic_error.rs b/src/error_handling/semantic_error.rs index 1084f75..7f4f0fc 100644 --- a/src/error_handling/semantic_error.rs +++ b/src/error_handling/semantic_error.rs @@ -1,7 +1,9 @@ +use serde::Serialize; + use super::utils::{get_line, get_line_number}; use super::PrintableError; -#[derive(Debug)] +#[derive(Serialize, Debug)] pub struct SemanticError { pub error_start: usize, pub error_end: usize, diff --git a/src/lexic/token.rs b/src/lexic/token.rs index a8975e0..57601ca 100755 --- a/src/lexic/token.rs +++ b/src/lexic/token.rs @@ -1,4 +1,6 @@ -#[derive(PartialEq, Debug, Clone)] +use serde::Serialize; + +#[derive(Serialize, PartialEq, Debug, Clone)] pub enum TokenType { Identifier, Datatype, @@ -23,7 +25,7 @@ pub enum TokenType { FUN, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Serialize, Debug, Clone, PartialEq)] pub struct Token { pub token_type: TokenType, // The token as a raw string diff --git a/src/php_ast/mod.rs b/src/php_ast/mod.rs index a9c8e90..2f9a696 100644 --- a/src/php_ast/mod.rs +++ b/src/php_ast/mod.rs @@ -1,3 +1,3 @@ -// Follows https://phplang.org/spec/09-lexical-structure.html +// Follows https://phplang.org/spec/19-grammar.html#syntactic-grammar struct PhpAst {} diff --git a/src/syntax/parsers/expression/mod.rs b/src/syntax/parsers/expression/mod.rs index d3b1923..e961fb1 100644 --- a/src/syntax/parsers/expression/mod.rs +++ b/src/syntax/parsers/expression/mod.rs @@ -14,7 +14,6 @@ impl<'a> Parseable<'a> for Expression<'a> { type Item = Expression<'a>; fn try_parse(tokens: &'a Vec, current_pos: usize) -> ParsingResult<'a, Self::Item> { - // TODO: This must be newline/indentation aware equality::try_parse(tokens, current_pos) } } diff --git a/src/syntax/parsers/module.rs b/src/syntax/parsers/module.rs index 6c7d291..574b43c 100644 --- a/src/syntax/parsers/module.rs +++ b/src/syntax/parsers/module.rs @@ -42,6 +42,7 @@ impl<'a> Parseable<'a> for ModuleAST<'a> { Ok((prod, next_pos)) => { productions.push(ModuleMembers::Expr(prod)); current_pos = next_pos; + continue; } Err(ParsingError::Err(error)) => { // TODO: Better error handling, write a better error message @@ -92,4 +93,13 @@ mod test { } } } + + #[test] + fn should_parse_expression() { + let tokens = get_tokens(&String::from("1")).unwrap(); + + let (module, next) = ModuleAST::try_parse(&tokens, 0).unwrap(); + + assert_eq!(next, 1); + } }