feat: Add interface to tokenize code from STDIN and output JSON

master
Araozu 2024-07-05 17:49:19 -05:00
parent 3f892e91c2
commit 5102d25676
13 changed files with 149 additions and 16 deletions

View File

@ -4,16 +4,9 @@
- Test correct operator precedence
- Implement functions as first class citizens
- Implement AST transformation before codegen:
Create a new AST to represent PHP source code
and a THP ast -> PHP ast process, so that the
codegen section can focus only in codegen, not in
translation of thp->php.
- Ignore indentation where it doesn't matter
- Parse __more__ binary operators
- Store tokens for the semantic analysis phase, to have actual error reporting
- Parse more complex bindings
- Watch mode
- Rework error messages
- Parse other language constructions
- Type checking
@ -28,7 +21,16 @@
- Not ignore comments & whitespace, for code formatting
- Abstract the parsing of datatypes, such that in the future generics can be implemented in a single place
- Include the original tokens in the AST
- Finish the workflow for a hello world
## v0.0.14
- [ ] Define a minimal PHP AST
- [ ] Transform THP AST into PHP AST
- [ ] Implement minimal codegen for the PHP AST
- [ ] Remove old codegen
- [ ] Finish the workflow for a hello world
## v0.0.13

80
Cargo.lock generated
View File

@ -12,19 +12,99 @@ dependencies = [
"windows-sys",
]
[[package]]
name = "itoa"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "proc-macro2"
version = "1.0.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
dependencies = [
"proc-macro2",
]
[[package]]
name = "ryu"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
[[package]]
name = "serde"
version = "1.0.203"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.203"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.120"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5"
dependencies = [
"itoa",
"ryu",
"serde",
]
[[package]]
name = "syn"
version = "2.0.68"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thp"
version = "0.0.13"
dependencies = [
"colored",
"serde",
"serde_json",
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]]
name = "windows-sys"
version = "0.48.0"

View File

@ -8,3 +8,5 @@ edition = "2021"
[dependencies]
colored = "2.1.0"
serde = { version = "1.0.203", features = ["derive"] }
serde_json = "1.0.120"

View File

@ -12,6 +12,7 @@ init Initializes a new project in the current directory
build, b Builds the project
fmt Formats all files in the project
watch, w Starts compilation of the project in watch mode
tokenize Tokenize code from STDIN and output tokens as JSON to STDOUT
help, h Print this message & exit

View File

@ -2,6 +2,7 @@ mod compile;
mod empty;
mod help;
mod repl;
mod tokenize;
mod types;
use types::CommandType;
@ -23,6 +24,7 @@ Commands
build Builds the project
fmt Formats all files in the project
watch, w Starts compilation of the project in watch mode
tokenize Tokenize code from STDIN and output tokens as JSON to STDOUT
help, h Print this message & exit
@ -67,6 +69,7 @@ fn parse_args() -> Result<(CommandType, Vec<String>), String> {
"init" => CommandType::Init,
"build" => CommandType::Build,
"fmt" => CommandType::Fmt,
"tokenize" => CommandType::Tokenize,
"watch" | "w" => CommandType::Watch,
"help" | "h" => CommandType::Help,
_ => return Err(format!("Unknown command `{}`", command)),

28
src/cli/tokenize.rs Normal file
View File

@ -0,0 +1,28 @@
use std::io::{self, BufRead};
use crate::lexic::get_tokens;
pub fn tokenize_command(_options: Vec<String>) -> Result<(), ()> {
// Get the input from stdin
let stdin = io::stdin();
let mut lines = Vec::new();
for line in stdin.lock().lines() {
match line {
Ok(line) => {
lines.push(line)
}
Err(reason) => {
eprintln!("Error reading input: {}", reason);
return Err(())
}
}
}
let input_code = lines.join("\n");
let tokens = get_tokens(&input_code);
let json = serde_json::to_string(&tokens).unwrap();
println!("{}", json);
Ok(())
}

View File

@ -8,6 +8,7 @@ pub enum CommandType {
Fmt,
Watch,
Help,
Tokenize,
None,
}
@ -18,6 +19,7 @@ impl CommandType {
CommandType::Compile => super::compile::compile_command(options),
CommandType::Repl => super::repl::repl_command(options),
CommandType::None => super::empty::empty_command(options),
CommandType::Tokenize => super::tokenize::tokenize_command(options),
_ => {
eprintln!("Not implemented yet! {:?} {:?}", self, options);
Err(())

View File

@ -1,3 +1,5 @@
use serde::Serialize;
use self::semantic_error::SemanticError;
mod lex_error;
@ -9,20 +11,20 @@ pub trait PrintableError {
fn get_error_str(&self, chars: &Vec<char>) -> String;
}
#[derive(Debug)]
#[derive(Serialize, Debug)]
pub enum MistiError {
Lex(LexError),
Syntax(SyntaxError),
Semantic(SemanticError),
}
#[derive(Debug)]
#[derive(Serialize, Debug)]
pub struct LexError {
pub position: usize,
pub reason: String,
}
#[derive(Debug)]
#[derive(Serialize, Debug)]
pub struct SyntaxError {
pub error_start: usize,
pub error_end: usize,

View File

@ -1,7 +1,9 @@
use serde::Serialize;
use super::utils::{get_line, get_line_number};
use super::PrintableError;
#[derive(Debug)]
#[derive(Serialize, Debug)]
pub struct SemanticError {
pub error_start: usize,
pub error_end: usize,

View File

@ -1,4 +1,6 @@
#[derive(PartialEq, Debug, Clone)]
use serde::Serialize;
#[derive(Serialize, PartialEq, Debug, Clone)]
pub enum TokenType {
Identifier,
Datatype,
@ -23,7 +25,7 @@ pub enum TokenType {
FUN,
}
#[derive(Debug, Clone, PartialEq)]
#[derive(Serialize, Debug, Clone, PartialEq)]
pub struct Token {
pub token_type: TokenType,
// The token as a raw string

View File

@ -1,3 +1,3 @@
// Follows https://phplang.org/spec/09-lexical-structure.html
// Follows https://phplang.org/spec/19-grammar.html#syntactic-grammar
struct PhpAst {}

View File

@ -14,7 +14,6 @@ impl<'a> Parseable<'a> for Expression<'a> {
type Item = Expression<'a>;
fn try_parse(tokens: &'a Vec<Token>, current_pos: usize) -> ParsingResult<'a, Self::Item> {
// TODO: This must be newline/indentation aware
equality::try_parse(tokens, current_pos)
}
}

View File

@ -42,6 +42,7 @@ impl<'a> Parseable<'a> for ModuleAST<'a> {
Ok((prod, next_pos)) => {
productions.push(ModuleMembers::Expr(prod));
current_pos = next_pos;
continue;
}
Err(ParsingError::Err(error)) => {
// TODO: Better error handling, write a better error message
@ -92,4 +93,13 @@ mod test {
}
}
}
#[test]
fn should_parse_expression() {
let tokens = get_tokens(&String::from("1")).unwrap();
let (module, next) = ModuleAST::try_parse(&tokens, 0).unwrap();
assert_eq!(next, 1);
}
}