diff --git a/CHANGELOG.md b/CHANGELOG.md index 70399d2..ca44cba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## TODO +- Test correct operator precedence - Implement functions as first class citizens - Implement AST transformation before codegen: Create a new AST to represent PHP source code diff --git a/src/syntax/parsers/expression/equality.rs b/src/syntax/parsers/expression/equality.rs index c0672c6..36a0767 100644 --- a/src/syntax/parsers/expression/equality.rs +++ b/src/syntax/parsers/expression/equality.rs @@ -9,7 +9,6 @@ use crate::{ /// equality = comparison, (("==" | "!="), comparison )*; /// ``` pub fn try_parse(tokens: &Vec, pos: usize) -> ParsingResult { - // TODO: This must be newline/indentation aware let (comparison, next_pos) = match super::comparison::try_parse(tokens, pos) { Ok((expr, next_pos)) => (expr, next_pos), _ => return Err(ParsingError::Unmatched), @@ -29,6 +28,12 @@ fn parse_many<'a>( let mut indented = false; let result = match tokens.get(pos) { Some(token) if token.value == "==" || token.value == "!=" => { + // here handle indentation, again, for: + // ``` + // value + // == value + // ``` + match super::comparison::try_parse(tokens, pos + 1) { Ok((expr, next_pos)) => { let expr = Expression::BinaryOperator( @@ -175,7 +180,6 @@ mod tests { let tokens = get_tokens(&String::from("a\n == b\n == c")).unwrap(); let (result, next) = try_parse(&tokens, 0).unwrap(); - assert_eq!(tokens[8].token_type, TokenType::DEDENT); assert_eq!(next, 9); match result { @@ -185,4 +189,21 @@ mod tests { _ => panic!("Expected a binary operator"), } } + + /* + #[test] + fn should_parse_indented_5() { + let tokens = get_tokens(&String::from("a ==\n b")).unwrap(); + let (result, next) = try_parse(&tokens, 0).unwrap(); + + assert_eq!(next, 6); + + match result { + Expression::BinaryOperator(_, _, op) => { + assert_eq!(op, "==") + } + _ => panic!("Expected a binary operator"), + } + } + */ } diff --git a/src/syntax/parsers/expression/factor.rs b/src/syntax/parsers/expression/factor.rs index a82067d..ac0b3b2 100644 --- a/src/syntax/parsers/expression/factor.rs +++ b/src/syntax/parsers/expression/factor.rs @@ -1,6 +1,6 @@ use crate::{ - lexic::token::Token, - syntax::{ast::Expression, ParsingError, ParsingResult}, + lexic::token::{Token, TokenType}, + syntax::{ast::Expression, utils::Tokenizer, ParsingError, ParsingResult}, }; /// Parses a factor expression. @@ -14,19 +14,62 @@ pub fn try_parse(tokens: &Vec, pos: usize) -> ParsingResult { _ => return Err(ParsingError::Unmatched), }; - parse_many(tokens, next_pos, unary) + parse_many(tokens, next_pos, unary, 0) } fn parse_many<'a>( tokens: &'a Vec, pos: usize, prev_expr: Expression<'a>, + indentation_level: u32, ) -> ParsingResult<'a, Expression<'a>> { // (("/" | "*"), unary)* - match tokens.get(pos) { + let mut indent_count: u32 = 0; + + // Handle possible indentation before binary operator + let mut next_pos = pos; + match (tokens.get(next_pos), tokens.get(next_pos + 1)) { + // New indentation level + (Some(t1), Some(t2)) + if t1.token_type == TokenType::NewLine && t2.token_type == TokenType::INDENT => + { + // set indentation + next_pos += 2; + indent_count += 1; + } + // we are indented, ignore newlines + (Some(t), _) if t.token_type == TokenType::NewLine && indentation_level > 0 => { + next_pos += 1; + } + // let other handlers handle this + _ => {} + }; + + let result = match tokens.get(next_pos) { Some(token) if token.value == "/" || token.value == "*" => { - match super::unary::try_parse(tokens, pos + 1) { + next_pos += 1; + + // Handle possible indentation after binary operator + match (tokens.get(next_pos), tokens.get(next_pos + 1)) { + // New indentation level + (Some(t1), Some(t2)) + if t1.token_type == TokenType::NewLine + && t2.token_type == TokenType::INDENT => + { + // set indentation + next_pos += 2; + indent_count += 1; + } + // we are indented, ignore newlines + (Some(t), _) if t.token_type == TokenType::NewLine && indentation_level > 0 => { + next_pos += 1; + } + // let other handlers handle this + _ => {} + }; + + match super::unary::try_parse(tokens, next_pos) { Ok((expr, next_pos)) => { let expr = Expression::BinaryOperator( Box::new(prev_expr), @@ -34,13 +77,32 @@ fn parse_many<'a>( &token.value, ); - parse_many(tokens, next_pos, expr) + parse_many(tokens, next_pos, expr, indentation_level + indent_count) } - _ => Err(ParsingError::Unmatched), + _ => return Err(ParsingError::Unmatched), } } - _ => Ok((prev_expr, pos)), + _ => return Ok((prev_expr, pos)), + }; + + let (new_expr, mut next_pos) = match result { + Ok((e, n)) => (e, n), + _ => return result, + }; + + for _ in 0..indent_count { + // Expect a DEDENT for each indentation matched + match tokens.get(next_pos) { + // continue + Some(t) if t.token_type == TokenType::DEDENT => {} + // This should be unreachable, as the lexer always emits a DEDENT for each INDENT + _ => unreachable!("Illegal parser state: Expected DEDENT (count: {})", indent_count), + }; + + next_pos += 1; } + + Ok((new_expr, next_pos)) } #[cfg(test)] @@ -83,4 +145,98 @@ mod tests { _ => panic!("Expected an Unmatched error"), } } + + #[test] + fn should_parse_indented_1() { + let tokens = get_tokens(&String::from("a\n * b")).unwrap(); + let (result, next) = try_parse(&tokens, 0).unwrap(); + + assert_eq!(tokens[5].token_type, TokenType::DEDENT); + assert_eq!(next, 6); + + match result { + Expression::BinaryOperator(_, _, op) => { + assert_eq!(op, "*") + } + _ => panic!("Expected a binary operator"), + } + } + + #[test] + fn should_parse_indented_2() { + let tokens = get_tokens(&String::from("a\n * b\n * c")).unwrap(); + let (result, next) = try_parse(&tokens, 0).unwrap(); + + assert_eq!(tokens[9].token_type, TokenType::DEDENT); + assert_eq!(tokens[10].token_type, TokenType::DEDENT); + assert_eq!(next, 11); + + match result { + Expression::BinaryOperator(_, _, op) => { + assert_eq!(op, "*") + } + _ => panic!("Expected a binary operator"), + } + } + + #[test] + fn should_parse_indented_3() { + let tokens = get_tokens(&String::from("a\n * b * c")).unwrap(); + let (result, next) = try_parse(&tokens, 0).unwrap(); + + assert_eq!(tokens[7].token_type, TokenType::DEDENT); + assert_eq!(next, 8); + + match result { + Expression::BinaryOperator(_, _, op) => { + assert_eq!(op, "*") + } + _ => panic!("Expected a binary operator"), + } + } + + #[test] + fn should_parse_indented_4() { + let tokens = get_tokens(&String::from("a\n * b\n * c")).unwrap(); + let (result, next) = try_parse(&tokens, 0).unwrap(); + + assert_eq!(next, 9); + + match result { + Expression::BinaryOperator(_, _, op) => { + assert_eq!(op, "*") + } + _ => panic!("Expected a binary operator"), + } + } + + #[test] + fn should_parse_indented_5() { + let tokens = get_tokens(&String::from("a /\n b")).unwrap(); + let (result, next) = try_parse(&tokens, 0).unwrap(); + + assert_eq!(next, 6); + + match result { + Expression::BinaryOperator(_, _, op) => { + assert_eq!(op, "/") + } + _ => panic!("Expected a binary operator"), + } + } + + #[test] + fn should_parse_indented_6() { + let tokens = get_tokens(&String::from("a\n /\n b")).unwrap(); + let (result, next) = try_parse(&tokens, 0).unwrap(); + + assert_eq!(next, 9); + + match result { + Expression::BinaryOperator(_, _, op) => { + assert_eq!(op, "/") + } + _ => panic!("Expected a binary operator"), + } + } } diff --git a/src/syntax/parsers/expression/mod.rs b/src/syntax/parsers/expression/mod.rs index 1f48ad9..fccf8b8 100644 --- a/src/syntax/parsers/expression/mod.rs +++ b/src/syntax/parsers/expression/mod.rs @@ -29,7 +29,7 @@ mod tests { let tokens = get_tokens(&String::from("a\n == b")).unwrap(); let (expr, _) = Expression::try_parse(&tokens, 0).unwrap(); match expr { - Expression::BinaryOperator(_e1, _e2, op) => {} + Expression::BinaryOperator(_e1, _e2, _op) => {} _ => panic!("Expected a binary operation"), } } diff --git a/src/syntax/utils.rs b/src/syntax/utils.rs index b3f627d..9e72027 100644 --- a/src/syntax/utils.rs +++ b/src/syntax/utils.rs @@ -4,6 +4,8 @@ use super::{ParsingError, ParsingResult}; pub trait Tokenizer { fn get_significant<'a>(&'a self, index: usize) -> Option<(&'a Token, usize)>; + + fn get_indented<'a>(&'a self, index: usize, indented: bool) -> (Option<&'a Token>, usize); } impl Tokenizer for Vec { @@ -28,6 +30,31 @@ impl Tokenizer for Vec { } } } + + fn get_indented<'a>(&'a self, index: usize, indented: bool) -> (Option<&'a Token>, usize) { + if !indented { + return (self.get(index), index + 1); + } + + let mut current_pos = index; + + // Ignore all whitespace and newlines + loop { + match self.get(current_pos) { + Some(token) => { + if token.token_type == TokenType::INDENT + || token.token_type == TokenType::DEDENT + || token.token_type == TokenType::NewLine + { + current_pos += 1; + } else { + return (Some(token), current_pos); + } + } + None => return (None, index + 1), + } + } + } } /// Expects the token at `pos` to be an operator of value `operator`. Doesn't ignore whitespace or newlines