From 78d01a8fc8856d4c4719279992a92f2b57d76925 Mon Sep 17 00:00:00 2001 From: Araozu Date: Sat, 15 Jun 2024 21:02:45 -0500 Subject: [PATCH] feat: indentation for bypassing statement termination --- CHANGELOG.md | 3 +- src/syntax/parsers/expression/comparison.rs | 62 ++++-------- src/syntax/parsers/expression/equality.rs | 59 ++++------- src/syntax/parsers/expression/factor.rs | 45 +++------ src/syntax/parsers/expression/term.rs | 7 +- src/syntax/parsers/expression/utils.rs | 102 ++------------------ src/syntax/utils.rs | 28 ------ 7 files changed, 66 insertions(+), 240 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d498258..8e95878 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and a THP ast -> PHP ast process, so that the codegen section can focus only in codegen, not in translation of thp->php. +- Ignore indentation where it doesn't matter - Parse __more__ binary operators - Store tokens for the semantic analysis phase, to have actual error reporting - Parse more complex bindings @@ -31,7 +32,7 @@ - [x] Begin work on a formal grammar - [x] Simplify/rewrite AST -- [ ] Properly parse expression indentation/dedentation +- [x] Properly parse expression indentation/dedentation - [x] Define the top level constructs - [ ] Include the original tokens in the AST - [ ] Finish the workflow for a hello world diff --git a/src/syntax/parsers/expression/comparison.rs b/src/syntax/parsers/expression/comparison.rs index a381782..90a40eb 100644 --- a/src/syntax/parsers/expression/comparison.rs +++ b/src/syntax/parsers/expression/comparison.rs @@ -1,10 +1,10 @@ -use crate::lexic::token::TokenType; use crate::{ - handle_dedentation, handle_indentation, lexic::token::Token, syntax::{ast::Expression, ParsingError, ParsingResult}, }; +use super::utils::try_binary_op; + /// Parses a factor expression. /// /// ```ebnf @@ -26,55 +26,31 @@ fn parse_many<'a>( indentation_level: u32, ) -> ParsingResult<'a, Expression<'a>> { // comparison = term, ((">" | ">=" | "<" | "<="), term)*; + try_binary_op( + tokens, + pos, + prev_expr, + vec![">", ">=", "<", "<="], + indentation_level, + |tokens, next_pos, prev_expr, token, indent_count: u32| match super::term::try_parse( + tokens, next_pos, + ) { + Ok((expr, next_pos)) => { + let expr = + Expression::BinaryOperator(Box::new(prev_expr), Box::new(expr), &token.value); - let mut indent_count: u32 = 0; - let mut next_pos = pos; - - // Handle possible indentation before binary operator - handle_indentation!(tokens, next_pos, indent_count, indentation_level); - - let result = match tokens.get(next_pos) { - Some(token) - if token.value == "<" - || token.value == "<=" - || token.value == ">" - || token.value == ">=" => - { - next_pos += 1; - - // Handle possible indentation after binary operator - handle_indentation!(tokens, next_pos, indent_count, indentation_level); - - match super::term::try_parse(tokens, next_pos) { - Ok((expr, next_pos)) => { - let expr = Expression::BinaryOperator( - Box::new(prev_expr), - Box::new(expr), - &token.value, - ); - - parse_many(tokens, next_pos, expr, indentation_level + indent_count) - } - _ => return Err(ParsingError::Unmatched), + parse_many(tokens, next_pos, expr, indentation_level + indent_count) } - } - _ => return Ok((prev_expr, pos)), - }; - - let (new_expr, mut next_pos) = match result { - Ok((e, n)) => (e, n), - _ => return result, - }; - - handle_dedentation!(tokens, next_pos, indent_count); - - Ok((new_expr, next_pos)) + _ => return Err(ParsingError::Unmatched), + }, + ) } #[cfg(test)] mod tests { use super::*; use crate::lexic::get_tokens; + use crate::lexic::token::TokenType; #[test] fn should_parse_comparison() { diff --git a/src/syntax/parsers/expression/equality.rs b/src/syntax/parsers/expression/equality.rs index e16bf48..22622d3 100644 --- a/src/syntax/parsers/expression/equality.rs +++ b/src/syntax/parsers/expression/equality.rs @@ -1,9 +1,10 @@ use crate::{ - handle_dedentation, handle_indentation, - lexic::token::{Token, TokenType}, + lexic::token::Token, syntax::{ast::Expression, ParsingError, ParsingResult}, }; +use super::utils::try_binary_op; + /// Parses a factor expression. /// /// ```ebnf @@ -25,50 +26,30 @@ fn parse_many<'a>( indentation_level: u32, ) -> ParsingResult<'a, Expression<'a>> { // equality = comparison, (("==" | "!="), comparison )*; + try_binary_op( + tokens, + pos, + prev_expr, + vec!["==", "!="], + indentation_level, + |tokens, next_pos, prev_expr, token, indent_count: u32| match super::comparison::try_parse( + tokens, next_pos, + ) { + Ok((expr, next_pos)) => { + let expr = + Expression::BinaryOperator(Box::new(prev_expr), Box::new(expr), &token.value); - let mut indent_count: u32 = 0; - let mut next_pos = pos; - - // Handle possible indentation before binary operator - handle_indentation!(tokens, next_pos, indent_count, indentation_level); - - let result = match tokens.get(next_pos) { - Some(token) if token.value == "==" || token.value == "!=" => { - next_pos += 1; - - // Handle possible indentation after binary operator - handle_indentation!(tokens, next_pos, indent_count, indentation_level); - - match super::comparison::try_parse(tokens, next_pos) { - Ok((expr, next_pos)) => { - let expr = Expression::BinaryOperator( - Box::new(prev_expr), - Box::new(expr), - &token.value, - ); - - parse_many(tokens, next_pos, expr, indentation_level + indent_count) - } - _ => return Err(ParsingError::Unmatched), + parse_many(tokens, next_pos, expr, indentation_level + indent_count) } - } - _ => return Ok((prev_expr, pos)), - }; - - let (new_expr, mut next_pos) = match result { - Ok((e, n)) => (e, n), - _ => return result, - }; - - handle_dedentation!(tokens, next_pos, indent_count); - - Ok((new_expr, next_pos)) + _ => return Err(ParsingError::Unmatched), + }, + ) } #[cfg(test)] mod tests { use super::*; - use crate::lexic::get_tokens; + use crate::lexic::{get_tokens, token::TokenType}; #[test] fn should_parse_comparison() { diff --git a/src/syntax/parsers/expression/factor.rs b/src/syntax/parsers/expression/factor.rs index 1d452c4..a426d5f 100644 --- a/src/syntax/parsers/expression/factor.rs +++ b/src/syntax/parsers/expression/factor.rs @@ -1,7 +1,8 @@ use crate::{ - handle_dedentation, handle_indentation, - lexic::token::{Token, TokenType}, - syntax::{ast::Expression, ParsingError, ParsingResult}, + lexic::token::Token, + syntax::{ + ast::Expression, parsers::expression::utils::try_binary_op, ParsingError, ParsingResult, + }, }; /// Parses a factor expression. @@ -25,20 +26,14 @@ fn parse_many<'a>( indentation_level: u32, ) -> ParsingResult<'a, Expression<'a>> { // (("/" | "*"), unary)* - - let mut indent_count: u32 = 0; - let mut next_pos = pos; - - // Handle possible indentation before binary operator - handle_indentation!(tokens, next_pos, indent_count, indentation_level); - - let result = match tokens.get(next_pos) { - Some(token) if token.value == "/" || token.value == "*" => { - next_pos += 1; - - // Handle possible indentation after binary operator - handle_indentation!(tokens, next_pos, indent_count, indentation_level); - + try_binary_op( + tokens, + pos, + prev_expr, + vec!["/", "*"], + indentation_level, + |tokens, next_pos, prev_expr, token, indent_count: u32| { + // match next match super::unary::try_parse(tokens, next_pos) { Ok((expr, next_pos)) => { let expr = Expression::BinaryOperator( @@ -51,24 +46,14 @@ fn parse_many<'a>( } _ => return Err(ParsingError::Unmatched), } - } - _ => return Ok((prev_expr, pos)), - }; - - let (new_expr, mut next_pos) = match result { - Ok((e, n)) => (e, n), - _ => return result, - }; - - handle_dedentation!(tokens, next_pos, indent_count); - - Ok((new_expr, next_pos)) + }, + ) } #[cfg(test)] mod tests { use super::*; - use crate::lexic::get_tokens; + use crate::lexic::{get_tokens, token::TokenType}; #[test] fn should_parse_comparison() { diff --git a/src/syntax/parsers/expression/term.rs b/src/syntax/parsers/expression/term.rs index 869123f..e9391ac 100644 --- a/src/syntax/parsers/expression/term.rs +++ b/src/syntax/parsers/expression/term.rs @@ -1,7 +1,5 @@ -use crate::lexic::token::TokenType; -use crate::syntax::parsers::expression::utils::{try_binary_op, try_binary_op_2}; +use crate::syntax::parsers::expression::utils::try_binary_op; use crate::{ - handle_dedentation, handle_indentation, lexic::token::Token, syntax::{ast::Expression, ParsingError, ParsingResult}, }; @@ -28,7 +26,7 @@ fn parse_many<'a>( ) -> ParsingResult<'a, Expression<'a>> { // term = factor, (("-" | "+"), factor)*; - try_binary_op_2( + try_binary_op( tokens, pos, prev_expr, @@ -56,6 +54,7 @@ fn parse_many<'a>( mod tests { use super::*; use crate::lexic::get_tokens; + use crate::lexic::token::TokenType; #[test] fn should_parse_comparison() { diff --git a/src/syntax/parsers/expression/utils.rs b/src/syntax/parsers/expression/utils.rs index b25c1b4..3dc9483 100644 --- a/src/syntax/parsers/expression/utils.rs +++ b/src/syntax/parsers/expression/utils.rs @@ -3,59 +3,15 @@ use crate::lexic::token::TokenType::{NewLine, DEDENT, INDENT}; use crate::syntax::ast::Expression; use crate::syntax::parseable::ParsingResult; -/// Attempts to parse a binary operator and handles indentation +/// Parses a binary operator, handles indentation and runs a function on it. /// -/// Binary operators may be in a new line as long as they are indented. -/// The new line may be before or after the operator. +/// First, handles indentation before the binary operator. Then, tries to +/// parse the binary operator. Then, handles indentation after the binary +/// operator. /// -/// Once an operator is indented, all following operators completely disregard newline/indentation -/// until a matching dedent is found. -pub fn try_binary_op<'a>( - tokens: &'a Vec, - pos: usize, - operators: Vec<&str>, - indentation_level: u32, -) -> Option<(&'a Token, usize, u32)> { - let mut indent_count = 0; - - // handle possible opening indentation - let pos = match (tokens.get(pos), tokens.get(pos + 1)) { - // New indentation level - (Some(t1), Some(t2)) if t1.token_type == NewLine && t2.token_type == INDENT => { - indent_count += 1; - pos + 2 - } - // when indented, ignore newlines - (Some(t), _) if t.token_type == NewLine && indentation_level > 0 => pos + 1, - // let other handlers handle this - _ => pos, - }; - - // try to parse binary operator - let (matched_token, pos) = match tokens.get(pos) { - Some(token) if operators.contains(&token.value.as_str()) => (token, pos + 1), - _ => return None, - }; - - // handle possible closing indentation - let pos = match (tokens.get(pos), tokens.get(pos + 1)) { - // New indentation level - (Some(t1), Some(t2)) if t1.token_type == NewLine && t2.token_type == INDENT => { - indent_count += 1; - pos + 2 - } - // when indented, ignore newlines - (Some(t), _) if t.token_type == NewLine && indentation_level > 0 => pos + 1, - // let other handlers handle this - _ => pos, - }; - - // return the matched token, next position and new indentation level - Some((matched_token, pos, indent_count)) -} - -// TODO: document -pub fn try_binary_op_2<'a, F>( +/// After this runs the function `fun`. Finishes by handling dedentation +/// parsed in the previous phase. +pub fn try_binary_op<'a, F>( tokens: &'a Vec, original_pos: usize, prev_expr: Expression<'a>, @@ -131,47 +87,3 @@ where Ok((new_expr, next_pos)) } - -/// macro for handling indentation in expressions -#[macro_export] -macro_rules! handle_indentation { - ($tokens: ident, $next_pos: ident, $indent_count: ident, $indentation_level: ident) => { - match ($tokens.get($next_pos), $tokens.get($next_pos + 1)) { - // New indentation level - (Some(t1), Some(t2)) - if t1.token_type == TokenType::NewLine && t2.token_type == TokenType::INDENT => - { - // set indentation - $next_pos += 2; - $indent_count += 1; - } - // we are indented, ignore newlines - (Some(t), _) if t.token_type == TokenType::NewLine && $indentation_level > 0 => { - $next_pos += 1; - } - // let other handlers handle this - _ => {} - }; - }; -} - -/// macro for handling dedentation in expressions -#[macro_export] -macro_rules! handle_dedentation { - ($tokens: ident, $next_pos: ident, $indent_count: ident) => { - for _ in 0..$indent_count { - // Expect a DEDENT for each indentation matched - match $tokens.get($next_pos) { - // continue - Some(t) if t.token_type == TokenType::DEDENT => {} - // This should be unreachable, as the lexer always emits a DEDENT for each INDENT - _ => unreachable!( - "Illegal parser state: Expected DEDENT (count: {})", - $indent_count - ), - }; - - $next_pos += 1; - } - }; -} diff --git a/src/syntax/utils.rs b/src/syntax/utils.rs index 502f2ca..b3f627d 100644 --- a/src/syntax/utils.rs +++ b/src/syntax/utils.rs @@ -4,8 +4,6 @@ use super::{ParsingError, ParsingResult}; pub trait Tokenizer { fn get_significant<'a>(&'a self, index: usize) -> Option<(&'a Token, usize)>; - - fn get_indented<'a>(&'a self, index: usize, indented: bool) -> (Option<&'a Token>, usize); } impl Tokenizer for Vec { @@ -30,32 +28,6 @@ impl Tokenizer for Vec { } } } - - // unused? remove? - fn get_indented<'a>(&'a self, index: usize, indented: bool) -> (Option<&'a Token>, usize) { - if !indented { - return (self.get(index), index + 1); - } - - let mut current_pos = index; - - // Ignore all whitespace and newlines - loop { - match self.get(current_pos) { - Some(token) => { - if token.token_type == TokenType::INDENT - || token.token_type == TokenType::DEDENT - || token.token_type == TokenType::NewLine - { - current_pos += 1; - } else { - return (Some(token), current_pos); - } - } - None => return (None, index + 1), - } - } - } } /// Expects the token at `pos` to be an operator of value `operator`. Doesn't ignore whitespace or newlines