Compare commits

..

No commits in common. "78d01a8fc8856d4c4719279992a92f2b57d76925" and "be8c16ccf02618f84010f78d827f954bc76ffa3a" have entirely different histories.

7 changed files with 165 additions and 315 deletions

View File

@ -9,7 +9,6 @@
and a THP ast -> PHP ast process, so that the and a THP ast -> PHP ast process, so that the
codegen section can focus only in codegen, not in codegen section can focus only in codegen, not in
translation of thp->php. translation of thp->php.
- Ignore indentation where it doesn't matter
- Parse __more__ binary operators - Parse __more__ binary operators
- Store tokens for the semantic analysis phase, to have actual error reporting - Store tokens for the semantic analysis phase, to have actual error reporting
- Parse more complex bindings - Parse more complex bindings
@ -32,7 +31,6 @@
- [x] Begin work on a formal grammar - [x] Begin work on a formal grammar
- [x] Simplify/rewrite AST - [x] Simplify/rewrite AST
- [x] Properly parse expression indentation/dedentation
- [x] Define the top level constructs - [x] Define the top level constructs
- [ ] Include the original tokens in the AST - [ ] Include the original tokens in the AST
- [ ] Finish the workflow for a hello world - [ ] Finish the workflow for a hello world

View File

@ -3,8 +3,6 @@ use crate::{
syntax::{ast::Expression, ParsingError, ParsingResult}, syntax::{ast::Expression, ParsingError, ParsingResult},
}; };
use super::utils::try_binary_op;
/// Parses a factor expression. /// Parses a factor expression.
/// ///
/// ```ebnf /// ```ebnf
@ -16,41 +14,44 @@ pub fn try_parse(tokens: &Vec<Token>, pos: usize) -> ParsingResult<Expression> {
_ => return Err(ParsingError::Unmatched), _ => return Err(ParsingError::Unmatched),
}; };
parse_many(tokens, next_pos, term, 0) parse_many(tokens, next_pos, term)
} }
fn parse_many<'a>( fn parse_many<'a>(
tokens: &'a Vec<Token>, tokens: &'a Vec<Token>,
pos: usize, pos: usize,
prev_expr: Expression<'a>, prev_expr: Expression<'a>,
indentation_level: u32,
) -> ParsingResult<'a, Expression<'a>> { ) -> ParsingResult<'a, Expression<'a>> {
// comparison = term, ((">" | ">=" | "<" | "<="), term)*; // comparison = term, ((">" | ">=" | "<" | "<="), term)*;
try_binary_op(
tokens,
pos,
prev_expr,
vec![">", ">=", "<", "<="],
indentation_level,
|tokens, next_pos, prev_expr, token, indent_count: u32| match super::term::try_parse(
tokens, next_pos,
) {
Ok((expr, next_pos)) => {
let expr =
Expression::BinaryOperator(Box::new(prev_expr), Box::new(expr), &token.value);
parse_many(tokens, next_pos, expr, indentation_level + indent_count) match tokens.get(pos) {
Some(token)
if token.value == "<"
|| token.value == "<="
|| token.value == ">"
|| token.value == ">=" =>
{
match super::term::try_parse(tokens, pos + 1) {
Ok((expr, next_pos)) => {
let expr = Expression::BinaryOperator(
Box::new(prev_expr),
Box::new(expr),
&token.value,
);
parse_many(tokens, next_pos, expr)
}
_ => Err(ParsingError::Unmatched),
}
}
_ => Ok((prev_expr, pos)),
} }
_ => return Err(ParsingError::Unmatched),
},
)
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::lexic::get_tokens; use crate::lexic::get_tokens;
use crate::lexic::token::TokenType;
#[test] #[test]
fn should_parse_comparison() { fn should_parse_comparison() {
@ -87,80 +88,4 @@ mod tests {
_ => panic!("Expected an Unmatched error"), _ => panic!("Expected an Unmatched error"),
} }
} }
#[test]
fn should_parse_indented_1() {
let tokens = get_tokens(&String::from("a\n >= b")).unwrap();
let (result, next) = try_parse(&tokens, 0).unwrap();
assert_eq!(tokens[5].token_type, TokenType::DEDENT);
assert_eq!(next, 6);
match result {
Expression::BinaryOperator(_, _, op) => {
assert_eq!(op, ">=")
}
_ => panic!("Expected a binary operator"),
}
}
#[test]
fn should_parse_indented_2() {
let tokens = get_tokens(&String::from("a\n <= b\n <= c")).unwrap();
let (result, next) = try_parse(&tokens, 0).unwrap();
assert_eq!(next, 11);
match result {
Expression::BinaryOperator(_, _, op) => {
assert_eq!(op, "<=")
}
_ => panic!("Expected a binary operator"),
}
}
#[test]
fn should_parse_indented_3() {
let tokens = get_tokens(&String::from("a\n <= b <= c")).unwrap();
let (result, next) = try_parse(&tokens, 0).unwrap();
assert_eq!(tokens[7].token_type, TokenType::DEDENT);
assert_eq!(next, 8);
match result {
Expression::BinaryOperator(_, _, op) => {
assert_eq!(op, "<=")
}
_ => panic!("Expected a binary operator"),
}
}
#[test]
fn should_parse_indented_4() {
let tokens = get_tokens(&String::from("a\n <= b\n <= c")).unwrap();
let (result, next) = try_parse(&tokens, 0).unwrap();
assert_eq!(next, 9);
match result {
Expression::BinaryOperator(_, _, op) => {
assert_eq!(op, "<=")
}
_ => panic!("Expected a binary operator"),
}
}
#[test]
fn should_parse_indented_5() {
let tokens = get_tokens(&String::from("a >=\n b")).unwrap();
let (result, next) = try_parse(&tokens, 0).unwrap();
assert_eq!(next, 6);
match result {
Expression::BinaryOperator(_, _, op) => {
assert_eq!(op, ">=")
}
_ => panic!("Expected a binary operator"),
}
}
} }

View File

@ -1,10 +1,7 @@
use crate::{ use crate::{
lexic::token::Token, handle_dedentation, handle_indentation, lexic::token::{Token, TokenType}, syntax::{ast::Expression, ParsingError, ParsingResult}
syntax::{ast::Expression, ParsingError, ParsingResult},
}; };
use super::utils::try_binary_op;
/// Parses a factor expression. /// Parses a factor expression.
/// ///
/// ```ebnf /// ```ebnf
@ -26,30 +23,50 @@ fn parse_many<'a>(
indentation_level: u32, indentation_level: u32,
) -> ParsingResult<'a, Expression<'a>> { ) -> ParsingResult<'a, Expression<'a>> {
// equality = comparison, (("==" | "!="), comparison )*; // equality = comparison, (("==" | "!="), comparison )*;
try_binary_op(
tokens, let mut indent_count: u32 = 0;
pos, let mut next_pos = pos;
prev_expr,
vec!["==", "!="], // Handle possible indentation before binary operator
indentation_level, handle_indentation!(tokens, next_pos, indent_count, indentation_level);
|tokens, next_pos, prev_expr, token, indent_count: u32| match super::comparison::try_parse(
tokens, next_pos, let result = match tokens.get(next_pos) {
) { Some(token) if token.value == "==" || token.value == "!=" => {
next_pos += 1;
// Handle possible indentation after binary operator
handle_indentation!(tokens, next_pos, indent_count, indentation_level);
match super::comparison::try_parse(tokens, next_pos) {
Ok((expr, next_pos)) => { Ok((expr, next_pos)) => {
let expr = let expr = Expression::BinaryOperator(
Expression::BinaryOperator(Box::new(prev_expr), Box::new(expr), &token.value); Box::new(prev_expr),
Box::new(expr),
&token.value,
);
parse_many(tokens, next_pos, expr, indentation_level + indent_count) parse_many(tokens, next_pos, expr, indentation_level + indent_count)
} }
_ => return Err(ParsingError::Unmatched), _ => return Err(ParsingError::Unmatched),
}, }
) }
_ => return Ok((prev_expr, pos)),
};
let (new_expr, mut next_pos) = match result {
Ok((e, n)) => (e, n),
_ => return result,
};
handle_dedentation!(tokens, next_pos, indent_count);
Ok((new_expr, next_pos))
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::lexic::{get_tokens, token::TokenType}; use crate::lexic::get_tokens;
#[test] #[test]
fn should_parse_comparison() { fn should_parse_comparison() {

View File

@ -1,8 +1,5 @@
use crate::{ use crate::{
lexic::token::Token, handle_dedentation, handle_indentation, lexic::token::{Token, TokenType}, syntax::{ast::Expression, ParsingError, ParsingResult}
syntax::{
ast::Expression, parsers::expression::utils::try_binary_op, ParsingError, ParsingResult,
},
}; };
/// Parses a factor expression. /// Parses a factor expression.
@ -26,14 +23,20 @@ fn parse_many<'a>(
indentation_level: u32, indentation_level: u32,
) -> ParsingResult<'a, Expression<'a>> { ) -> ParsingResult<'a, Expression<'a>> {
// (("/" | "*"), unary)* // (("/" | "*"), unary)*
try_binary_op(
tokens, let mut indent_count: u32 = 0;
pos, let mut next_pos = pos;
prev_expr,
vec!["/", "*"], // Handle possible indentation before binary operator
indentation_level, handle_indentation!(tokens, next_pos, indent_count, indentation_level);
|tokens, next_pos, prev_expr, token, indent_count: u32| {
// match next let result = match tokens.get(next_pos) {
Some(token) if token.value == "/" || token.value == "*" => {
next_pos += 1;
// Handle possible indentation after binary operator
handle_indentation!(tokens, next_pos, indent_count, indentation_level);
match super::unary::try_parse(tokens, next_pos) { match super::unary::try_parse(tokens, next_pos) {
Ok((expr, next_pos)) => { Ok((expr, next_pos)) => {
let expr = Expression::BinaryOperator( let expr = Expression::BinaryOperator(
@ -46,14 +49,24 @@ fn parse_many<'a>(
} }
_ => return Err(ParsingError::Unmatched), _ => return Err(ParsingError::Unmatched),
} }
}, }
) _ => return Ok((prev_expr, pos)),
};
let (new_expr, mut next_pos) = match result {
Ok((e, n)) => (e, n),
_ => return result,
};
handle_dedentation!(tokens, next_pos, indent_count);
Ok((new_expr, next_pos))
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::lexic::{get_tokens, token::TokenType}; use crate::lexic::get_tokens;
#[test] #[test]
fn should_parse_comparison() { fn should_parse_comparison() {

View File

@ -1,4 +1,3 @@
use crate::syntax::parsers::expression::utils::try_binary_op;
use crate::{ use crate::{
lexic::token::Token, lexic::token::Token,
syntax::{ast::Expression, ParsingError, ParsingResult}, syntax::{ast::Expression, ParsingError, ParsingResult},
@ -15,26 +14,19 @@ pub fn try_parse(tokens: &Vec<Token>, pos: usize) -> ParsingResult<Expression> {
_ => return Err(ParsingError::Unmatched), _ => return Err(ParsingError::Unmatched),
}; };
parse_many(tokens, next_pos, factor, 0) parse_many(tokens, next_pos, factor)
} }
fn parse_many<'a>( fn parse_many<'a>(
tokens: &'a Vec<Token>, tokens: &'a Vec<Token>,
pos: usize, pos: usize,
prev_expr: Expression<'a>, prev_expr: Expression<'a>,
indentation_level: u32,
) -> ParsingResult<'a, Expression<'a>> { ) -> ParsingResult<'a, Expression<'a>> {
// term = factor, (("-" | "+"), factor)*; // term = factor, (("-" | "+"), factor)*;
try_binary_op( match tokens.get(pos) {
tokens, Some(token) if token.value == "+" || token.value == "-" => {
pos, match super::factor::try_parse(tokens, pos + 1) {
prev_expr,
vec!["+", "-"],
indentation_level,
|tokens, pos, prev_expr, token, indent_count: u32| {
// Parse the next factor
match super::factor::try_parse(tokens, pos) {
Ok((expr, next_pos)) => { Ok((expr, next_pos)) => {
let expr = Expression::BinaryOperator( let expr = Expression::BinaryOperator(
Box::new(prev_expr), Box::new(prev_expr),
@ -42,19 +34,19 @@ fn parse_many<'a>(
&token.value, &token.value,
); );
parse_many(tokens, next_pos, expr, indentation_level + indent_count) parse_many(tokens, next_pos, expr)
} }
_ => return Err(ParsingError::Unmatched), _ => Err(ParsingError::Unmatched),
}
}
_ => Ok((prev_expr, pos)),
} }
},
)
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::lexic::get_tokens; use crate::lexic::get_tokens;
use crate::lexic::token::TokenType;
#[test] #[test]
fn should_parse_comparison() { fn should_parse_comparison() {
@ -91,80 +83,4 @@ mod tests {
_ => panic!("Expected an Unmatched error"), _ => panic!("Expected an Unmatched error"),
} }
} }
#[test]
fn should_parse_indented_1() {
let tokens = get_tokens(&String::from("a\n + b")).unwrap();
let (result, next) = try_parse(&tokens, 0).unwrap();
assert_eq!(tokens[5].token_type, TokenType::DEDENT);
assert_eq!(next, 6);
match result {
Expression::BinaryOperator(_, _, op) => {
assert_eq!(op, "+")
}
_ => panic!("Expected a binary operator"),
}
}
#[test]
fn should_parse_indented_2() {
let tokens = get_tokens(&String::from("a\n + b\n + c")).unwrap();
let (result, next) = try_parse(&tokens, 0).unwrap();
assert_eq!(next, 11);
match result {
Expression::BinaryOperator(_, _, op) => {
assert_eq!(op, "+")
}
_ => panic!("Expected a binary operator"),
}
}
#[test]
fn should_parse_indented_3() {
let tokens = get_tokens(&String::from("a\n + b + c")).unwrap();
let (result, next) = try_parse(&tokens, 0).unwrap();
assert_eq!(tokens[7].token_type, TokenType::DEDENT);
assert_eq!(next, 8);
match result {
Expression::BinaryOperator(_, _, op) => {
assert_eq!(op, "+")
}
_ => panic!("Expected a binary operator"),
}
}
#[test]
fn should_parse_indented_4() {
let tokens = get_tokens(&String::from("a\n + b\n + c")).unwrap();
let (result, next) = try_parse(&tokens, 0).unwrap();
assert_eq!(next, 9);
match result {
Expression::BinaryOperator(_, _, op) => {
assert_eq!(op, "+")
}
_ => panic!("Expected a binary operator"),
}
}
#[test]
fn should_parse_indented_5() {
let tokens = get_tokens(&String::from("a +\n b")).unwrap();
let (result, next) = try_parse(&tokens, 0).unwrap();
assert_eq!(next, 6);
match result {
Expression::BinaryOperator(_, _, op) => {
assert_eq!(op, "+")
}
_ => panic!("Expected a binary operator"),
}
}
} }

View File

@ -1,89 +1,43 @@
use crate::lexic::token::Token; /// macro for handling indentation in expressions
use crate::lexic::token::TokenType::{NewLine, DEDENT, INDENT}; #[macro_export]
use crate::syntax::ast::Expression; macro_rules! handle_indentation {
use crate::syntax::parseable::ParsingResult; ($tokens: ident, $next_pos: ident, $indent_count: ident, $indentation_level: ident) => {
match ($tokens.get($next_pos), $tokens.get($next_pos + 1)) {
/// Parses a binary operator, handles indentation and runs a function on it. // New indentation level
/// (Some(t1), Some(t2))
/// First, handles indentation before the binary operator. Then, tries to if t1.token_type == TokenType::NewLine && t2.token_type == TokenType::INDENT =>
/// parse the binary operator. Then, handles indentation after the binary
/// operator.
///
/// After this runs the function `fun`. Finishes by handling dedentation
/// parsed in the previous phase.
pub fn try_binary_op<'a, F>(
tokens: &'a Vec<Token>,
original_pos: usize,
prev_expr: Expression<'a>,
operators: Vec<&str>,
indentation_level: u32,
fun: F,
) -> ParsingResult<'a, Expression<'a>>
where
F: FnOnce(
&'a Vec<Token>,
usize,
Expression<'a>,
&'a Token,
u32,
) -> ParsingResult<'a, Expression<'a>>,
{ {
let mut indent_count = 0; // set indentation
let pos = original_pos; $next_pos += 2;
$indent_count += 1;
// handle possible opening indentation
let pos = match (tokens.get(pos), tokens.get(pos + 1)) {
// New indentation level
(Some(t1), Some(t2)) if t1.token_type == NewLine && t2.token_type == INDENT => {
indent_count += 1;
pos + 2
} }
// when indented, ignore newlines // we are indented, ignore newlines
(Some(t), _) if t.token_type == NewLine && indentation_level > 0 => pos + 1, (Some(t), _) if t.token_type == TokenType::NewLine && $indentation_level > 0 => {
// let other handlers handle this $next_pos += 1;
_ => pos,
};
// try to parse any of the binary operators
let (matched_token, pos) = match tokens.get(pos) {
Some(token) if operators.contains(&token.value.as_str()) => (token, pos + 1),
// If not matched, return the existing expression
_ => return Ok((prev_expr, original_pos)),
};
// handle possible closing indentation
let pos = match (tokens.get(pos), tokens.get(pos + 1)) {
// New indentation level
(Some(t1), Some(t2)) if t1.token_type == NewLine && t2.token_type == INDENT => {
indent_count += 1;
pos + 2
} }
// when indented, ignore newlines
(Some(t), _) if t.token_type == NewLine && indentation_level > 0 => pos + 1,
// let other handlers handle this // let other handlers handle this
_ => pos, _ => {}
}; };
// run the rest of the logic
let (new_expr, mut next_pos) = match fun(tokens, pos, prev_expr, matched_token, indent_count) {
Ok((e, n)) => (e, n),
x => return x,
}; };
}
// handle the possible dedentation before/after the operator /// macro for handling dedentation in expressions
for _ in 0..indent_count { #[macro_export]
// expect a DEDENT for each INDENT matched macro_rules! handle_dedentation {
match tokens.get(next_pos) { ($tokens: ident, $next_pos: ident, $indent_count: ident) => {
for _ in 0..$indent_count {
// Expect a DEDENT for each indentation matched
match $tokens.get($next_pos) {
// continue // continue
Some(t) if t.token_type == DEDENT => {} Some(t) if t.token_type == TokenType::DEDENT => {}
// This should be unreachable, as the lexer always emits a DEDENT for each INDENT
_ => unreachable!( _ => unreachable!(
"Illegal parser state: Expected DEDENT (count: {})", "Illegal parser state: Expected DEDENT (count: {})",
indent_count $indent_count
), ),
}; };
next_pos += 1; $next_pos += 1;
} }
};
Ok((new_expr, next_pos))
} }

View File

@ -4,6 +4,8 @@ use super::{ParsingError, ParsingResult};
pub trait Tokenizer { pub trait Tokenizer {
fn get_significant<'a>(&'a self, index: usize) -> Option<(&'a Token, usize)>; fn get_significant<'a>(&'a self, index: usize) -> Option<(&'a Token, usize)>;
fn get_indented<'a>(&'a self, index: usize, indented: bool) -> (Option<&'a Token>, usize);
} }
impl Tokenizer for Vec<Token> { impl Tokenizer for Vec<Token> {
@ -28,6 +30,31 @@ impl Tokenizer for Vec<Token> {
} }
} }
} }
fn get_indented<'a>(&'a self, index: usize, indented: bool) -> (Option<&'a Token>, usize) {
if !indented {
return (self.get(index), index + 1);
}
let mut current_pos = index;
// Ignore all whitespace and newlines
loop {
match self.get(current_pos) {
Some(token) => {
if token.token_type == TokenType::INDENT
|| token.token_type == TokenType::DEDENT
|| token.token_type == TokenType::NewLine
{
current_pos += 1;
} else {
return (Some(token), current_pos);
}
}
None => return (None, index + 1),
}
}
}
} }
/// Expects the token at `pos` to be an operator of value `operator`. Doesn't ignore whitespace or newlines /// Expects the token at `pos` to be an operator of value `operator`. Doesn't ignore whitespace or newlines