From e383d300f22c219bccca40b3584c41b0f7cbc01e Mon Sep 17 00:00:00 2001 From: Araozu Date: Wed, 15 Mar 2023 16:33:00 -0500 Subject: [PATCH] Show offending line and token when a syntax error is found --- CHANGELOG.md | 2 + src/error_handling/lex_error.rs | 8 +- src/error_handling/mod.rs | 2 + src/error_handling/syntax_error.rs | 140 ++++++++++++++++++++++++++++- src/lexic/lex_error.rs | 9 -- src/lexic/scanner/identifier.rs | 9 +- src/lexic/scanner/mod.rs | 2 +- src/lexic/scanner/new_line.rs | 4 +- src/lexic/scanner/number.rs | 8 +- src/lexic/scanner/operator.rs | 2 +- src/lexic/scanner/string.rs | 2 +- src/repl/mod.rs | 2 +- src/syntax/binding.rs | 41 +++++++-- src/syntax/mod.rs | 8 +- src/token.rs | 34 +++---- 15 files changed, 220 insertions(+), 53 deletions(-) delete mode 100755 src/lexic/lex_error.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index a8085c3..f4d0ca5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,10 +10,12 @@ - [ ] Namespace identifiers in the symbol table - [ ] Stdlib - [ ] Document code +- [ ] Test that the field `position` of the tokens actually points to the start of the token, and not its length ## v0.0.4 - Explicit datatype of variables +- Improve error messages when a syntax error is found (show offending line and offending token) ## v0.0.3 diff --git a/src/error_handling/lex_error.rs b/src/error_handling/lex_error.rs index b5c8877..833504e 100644 --- a/src/error_handling/lex_error.rs +++ b/src/error_handling/lex_error.rs @@ -6,11 +6,7 @@ impl PrintableError for LexError { fn get_error_str(&self, chars: &Vec) -> String { let (erroneous_code, back_count) = get_line(chars, self.position); - let mut whitespace = Vec::::new(); - for _ in 0..back_count { - whitespace.push(' '); - } - let whitespace = whitespace.iter().collect::(); + let whitespace = vec![' '; back_count].iter().collect::(); format!( "\n{}\n{}^\n\n{}{}\n{}", @@ -41,6 +37,8 @@ fn get_line(chars: &Vec, pos: usize) -> (String, usize) { let current_char = chars[before_pos]; if current_char == '\n' { + // This is important because before_pos will be used to calculate + // the number of chars before pos before_pos += 1; break; } diff --git a/src/error_handling/mod.rs b/src/error_handling/mod.rs index 258b70e..2d08f20 100644 --- a/src/error_handling/mod.rs +++ b/src/error_handling/mod.rs @@ -19,6 +19,8 @@ pub struct LexError { #[derive(Debug)] pub struct SyntaxError { + pub error_start: usize, + pub error_end: usize, pub reason: String, } diff --git a/src/error_handling/syntax_error.rs b/src/error_handling/syntax_error.rs index 7d31d06..e8353cb 100644 --- a/src/error_handling/syntax_error.rs +++ b/src/error_handling/syntax_error.rs @@ -1,7 +1,145 @@ +use std::collections::VecDeque; + use super::{PrintableError, SyntaxError}; impl PrintableError for SyntaxError { fn get_error_str(&self, chars: &Vec) -> String { - String::from("Syntax error: NOT IMPLEMENTED") + let (line, before, length) = get_line(chars, self.error_start, self.error_end); + + let whitespace = vec![' '; before].iter().collect::(); + let indicator = vec!['^'; length].iter().collect::(); + + format!( + "\n{}\n{}{}\n\n{}{}{}", + line, whitespace, indicator, "Syntax error at pos ", self.error_start, ": " + ) + } +} + +/// Extracts a line of code +/// +/// - `chars`: Input where to extract the line from +/// - `start_position`: Position where the erroneous code starts +/// - `end_position`: Position where the erroneous code ends +/// +/// Returns a tuple of: +/// +/// - `String`: The faulty line +/// - `usize`: The amount of chars *before* the faulty code +/// - `usize`: The lenght of the faulty code +/// +/// ## Example +/// +/// ``` +/// let input = String::from("\n\nval number == 50\n\n").chars().into_iter().collect(); +/// let start_position = 13; +/// let end_position = 15; +/// +/// let (line, before, length) = get_line(&input, start_position, end_position); +/// +/// assert_eq!("val number == 50", line); +/// assert_eq!(11, before); +/// assert_eq!(2, length); +/// ``` +fn get_line( + chars: &Vec, + start_position: usize, + end_position: usize, +) -> (String, usize, usize) { + let mut result_chars = VecDeque::::new(); + + // Push chars to the front until a new line is found + let mut before_pos = start_position; + loop { + let current_char = chars[before_pos]; + + if current_char == '\n' { + // This is important because before_pos will be used to calculate + // the number of chars before start_position + before_pos += 1; + break; + } + + result_chars.push_front(current_char); + + if before_pos == 0 { + break; + } + + before_pos -= 1; + } + + // Push chars to the end until a new line is found + let mut after_pos = start_position + 1; + let char_count = chars.len(); + while after_pos < char_count { + let current_char = chars[after_pos]; + + if current_char == '\n' { + break; + } + + result_chars.push_back(current_char); + after_pos += 1; + } + + ( + result_chars.iter().collect::(), + start_position - before_pos, + end_position - start_position, + ) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + error_handling::{PrintableError, SyntaxError}, + lexic::get_tokens, + syntax::construct_ast, + }; + + fn get_error_data(input: String) -> (Vec, SyntaxError) { + let tokens = get_tokens(&input).unwrap(); + let error_holder = construct_ast(&tokens); + + match error_holder { + Ok(_) => panic!( + "syntax_error test: Input expected to throw error didn't:\n\n{}", + input + ), + Err(error) => { + let chars: Vec = input.chars().into_iter().collect(); + + (chars, error) + } + } + } + + #[test] + fn should_show_an_error_for_missing_binding_name() { + let (chars, error) = get_error_data(String::from("val")); + let actual_err = error.get_error_str(&chars); + // TODO: Write a better error message (something that explains why it failed) + let expected_str = format!("\n{}\n{}\n\n{}", "val", "^^^", "Syntax error at pos 0: "); + + assert_eq!(expected_str, actual_err); + } + + #[test] + fn should_get_line() { + let input: Vec = String::from("\n\nval number == 50\n\n") + .chars() + .into_iter() + .collect(); + + let start_position = 13; + let end_position = 15; + + let (line, before, length) = get_line(&input, start_position, end_position); + + assert_eq!("val number == 50", line); + assert_eq!(11, before); + assert_eq!(2, length); } } diff --git a/src/lexic/lex_error.rs b/src/lexic/lex_error.rs deleted file mode 100755 index cc4c65a..0000000 --- a/src/lexic/lex_error.rs +++ /dev/null @@ -1,9 +0,0 @@ - -/// Represents an error in the scanning process -#[derive(Debug)] -pub struct LexError { - /// Position where the offending char was found - pub position: usize, - /// Reason of the errror - pub reason: String, -} diff --git a/src/lexic/scanner/identifier.rs b/src/lexic/scanner/identifier.rs index 09bb6f1..be209dd 100755 --- a/src/lexic/scanner/identifier.rs +++ b/src/lexic/scanner/identifier.rs @@ -34,12 +34,15 @@ fn scan_impl(chars: &Vec, start_pos: usize, current: String, is_datatype: is_datatype, ), _ => { + // start_pos is the position where the token ENDS, not where it STARTS, + // so this is used to retrieve the original START position of the token + let current_len = current.len(); if let Some(token_type) = str_is_keyword(¤t) { - LexResult::Some(token::new(current, start_pos as i32, token_type), start_pos) + LexResult::Some(token::new(current, start_pos - current_len, token_type), start_pos) } else if is_datatype { - LexResult::Some(token::new_datatype(current, start_pos as i32), start_pos) + LexResult::Some(token::new_datatype(current, start_pos - current_len), start_pos) } else { - LexResult::Some(token::new_identifier(current, start_pos as i32), start_pos) + LexResult::Some(token::new_identifier(current, start_pos - current_len), start_pos) } } } diff --git a/src/lexic/scanner/mod.rs b/src/lexic/scanner/mod.rs index ee2524d..f6d08df 100755 --- a/src/lexic/scanner/mod.rs +++ b/src/lexic/scanner/mod.rs @@ -33,7 +33,7 @@ pub fn grouping_sign(c: char, _: &Vec, start_pos: usize) -> Option return None, }; - let token = token::new(c.to_string(), start_pos as i32, token_type); + let token = token::new(c.to_string(), start_pos, token_type); Some(LexResult::Some(token, start_pos + 1)) } diff --git a/src/lexic/scanner/new_line.rs b/src/lexic/scanner/new_line.rs index 197cf4f..6da4a2d 100644 --- a/src/lexic/scanner/new_line.rs +++ b/src/lexic/scanner/new_line.rs @@ -17,12 +17,12 @@ pub fn scan(chars: &Vec, start_pos: usize) -> LexResult { Some(c) if *c == ' ' => match look_ahead_for_new_line(chars, start_pos + 1) { Some(next_pos) => scan(chars, next_pos), None => { - let token = token::new(String::from(";"), start_pos as i32, TokenType::Semicolon); + let token = token::new(String::from(";"), start_pos, TokenType::Semicolon); LexResult::Some(token, start_pos) } }, Some(_) | None => { - let token = token::new(String::from(";"), start_pos as i32, TokenType::Semicolon); + let token = token::new(String::from(";"), start_pos, TokenType::Semicolon); LexResult::Some(token, start_pos) } } diff --git a/src/lexic/scanner/number.rs b/src/lexic/scanner/number.rs index fde623f..11728fb 100755 --- a/src/lexic/scanner/number.rs +++ b/src/lexic/scanner/number.rs @@ -33,7 +33,7 @@ fn scan_decimal(chars: &Vec, start_pos: usize, current: String) -> LexResu Some(c) if utils::is_digit(*c) => { scan_decimal(chars, start_pos + 1, utils::str_append(current, *c)) } - _ => LexResult::Some(token::new_number(current, start_pos as i32), start_pos), + _ => LexResult::Some(token::new_number(current, start_pos), start_pos), } } @@ -86,7 +86,7 @@ fn scan_double_impl(chars: &Vec, start_pos: usize, current: String) -> Lex Some(c) if *c == 'e' => { scan_scientific(chars, start_pos + 1, utils::str_append(current, *c)) } - _ => LexResult::Some(token::new_number(current, start_pos as i32), start_pos), + _ => LexResult::Some(token::new_number(current, start_pos), start_pos), } } @@ -123,7 +123,7 @@ fn scan_digits(chars: &Vec, start_pos: usize, current: String) -> (Token, Some(c) if utils::is_digit(*c) => { scan_digits(chars, start_pos + 1, utils::str_append(current, *c)) } - _ => (token::new_number(current, start_pos as i32), start_pos), + _ => (token::new_number(current, start_pos), start_pos), } } @@ -133,7 +133,7 @@ fn scan_hex_digits(chars: &Vec, start_pos: usize, current: String) -> (Tok Some(c) if utils::is_hex_digit(*c) => { scan_hex_digits(chars, start_pos + 1, utils::str_append(current, *c)) } - _ => (token::new_number(current, start_pos as i32), start_pos), + _ => (token::new_number(current, start_pos), start_pos), } } diff --git a/src/lexic/scanner/operator.rs b/src/lexic/scanner/operator.rs index 430b39b..9514c42 100755 --- a/src/lexic/scanner/operator.rs +++ b/src/lexic/scanner/operator.rs @@ -12,7 +12,7 @@ pub fn scan_impl(chars: &Vec, start_pos: usize, current: String) -> LexRes Some(c) if utils::is_operator(*c) => { scan_impl(chars, start_pos + 1, utils::str_append(current, *c)) } - _ => LexResult::Some(token::new_operator(current, start_pos as i32), start_pos), + _ => LexResult::Some(token::new_operator(current, start_pos), start_pos), } } diff --git a/src/lexic/scanner/string.rs b/src/lexic/scanner/string.rs index 6610243..0979b3c 100755 --- a/src/lexic/scanner/string.rs +++ b/src/lexic/scanner/string.rs @@ -13,7 +13,7 @@ pub fn scan(chars: &Vec, start_pos: usize) -> LexResult { pub fn scan_impl(chars: &Vec, start_pos: usize, current: String) -> LexResult { match chars.get(start_pos) { Some(c) if *c == '"' => { - LexResult::Some(token::new_string(current, start_pos as i32), start_pos + 1) + LexResult::Some(token::new_string(current, start_pos), start_pos + 1) } Some(c) if *c == '\n' => LexResult::Err(LexError { position: start_pos, diff --git a/src/repl/mod.rs b/src/repl/mod.rs index b3f6767..58b0cd7 100755 --- a/src/repl/mod.rs +++ b/src/repl/mod.rs @@ -39,7 +39,7 @@ fn build_ast(input: &String, tokens: Vec) { } Err(reason) => { let chars: Vec = input.chars().into_iter().collect(); - eprintln!("Syntax error.\n{}", reason.get_error_str(&chars)) + eprintln!("{}", reason.get_error_str(&chars)) } } } diff --git a/src/syntax/binding.rs b/src/syntax/binding.rs index 32d364a..e237aca 100644 --- a/src/syntax/binding.rs +++ b/src/syntax/binding.rs @@ -1,5 +1,6 @@ use super::ast_types::{Binding, ValBinding, VarBinding}; use super::{expression, SyntaxResult}; +use crate::error_handling::SyntaxError; use crate::token::{Token, TokenType}; // TODO: Should return a 3 state value: @@ -16,22 +17,27 @@ pub fn try_parse<'a>(tokens: &'a Vec, pos: usize) -> Option pos += 1; Some(String::from(&t.value)) } + // If the first token is anything else, ignore Some(_) => None, - // TODO: return Error - None => return None, + // This should never match, as there should always be at least a + // TokenType::Semicolon or TokenType::EOF + None => panic!( + "Internal compiler error: Illegal token stream at src/syntax/binding.rs#try_parse" + ), } }; // var/val keyword - let is_val = { + let (is_val, binding_token) = { let res1 = try_token_type(tokens, pos, TokenType::VAL); match res1 { - Some(_) => true, + Some(val_token) => (true, val_token), None => { let res2 = try_token_type(tokens, pos, TokenType::VAR); match res2 { - Some(_) => false, - // TODO: return Error + Some(var_token) => (false, var_token), + // Neither VAL nor VAR were matched, the parser should try + // other constructs None => return None, } } @@ -41,7 +47,12 @@ pub fn try_parse<'a>(tokens: &'a Vec, pos: usize) -> Option let identifier = try_token_type(tokens, pos + 1, TokenType::Identifier); if identifier.is_none() { // TODO: return Error - return None; + // The parser didn't find an Identifier after VAL/VAR + return Some(SyntaxResult::Err(SyntaxError { + reason: String::from("D:"), + error_start: binding_token.position, + error_end: binding_token.position + binding_token.value.len(), + })); } let identifier = identifier.unwrap(); @@ -154,4 +165,20 @@ mod tests { _ => panic!(), } } + + #[test] + fn should_return_correct_error() { + let tokens = get_tokens(&String::from("val")).unwrap(); + assert_eq!(TokenType::VAL, tokens[0].token_type); + assert_eq!(0, tokens[0].position); + let binding = try_parse(&tokens, 0).unwrap(); + + match binding { + SyntaxResult::Err(error) => { + assert_eq!(0, error.error_start); + assert_eq!(3, error.error_end); + } + _ => panic!(), + } + } } diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index afc864f..fa0e143 100755 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -31,7 +31,10 @@ pub fn construct_ast<'a>(tokens: &'a Vec) -> Result, Syntax bindings: vec![module], }), SyntaxResult::None => Err(SyntaxError { - reason: String::from("D:"), + reason: String::from("PARSER couldn't parse any construction"), + // FIXME: This should get the position of the _token_ that current_pos points to + error_start: current_pos, + error_end: current_pos, }), SyntaxResult::Err(err) => Err(err), } @@ -42,6 +45,9 @@ fn next_construct<'a>(tokens: &'a Vec, current_pos: usize) -> SyntaxResul .unwrap_or_else(|| { SyntaxResult::Err(SyntaxError { reason: String::from("Unrecognized token"), + // FIXME: This should get the position of the _token_ that current_pos points to + error_start: current_pos, + error_end: current_pos, }) }) } diff --git a/src/token.rs b/src/token.rs index 3ad34d2..50e7a62 100755 --- a/src/token.rs +++ b/src/token.rs @@ -23,69 +23,69 @@ pub struct Token { pub value: String, /// The absolute position of this token, from the /// start of the file - _position: i32, + pub position: usize, } -pub fn new_eof(position: i32) -> Token { +pub fn new_eof(position: usize) -> Token { Token { token_type: TokenType::EOF, value: String::from(""), - _position: position, + position, } } -pub fn new_number(value: String, position: i32) -> Token { +pub fn new_number(value: String, position: usize) -> Token { Token { token_type: TokenType::Number, value, - _position: position, + position, } } -pub fn new_operator(value: String, position: i32) -> Token { +pub fn new_operator(value: String, position: usize) -> Token { Token { token_type: TokenType::Operator, value, - _position: position, + position, } } -pub fn new(value: String, position: i32, token_type: TokenType) -> Token { +pub fn new(value: String, position: usize, token_type: TokenType) -> Token { Token { token_type, value, - _position: position, + position, } } -pub fn new_identifier(value: String, position: i32) -> Token { +pub fn new_identifier(value: String, position: usize) -> Token { Token { token_type: TokenType::Identifier, value, - _position: position, + position, } } -pub fn new_string(value: String, position: i32) -> Token { +pub fn new_string(value: String, position: usize) -> Token { Token { token_type: TokenType::String, value, - _position: position, + position, } } -pub fn new_semicolon(position: i32) -> Token { +pub fn new_semicolon(position: usize) -> Token { Token { token_type: TokenType::Semicolon, value: String::from(";"), - _position: position, + position, } } -pub fn new_datatype(value: String, position: i32) -> Token { +pub fn new_datatype(value: String, position: usize) -> Token { Token { token_type: TokenType::Datatype, value, - _position: position, + position, } }