From c0e20ad2833cf20ed1b675f5b90be655e03072f7 Mon Sep 17 00:00:00 2001 From: Araozu Date: Wed, 31 Jul 2024 10:54:17 -0500 Subject: [PATCH] refactor: Dont store quotes inside string tokens --- src/error_handling/mod.rs | 1 + src/lexic/mod.rs | 2 ++ src/lexic/scanner/new_comment.rs | 28 ++++++++++++----------- src/lexic/scanner/number.rs | 4 ++++ src/lexic/scanner/string.rs | 29 +++++++++++++----------- src/lexic/token.rs | 10 +++++++- src/syntax/parsers/expression/primary.rs | 2 +- 7 files changed, 48 insertions(+), 28 deletions(-) diff --git a/src/error_handling/mod.rs b/src/error_handling/mod.rs index 490e986..ec6ca97 100644 --- a/src/error_handling/mod.rs +++ b/src/error_handling/mod.rs @@ -22,6 +22,7 @@ pub enum MistiError { pub struct LexError { pub position: usize, // TODO: Add and end position + pub end_position: usize, pub reason: String, } diff --git a/src/lexic/mod.rs b/src/lexic/mod.rs index bb25e17..7e02080 100755 --- a/src/lexic/mod.rs +++ b/src/lexic/mod.rs @@ -152,6 +152,7 @@ fn next_token( .unwrap_or_else(|| { let error = LexError { position: current_pos, + end_position: current_pos + 1, reason: format!( "Illegal character `{}` (escaped: {})", next_char, @@ -196,6 +197,7 @@ fn handle_indentation( // Illegal state: Indentation error let error = LexError { position: current_pos, + end_position: current_pos + 1, reason: format!( "Indentation error: expected {} spaces, found {}", new_top, spaces diff --git a/src/lexic/scanner/new_comment.rs b/src/lexic/scanner/new_comment.rs index 4a43b63..4f90850 100644 --- a/src/lexic/scanner/new_comment.rs +++ b/src/lexic/scanner/new_comment.rs @@ -34,22 +34,27 @@ fn scan_any_except_new_line( /// and the character at `start_pos + 1` is '*' pub fn scan_multiline(chars: &Vec, start_pos: usize) -> LexResult { match multiline_impl(chars, start_pos + 2) { - Some((value, next_position)) => LexResult::Some( + Ok((value, next_position)) => LexResult::Some( Token::new_multiline_comment(value.iter().collect(), start_pos), next_position, ), - None => { + Err(last_position) => { // Throw an error: Incomplete multiline comment LexResult::Err(LexError { position: start_pos, // TODO: add an end_position + end_position: last_position, reason: "Unfinished multiline commend".into(), }) } } } -fn multiline_impl(chars: &Vec, start_pos: usize) -> Option<(Vec, usize)> { +/// Implementation that scans the multiline comment. +/// +/// May only error if EOF is found before the comment is finished. +/// If Err, returns the last position where a char was available. +fn multiline_impl(chars: &Vec, start_pos: usize) -> Result<(Vec, usize), usize> { let mut current_position = start_pos; let mut result = Vec::::new(); @@ -61,10 +66,10 @@ fn multiline_impl(chars: &Vec, start_pos: usize) -> Option<(Vec, usi // Scan nested comment let (mut nested, next_position) = match multiline_impl(chars, current_position + 2) { - Some(v) => v, - None => { + Ok(v) => v, + Err(pos) => { // The nested comment is not closed. - return None; + return Err(pos); } }; result.push('/'); @@ -79,7 +84,7 @@ fn multiline_impl(chars: &Vec, start_pos: usize) -> Option<(Vec, usi result.push('/'); result.push(*c); } - None => return None, + None => return Err(current_position), } } Some('*') => { @@ -88,7 +93,7 @@ fn multiline_impl(chars: &Vec, start_pos: usize) -> Option<(Vec, usi Some('/') => { // Create and return the token, // ignoring the `*/` - return Some((result, current_position + 2)); + return Ok((result, current_position + 2)); } Some(c) => { // Append both and continue @@ -98,7 +103,7 @@ fn multiline_impl(chars: &Vec, start_pos: usize) -> Option<(Vec, usi } None => { // Throw an error - return None; + return Err(current_position); } } } @@ -108,10 +113,7 @@ fn multiline_impl(chars: &Vec, start_pos: usize) -> Option<(Vec, usi current_position += 1; } None => { - // TODO: Also return the position where this token ends, - // to display better error messages. - // Requires LexError to implement an end_position field - return None; + return Err(current_position); } } } diff --git a/src/lexic/scanner/number.rs b/src/lexic/scanner/number.rs index c35959b..c3605c4 100755 --- a/src/lexic/scanner/number.rs +++ b/src/lexic/scanner/number.rs @@ -53,6 +53,7 @@ fn scan_hex(chars: &Vec, start_pos: usize, current: String) -> LexResult { } _ => LexResult::Err(LexError { position: start_pos, + end_position: start_pos + 1, reason: String::from("Tried to scan an incomplete hex value"), }), } @@ -69,12 +70,14 @@ fn scan_double(chars: &Vec, start_pos: usize, current: String) -> LexResul Some(c) if utils::is_digit(*c) => scan_double_impl(chars, start_pos, current), Some(_) => LexResult::Err(LexError { position: start_pos, + end_position: start_pos + 1, reason: String::from( "The character after the dot when scanning a double is not a number.", ), }), _ => LexResult::Err(LexError { position: start_pos, + end_position: start_pos + 1, reason: String::from("EOF when scanning a double number."), }), } @@ -122,6 +125,7 @@ fn scan_scientific(chars: &Vec, start_pos: usize, current: String) -> LexR } _ => LexResult::Err(LexError { position: start_pos, + end_position: start_pos + 1, reason: String::from( "The characters after 'e' are not + or -, or are not followed by a number", ), diff --git a/src/lexic/scanner/string.rs b/src/lexic/scanner/string.rs index 8eaae0d..06f61e7 100755 --- a/src/lexic/scanner/string.rs +++ b/src/lexic/scanner/string.rs @@ -7,9 +7,11 @@ use crate::lexic::{utils, LexResult}; /// This function assumes that `start_pos` is after the first double quote, /// e.g. if the input is `"hello"`, `start_pos == 1` pub fn scan(chars: &Vec, start_pos: usize) -> LexResult { - scan_impl(chars, start_pos, String::from("\"")) + scan_impl(chars, start_pos, String::from("")) } +// TODO: This can be iterative instead of recursive + /// Recursive function that does the scanning pub fn scan_impl(chars: &Vec, start_pos: usize, current: String) -> LexResult { match chars.get(start_pos) { @@ -17,16 +19,16 @@ pub fn scan_impl(chars: &Vec, start_pos: usize, current: String) -> LexRes // start_pos is the position where the token ENDS, not where it STARTS, // so this is used to retrieve the original START position of the token // 1 is added to account for the opening `"` - let current_len = current.len(); + let current_len = current.len() + 1; - let final_str = format!("{}\"", current); LexResult::Some( - Token::new_string(final_str, start_pos - current_len), + Token::new_string(current, start_pos - current_len), start_pos + 1, ) } Some(c) if *c == '\n' => LexResult::Err(LexError { position: start_pos, + end_position: start_pos + 1, reason: String::from("Unexpected new line inside a string."), }), Some(c) if *c == '\\' => { @@ -40,6 +42,7 @@ pub fn scan_impl(chars: &Vec, start_pos: usize, current: String) -> LexRes Some(c) => scan_impl(chars, start_pos + 1, utils::str_append(current, *c)), None => LexResult::Err(LexError { position: start_pos, + end_position: start_pos + 1, reason: String::from("Incomplete string found"), }), } @@ -79,7 +82,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(2, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("\"\"", token.value); + assert_eq!("", token.value); assert_eq!(0, token.position); } else { panic!() @@ -93,7 +96,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(15, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("\"Hello, world!\"", token.value); + assert_eq!("Hello, world!", token.value); assert_eq!(0, token.position); } else { panic!() @@ -118,7 +121,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("\"Sample\\ntext\"", token.value); + assert_eq!("Sample\\ntext", token.value); assert_eq!(0, token.position); } else { panic!() @@ -129,7 +132,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("\"Sample\\\"text\"", token.value); + assert_eq!("Sample\\\"text", token.value); assert_eq!(0, token.position); } else { panic!() @@ -140,7 +143,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("\"Sample\\rtext\"", token.value); + assert_eq!("Sample\\rtext", token.value); assert_eq!(0, token.position); } else { panic!() @@ -151,7 +154,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("\"Sample\\\\text\"", token.value); + assert_eq!("Sample\\\\text", token.value); assert_eq!(0, token.position); } else { panic!() @@ -162,7 +165,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("\"Sample\\ttext\"", token.value); + assert_eq!("Sample\\ttext", token.value); assert_eq!(0, token.position); } else { panic!() @@ -173,7 +176,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("\"Sample\\ text\"", token.value); + assert_eq!("Sample\\ text", token.value); assert_eq!(0, token.position); } else { panic!() @@ -187,7 +190,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("\"Sample\\atext\"", token.value); + assert_eq!("Sample\\atext", token.value); assert_eq!(0, token.position); } else { panic!() diff --git a/src/lexic/token.rs b/src/lexic/token.rs index d77b66b..78c8198 100755 --- a/src/lexic/token.rs +++ b/src/lexic/token.rs @@ -38,7 +38,15 @@ pub struct Token { impl Token { pub fn get_end_position(&self) -> usize { - self.position + self.value.len() + match self.token_type { + // 4 extra characters for /* and */ + TokenType::MultilineComment => self.position + self.value.len() + 4, + // 2 extra characters for // + TokenType::Comment => self.position + self.value.len() + 2, + // 2 extra characters for "" + TokenType::String => self.position + self.value.len() + 2, + _ => self.position + self.value.len() + } } } diff --git a/src/syntax/parsers/expression/primary.rs b/src/syntax/parsers/expression/primary.rs index f7bef7a..0063564 100644 --- a/src/syntax/parsers/expression/primary.rs +++ b/src/syntax/parsers/expression/primary.rs @@ -66,7 +66,7 @@ mod tests { match expression { Ok((Expression::String(value), _)) => { - assert_eq!("\"Hello\"", format!("{}", value)) + assert_eq!("Hello", format!("{}", value)) } _ => panic!(), }