refactor: Dont store quotes inside string tokens

This commit is contained in:
Araozu 2024-07-31 10:54:17 -05:00
parent a62d08455b
commit c0e20ad283
7 changed files with 48 additions and 28 deletions

View File

@ -22,6 +22,7 @@ pub enum MistiError {
pub struct LexError { pub struct LexError {
pub position: usize, pub position: usize,
// TODO: Add and end position // TODO: Add and end position
pub end_position: usize,
pub reason: String, pub reason: String,
} }

View File

@ -152,6 +152,7 @@ fn next_token(
.unwrap_or_else(|| { .unwrap_or_else(|| {
let error = LexError { let error = LexError {
position: current_pos, position: current_pos,
end_position: current_pos + 1,
reason: format!( reason: format!(
"Illegal character `{}` (escaped: {})", "Illegal character `{}` (escaped: {})",
next_char, next_char,
@ -196,6 +197,7 @@ fn handle_indentation(
// Illegal state: Indentation error // Illegal state: Indentation error
let error = LexError { let error = LexError {
position: current_pos, position: current_pos,
end_position: current_pos + 1,
reason: format!( reason: format!(
"Indentation error: expected {} spaces, found {}", "Indentation error: expected {} spaces, found {}",
new_top, spaces new_top, spaces

View File

@ -34,22 +34,27 @@ fn scan_any_except_new_line(
/// and the character at `start_pos + 1` is '*' /// and the character at `start_pos + 1` is '*'
pub fn scan_multiline(chars: &Vec<char>, start_pos: usize) -> LexResult { pub fn scan_multiline(chars: &Vec<char>, start_pos: usize) -> LexResult {
match multiline_impl(chars, start_pos + 2) { match multiline_impl(chars, start_pos + 2) {
Some((value, next_position)) => LexResult::Some( Ok((value, next_position)) => LexResult::Some(
Token::new_multiline_comment(value.iter().collect(), start_pos), Token::new_multiline_comment(value.iter().collect(), start_pos),
next_position, next_position,
), ),
None => { Err(last_position) => {
// Throw an error: Incomplete multiline comment // Throw an error: Incomplete multiline comment
LexResult::Err(LexError { LexResult::Err(LexError {
position: start_pos, position: start_pos,
// TODO: add an end_position // TODO: add an end_position
end_position: last_position,
reason: "Unfinished multiline commend".into(), reason: "Unfinished multiline commend".into(),
}) })
} }
} }
} }
fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usize)> { /// Implementation that scans the multiline comment.
///
/// May only error if EOF is found before the comment is finished.
/// If Err, returns the last position where a char was available.
fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Result<(Vec<char>, usize), usize> {
let mut current_position = start_pos; let mut current_position = start_pos;
let mut result = Vec::<char>::new(); let mut result = Vec::<char>::new();
@ -61,10 +66,10 @@ fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usi
// Scan nested comment // Scan nested comment
let (mut nested, next_position) = let (mut nested, next_position) =
match multiline_impl(chars, current_position + 2) { match multiline_impl(chars, current_position + 2) {
Some(v) => v, Ok(v) => v,
None => { Err(pos) => {
// The nested comment is not closed. // The nested comment is not closed.
return None; return Err(pos);
} }
}; };
result.push('/'); result.push('/');
@ -79,7 +84,7 @@ fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usi
result.push('/'); result.push('/');
result.push(*c); result.push(*c);
} }
None => return None, None => return Err(current_position),
} }
} }
Some('*') => { Some('*') => {
@ -88,7 +93,7 @@ fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usi
Some('/') => { Some('/') => {
// Create and return the token, // Create and return the token,
// ignoring the `*/` // ignoring the `*/`
return Some((result, current_position + 2)); return Ok((result, current_position + 2));
} }
Some(c) => { Some(c) => {
// Append both and continue // Append both and continue
@ -98,7 +103,7 @@ fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usi
} }
None => { None => {
// Throw an error // Throw an error
return None; return Err(current_position);
} }
} }
} }
@ -108,10 +113,7 @@ fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usi
current_position += 1; current_position += 1;
} }
None => { None => {
// TODO: Also return the position where this token ends, return Err(current_position);
// to display better error messages.
// Requires LexError to implement an end_position field
return None;
} }
} }
} }

View File

@ -53,6 +53,7 @@ fn scan_hex(chars: &Vec<char>, start_pos: usize, current: String) -> LexResult {
} }
_ => LexResult::Err(LexError { _ => LexResult::Err(LexError {
position: start_pos, position: start_pos,
end_position: start_pos + 1,
reason: String::from("Tried to scan an incomplete hex value"), reason: String::from("Tried to scan an incomplete hex value"),
}), }),
} }
@ -69,12 +70,14 @@ fn scan_double(chars: &Vec<char>, start_pos: usize, current: String) -> LexResul
Some(c) if utils::is_digit(*c) => scan_double_impl(chars, start_pos, current), Some(c) if utils::is_digit(*c) => scan_double_impl(chars, start_pos, current),
Some(_) => LexResult::Err(LexError { Some(_) => LexResult::Err(LexError {
position: start_pos, position: start_pos,
end_position: start_pos + 1,
reason: String::from( reason: String::from(
"The character after the dot when scanning a double is not a number.", "The character after the dot when scanning a double is not a number.",
), ),
}), }),
_ => LexResult::Err(LexError { _ => LexResult::Err(LexError {
position: start_pos, position: start_pos,
end_position: start_pos + 1,
reason: String::from("EOF when scanning a double number."), reason: String::from("EOF when scanning a double number."),
}), }),
} }
@ -122,6 +125,7 @@ fn scan_scientific(chars: &Vec<char>, start_pos: usize, current: String) -> LexR
} }
_ => LexResult::Err(LexError { _ => LexResult::Err(LexError {
position: start_pos, position: start_pos,
end_position: start_pos + 1,
reason: String::from( reason: String::from(
"The characters after 'e' are not + or -, or are not followed by a number", "The characters after 'e' are not + or -, or are not followed by a number",
), ),

View File

@ -7,9 +7,11 @@ use crate::lexic::{utils, LexResult};
/// This function assumes that `start_pos` is after the first double quote, /// This function assumes that `start_pos` is after the first double quote,
/// e.g. if the input is `"hello"`, `start_pos == 1` /// e.g. if the input is `"hello"`, `start_pos == 1`
pub fn scan(chars: &Vec<char>, start_pos: usize) -> LexResult { pub fn scan(chars: &Vec<char>, start_pos: usize) -> LexResult {
scan_impl(chars, start_pos, String::from("\"")) scan_impl(chars, start_pos, String::from(""))
} }
// TODO: This can be iterative instead of recursive
/// Recursive function that does the scanning /// Recursive function that does the scanning
pub fn scan_impl(chars: &Vec<char>, start_pos: usize, current: String) -> LexResult { pub fn scan_impl(chars: &Vec<char>, start_pos: usize, current: String) -> LexResult {
match chars.get(start_pos) { match chars.get(start_pos) {
@ -17,16 +19,16 @@ pub fn scan_impl(chars: &Vec<char>, start_pos: usize, current: String) -> LexRes
// start_pos is the position where the token ENDS, not where it STARTS, // start_pos is the position where the token ENDS, not where it STARTS,
// so this is used to retrieve the original START position of the token // so this is used to retrieve the original START position of the token
// 1 is added to account for the opening `"` // 1 is added to account for the opening `"`
let current_len = current.len(); let current_len = current.len() + 1;
let final_str = format!("{}\"", current);
LexResult::Some( LexResult::Some(
Token::new_string(final_str, start_pos - current_len), Token::new_string(current, start_pos - current_len),
start_pos + 1, start_pos + 1,
) )
} }
Some(c) if *c == '\n' => LexResult::Err(LexError { Some(c) if *c == '\n' => LexResult::Err(LexError {
position: start_pos, position: start_pos,
end_position: start_pos + 1,
reason: String::from("Unexpected new line inside a string."), reason: String::from("Unexpected new line inside a string."),
}), }),
Some(c) if *c == '\\' => { Some(c) if *c == '\\' => {
@ -40,6 +42,7 @@ pub fn scan_impl(chars: &Vec<char>, start_pos: usize, current: String) -> LexRes
Some(c) => scan_impl(chars, start_pos + 1, utils::str_append(current, *c)), Some(c) => scan_impl(chars, start_pos + 1, utils::str_append(current, *c)),
None => LexResult::Err(LexError { None => LexResult::Err(LexError {
position: start_pos, position: start_pos,
end_position: start_pos + 1,
reason: String::from("Incomplete string found"), reason: String::from("Incomplete string found"),
}), }),
} }
@ -79,7 +82,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(2, next); assert_eq!(2, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("\"\"", token.value); assert_eq!("", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -93,7 +96,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(15, next); assert_eq!(15, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("\"Hello, world!\"", token.value); assert_eq!("Hello, world!", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -118,7 +121,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("\"Sample\\ntext\"", token.value); assert_eq!("Sample\\ntext", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -129,7 +132,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("\"Sample\\\"text\"", token.value); assert_eq!("Sample\\\"text", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -140,7 +143,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("\"Sample\\rtext\"", token.value); assert_eq!("Sample\\rtext", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -151,7 +154,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("\"Sample\\\\text\"", token.value); assert_eq!("Sample\\\\text", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -162,7 +165,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("\"Sample\\ttext\"", token.value); assert_eq!("Sample\\ttext", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -173,7 +176,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("\"Sample\\ text\"", token.value); assert_eq!("Sample\\ text", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -187,7 +190,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("\"Sample\\atext\"", token.value); assert_eq!("Sample\\atext", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()

View File

@ -38,7 +38,15 @@ pub struct Token {
impl Token { impl Token {
pub fn get_end_position(&self) -> usize { pub fn get_end_position(&self) -> usize {
self.position + self.value.len() match self.token_type {
// 4 extra characters for /* and */
TokenType::MultilineComment => self.position + self.value.len() + 4,
// 2 extra characters for //
TokenType::Comment => self.position + self.value.len() + 2,
// 2 extra characters for ""
TokenType::String => self.position + self.value.len() + 2,
_ => self.position + self.value.len()
}
} }
} }

View File

@ -66,7 +66,7 @@ mod tests {
match expression { match expression {
Ok((Expression::String(value), _)) => { Ok((Expression::String(value), _)) => {
assert_eq!("\"Hello\"", format!("{}", value)) assert_eq!("Hello", format!("{}", value))
} }
_ => panic!(), _ => panic!(),
} }