From c0e20ad2833cf20ed1b675f5b90be655e03072f7 Mon Sep 17 00:00:00 2001
From: Araozu <fernando@araozu.dev>
Date: Wed, 31 Jul 2024 10:54:17 -0500
Subject: [PATCH] refactor: Dont store quotes inside string tokens

---
 src/error_handling/mod.rs                |  1 +
 src/lexic/mod.rs                         |  2 ++
 src/lexic/scanner/new_comment.rs         | 28 ++++++++++++-----------
 src/lexic/scanner/number.rs              |  4 ++++
 src/lexic/scanner/string.rs              | 29 +++++++++++++-----------
 src/lexic/token.rs                       | 10 +++++++-
 src/syntax/parsers/expression/primary.rs |  2 +-
 7 files changed, 48 insertions(+), 28 deletions(-)
diff --git a/src/error_handling/mod.rs b/src/error_handling/mod.rs
index 490e986..ec6ca97 100644
--- a/src/error_handling/mod.rs
+++ b/src/error_handling/mod.rs
@@ -22,6 +22,7 @@ pub enum MistiError {
 pub struct LexError {
     pub position: usize,
     // TODO: Add and end position
+    pub end_position: usize,
     pub reason: String,
 }
 
diff --git a/src/lexic/mod.rs b/src/lexic/mod.rs
index bb25e17..7e02080 100755
--- a/src/lexic/mod.rs
+++ b/src/lexic/mod.rs
@@ -152,6 +152,7 @@ fn next_token(
         .unwrap_or_else(|| {
             let error = LexError {
                 position: current_pos,
+                end_position: current_pos + 1,
                 reason: format!(
                     "Illegal character `{}` (escaped: {})",
                     next_char,
@@ -196,6 +197,7 @@ fn handle_indentation(
                 // Illegal state: Indentation error
                 let error = LexError {
                     position: current_pos,
+                    end_position: current_pos + 1,
                     reason: format!(
                         "Indentation error: expected {} spaces, found {}",
                         new_top, spaces
diff --git a/src/lexic/scanner/new_comment.rs b/src/lexic/scanner/new_comment.rs
index 4a43b63..4f90850 100644
--- a/src/lexic/scanner/new_comment.rs
+++ b/src/lexic/scanner/new_comment.rs
@@ -34,22 +34,27 @@ fn scan_any_except_new_line(
 /// and the character at `start_pos + 1` is '*'
 pub fn scan_multiline(chars: &Vec<char>, start_pos: usize) -> LexResult {
     match multiline_impl(chars, start_pos + 2) {
-        Some((value, next_position)) => LexResult::Some(
+        Ok((value, next_position)) => LexResult::Some(
             Token::new_multiline_comment(value.iter().collect(), start_pos),
             next_position,
         ),
-        None => {
+        Err(last_position) => {
             // Throw an error: Incomplete multiline comment
             LexResult::Err(LexError {
                 position: start_pos,
                 // TODO: add an end_position
+                end_position: last_position,
                 reason: "Unfinished multiline commend".into(),
             })
         }
     }
 }
 
-fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usize)> {
+/// Implementation that scans the multiline comment.
+/// 
+/// May only error if EOF is found before the comment is finished.
+/// If Err, returns the last position where a char was available.
+fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Result<(Vec<char>, usize), usize> {
     let mut current_position = start_pos;
     let mut result = Vec::<char>::new();
 
@@ -61,10 +66,10 @@ fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usi
                         // Scan nested comment
                         let (mut nested, next_position) =
                             match multiline_impl(chars, current_position + 2) {
-                                Some(v) => v,
-                                None => {
+                                Ok(v) => v,
+                                Err(pos) => {
                                     // The nested comment is not closed.
-                                    return None;
+                                    return Err(pos);
                                 }
                             };
                         result.push('/');
@@ -79,7 +84,7 @@ fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usi
                         result.push('/');
                         result.push(*c);
                     }
-                    None => return None,
+                    None => return Err(current_position),
                 }
             }
             Some('*') => {
@@ -88,7 +93,7 @@ fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usi
                     Some('/') => {
                         // Create and return the token,
                         // ignoring the `*/`
-                        return Some((result, current_position + 2));
+                        return Ok((result, current_position + 2));
                     }
                     Some(c) => {
                         // Append both and continue
@@ -98,7 +103,7 @@ fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usi
                     }
                     None => {
                         // Throw an error
-                        return None;
+                        return Err(current_position);
                     }
                 }
             }
@@ -108,10 +113,7 @@ fn multiline_impl(chars: &Vec<char>, start_pos: usize) -> Option<(Vec<char>, usi
                 current_position += 1;
             }
             None => {
-                // TODO: Also return the position where this token ends,
-                // to display better error messages.
-                // Requires LexError to implement an end_position field
-                return None;
+                return Err(current_position);
             }
         }
     }
diff --git a/src/lexic/scanner/number.rs b/src/lexic/scanner/number.rs
index c35959b..c3605c4 100755
--- a/src/lexic/scanner/number.rs
+++ b/src/lexic/scanner/number.rs
@@ -53,6 +53,7 @@ fn scan_hex(chars: &Vec<char>, start_pos: usize, current: String) -> LexResult {
         }
         _ => LexResult::Err(LexError {
             position: start_pos,
+            end_position: start_pos + 1,
             reason: String::from("Tried to scan an incomplete hex value"),
         }),
     }
@@ -69,12 +70,14 @@ fn scan_double(chars: &Vec<char>, start_pos: usize, current: String) -> LexResul
         Some(c) if utils::is_digit(*c) => scan_double_impl(chars, start_pos, current),
         Some(_) => LexResult::Err(LexError {
             position: start_pos,
+            end_position: start_pos + 1,
             reason: String::from(
                 "The character after the dot when scanning a double is not a number.",
             ),
         }),
         _ => LexResult::Err(LexError {
             position: start_pos,
+            end_position: start_pos + 1,
             reason: String::from("EOF when scanning a double number."),
         }),
     }
@@ -122,6 +125,7 @@ fn scan_scientific(chars: &Vec<char>, start_pos: usize, current: String) -> LexR
         }
         _ => LexResult::Err(LexError {
             position: start_pos,
+            end_position: start_pos + 1,
             reason: String::from(
                 "The characters after 'e' are not + or -, or are not followed by a number",
             ),
diff --git a/src/lexic/scanner/string.rs b/src/lexic/scanner/string.rs
index 8eaae0d..06f61e7 100755
--- a/src/lexic/scanner/string.rs
+++ b/src/lexic/scanner/string.rs
@@ -7,9 +7,11 @@ use crate::lexic::{utils, LexResult};
 /// This function assumes that `start_pos` is after the first double quote,
 /// e.g. if the input is `"hello"`, `start_pos == 1`
 pub fn scan(chars: &Vec<char>, start_pos: usize) -> LexResult {
-    scan_impl(chars, start_pos, String::from("\""))
+    scan_impl(chars, start_pos, String::from(""))
 }
 
+// TODO: This can be iterative instead of recursive
+
 /// Recursive function that does the scanning
 pub fn scan_impl(chars: &Vec<char>, start_pos: usize, current: String) -> LexResult {
     match chars.get(start_pos) {
@@ -17,16 +19,16 @@ pub fn scan_impl(chars: &Vec<char>, start_pos: usize, current: String) -> LexRes
             // start_pos is the position where the token ENDS, not where it STARTS,
             // so this is used to retrieve the original START position of the token
             // 1 is added to account for the opening `"`
-            let current_len = current.len();
+            let current_len = current.len() + 1;
 
-            let final_str = format!("{}\"", current);
             LexResult::Some(
-                Token::new_string(final_str, start_pos - current_len),
+                Token::new_string(current, start_pos - current_len),
                 start_pos + 1,
             )
         }
         Some(c) if *c == '\n' => LexResult::Err(LexError {
             position: start_pos,
+            end_position: start_pos + 1,
             reason: String::from("Unexpected new line inside a string."),
         }),
         Some(c) if *c == '\\' => {
@@ -40,6 +42,7 @@ pub fn scan_impl(chars: &Vec<char>, start_pos: usize, current: String) -> LexRes
         Some(c) => scan_impl(chars, start_pos + 1, utils::str_append(current, *c)),
         None => LexResult::Err(LexError {
             position: start_pos,
+            end_position: start_pos + 1,
             reason: String::from("Incomplete string found"),
         }),
     }
@@ -79,7 +82,7 @@ mod tests {
         if let LexResult::Some(token, next) = scan(&input, start_pos) {
             assert_eq!(2, next);
             assert_eq!(TokenType::String, token.token_type);
-            assert_eq!("\"\"", token.value);
+            assert_eq!("", token.value);
             assert_eq!(0, token.position);
         } else {
             panic!()
@@ -93,7 +96,7 @@ mod tests {
         if let LexResult::Some(token, next) = scan(&input, start_pos) {
             assert_eq!(15, next);
             assert_eq!(TokenType::String, token.token_type);
-            assert_eq!("\"Hello, world!\"", token.value);
+            assert_eq!("Hello, world!", token.value);
             assert_eq!(0, token.position);
         } else {
             panic!()
@@ -118,7 +121,7 @@ mod tests {
         if let LexResult::Some(token, next) = scan(&input, start_pos) {
             assert_eq!(14, next);
             assert_eq!(TokenType::String, token.token_type);
-            assert_eq!("\"Sample\\ntext\"", token.value);
+            assert_eq!("Sample\\ntext", token.value);
             assert_eq!(0, token.position);
         } else {
             panic!()
@@ -129,7 +132,7 @@ mod tests {
         if let LexResult::Some(token, next) = scan(&input, start_pos) {
             assert_eq!(14, next);
             assert_eq!(TokenType::String, token.token_type);
-            assert_eq!("\"Sample\\\"text\"", token.value);
+            assert_eq!("Sample\\\"text", token.value);
             assert_eq!(0, token.position);
         } else {
             panic!()
@@ -140,7 +143,7 @@ mod tests {
         if let LexResult::Some(token, next) = scan(&input, start_pos) {
             assert_eq!(14, next);
             assert_eq!(TokenType::String, token.token_type);
-            assert_eq!("\"Sample\\rtext\"", token.value);
+            assert_eq!("Sample\\rtext", token.value);
             assert_eq!(0, token.position);
         } else {
             panic!()
@@ -151,7 +154,7 @@ mod tests {
         if let LexResult::Some(token, next) = scan(&input, start_pos) {
             assert_eq!(14, next);
             assert_eq!(TokenType::String, token.token_type);
-            assert_eq!("\"Sample\\\\text\"", token.value);
+            assert_eq!("Sample\\\\text", token.value);
             assert_eq!(0, token.position);
         } else {
             panic!()
@@ -162,7 +165,7 @@ mod tests {
         if let LexResult::Some(token, next) = scan(&input, start_pos) {
             assert_eq!(14, next);
             assert_eq!(TokenType::String, token.token_type);
-            assert_eq!("\"Sample\\ttext\"", token.value);
+            assert_eq!("Sample\\ttext", token.value);
             assert_eq!(0, token.position);
         } else {
             panic!()
@@ -173,7 +176,7 @@ mod tests {
         if let LexResult::Some(token, next) = scan(&input, start_pos) {
             assert_eq!(14, next);
             assert_eq!(TokenType::String, token.token_type);
-            assert_eq!("\"Sample\\ text\"", token.value);
+            assert_eq!("Sample\\ text", token.value);
             assert_eq!(0, token.position);
         } else {
             panic!()
@@ -187,7 +190,7 @@ mod tests {
         if let LexResult::Some(token, next) = scan(&input, start_pos) {
             assert_eq!(14, next);
             assert_eq!(TokenType::String, token.token_type);
-            assert_eq!("\"Sample\\atext\"", token.value);
+            assert_eq!("Sample\\atext", token.value);
             assert_eq!(0, token.position);
         } else {
             panic!()
diff --git a/src/lexic/token.rs b/src/lexic/token.rs
index d77b66b..78c8198 100755
--- a/src/lexic/token.rs
+++ b/src/lexic/token.rs
@@ -38,7 +38,15 @@ pub struct Token {
 
 impl Token {
     pub fn get_end_position(&self) -> usize {
-        self.position + self.value.len()
+        match self.token_type {
+            // 4 extra characters for /* and */
+            TokenType::MultilineComment => self.position + self.value.len() + 4,
+            // 2 extra characters for //
+            TokenType::Comment => self.position + self.value.len() + 2,
+            // 2 extra characters for ""
+            TokenType::String => self.position + self.value.len() + 2,
+            _ => self.position + self.value.len()
+        }
     }
 }
 
diff --git a/src/syntax/parsers/expression/primary.rs b/src/syntax/parsers/expression/primary.rs
index f7bef7a..0063564 100644
--- a/src/syntax/parsers/expression/primary.rs
+++ b/src/syntax/parsers/expression/primary.rs
@@ -66,7 +66,7 @@ mod tests {
 
         match expression {
             Ok((Expression::String(value), _)) => {
-                assert_eq!("\"Hello\"", format!("{}", value))
+                assert_eq!("Hello", format!("{}", value))
             }
             _ => panic!(),
         }