From 14c1b6f8d8655d761efbd6a4773cc3e9085b9dab Mon Sep 17 00:00:00 2001 From: Araozu Date: Mon, 29 Jul 2024 16:18:33 -0500 Subject: [PATCH] feat: Scanning of multiline comments --- CHANGELOG.md | 2 + src/error_handling/mod.rs | 1 + src/lexic/mod.rs | 1 + src/lexic/scanner/mod.rs | 8 ++ src/lexic/scanner/new_comment.rs | 160 ++++++++++++++++++++++++++++++- src/lexic/token.rs | 9 ++ 6 files changed, 180 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 546cdf3..304e8c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,8 @@ ## v0.0.15 +- [x] Multiline comments +- [ ] Nested multiline comments - [ ] Include comments in the AST - [ ] Replace all panics with actual errors - [ ] Remove all old codegen diff --git a/src/error_handling/mod.rs b/src/error_handling/mod.rs index 928a5b6..490e986 100644 --- a/src/error_handling/mod.rs +++ b/src/error_handling/mod.rs @@ -21,6 +21,7 @@ pub enum MistiError { #[derive(Serialize, Debug)] pub struct LexError { pub position: usize, + // TODO: Add and end position pub reason: String, } diff --git a/src/lexic/mod.rs b/src/lexic/mod.rs index 267d849..bb25e17 100755 --- a/src/lexic/mod.rs +++ b/src/lexic/mod.rs @@ -135,6 +135,7 @@ fn next_token( .or_else(|| scanner::datatype(next_char, chars, current_pos)) .or_else(|| scanner::string(next_char, chars, current_pos)) .or_else(|| scanner::new_comment(next_char, chars, current_pos)) + .or_else(|| scanner::new_multiline_comment(next_char, chars, current_pos)) .or_else(|| scanner::operator(next_char, chars, current_pos)) .or_else(|| scanner::grouping_sign(next_char, chars, current_pos)) .or_else(|| scanner::new_line(next_char, chars, current_pos)) diff --git a/src/lexic/scanner/mod.rs b/src/lexic/scanner/mod.rs index c30899e..78c0055 100755 --- a/src/lexic/scanner/mod.rs +++ b/src/lexic/scanner/mod.rs @@ -68,3 +68,11 @@ pub fn new_comment(c: char, chars: &Vec, start_pos: usize) -> Option None, } } + +pub fn new_multiline_comment(c: char, chars: &Vec, start_pos: usize) -> Option { + let next_char = chars.get(start_pos + 1); + match (c, next_char) { + ('/', Some('*')) => Some(new_comment::scan_multiline(chars, start_pos)), + _ => None, + } +} diff --git a/src/lexic/scanner/new_comment.rs b/src/lexic/scanner/new_comment.rs index cb1e327..b373554 100644 --- a/src/lexic/scanner/new_comment.rs +++ b/src/lexic/scanner/new_comment.rs @@ -1,5 +1,8 @@ use super::token::Token; -use crate::lexic::{utils, LexResult}; +use crate::{ + error_handling::LexError, + lexic::{utils, LexResult}, +}; /// Scans a new line. /// @@ -26,6 +29,74 @@ fn scan_any_except_new_line( } } +/// Scans a multiline commend +/// This function assumes that the character at `start_pos` is '/' +/// and the character at `start_pos + 1` is '*' +pub fn scan_multiline(chars: &Vec, start_pos: usize) -> LexResult { + match multiline_impl(chars, start_pos + 2) { + Some((value, next_position)) => LexResult::Some( + Token::new_multiline_comment(value, start_pos), + next_position, + ), + None => { + // Throw an error: Incomplete multiline comment + LexResult::Err(LexError { + position: start_pos, + // TODO: add an end_position + reason: "Unfinished multiline commend".into(), + }) + } + } +} + +fn multiline_impl(chars: &Vec, start_pos: usize) -> Option<(String, usize)> { + let mut current_position = start_pos; + let mut result = Vec::::new(); + + loop { + match chars.get(current_position) { + Some('/') => { + // TODO: Check for a nested comment instead of + // appending + result.push('/'); + current_position += 1; + } + Some('*') => { + // Check for the end of a comment + match chars.get(current_position + 1) { + Some('/') => { + // Create and return the token, + // ignoring the `*/` + return Some((result.iter().collect(), current_position + 2)); + } + Some(c) => { + // Append both and continue + result.push('*'); + result.push(*c); + current_position += 2; + } + None => { + // Throw an error + return None; + } + } + } + Some(c) => { + // Append and continue + result.push(*c); + current_position += 1; + } + None => { + // Throw an error + // TODO: Also return the position where this token ends, + // to display better error messages. + // Requires LexError to implement an end_position field + return None; + } + } + } +} + #[cfg(test)] mod tests { use crate::lexic::scanner::TokenType; @@ -73,4 +144,91 @@ mod tests { } } } + + #[test] + fn should_scan_multiline() { + let input = str_to_vec("/**/"); + + let result = scan_multiline(&input, 0); + match result { + LexResult::Some(t, next) => { + assert_eq!(4, next); + assert_eq!("", t.value); + assert_eq!(0, t.position); + assert_eq!(TokenType::MultilineComment, t.token_type); + } + _ => { + panic!("Expected a multine comment") + } + } + } + + #[test] + fn should_scan_multiline_2() { + let input = str_to_vec("/* my comment */"); + + let result = scan_multiline(&input, 0); + match result { + LexResult::Some(t, next) => { + assert_eq!(16, next); + assert_eq!(" my comment ", t.value); + assert_eq!(0, t.position); + assert_eq!(TokenType::MultilineComment, t.token_type); + } + _ => { + panic!("Expected a multine comment") + } + } + } + + #[test] + fn should_scan_multiline_with_multiple_lines() { + let input = str_to_vec("/* my\ncomment */"); + + let result = scan_multiline(&input, 0); + match result { + LexResult::Some(t, next) => { + assert_eq!(16, next); + assert_eq!(" my\ncomment ", t.value); + assert_eq!(0, t.position); + assert_eq!(TokenType::MultilineComment, t.token_type); + } + _ => { + panic!("Expected a multine comment") + } + } + } + + #[test] + fn should_not_scan_multiline_comment_if_invalid() { + let input = str_to_vec("/* my\ncomment"); + + let result = scan_multiline(&input, 0); + match result { + LexResult::Err(error) => { + assert_eq!(0, error.position) + } + _ => { + panic!("Expected an error scannning an incomplete multiline comment") + } + } + } + + #[test] + fn should_scan_multiline_comments_with_asterisk() { + let input = str_to_vec("/* my * comment */"); + + let result = scan_multiline(&input, 0); + match result { + LexResult::Some(t, next) => { + assert_eq!(18, next); + assert_eq!(" my * comment ", t.value); + assert_eq!(0, t.position); + assert_eq!(TokenType::MultilineComment, t.token_type); + } + _ => { + panic!("Expected a multine comment") + } + } + } } diff --git a/src/lexic/token.rs b/src/lexic/token.rs index 57601ca..d77b66b 100755 --- a/src/lexic/token.rs +++ b/src/lexic/token.rs @@ -16,6 +16,7 @@ pub enum TokenType { RightBrace, NewLine, Comment, + MultilineComment, Comma, INDENT, DEDENT, @@ -114,6 +115,14 @@ impl Token { } } + pub fn new_multiline_comment(value: String, position: usize) -> Token { + Token { + token_type: TokenType::MultilineComment, + value, + position, + } + } + pub fn new_indent(position: usize) -> Token { Token { token_type: TokenType::INDENT,