diff --git a/src/lexic/mod.rs b/src/lexic/mod.rs index be4bf8d..fef9756 100755 --- a/src/lexic/mod.rs +++ b/src/lexic/mod.rs @@ -6,6 +6,8 @@ pub mod token; use crate::error_handling::{LexError, MistiError}; use token::Token; +use self::token::TokenType; + type Chars = Vec; /// Represents the result of scanning a single token from the input @@ -39,10 +41,17 @@ pub fn get_tokens(input: &String) -> Result, MistiError> { let chars: Vec = input.chars().into_iter().collect(); let mut results = Vec::new(); let mut current_pos: usize = 0; + let mut indentation_stack = Vec::::new(); + // Used to emit INDENT & DEDENT tokens + let mut at_new_line = false; while has_input(&chars, current_pos) { - match next_token(&chars, current_pos) { + match next_token(&chars, current_pos, &mut indentation_stack, at_new_line) { LexResult::Some(token, next_pos) => { + if token.token_type == TokenType::NewLine { + at_new_line = true; + } + results.push(token); current_pos = next_pos; } @@ -56,12 +65,15 @@ pub fn get_tokens(input: &String) -> Result, MistiError> { } results.push(Token::new_semicolon(0)); + + // TODO: emit DEDENT tokens for each entry in indentation_stack + results.push(Token::new_eof(0)); Ok(results) } /// Scans a single token from `chars`, starting from `current_pos` -fn next_token(chars: &Chars, current_pos: usize) -> LexResult { +fn next_token(chars: &Chars, current_pos: usize, indentation_stack: &mut Vec, at_new_line: bool) -> LexResult { let next_char = peek(chars, current_pos); // If EOF is reached return nothing but the current position @@ -70,8 +82,33 @@ fn next_token(chars: &Chars, current_pos: usize) -> LexResult { } // Handle whitespace recursively. - if next_char == ' ' { - return next_token(chars, current_pos + 1); + if next_char == ' ' && !at_new_line { + return next_token(chars, current_pos + 1, indentation_stack, false); + } + // When whitespace is found at the start of the line, emit INDENT/DEDENT + else if next_char == ' ' && at_new_line { + // Count the number of spaces + let mut spaces = 0; + let mut sub_pos = current_pos; + while peek(chars, sub_pos) == ' ' { + spaces += 1; + sub_pos += 1; + } + + // Compare the number of spaces with the top of the stack + let top = indentation_stack.last().unwrap_or(&0); + if spaces > *top { + // Push the new indentation level + indentation_stack.push(spaces); + return LexResult::Some(Token::new_indent(current_pos), current_pos + spaces); + } else if spaces < *top { + // Pop the indentation level + indentation_stack.pop(); + return LexResult::Some(Token::new_dedent(current_pos), current_pos + spaces); + } else { + // Same indentation level + return next_token(chars, current_pos + spaces, indentation_stack, true); + } } // Scanners @@ -141,11 +178,12 @@ mod tests { fn t() { let input = String::from("126 "); let chars: Vec = input.chars().into_iter().collect(); + let mut indentation_stack = Vec::::new(); assert_eq!(4, chars.len()); assert!(has_input(&chars, 0)); - match next_token(&chars, 0) { + match next_token(&chars, 0, &mut indentation_stack, true) { LexResult::Some(t, _) => { assert_eq!("126", t.value) } @@ -175,7 +213,7 @@ mod tests { assert_eq!("1789e+1", tokens.get(3).unwrap().value); assert_eq!("239.3298e-103", tokens.get(4).unwrap().value); - assert_eq!(TokenType::Semicolon, tokens.get(5).unwrap().token_type); + assert_eq!(TokenType::NewLine, tokens.get(5).unwrap().token_type); assert_eq!(TokenType::EOF, tokens.get(6).unwrap().token_type); } @@ -222,7 +260,7 @@ mod tests { let input = String::from("3\n22"); let tokens = get_tokens(&input).unwrap(); - assert_eq!(TokenType::Semicolon, tokens[1].token_type); + assert_eq!(TokenType::NewLine, tokens[1].token_type); } #[test] @@ -230,7 +268,7 @@ mod tests { let input = String::from("3\n\n\n22"); let tokens = get_tokens(&input).unwrap(); - assert_eq!(TokenType::Semicolon, tokens[1].token_type); + assert_eq!(TokenType::NewLine, tokens[1].token_type); assert_eq!(TokenType::Number, tokens[2].token_type); } @@ -239,7 +277,45 @@ mod tests { let input = String::from("3\n \n \n22"); let tokens = get_tokens(&input).unwrap(); - assert_eq!(TokenType::Semicolon, tokens[1].token_type); + assert_eq!(TokenType::NewLine, tokens[1].token_type); assert_eq!(TokenType::Number, tokens[2].token_type); } + + #[test] + fn should_emit_indent_token() { + let input = String::from("3\n \n 22"); + let tokens = get_tokens(&input).unwrap(); + + assert_eq!(TokenType::Number, tokens[0].token_type); + assert_eq!(TokenType::NewLine, tokens[1].token_type); + assert_eq!(TokenType::INDENT, tokens[2].token_type); + assert_eq!(TokenType::Number, tokens[3].token_type); + } + + #[test] + fn should_emit_indent_when_indentation_increases() { + let input = String::from("3\n \n 22\n 111"); + let tokens = get_tokens(&input).unwrap(); + + assert_eq!(TokenType::Number, tokens[0].token_type); + assert_eq!(TokenType::NewLine, tokens[1].token_type); + assert_eq!(TokenType::INDENT, tokens[2].token_type); + assert_eq!(TokenType::Number, tokens[3].token_type); + assert_eq!(TokenType::NewLine, tokens[4].token_type); + assert_eq!(TokenType::INDENT, tokens[5].token_type); + assert_eq!(TokenType::Number, tokens[6].token_type); + } + + #[test] + fn shouldnt_emit_indent_when_indentation_stays() { + let input = String::from("3\n \n 22\n 111"); + let tokens = get_tokens(&input).unwrap(); + + assert_eq!(TokenType::Number, tokens[0].token_type); + assert_eq!(TokenType::NewLine, tokens[1].token_type); + assert_eq!(TokenType::INDENT, tokens[2].token_type); + assert_eq!(TokenType::Number, tokens[3].token_type); + assert_eq!(TokenType::NewLine, tokens[4].token_type); + assert_eq!(TokenType::Number, tokens[5].token_type); + } } diff --git a/src/lexic/scanner/new_line.rs b/src/lexic/scanner/new_line.rs index e2cb02e..2646a58 100644 --- a/src/lexic/scanner/new_line.rs +++ b/src/lexic/scanner/new_line.rs @@ -3,9 +3,6 @@ use crate::lexic::{token::Token, LexResult}; /// Function to handle new lines /// -/// It performs Automatic Semicolon Insertion, inserting a semicolon after -/// every new line or group of new lines -/// /// Assumes the char at start_pos is a new line pub fn scan(chars: &Vec, start_pos: usize) -> LexResult { let current = chars.get(start_pos); @@ -15,12 +12,12 @@ pub fn scan(chars: &Vec, start_pos: usize) -> LexResult { Some(c) if *c == ' ' => match look_ahead_for_new_line(chars, start_pos + 1) { Some(next_pos) => scan(chars, next_pos), None => { - let token = Token::new(String::from(";"), start_pos, TokenType::Semicolon); + let token = Token::new(String::from(";"), start_pos, TokenType::NewLine); LexResult::Some(token, start_pos) } }, Some(_) | None => { - let token = Token::new(String::from(";"), start_pos, TokenType::Semicolon); + let token = Token::new(String::from(";"), start_pos, TokenType::NewLine); LexResult::Some(token, start_pos) } } @@ -51,7 +48,7 @@ mod tests { let start_pos = 0; if let LexResult::Some(token, next_pos) = scan(&input, start_pos) { - assert_eq!(TokenType::Semicolon, token.token_type); + assert_eq!(TokenType::NewLine, token.token_type); assert_eq!(1, next_pos); } else { panic!() @@ -64,7 +61,7 @@ mod tests { let start_pos = 0; if let LexResult::Some(token, next_pos) = scan(&input, start_pos) { - assert_eq!(TokenType::Semicolon, token.token_type); + assert_eq!(TokenType::NewLine, token.token_type); assert_eq!(3, next_pos); } else { panic!() @@ -74,7 +71,7 @@ mod tests { let start_pos = 0; if let LexResult::Some(token, next_pos) = scan(&input, start_pos) { - assert_eq!(TokenType::Semicolon, token.token_type); + assert_eq!(TokenType::NewLine, token.token_type); assert_eq!(3, next_pos); } else { panic!() @@ -87,7 +84,7 @@ mod tests { let start_pos = 0; if let LexResult::Some(token, next_pos) = scan(&input, start_pos) { - assert_eq!(TokenType::Semicolon, token.token_type); + assert_eq!(TokenType::NewLine, token.token_type); assert_eq!(6, next_pos); } else { panic!() @@ -97,7 +94,7 @@ mod tests { let start_pos = 0; if let LexResult::Some(token, next_pos) = scan(&input, start_pos) { - assert_eq!(TokenType::Semicolon, token.token_type); + assert_eq!(TokenType::NewLine, token.token_type); assert_eq!(6, next_pos); } else { panic!() @@ -107,7 +104,7 @@ mod tests { let start_pos = 0; if let LexResult::Some(token, next_pos) = scan(&input, start_pos) { - assert_eq!(TokenType::Semicolon, token.token_type); + assert_eq!(TokenType::NewLine, token.token_type); assert_eq!(6, next_pos); } else { panic!() diff --git a/src/lexic/token.rs b/src/lexic/token.rs index 04ff33a..91b8910 100755 --- a/src/lexic/token.rs +++ b/src/lexic/token.rs @@ -11,8 +11,10 @@ pub enum TokenType { RightBracket, LeftBrace, RightBrace, - Semicolon, + NewLine, Comment, + INDENT, + DEDENT, VAR, VAL, EOF, @@ -86,7 +88,7 @@ impl Token { pub fn new_semicolon(position: usize) -> Token { Token { - token_type: TokenType::Semicolon, + token_type: TokenType::NewLine, value: String::from(";"), position, } @@ -107,4 +109,20 @@ impl Token { position, } } + + pub fn new_indent(position: usize) -> Token { + Token { + token_type: TokenType::INDENT, + value: String::from(""), + position, + } + } + + pub fn new_dedent(position: usize) -> Token { + Token { + token_type: TokenType::INDENT, + value: String::from(""), + position, + } + } } diff --git a/src/lexic/utils.rs b/src/lexic/utils.rs index 87737ec..de52aa3 100755 --- a/src/lexic/utils.rs +++ b/src/lexic/utils.rs @@ -3,7 +3,7 @@ pub fn is_digit(c: char) -> bool { '0' <= c && c <= '9' } -/// Whether `c` is between `a-fA-F` +/// Whether `c` is between `0-9a-fA-F` pub fn is_hex_digit(c: char) -> bool { is_digit(c) || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' } diff --git a/src/syntax/binding.rs b/src/syntax/binding.rs index 20264eb..f893393 100644 --- a/src/syntax/binding.rs +++ b/src/syntax/binding.rs @@ -129,7 +129,7 @@ pub fn try_parse<'a>(tokens: &'a Vec, pos: usize) -> Option fn try_token_type(tokens: &Vec, pos: usize, token_type: TokenType) -> Result3<&Token> { match tokens.get(pos) { Some(t) if t.token_type == token_type => Result3::Ok(t), - Some(t) if t.token_type == TokenType::Semicolon || t.token_type == TokenType::EOF => { + Some(t) if t.token_type == TokenType::NewLine || t.token_type == TokenType::EOF => { Result3::None } Some(t) => Result3::Err(t), @@ -140,7 +140,7 @@ fn try_token_type(tokens: &Vec, pos: usize, token_type: TokenType) -> Res fn try_operator(tokens: &Vec, pos: usize, operator: String) -> Result3<&Token> { match tokens.get(pos) { Some(t) if t.token_type == TokenType::Operator && t.value == operator => Result3::Ok(t), - Some(t) if t.token_type == TokenType::Semicolon || t.token_type == TokenType::EOF => { + Some(t) if t.token_type == TokenType::NewLine || t.token_type == TokenType::EOF => { Result3::None } Some(t) => Result3::Err(t), diff --git a/src/syntax/utils.rs b/src/syntax/utils.rs index 0cb1f56..ecbe58d 100644 --- a/src/syntax/utils.rs +++ b/src/syntax/utils.rs @@ -6,7 +6,7 @@ use crate::{ pub fn try_token_type(tokens: &Vec, pos: usize, token_type: TokenType) -> Result3<&Token> { match tokens.get(pos) { Some(t) if t.token_type == token_type => Result3::Ok(t), - Some(t) if t.token_type == TokenType::Semicolon || t.token_type == TokenType::EOF => { + Some(t) if t.token_type == TokenType::NewLine || t.token_type == TokenType::EOF => { Result3::None } Some(t) => Result3::Err(t),