thp/src/lexic/mod.rs

424 lines
14 KiB
Rust
Raw Normal View History

2022-11-28 23:33:34 +00:00
mod scanner;
mod utils;
2023-09-08 01:46:11 +00:00
pub mod token;
use crate::error_handling::{LexError, MistiError};
2023-09-08 01:46:11 +00:00
use token::Token;
2022-11-28 23:33:34 +00:00
use self::token::TokenType;
2022-11-28 23:33:34 +00:00
type Chars = Vec<char>;
/// Represents the result of scanning a single token from the input
2022-11-30 13:38:43 +00:00
pub enum LexResult {
/// A token was found. The first element is the token, and the
/// second element is the position in the input after the token.
///
/// E.g., given an input
///
/// "`identifier 55`"
///
/// scanning from a position `0`, the result would be
///
/// `Some(Token("identifier"), 10)`.
///
/// where:
/// - `Token("identifier")` is the token
/// - `10` is the position where the token ends, and from where the next token
/// should be scanned
2022-11-30 13:38:43 +00:00
Some(Token, usize),
/// Multiple tokens
Multiple(Vec<Token>, usize),
/// No token was found. This indicates that EOF has been reached.
///
/// Contains the last position, which should be the input lenght - 1
2022-11-30 13:38:43 +00:00
None(usize),
/// An error was found while scanning.
2023-01-05 17:48:34 +00:00
Err(LexError),
2022-11-30 13:38:43 +00:00
}
2022-11-28 23:33:34 +00:00
/// Scans and returns all the tokens in the input String
pub fn get_tokens(input: &String) -> Result<Vec<Token>, MistiError> {
2022-11-28 23:33:34 +00:00
let chars: Vec<char> = input.chars().into_iter().collect();
let mut results = Vec::new();
let mut current_pos: usize = 0;
2023-09-10 16:39:52 +00:00
let mut indentation_stack = vec![0];
// Used to emit INDENT & DEDENT tokens
let mut at_new_line = false;
2022-11-28 23:33:34 +00:00
while has_input(&chars, current_pos) {
match next_token(&chars, current_pos, &mut indentation_stack, at_new_line) {
2022-11-30 13:38:43 +00:00
LexResult::Some(token, next_pos) => {
2023-09-10 16:39:52 +00:00
at_new_line = token.token_type == TokenType::NewLine;
2022-11-29 00:16:55 +00:00
results.push(token);
current_pos = next_pos;
}
LexResult::Multiple(tokens, next_pos) => {
at_new_line = tokens.last().unwrap().token_type == TokenType::NewLine;
results.extend(tokens);
current_pos = next_pos;
}
2022-11-30 13:38:43 +00:00
LexResult::None(next_pos) => {
2022-11-29 00:16:55 +00:00
current_pos = next_pos;
}
LexResult::Err(error_info) => {
return Err(MistiError::Lex(error_info));
}
2022-11-28 23:33:34 +00:00
}
}
2023-09-08 01:46:11 +00:00
results.push(Token::new_semicolon(0));
// TODO: emit DEDENT tokens for each entry in indentation_stack
2023-09-08 01:46:11 +00:00
results.push(Token::new_eof(0));
2022-11-29 00:16:55 +00:00
Ok(results)
2022-11-28 23:33:34 +00:00
}
/// Scans a single token from `chars`, starting from `current_pos`
2023-09-10 16:39:52 +00:00
fn next_token(
chars: &Chars,
current_pos: usize,
indentation_stack: &mut Vec<usize>,
at_new_line: bool,
) -> LexResult {
let mut current_pos = current_pos;
if at_new_line {
return handle_indentation(chars, current_pos, indentation_stack);
}
else if !at_new_line && peek(chars, current_pos) == ' ' {
// Consume whitespace
current_pos += 1;
while peek(chars, current_pos) == ' ' {
2023-09-10 16:39:52 +00:00
current_pos += 1;
}
2022-11-29 00:16:55 +00:00
}
2023-09-10 16:39:52 +00:00
// If EOF is reached return only the current position
if peek(chars, current_pos) == '\0' {
return LexResult::None(current_pos);
}
2023-09-10 16:39:52 +00:00
let next_char = peek(chars, current_pos);
2022-11-28 23:33:34 +00:00
2022-12-01 13:33:48 +00:00
// Scanners
None.or_else(|| scanner::number(next_char, chars, current_pos))
.or_else(|| scanner::identifier(next_char, chars, current_pos))
.or_else(|| scanner::datatype(next_char, chars, current_pos))
.or_else(|| scanner::string(next_char, chars, current_pos))
2023-04-05 15:31:12 +00:00
.or_else(|| scanner::new_comment(next_char, chars, current_pos))
.or_else(|| scanner::operator(next_char, chars, current_pos))
.or_else(|| scanner::grouping_sign(next_char, chars, current_pos))
2023-02-14 20:22:29 +00:00
.or_else(|| scanner::new_line(next_char, chars, current_pos))
2022-11-30 13:38:43 +00:00
.unwrap_or_else(|| {
2023-01-05 17:48:34 +00:00
let error = LexError {
position: current_pos,
2023-03-03 14:23:08 +00:00
reason: format!(
"Unrecognized character `{}` (escaped: `{}`)",
next_char,
2023-03-03 14:23:08 +00:00
next_char.escape_default().to_string(),
),
2023-01-05 17:48:34 +00:00
};
LexResult::Err(error)
2022-11-30 13:38:43 +00:00
})
2022-11-28 23:33:34 +00:00
}
2023-09-10 16:39:52 +00:00
fn handle_indentation(
chars: &Chars,
current_pos: usize,
indentation_stack: &mut Vec<usize>,
) -> LexResult {
// Count the number of spaces
let mut spaces = 0;
let mut sub_pos = current_pos;
while peek(chars, sub_pos) == ' ' {
spaces += 1;
sub_pos += 1;
}
// TODO: should emit a DEDENT for every single entry decreased in the stack
2023-09-10 16:39:52 +00:00
// Compare the number of spaces with the top of the stack
let top = indentation_stack.last().unwrap_or(&0);
2023-09-10 16:39:52 +00:00
if spaces > *top {
// Push the new indentation level
indentation_stack.push(spaces);
return LexResult::Some(Token::new_indent(current_pos), current_pos + spaces);
} else if spaces < *top {
// Emit a DEDENT token for each indentation level that is decreased
let mut dedent_tokens = Vec::<Token>::new();
while let Some(new_top) = indentation_stack.last() {
if spaces < *new_top {
indentation_stack.pop();
dedent_tokens.push(Token::new_dedent(current_pos));
}
else if spaces == *new_top {
break;
}
else {
// Illegal state: Indentation error
let error = LexError {
position: current_pos,
reason: format!(
"Indentation error: expected {} spaces, found {}",
new_top,
spaces
),
};
return LexResult::Err(error);
}
}
return LexResult::Multiple(
dedent_tokens,
current_pos + spaces
);
2023-09-10 16:39:52 +00:00
} else {
// Same indentation level
return next_token(chars, current_pos + spaces, indentation_stack, false);
2023-09-10 16:39:52 +00:00
}
}
/// Returns the char at `pos`
2022-11-28 23:33:34 +00:00
fn peek(input: &Chars, pos: usize) -> char {
let result = input.get(pos).unwrap_or(&'\0');
*result
}
/// Whether there is still input based on `current_pos`
2022-11-28 23:33:34 +00:00
fn has_input(input: &Chars, current_pos: usize) -> bool {
2022-11-29 00:16:55 +00:00
current_pos < input.len()
2022-11-28 23:33:34 +00:00
}
#[cfg(test)]
mod tests {
use super::*;
2022-11-29 00:16:55 +00:00
use token::TokenType;
2022-11-28 23:33:34 +00:00
/// Should return an EOF token if the input has no tokens
#[test]
2023-09-10 16:39:52 +00:00
fn should_emit_eof() {
2022-11-28 23:33:34 +00:00
let input = String::from("");
2022-11-29 00:16:55 +00:00
let tokens = get_tokens(&input).unwrap();
// 1 semicolon and 1 EOF token
assert_eq!(2, tokens.len());
let first = tokens.get(1).unwrap();
2022-11-28 23:33:34 +00:00
assert_eq!(TokenType::EOF, first.token_type);
let input = String::from(" ");
2022-11-29 00:16:55 +00:00
let tokens = get_tokens(&input).unwrap();
// 1 semicolon and 1 EOF token
assert_eq!(2, tokens.len());
let first = tokens.get(1).unwrap();
2022-11-28 23:33:34 +00:00
assert_eq!(TokenType::EOF, first.token_type);
2022-11-29 00:16:55 +00:00
let input = String::from(" ");
let tokens = get_tokens(&input).unwrap();
// 1 semicolon and 1 EOF token
assert_eq!(2, tokens.len());
let first = tokens.get(1).unwrap();
2022-11-28 23:33:34 +00:00
assert_eq!(TokenType::EOF, first.token_type);
}
2022-11-29 00:16:55 +00:00
#[test]
fn t() {
let input = String::from("126 ");
let chars: Vec<char> = input.chars().into_iter().collect();
let mut indentation_stack = Vec::<usize>::new();
2022-11-29 00:16:55 +00:00
assert_eq!(4, chars.len());
assert!(has_input(&chars, 0));
match next_token(&chars, 0, &mut indentation_stack, true) {
2022-11-30 13:38:43 +00:00
LexResult::Some(t, _) => {
2022-11-29 00:16:55 +00:00
assert_eq!("126", t.value)
}
2022-11-30 13:38:43 +00:00
_ => {
2022-11-29 00:16:55 +00:00
panic!()
}
}
}
2022-11-28 23:33:34 +00:00
/// Should scan numbers
#[test]
fn number_test() {
2022-11-30 13:38:43 +00:00
let input = String::from("126 278.98 0.282398 1789e+1 239.3298e-103");
2022-11-29 00:16:55 +00:00
let tokens = get_tokens(&input).unwrap();
let t1 = tokens.get(0).unwrap();
assert_eq!(TokenType::Number, t1.token_type);
assert_eq!("126", t1.value);
let t2 = tokens.get(1).unwrap();
assert_eq!(TokenType::Number, t2.token_type);
assert_eq!("278.98", t2.value);
2022-11-28 23:33:34 +00:00
2022-11-29 00:16:55 +00:00
let t3 = tokens.get(2).unwrap();
assert_eq!(TokenType::Number, t3.token_type);
assert_eq!("0.282398", t3.value);
2022-11-30 13:38:43 +00:00
assert_eq!("1789e+1", tokens.get(3).unwrap().value);
2022-11-28 23:33:34 +00:00
assert_eq!("239.3298e-103", tokens.get(4).unwrap().value);
assert_eq!(TokenType::NewLine, tokens.get(5).unwrap().token_type);
assert_eq!(TokenType::EOF, tokens.get(6).unwrap().token_type);
2022-11-30 13:38:43 +00:00
}
#[test]
fn grouping_sign_test() {
let input = String::from("( ) { } [ ]");
let tokens = get_tokens(&input).unwrap();
let t = tokens.get(0).unwrap();
assert_eq!(TokenType::LeftParen, t.token_type);
assert_eq!("(", t.value);
let t = tokens.get(1).unwrap();
assert_eq!(TokenType::RightParen, t.token_type);
assert_eq!(")", t.value);
let t = tokens.get(2).unwrap();
assert_eq!(TokenType::LeftBrace, t.token_type);
assert_eq!("{", t.value);
let t = tokens.get(3).unwrap();
assert_eq!(TokenType::RightBrace, t.token_type);
assert_eq!("}", t.value);
let t = tokens.get(4).unwrap();
assert_eq!(TokenType::LeftBracket, t.token_type);
assert_eq!("[", t.value);
let t = tokens.get(5).unwrap();
assert_eq!(TokenType::RightBracket, t.token_type);
assert_eq!("]", t.value);
2022-11-28 23:33:34 +00:00
}
#[test]
fn should_scan_datatype() {
let input = String::from("Num");
let tokens = get_tokens(&input).unwrap();
assert_eq!(TokenType::Datatype, tokens[0].token_type);
}
2023-02-14 20:22:29 +00:00
#[test]
fn should_scan_new_line() {
let input = String::from("3\n22");
let tokens = get_tokens(&input).unwrap();
assert_eq!(TokenType::NewLine, tokens[1].token_type);
2023-02-14 20:22:29 +00:00
}
2023-02-14 20:22:29 +00:00
#[test]
fn should_scan_multiple_new_lines() {
let input = String::from("3\n\n\n22");
let tokens = get_tokens(&input).unwrap();
assert_eq!(TokenType::NewLine, tokens[1].token_type);
2023-02-14 20:22:29 +00:00
assert_eq!(TokenType::Number, tokens[2].token_type);
}
2023-02-14 20:22:29 +00:00
#[test]
fn should_scan_multiple_new_lines_with_whitespace_in_between() {
let input = String::from("3\n \n \n22");
let tokens = get_tokens(&input).unwrap();
assert_eq!(TokenType::NewLine, tokens[1].token_type);
2023-02-14 20:22:29 +00:00
assert_eq!(TokenType::Number, tokens[2].token_type);
}
#[test]
fn should_emit_indent_token() {
let input = String::from("3\n \n 22");
let tokens = get_tokens(&input).unwrap();
assert_eq!(TokenType::Number, tokens[0].token_type);
assert_eq!(TokenType::NewLine, tokens[1].token_type);
assert_eq!(TokenType::INDENT, tokens[2].token_type);
assert_eq!(TokenType::Number, tokens[3].token_type);
}
#[test]
fn should_emit_indent_when_indentation_increases() {
let input = String::from("3\n \n 22\n 111");
let tokens = get_tokens(&input).unwrap();
assert_eq!(TokenType::Number, tokens[0].token_type);
assert_eq!(TokenType::NewLine, tokens[1].token_type);
assert_eq!(TokenType::INDENT, tokens[2].token_type);
assert_eq!(TokenType::Number, tokens[3].token_type);
assert_eq!(TokenType::NewLine, tokens[4].token_type);
assert_eq!(TokenType::INDENT, tokens[5].token_type);
assert_eq!(TokenType::Number, tokens[6].token_type);
}
#[test]
fn shouldnt_emit_indent_when_indentation_stays() {
let input = String::from("3\n \n 22\n 111");
let tokens = get_tokens(&input).unwrap();
assert_eq!(TokenType::Number, tokens[0].token_type);
assert_eq!(TokenType::NewLine, tokens[1].token_type);
assert_eq!(TokenType::INDENT, tokens[2].token_type);
assert_eq!(TokenType::Number, tokens[3].token_type);
assert_eq!(TokenType::NewLine, tokens[4].token_type);
assert_eq!(TokenType::Number, tokens[5].token_type);
}
#[test]
fn should_emit_dedent() {
let input = String::from("3\n \n 22\n111");
let tokens = get_tokens(&input).unwrap();
assert_eq!(TokenType::Number, tokens[0].token_type);
assert_eq!(TokenType::NewLine, tokens[1].token_type);
assert_eq!(TokenType::INDENT, tokens[2].token_type);
assert_eq!(TokenType::Number, tokens[3].token_type);
assert_eq!(TokenType::NewLine, tokens[4].token_type);
assert_eq!(TokenType::DEDENT, tokens[5].token_type);
assert_eq!(TokenType::Number, tokens[6].token_type);
}
#[test]
fn should_emit_multiple_dedents() {
let input = String::from("1\n 2\n 3\n 4\n5");
let tokens = get_tokens(&input).unwrap();
assert_eq!(TokenType::Number, tokens[0].token_type);
assert_eq!(TokenType::NewLine, tokens[1].token_type);
assert_eq!(TokenType::INDENT, tokens[2].token_type);
assert_eq!(TokenType::Number, tokens[3].token_type);
assert_eq!(TokenType::NewLine, tokens[4].token_type);
assert_eq!(TokenType::INDENT, tokens[5].token_type);
assert_eq!(TokenType::Number, tokens[6].token_type);
assert_eq!(TokenType::NewLine, tokens[7].token_type);
assert_eq!(TokenType::DEDENT, tokens[8].token_type);
assert_eq!(TokenType::Number, tokens[9].token_type);
assert_eq!(TokenType::NewLine, tokens[10].token_type);
assert_eq!(TokenType::DEDENT, tokens[11].token_type);
}
#[test]
fn should_emit_multiple_dedents_2() {
let input = String::from("1\n 2\n 3\n4");
let tokens = get_tokens(&input).unwrap();
assert_eq!(TokenType::Number, tokens[0].token_type);
assert_eq!(TokenType::NewLine, tokens[1].token_type);
assert_eq!(TokenType::INDENT, tokens[2].token_type);
assert_eq!(TokenType::Number, tokens[3].token_type);
assert_eq!(TokenType::NewLine, tokens[4].token_type);
assert_eq!(TokenType::INDENT, tokens[5].token_type);
assert_eq!(TokenType::Number, tokens[6].token_type);
assert_eq!(TokenType::NewLine, tokens[7].token_type);
assert_eq!(TokenType::DEDENT, tokens[8].token_type);
assert_eq!(TokenType::DEDENT, tokens[9].token_type);
}
2022-11-28 23:33:34 +00:00
}