504 lines
16 KiB
Rust
Executable File
504 lines
16 KiB
Rust
Executable File
mod scanner;
|
|
mod utils;
|
|
|
|
pub mod token;
|
|
|
|
use crate::error_handling::{ErrorContainer, ErrorLabel, MistiError};
|
|
use token::Token;
|
|
|
|
use self::token::TokenType;
|
|
|
|
type Chars = Vec<char>;
|
|
|
|
/// Represents the result of scanning a single token from the input
|
|
#[derive(Debug)]
|
|
pub enum LexResult {
|
|
/// A token was found. The first element is the token, and the
|
|
/// second element is the position in the input after the token.
|
|
///
|
|
/// E.g., given an input
|
|
///
|
|
/// "`identifier 55`"
|
|
///
|
|
/// scanning from a position `0`, the result would be
|
|
///
|
|
/// `Some(Token("identifier"), 10)`.
|
|
///
|
|
/// where:
|
|
/// - `Token("identifier")` is the token
|
|
/// - `10` is the position where the token ends, and from where the next token
|
|
/// should be scanned
|
|
Some(Token, usize),
|
|
/// Multiple tokens
|
|
Multiple(Vec<Token>, usize),
|
|
/// No token was found. This indicates that EOF has been reached.
|
|
///
|
|
/// Contains the last position, which should be the input lenght - 1
|
|
None(usize),
|
|
/// An error was found while scanning.
|
|
Err(ErrorContainer),
|
|
}
|
|
|
|
/// Scans and returns all the tokens in the input String
|
|
pub fn get_tokens(input: &String) -> Result<Vec<Token>, MistiError> {
|
|
let chars: Vec<char> = input.chars().into_iter().collect();
|
|
let mut results = Vec::new();
|
|
let mut current_pos: usize = 0;
|
|
let mut indentation_stack = vec![0];
|
|
// Used to emit INDENT & DEDENT tokens
|
|
let mut at_new_line = false;
|
|
|
|
while has_input(&chars, current_pos) {
|
|
match next_token(&chars, current_pos, &mut indentation_stack, at_new_line) {
|
|
LexResult::Some(token, next_pos) => {
|
|
// When a INDENT/DEDENT is returned it is because there is a NewLine.
|
|
// Remove that NewLine token and then insert the corresponding INDENT/DEDENT
|
|
if token.token_type == TokenType::INDENT || token.token_type == TokenType::DEDENT {
|
|
results.pop();
|
|
}
|
|
|
|
at_new_line = token.token_type == TokenType::NewLine;
|
|
|
|
results.push(token);
|
|
current_pos = next_pos;
|
|
}
|
|
LexResult::Multiple(tokens, next_pos) => {
|
|
// When a INDENT/DEDENT is returned it is because there is a NewLine.
|
|
// Remove that NewLine token and then insert the corresponding INDENT/DEDENT
|
|
match tokens.get(0) {
|
|
Some(t)
|
|
if t.token_type == TokenType::INDENT
|
|
|| t.token_type == TokenType::DEDENT =>
|
|
{
|
|
results.pop();
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
at_new_line = match tokens.last() {
|
|
Some(t) if t.token_type == TokenType::NewLine => true,
|
|
// This may be None if there are newlines followed by EOF.
|
|
_ => false,
|
|
};
|
|
|
|
results.extend(tokens);
|
|
current_pos = next_pos;
|
|
}
|
|
LexResult::None(next_pos) => {
|
|
current_pos = next_pos;
|
|
}
|
|
LexResult::Err(error_info) => {
|
|
return Err(error_info);
|
|
}
|
|
}
|
|
}
|
|
|
|
// emit DEDENT tokens for each entry left in the indentation_stack,
|
|
// except the first one (which is 0)
|
|
for _ in 0..indentation_stack.len() - 1 {
|
|
results.push(Token::new_dedent(current_pos));
|
|
}
|
|
|
|
// Push EOF
|
|
results.push(Token::new_eof(0));
|
|
Ok(results)
|
|
}
|
|
|
|
/// Scans a single token from `chars`, starting from `current_pos`
|
|
fn next_token(
|
|
chars: &Chars,
|
|
current_pos: usize,
|
|
indentation_stack: &mut Vec<usize>,
|
|
at_new_line: bool,
|
|
) -> LexResult {
|
|
let mut current_pos = current_pos;
|
|
|
|
if at_new_line {
|
|
return handle_indentation(chars, current_pos, indentation_stack);
|
|
} else if !at_new_line && peek(chars, current_pos) == ' ' {
|
|
// Consume whitespace
|
|
current_pos += 1;
|
|
while peek(chars, current_pos) == ' ' {
|
|
current_pos += 1;
|
|
}
|
|
}
|
|
|
|
// If EOF is reached return only the current position
|
|
if peek(chars, current_pos) == '\0' {
|
|
return LexResult::None(current_pos);
|
|
}
|
|
|
|
let next_char = peek(chars, current_pos);
|
|
|
|
// Scanners
|
|
None.or_else(|| scanner::number(next_char, chars, current_pos))
|
|
.or_else(|| scanner::identifier(next_char, chars, current_pos))
|
|
.or_else(|| scanner::datatype(next_char, chars, current_pos))
|
|
.or_else(|| scanner::string(next_char, chars, current_pos))
|
|
.or_else(|| scanner::new_comment(next_char, chars, current_pos))
|
|
.or_else(|| scanner::new_multiline_comment(next_char, chars, current_pos))
|
|
.or_else(|| scanner::operator(next_char, chars, current_pos))
|
|
.or_else(|| scanner::grouping_sign(next_char, chars, current_pos))
|
|
.or_else(|| scanner::new_line(next_char, chars, current_pos))
|
|
.or_else(|| {
|
|
if next_char == ',' {
|
|
Some(LexResult::Some(
|
|
Token::new(",".into(), current_pos, TokenType::Comma),
|
|
current_pos + 1,
|
|
))
|
|
} else {
|
|
None
|
|
}
|
|
})
|
|
.unwrap_or_else(|| {
|
|
let label = ErrorLabel {
|
|
message: String::from("This character is not allowed"),
|
|
start: current_pos,
|
|
end: current_pos + 1,
|
|
};
|
|
let error_container = ErrorContainer {
|
|
error_offset: current_pos,
|
|
error_code: 0x010001,
|
|
labels: vec![label],
|
|
note: None,
|
|
help: Some(String::from("Remove this character")),
|
|
};
|
|
|
|
LexResult::Err(error_container)
|
|
})
|
|
}
|
|
|
|
fn handle_indentation(
|
|
chars: &Chars,
|
|
current_pos: usize,
|
|
indentation_stack: &mut Vec<usize>,
|
|
) -> LexResult {
|
|
// Count the number of spaces
|
|
let mut spaces = 0;
|
|
let mut sub_pos = current_pos;
|
|
while peek(chars, sub_pos) == ' ' {
|
|
spaces += 1;
|
|
sub_pos += 1;
|
|
}
|
|
|
|
// Compare the number of spaces with the top of the stack
|
|
let top = indentation_stack.last().unwrap_or(&0);
|
|
|
|
if spaces > *top {
|
|
// Push the new indentation level
|
|
indentation_stack.push(spaces);
|
|
return LexResult::Some(Token::new_indent(current_pos), current_pos + spaces);
|
|
} else if spaces < *top {
|
|
// Emit a DEDENT token for each indentation level that is decreased
|
|
let mut dedent_tokens = Vec::<Token>::new();
|
|
|
|
while let Some(new_top) = indentation_stack.last() {
|
|
if spaces < *new_top {
|
|
indentation_stack.pop();
|
|
dedent_tokens.push(Token::new_dedent(current_pos));
|
|
} else if spaces == *new_top {
|
|
break;
|
|
} else {
|
|
// Illegal state: Indentation error
|
|
let econtaner = ErrorContainer {
|
|
error_code: 0,
|
|
error_offset: current_pos,
|
|
labels: vec![],
|
|
note: None,
|
|
help: None,
|
|
};
|
|
|
|
return LexResult::Err(econtaner);
|
|
}
|
|
}
|
|
|
|
return LexResult::Multiple(dedent_tokens, current_pos + spaces);
|
|
} else {
|
|
// Same indentation level
|
|
return next_token(chars, current_pos + spaces, indentation_stack, false);
|
|
}
|
|
}
|
|
|
|
/// Returns the char at `pos`
|
|
fn peek(input: &Chars, pos: usize) -> char {
|
|
let result = input.get(pos).unwrap_or(&'\0');
|
|
*result
|
|
}
|
|
|
|
/// Whether there is still input based on `current_pos`
|
|
fn has_input(input: &Chars, current_pos: usize) -> bool {
|
|
current_pos < input.len()
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use token::TokenType;
|
|
|
|
/// Should return an EOF token if the input has no tokens
|
|
#[test]
|
|
fn should_emit_eof() {
|
|
let input = String::from("");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
// a EOF token
|
|
assert_eq!(1, tokens.len());
|
|
let first = tokens.get(0).unwrap();
|
|
assert_eq!(TokenType::EOF, first.token_type);
|
|
|
|
let input = String::from(" ");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
// a EOF token
|
|
assert_eq!(1, tokens.len());
|
|
let first = tokens.get(0).unwrap();
|
|
assert_eq!(TokenType::EOF, first.token_type);
|
|
|
|
let input = String::from(" ");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
// a EOF token
|
|
assert_eq!(1, tokens.len());
|
|
let first = tokens.get(0).unwrap();
|
|
assert_eq!(TokenType::EOF, first.token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn t() {
|
|
let input = String::from("126 ");
|
|
let chars: Vec<char> = input.chars().into_iter().collect();
|
|
let mut indentation_stack = Vec::<usize>::new();
|
|
|
|
assert_eq!(4, chars.len());
|
|
assert!(has_input(&chars, 0));
|
|
|
|
match next_token(&chars, 0, &mut indentation_stack, true) {
|
|
LexResult::Some(t, _) => {
|
|
assert_eq!("126", t.value)
|
|
}
|
|
_ => {
|
|
panic!()
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Should scan numbers
|
|
#[test]
|
|
fn number_test() {
|
|
let input = String::from("126 278.98 0.282398 1789e+1 239.3298e-103");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
let t1 = tokens.get(0).unwrap();
|
|
assert_eq!(TokenType::Int, t1.token_type);
|
|
assert_eq!("126", t1.value);
|
|
|
|
let t2 = tokens.get(1).unwrap();
|
|
assert_eq!(TokenType::Float, t2.token_type);
|
|
assert_eq!("278.98", t2.value);
|
|
|
|
let t3 = tokens.get(2).unwrap();
|
|
assert_eq!(TokenType::Float, t3.token_type);
|
|
assert_eq!("0.282398", t3.value);
|
|
|
|
assert_eq!("1789e+1", tokens.get(3).unwrap().value);
|
|
assert_eq!("239.3298e-103", tokens.get(4).unwrap().value);
|
|
assert_eq!(TokenType::EOF, tokens.get(5).unwrap().token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn grouping_sign_test() {
|
|
let input = String::from("( ) { } [ ]");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
let t = tokens.get(0).unwrap();
|
|
assert_eq!(TokenType::LeftParen, t.token_type);
|
|
assert_eq!("(", t.value);
|
|
|
|
let t = tokens.get(1).unwrap();
|
|
assert_eq!(TokenType::RightParen, t.token_type);
|
|
assert_eq!(")", t.value);
|
|
|
|
let t = tokens.get(2).unwrap();
|
|
assert_eq!(TokenType::LeftBrace, t.token_type);
|
|
assert_eq!("{", t.value);
|
|
|
|
let t = tokens.get(3).unwrap();
|
|
assert_eq!(TokenType::RightBrace, t.token_type);
|
|
assert_eq!("}", t.value);
|
|
|
|
let t = tokens.get(4).unwrap();
|
|
assert_eq!(TokenType::LeftBracket, t.token_type);
|
|
assert_eq!("[", t.value);
|
|
|
|
let t = tokens.get(5).unwrap();
|
|
assert_eq!(TokenType::RightBracket, t.token_type);
|
|
assert_eq!("]", t.value);
|
|
}
|
|
|
|
#[test]
|
|
fn should_scan_datatype() {
|
|
let input = String::from("Num");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::Datatype, tokens[0].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn should_scan_new_line() {
|
|
let input = String::from("3\n22");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::NewLine, tokens[1].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn should_scan_multiple_new_lines() {
|
|
let input = String::from("3\n\n\n22");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::NewLine, tokens[1].token_type);
|
|
assert_eq!(TokenType::Int, tokens[2].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn should_scan_multiple_new_lines_with_whitespace_in_between() {
|
|
let input = String::from("3\n \n \n22");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::NewLine, tokens[1].token_type);
|
|
assert_eq!(TokenType::Int, tokens[2].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn should_emit_indent_token() {
|
|
let input = String::from("3\n \n 22");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::Int, tokens[0].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[1].token_type);
|
|
assert_eq!(TokenType::Int, tokens[2].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn should_emit_indent_when_indentation_increases() {
|
|
let input = String::from("3\n \n 22\n 111");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::Int, tokens[0].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[1].token_type);
|
|
assert_eq!(TokenType::Int, tokens[2].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[3].token_type);
|
|
assert_eq!(TokenType::Int, tokens[4].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn shouldnt_emit_indent_when_indentation_stays() {
|
|
let input = String::from("3\n \n 22\n 111");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::Int, tokens[0].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[1].token_type);
|
|
assert_eq!(TokenType::Int, tokens[2].token_type);
|
|
assert_eq!(TokenType::NewLine, tokens[3].token_type);
|
|
assert_eq!(TokenType::Int, tokens[4].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn should_emit_dedent() {
|
|
let input = String::from("3\n \n 22\n111");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::Int, tokens[0].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[1].token_type);
|
|
assert_eq!(TokenType::Int, tokens[2].token_type);
|
|
assert_eq!(TokenType::DEDENT, tokens[3].token_type);
|
|
assert_eq!(TokenType::Int, tokens[4].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn should_emit_multiple_dedents() {
|
|
let input = String::from("1\n 2\n 3\n 4\n5");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::Int, tokens[0].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[1].token_type);
|
|
assert_eq!(TokenType::Int, tokens[2].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[3].token_type);
|
|
assert_eq!(TokenType::Int, tokens[4].token_type);
|
|
assert_eq!(TokenType::DEDENT, tokens[5].token_type);
|
|
assert_eq!(TokenType::Int, tokens[6].token_type);
|
|
assert_eq!(TokenType::DEDENT, tokens[7].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn should_emit_multiple_dedents_2() {
|
|
let input = String::from("1\n 2\n 3\n4");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::Int, tokens[0].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[1].token_type);
|
|
assert_eq!(TokenType::Int, tokens[2].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[3].token_type);
|
|
assert_eq!(TokenType::Int, tokens[4].token_type);
|
|
assert_eq!(TokenType::DEDENT, tokens[5].token_type);
|
|
assert_eq!(TokenType::DEDENT, tokens[6].token_type);
|
|
assert_eq!(TokenType::Int, tokens[7].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn shouldnt_emit_trailing_newlines() {
|
|
let input = String::from("token\n");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(2, tokens.len());
|
|
assert_eq!(TokenType::Identifier, tokens[0].token_type);
|
|
assert_eq!(TokenType::EOF, tokens[1].token_type);
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod indentation_tests {
|
|
use super::*;
|
|
use token::TokenType;
|
|
|
|
#[test]
|
|
fn should_emit_dedents_on_eof() {
|
|
let input = String::from("1\n 2");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::Int, tokens[0].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[1].token_type);
|
|
assert_eq!(TokenType::Int, tokens[2].token_type);
|
|
assert_eq!(TokenType::DEDENT, tokens[3].token_type);
|
|
assert_eq!(TokenType::EOF, tokens[4].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn should_emit_dedents_on_eof_2() {
|
|
let input = String::from("1\n 2\n 3");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::Int, tokens[0].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[1].token_type);
|
|
assert_eq!(TokenType::Int, tokens[2].token_type);
|
|
assert_eq!(TokenType::INDENT, tokens[3].token_type);
|
|
assert_eq!(TokenType::Int, tokens[4].token_type);
|
|
assert_eq!(TokenType::DEDENT, tokens[5].token_type);
|
|
assert_eq!(TokenType::DEDENT, tokens[6].token_type);
|
|
assert_eq!(TokenType::EOF, tokens[7].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn should_lex_comments() {
|
|
let input = String::from("// ??");
|
|
let tokens = get_tokens(&input).unwrap();
|
|
|
|
assert_eq!(TokenType::Comment, tokens[0].token_type);
|
|
}
|
|
|
|
#[test]
|
|
fn should_emit_error_on_incorrect_indentation() {
|
|
let input = String::from("1\n 2\n 3");
|
|
let tokens = get_tokens(&input);
|
|
|
|
assert!(tokens.is_err());
|
|
}
|
|
}
|