thp/compiler/src/lexic/mod.rs

mod scanner;
mod utils;

use super::token::{self, Token};
use crate::error_handling::{LexError, MistiError};

type Chars = Vec<char>;

/// Represents the result of scanning a single token from the input
pub enum LexResult {
    /// A token was found. The first element is the token, and the
    /// second element is the position in the input after the token.
    ///
    /// E.g., given an input
    ///
    /// "`identifier 55`"
    ///
    /// scanning from a position `0`, the result would be
    ///
    /// `Some(Token("identifier"), 10)`.
    ///
    /// where:
    /// - `Token("identifier")` is the token
    /// - `10` is the position where the token ends, and from where the next token
    /// should be scanned
    Some(Token, usize),
    /// No token was found. This indicates that EOF has been reached.
    ///
    /// Contains the last position, which should be the input lenght - 1
    None(usize),
    /// An error was found while scanning.
    Err(LexError),
}

/// Scans and returns all the tokens in the input String
pub fn get_tokens(input: &String) -> Result<Vec<Token>, MistiError> {
    let chars: Vec<char> = input.chars().into_iter().collect();
    let mut results = Vec::new();
    let mut current_pos: usize = 0;

    while has_input(&chars, current_pos) {
        match next_token(&chars, current_pos) {
            LexResult::Some(token, next_pos) => {
                results.push(token);
                current_pos = next_pos;
            }
            LexResult::None(next_pos) => {
                current_pos = next_pos;
            }
            LexResult::Err(error_info) => {
                return Err(MistiError::Lex(error_info));
            }
        }
    }

    results.push(token::new_semicolon(0));
    results.push(token::new_eof(0));
    Ok(results)
}

/// Scans a single token from `chars`, starting from `current_pos`
fn next_token(chars: &Chars, current_pos: usize) -> LexResult {
    let next_char = peek(chars, current_pos);

    // If EOF is reached return nothing but the current position
    if next_char == '\0' {
        return LexResult::None(current_pos);
    }

    // Handle whitespace recursively.
    if next_char == ' ' {
        return next_token(chars, current_pos + 1);
    }

    // Scanners
    None.or_else(|| scanner::number(next_char, chars, current_pos))
        .or_else(|| scanner::identifier(next_char, chars, current_pos))
        .or_else(|| scanner::datatype(next_char, chars, current_pos))
        .or_else(|| scanner::string(next_char, chars, current_pos))
        .or_else(|| scanner::operator(next_char, chars, current_pos))
        .or_else(|| scanner::grouping_sign(next_char, chars, current_pos))
        .or_else(|| scanner::new_line(next_char, chars, current_pos))
        .unwrap_or_else(|| {
            let error = LexError {
                position: current_pos,
                reason: format!(
                    "Unrecognized character `{}` (escaped: `{}`)",
                    next_char,
                    next_char.escape_default().to_string(),
                ),
            };
            LexResult::Err(error)
        })
}

/// Returns the char at `pos`
fn peek(input: &Chars, pos: usize) -> char {
    let result = input.get(pos).unwrap_or(&'\0');
    *result
}

/// Whether there is still input based on `current_pos`
fn has_input(input: &Chars, current_pos: usize) -> bool {
    current_pos < input.len()
}

#[cfg(test)]
mod tests {
    use super::*;
    use token::TokenType;

    /// Should return an EOF token if the input has no tokens
    #[test]
    fn test1() {
        let input = String::from("");
        let tokens = get_tokens(&input).unwrap();
        // 1 semicolon and 1 EOF token
        assert_eq!(2, tokens.len());
        let first = tokens.get(1).unwrap();
        assert_eq!(TokenType::EOF, first.token_type);

        let input = String::from("  ");
        let tokens = get_tokens(&input).unwrap();
        // 1 semicolon and 1 EOF token
        assert_eq!(2, tokens.len());
        let first = tokens.get(1).unwrap();
        assert_eq!(TokenType::EOF, first.token_type);

        let input = String::from("    ");
        let tokens = get_tokens(&input).unwrap();
        // 1 semicolon and 1 EOF token
        assert_eq!(2, tokens.len());
        let first = tokens.get(1).unwrap();
        assert_eq!(TokenType::EOF, first.token_type);
    }

    #[test]
    fn t() {
        let input = String::from("126 ");
        let chars: Vec<char> = input.chars().into_iter().collect();

        assert_eq!(4, chars.len());
        assert!(has_input(&chars, 0));

        match next_token(&chars, 0) {
            LexResult::Some(t, _) => {
                assert_eq!("126", t.value)
            }
            _ => {
                panic!()
            }
        }
    }

    /// Should scan numbers
    #[test]
    fn number_test() {
        let input = String::from("126 278.98 0.282398 1789e+1 239.3298e-103");
        let tokens = get_tokens(&input).unwrap();

        let t1 = tokens.get(0).unwrap();
        assert_eq!(TokenType::Number, t1.token_type);
        assert_eq!("126", t1.value);

        let t2 = tokens.get(1).unwrap();
        assert_eq!(TokenType::Number, t2.token_type);
        assert_eq!("278.98", t2.value);

        let t3 = tokens.get(2).unwrap();
        assert_eq!(TokenType::Number, t3.token_type);
        assert_eq!("0.282398", t3.value);

        assert_eq!("1789e+1", tokens.get(3).unwrap().value);
        assert_eq!("239.3298e-103", tokens.get(4).unwrap().value);
        assert_eq!(TokenType::Semicolon, tokens.get(5).unwrap().token_type);
        assert_eq!(TokenType::EOF, tokens.get(6).unwrap().token_type);
    }

    #[test]
    fn grouping_sign_test() {
        let input = String::from("( ) { } [ ]");
        let tokens = get_tokens(&input).unwrap();

        let t = tokens.get(0).unwrap();
        assert_eq!(TokenType::LeftParen, t.token_type);
        assert_eq!("(", t.value);

        let t = tokens.get(1).unwrap();
        assert_eq!(TokenType::RightParen, t.token_type);
        assert_eq!(")", t.value);

        let t = tokens.get(2).unwrap();
        assert_eq!(TokenType::LeftBrace, t.token_type);
        assert_eq!("{", t.value);

        let t = tokens.get(3).unwrap();
        assert_eq!(TokenType::RightBrace, t.token_type);
        assert_eq!("}", t.value);

        let t = tokens.get(4).unwrap();
        assert_eq!(TokenType::LeftBracket, t.token_type);
        assert_eq!("[", t.value);

        let t = tokens.get(5).unwrap();
        assert_eq!(TokenType::RightBracket, t.token_type);
        assert_eq!("]", t.value);
    }

    #[test]
    fn should_scan_datatype() {
        let input = String::from("Num");
        let tokens = get_tokens(&input).unwrap();

        assert_eq!(TokenType::Datatype, tokens[0].token_type);
    }

    #[test]
    fn should_scan_new_line() {
        let input = String::from("3\n22");
        let tokens = get_tokens(&input).unwrap();

        assert_eq!(TokenType::Semicolon, tokens[1].token_type);
    }

    #[test]
    fn should_scan_multiple_new_lines() {
        let input = String::from("3\n\n\n22");
        let tokens = get_tokens(&input).unwrap();

        assert_eq!(TokenType::Semicolon, tokens[1].token_type);
        assert_eq!(TokenType::Number, tokens[2].token_type);
    }

    #[test]
    fn should_scan_multiple_new_lines_with_whitespace_in_between() {
        let input = String::from("3\n \n   \n22");
        let tokens = get_tokens(&input).unwrap();

        assert_eq!(TokenType::Semicolon, tokens[1].token_type);
        assert_eq!(TokenType::Number, tokens[2].token_type);
    }
}
Scan operators 2022-11-28 23:33:34 +00:00			`mod scanner;`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00			`mod utils;`
Add functions for error handling (merge) 2023-01-24 15:01:09 +00:00
Scan operators 2022-11-28 23:33:34 +00:00			`use super::token::{self, Token};`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00			`use crate::error_handling::{LexError, MistiError};`
Scan operators 2022-11-28 23:33:34 +00:00
			`type Chars = Vec<char>;`

Get datatype from an identifier in the symbol table. Improve code documentation 2023-02-11 23:13:05 +00:00			`/// Represents the result of scanning a single token from the input`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`pub enum LexResult {`
Get datatype from an identifier in the symbol table. Improve code documentation 2023-02-11 23:13:05 +00:00			`/// A token was found. The first element is the token, and the`
			`/// second element is the position in the input after the token.`
			`///`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00			`/// E.g., given an input`
Get datatype from an identifier in the symbol table. Improve code documentation 2023-02-11 23:13:05 +00:00			`///`
			/// "`identifier 55`"
			`///`
			/// scanning from a position `0`, the result would be
			`///`
			/// `Some(Token("identifier"), 10)`.
			`///`
			`/// where:`
			/// - `Token("identifier")` is the token
			/// - `10` is the position where the token ends, and from where the next token
			`/// should be scanned`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`Some(Token, usize),`
Get datatype from an identifier in the symbol table. Improve code documentation 2023-02-11 23:13:05 +00:00			`/// No token was found. This indicates that EOF has been reached.`
			`///`
			`/// Contains the last position, which should be the input lenght - 1`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`None(usize),`
Get datatype from an identifier in the symbol table. Improve code documentation 2023-02-11 23:13:05 +00:00			`/// An error was found while scanning.`
Fix bugs and improve error messages 2023-01-05 17:48:34 +00:00			`Err(LexError),`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`}`

Scan operators 2022-11-28 23:33:34 +00:00			`/// Scans and returns all the tokens in the input String`
Add functions for error handling (merge) 2023-01-24 15:01:09 +00:00			`pub fn get_tokens(input: &String) -> Result<Vec<Token>, MistiError> {`
Scan operators 2022-11-28 23:33:34 +00:00			`let chars: Vec<char> = input.chars().into_iter().collect();`
			`let mut results = Vec::new();`
			`let mut current_pos: usize = 0;`

			`while has_input(&chars, current_pos) {`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`match next_token(&chars, current_pos) {`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`LexResult::Some(token, next_pos) => {`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`results.push(token);`
			`current_pos = next_pos;`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00			`}`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`LexResult::None(next_pos) => {`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`current_pos = next_pos;`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00			`}`
Add functions for error handling (merge) 2023-01-24 15:01:09 +00:00			`LexResult::Err(error_info) => {`
			`return Err(MistiError::Lex(error_info));`
			`}`
Scan operators 2022-11-28 23:33:34 +00:00			`}`
			`}`

v0.0.3 - token stream always ends with Semicolon & EOF 2023-02-14 20:32:45 +00:00			`results.push(token::new_semicolon(0));`
Scan operators 2022-11-28 23:33:34 +00:00			`results.push(token::new_eof(0));`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`Ok(results)`
Scan operators 2022-11-28 23:33:34 +00:00			`}`

Get datatype from an identifier in the symbol table. Improve code documentation 2023-02-11 23:13:05 +00:00			/// Scans a single token from `chars`, starting from `current_pos`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`fn next_token(chars: &Chars, current_pos: usize) -> LexResult {`
Scan operators 2022-11-28 23:33:34 +00:00			`let next_char = peek(chars, current_pos);`

Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`// If EOF is reached return nothing but the current position`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`if next_char == '\0' {`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00			`return LexResult::None(current_pos);`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`}`

Simple ASI 2023-02-14 20:22:29 +00:00			`// Handle whitespace recursively.`
Scan operators 2022-11-28 23:33:34 +00:00			`if next_char == ' ' {`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00			`return next_token(chars, current_pos + 1);`
Scan operators 2022-11-28 23:33:34 +00:00			`}`

Scan identifiers 2022-12-01 13:33:48 +00:00			`// Scanners`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00			`None.or_else(\|\| scanner::number(next_char, chars, current_pos))`
Scan strings and escape characters inside string 2022-12-01 17:53:14 +00:00			`.or_else(\|\| scanner::identifier(next_char, chars, current_pos))`
Scan datatypes. Parse datatype annotations 2023-02-15 21:17:50 +00:00			`.or_else(\|\| scanner::datatype(next_char, chars, current_pos))`
Scan strings and escape characters inside string 2022-12-01 17:53:14 +00:00			`.or_else(\|\| scanner::string(next_char, chars, current_pos))`
			`.or_else(\|\| scanner::operator(next_char, chars, current_pos))`
			`.or_else(\|\| scanner::grouping_sign(next_char, chars, current_pos))`
Simple ASI 2023-02-14 20:22:29 +00:00			`.or_else(\|\| scanner::new_line(next_char, chars, current_pos))`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`.unwrap_or_else(\|\| {`
Fix bugs and improve error messages 2023-01-05 17:48:34 +00:00			`let error = LexError {`
			`position: current_pos,`
Error messages 2023-03-03 14:23:08 +00:00			`reason: format!(`
			"Unrecognized character `{}` (escaped: `{}`)",
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00			`next_char,`
Error messages 2023-03-03 14:23:08 +00:00			`next_char.escape_default().to_string(),`
			`),`
Fix bugs and improve error messages 2023-01-05 17:48:34 +00:00			`};`
			`LexResult::Err(error)`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`})`
Scan operators 2022-11-28 23:33:34 +00:00			`}`

Get datatype from an identifier in the symbol table. Improve code documentation 2023-02-11 23:13:05 +00:00			/// Returns the char at `pos`
Scan operators 2022-11-28 23:33:34 +00:00			`fn peek(input: &Chars, pos: usize) -> char {`
			`let result = input.get(pos).unwrap_or(&'\0');`
			`*result`
			`}`

Get datatype from an identifier in the symbol table. Improve code documentation 2023-02-11 23:13:05 +00:00			/// Whether there is still input based on `current_pos`
Scan operators 2022-11-28 23:33:34 +00:00			`fn has_input(input: &Chars, current_pos: usize) -> bool {`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`current_pos < input.len()`
Scan operators 2022-11-28 23:33:34 +00:00			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`use token::TokenType;`
Scan operators 2022-11-28 23:33:34 +00:00
			`/// Should return an EOF token if the input has no tokens`
			`#[test]`
			`fn test1() {`
			`let input = String::from("");`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`let tokens = get_tokens(&input).unwrap();`
v0.0.3 - token stream always ends with Semicolon & EOF 2023-02-14 20:32:45 +00:00			`// 1 semicolon and 1 EOF token`
			`assert_eq!(2, tokens.len());`
			`let first = tokens.get(1).unwrap();`
Scan operators 2022-11-28 23:33:34 +00:00			`assert_eq!(TokenType::EOF, first.token_type);`

			`let input = String::from(" ");`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`let tokens = get_tokens(&input).unwrap();`
v0.0.3 - token stream always ends with Semicolon & EOF 2023-02-14 20:32:45 +00:00			`// 1 semicolon and 1 EOF token`
			`assert_eq!(2, tokens.len());`
			`let first = tokens.get(1).unwrap();`
Scan operators 2022-11-28 23:33:34 +00:00			`assert_eq!(TokenType::EOF, first.token_type);`

Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`let input = String::from(" ");`
			`let tokens = get_tokens(&input).unwrap();`
v0.0.3 - token stream always ends with Semicolon & EOF 2023-02-14 20:32:45 +00:00			`// 1 semicolon and 1 EOF token`
			`assert_eq!(2, tokens.len());`
			`let first = tokens.get(1).unwrap();`
Scan operators 2022-11-28 23:33:34 +00:00			`assert_eq!(TokenType::EOF, first.token_type);`
			`}`

Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`#[test]`
			`fn t() {`
			`let input = String::from("126 ");`
			`let chars: Vec<char> = input.chars().into_iter().collect();`

			`assert_eq!(4, chars.len());`
			`assert!(has_input(&chars, 0));`

Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`match next_token(&chars, 0) {`
			`LexResult::Some(t, _) => {`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`assert_eq!("126", t.value)`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00			`}`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`_ => {`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`panic!()`
			`}`
			`}`
			`}`

Scan operators 2022-11-28 23:33:34 +00:00			`/// Should scan numbers`
			`#[test]`
			`fn number_test() {`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`let input = String::from("126 278.98 0.282398 1789e+1 239.3298e-103");`
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`let tokens = get_tokens(&input).unwrap();`

			`let t1 = tokens.get(0).unwrap();`
			`assert_eq!(TokenType::Number, t1.token_type);`
			`assert_eq!("126", t1.value);`

			`let t2 = tokens.get(1).unwrap();`
			`assert_eq!(TokenType::Number, t2.token_type);`
			`assert_eq!("278.98", t2.value);`
Scan operators 2022-11-28 23:33:34 +00:00
Fix errors in lexical analyzer 2022-11-29 00:16:55 +00:00			`let t3 = tokens.get(2).unwrap();`
			`assert_eq!(TokenType::Number, t3.token_type);`
			`assert_eq!("0.282398", t3.value);`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`assert_eq!("1789e+1", tokens.get(3).unwrap().value);`
Scan operators 2022-11-28 23:33:34 +00:00			`assert_eq!("239.3298e-103", tokens.get(4).unwrap().value);`
v0.0.3 - token stream always ends with Semicolon & EOF 2023-02-14 20:32:45 +00:00			`assert_eq!(TokenType::Semicolon, tokens.get(5).unwrap().token_type);`
			`assert_eq!(TokenType::EOF, tokens.get(6).unwrap().token_type);`
Refactor and scan grouping signs 2022-11-30 13:38:43 +00:00			`}`

			`#[test]`
			`fn grouping_sign_test() {`
			`let input = String::from("( ) { } [ ]");`
			`let tokens = get_tokens(&input).unwrap();`

			`let t = tokens.get(0).unwrap();`
			`assert_eq!(TokenType::LeftParen, t.token_type);`
			`assert_eq!("(", t.value);`

			`let t = tokens.get(1).unwrap();`
			`assert_eq!(TokenType::RightParen, t.token_type);`
			`assert_eq!(")", t.value);`

			`let t = tokens.get(2).unwrap();`
			`assert_eq!(TokenType::LeftBrace, t.token_type);`
			`assert_eq!("{", t.value);`

			`let t = tokens.get(3).unwrap();`
			`assert_eq!(TokenType::RightBrace, t.token_type);`
			`assert_eq!("}", t.value);`

			`let t = tokens.get(4).unwrap();`
			`assert_eq!(TokenType::LeftBracket, t.token_type);`
			`assert_eq!("[", t.value);`

			`let t = tokens.get(5).unwrap();`
			`assert_eq!(TokenType::RightBracket, t.token_type);`
			`assert_eq!("]", t.value);`
Scan operators 2022-11-28 23:33:34 +00:00			`}`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00
Scan datatypes. Parse datatype annotations 2023-02-15 21:17:50 +00:00			`#[test]`
			`fn should_scan_datatype() {`
			`let input = String::from("Num");`
			`let tokens = get_tokens(&input).unwrap();`

			`assert_eq!(TokenType::Datatype, tokens[0].token_type);`
			`}`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00
Simple ASI 2023-02-14 20:22:29 +00:00			`#[test]`
			`fn should_scan_new_line() {`
			`let input = String::from("3\n22");`
			`let tokens = get_tokens(&input).unwrap();`

			`assert_eq!(TokenType::Semicolon, tokens[1].token_type);`
			`}`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00
Simple ASI 2023-02-14 20:22:29 +00:00			`#[test]`
			`fn should_scan_multiple_new_lines() {`
			`let input = String::from("3\n\n\n22");`
			`let tokens = get_tokens(&input).unwrap();`

			`assert_eq!(TokenType::Semicolon, tokens[1].token_type);`
			`assert_eq!(TokenType::Number, tokens[2].token_type);`
			`}`
Add minimal error reporting for syntax analysis 2023-03-14 21:10:43 +00:00
Simple ASI 2023-02-14 20:22:29 +00:00			`#[test]`
			`fn should_scan_multiple_new_lines_with_whitespace_in_between() {`
			`let input = String::from("3\n \n \n22");`
			`let tokens = get_tokens(&input).unwrap();`

			`assert_eq!(TokenType::Semicolon, tokens[1].token_type);`
			`assert_eq!(TokenType::Number, tokens[2].token_type);`
			`}`
Scan operators 2022-11-28 23:33:34 +00:00			`}`