Scan identifiers

This commit is contained in:
Araozu 2022-12-01 08:33:48 -05:00
parent 11ca7edb44
commit e580167682
6 changed files with 208 additions and 61 deletions

View File

@ -49,7 +49,7 @@ fn next_token(chars: &Chars, current_pos: usize) -> LexResult {
return next_token(chars, current_pos + 1)
}
// Test number
// Scanners
None
.or_else(|| {
scanner::number(next_char, chars, current_pos)
@ -60,6 +60,9 @@ fn next_token(chars: &Chars, current_pos: usize) -> LexResult {
.or_else(|| {
scanner::grouping_sign(next_char, chars, current_pos)
})
.or_else(|| {
scanner::identifier(next_char, chars, current_pos)
})
.unwrap_or_else(|| {
LexResult::Err(format!("Unrecognized character: {}", next_char))
})

View File

@ -0,0 +1,122 @@
use crate::lexic::{token, utils, LexResult};
pub fn scan(start_char: char, chars: &Vec<char>, start_pos: usize) -> LexResult {
scan_impl(chars, start_pos + 1, format!("{}", start_char))
}
pub fn scan_impl(chars: &Vec<char>, start_pos: usize, current: String) -> LexResult {
match chars.get(start_pos) {
Some(c) if utils::is_identifier_char(*c) => {
scan_impl(chars, start_pos + 1, utils::str_append(current, *c))
},
_ => {
LexResult::Some(token::new_identifier(current, start_pos as i32), start_pos)
}
}
}
#[cfg(test)]
mod tests {
use crate::lexic::token::TokenType;
use super::*;
fn str_to_vec(s: &str) -> Vec<char> {
s.chars().collect()
}
// Should scan a lenght 1 identifier
#[test]
fn test_1() {
let input = str_to_vec("_");
let start_pos = 0;
match scan(*input.get(0).unwrap(), &input, start_pos) {
LexResult::Some(token, next) => {
assert_eq!(1, next);
assert_eq!(TokenType::Identifier, token.token_type);
assert_eq!("_", token.value);
},
_ => panic!()
}
let input = str_to_vec("i");
let start_pos = 0;
match scan(*input.get(0).unwrap(), &input, start_pos) {
LexResult::Some(token, next) => {
assert_eq!(1, next);
assert_eq!(TokenType::Identifier, token.token_type);
assert_eq!("i", token.value);
},
_ => panic!()
}
}
// Should scan a lenght 2 identifier
#[test]
fn test_2() {
let operators = vec![
"_a",
"_z",
"_A",
"_Z",
"__",
"_0",
"_9",
"aa",
"az",
"aA",
"aZ",
"a_",
"a0",
"a9",
"za",
"zz",
"zA",
"zZ",
"z_",
"z0",
"z9",
];
for op in operators {
let input = str_to_vec(op);
let start_pos = 0;
match scan(*input.get(0).unwrap(), &input, start_pos) {
LexResult::Some(token, next) => {
assert_eq!(2, next);
assert_eq!(TokenType::Identifier, token.token_type);
assert_eq!(op, token.value);
},
_ => panic!()
}
}
}
// Should scan long identifiers
#[test]
fn test_3() {
let operators = vec![
"_validIdentifier",
"iterationCount",
"buffer",
"aVeryLongIdentifier2WithSome5Numbers67InBetween1",
];
for op in operators {
let input = str_to_vec(op);
let start_pos = 0;
match scan(*input.get(0).unwrap(), &input, start_pos) {
LexResult::Some(token, next) => {
assert_eq!(input.len(), next);
assert_eq!(TokenType::Identifier, token.token_type);
assert_eq!(op, token.value);
},
_ => panic!()
}
}
}
}

View File

@ -2,33 +2,17 @@ use super::{token::{TokenType, self}, utils, LexResult};
mod number;
mod operator;
mod identifier;
/// Attempts to scan a number. Returns None to be able to chain other scanner
pub fn number(c: char, chars: &Vec<char>, start_pos: usize) -> Option<LexResult> {
if utils::is_digit(c) {
match number::scan(chars, start_pos) {
Ok((token, next_pos)) => {
Some(LexResult::Some(token, next_pos))
},
Err(reason) => {
Some(LexResult::Err(reason))
},
}
}
else {
None
}
utils::is_digit(c).then(|| number::scan(chars, start_pos))
}
/// Attempts to scan an operator. Returns None to be able to chain other scanner
pub fn operator(c: char, chars: &Vec<char>, start_pos: usize) -> Option<LexResult> {
if utils::is_operator(c) {
Some(operator::scan(chars, start_pos))
}
else {
None
}
utils::is_operator(c).then(|| operator::scan(chars, start_pos))
}
@ -51,3 +35,9 @@ pub fn grouping_sign(c: char, _: &Vec<char>, start_pos: usize) -> Option<LexResu
);
Some(LexResult::Some(token, start_pos + 1))
}
/// Attempts to scan an identifier. Returns None to be able to chain other scanner
pub fn identifier(c: char, chars: &Vec<char>, start_pos: usize) -> Option<LexResult> {
utils::is_lowercase(c).then(|| identifier::scan(c, chars, start_pos))
}

View File

@ -1,12 +1,10 @@
use crate::lexic::{token::{Token, self}, utils};
type ScanResult = Result<(Token, usize), String>;
use crate::lexic::{token::{Token, self}, utils, LexResult};
/// Function to scan a number
///
/// This function assumes that the character at `start_pos` is a number [0-9],
/// if not it will panic
pub fn scan(chars: &Vec<char>, start_pos: usize) -> ScanResult {
pub fn scan(chars: &Vec<char>, start_pos: usize) -> LexResult {
let next_char_1 = chars.get(start_pos);
let next_char_2 = chars.get(start_pos + 1);
@ -23,7 +21,7 @@ pub fn scan(chars: &Vec<char>, start_pos: usize) -> ScanResult {
/// Recursively scans an integer. If a dot `.` is found, scans a double,
/// if a `e` is found, scans a number in scientific notation
fn scan_decimal(chars: &Vec<char>, start_pos: usize, current: String) -> ScanResult {
fn scan_decimal(chars: &Vec<char>, start_pos: usize, current: String) -> LexResult {
match chars.get(start_pos) {
Some(c) if *c == '.' => {
scan_double(chars, start_pos + 1, utils::str_append(current, *c))
@ -35,7 +33,7 @@ fn scan_decimal(chars: &Vec<char>, start_pos: usize, current: String) -> ScanRes
scan_decimal(chars, start_pos + 1, utils::str_append(current, *c))
},
_ => {
Ok((token::new_number(current, start_pos as i32), start_pos))
LexResult::Some(token::new_number(current, start_pos as i32), start_pos)
}
}
}
@ -46,12 +44,13 @@ fn scan_decimal(chars: &Vec<char>, start_pos: usize, current: String) -> ScanRes
/// This function expects the following on the first call:
/// - The char at `start_pos` is a value between [0-9a-fA-F]. If not, will return an error.
/// - `current == "0x"`. If not will return an incorrect value, or panic.
fn scan_hex(chars: &Vec<char>, start_pos: usize, current: String) -> ScanResult {
fn scan_hex(chars: &Vec<char>, start_pos: usize, current: String) -> LexResult {
match chars.get(start_pos) {
Some(c) if utils::is_hex_digit(*c) => {
Ok(scan_hex_digits(chars, start_pos + 1, utils::str_append(current, *c)))
let (t, next) = scan_hex_digits(chars, start_pos + 1, utils::str_append(current, *c));
LexResult::Some(t, next)
},
_ => Err(String::from("Tried to scan an incomplete hex value"))
_ => LexResult::Err(String::from("Tried to scan an incomplete hex value"))
}
}
@ -62,21 +61,21 @@ fn scan_hex(chars: &Vec<char>, start_pos: usize, current: String) -> ScanResult
/// - `start_pos` is the position after the dot. E.g., if the input is `3.22` then `start_pos == 2`.
///
/// Returns a syntax error if the char at `start_pos` is not a value between [0-9]
fn scan_double(chars: &Vec<char>, start_pos: usize, current: String) -> ScanResult {
fn scan_double(chars: &Vec<char>, start_pos: usize, current: String) -> LexResult {
match chars.get(start_pos) {
Some(c) if utils::is_digit(*c) => {
scan_double_impl(chars, start_pos, current)
},
Some(_) => {
Err(String::from("The character after the dot when scanning a double is not a number."))
LexResult::Err(String::from("The character after the dot when scanning a double is not a number."))
},
_ => Err(String::from("EOF when scanning a double number."))
_ => LexResult::Err(String::from("EOF when scanning a double number."))
}
}
// Implementation of scan_double
fn scan_double_impl(chars: &Vec<char>, start_pos: usize, current: String) -> ScanResult {
fn scan_double_impl(chars: &Vec<char>, start_pos: usize, current: String) -> LexResult {
match chars.get(start_pos) {
Some(c) if utils::is_digit(*c) => {
scan_double_impl(chars, start_pos + 1, utils::str_append(current, *c))
@ -85,7 +84,7 @@ fn scan_double_impl(chars: &Vec<char>, start_pos: usize, current: String) -> Sca
scan_scientific(chars, start_pos + 1, utils::str_append(current, *c))
}
_ => {
Ok((token::new_number(current, start_pos as i32), start_pos))
LexResult::Some(token::new_number(current, start_pos as i32), start_pos)
}
}
}
@ -99,16 +98,17 @@ fn scan_double_impl(chars: &Vec<char>, start_pos: usize, current: String) -> Sca
/// Returns a syntax error if:
/// - The char at `start_pos` is not `+` or `-`
/// - The char at `start_pos + 1` is not between [0-9]
fn scan_scientific(chars: &Vec<char>, start_pos: usize, current: String) -> ScanResult {
fn scan_scientific(chars: &Vec<char>, start_pos: usize, current: String) -> LexResult {
let next_char_1 = chars.get(start_pos);
let next_char_2 = chars.get(start_pos + 1);
match (next_char_1, next_char_2) {
(Some(c1), Some(c2)) if (*c1 == '+' || *c1 == '-') && utils::is_digit(*c2) => {
let new_value = format!("{}{}{}", current, *c1, *c2);
Ok(scan_digits(chars, start_pos + 2, new_value))
let (t, next) = scan_digits(chars, start_pos + 2, new_value);
LexResult::Some(t, next)
},
_ => Err(String::from("The characters after 'e' are not + or -, or are not followed by a number"))
_ => LexResult::Err(String::from("The characters after 'e' are not + or -, or are not followed by a number"))
}
}
@ -152,28 +152,31 @@ mod tests {
let input = str_to_vec("123");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(3, next);
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("123", token.value);
} else {panic!()}
let input = str_to_vec("0123 ");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(4, next);
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("0123", token.value);
} else {panic!()}
let input = str_to_vec(" 123456 789");
let start_pos = 2;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(8, next);
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("123456", token.value);
} else {panic!()}
}
// Should not scan whitespace after the number
@ -182,10 +185,11 @@ mod tests {
let input = str_to_vec("123 ");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(3, next);
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("123", token.value);
} else {panic!()}
}
#[test]
@ -193,19 +197,21 @@ mod tests {
let input = str_to_vec("0x20 ");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(4, next);
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("0x20", token.value);
} else {panic!()}
let input = str_to_vec(" 0Xff23DA ");
let start_pos = 4;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(12, next);
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("0xff23DA", token.value);
} else {panic!()}
}
// Should not scan an incomplete hex value
@ -215,16 +221,17 @@ mod tests {
let start_pos = 0;
match scan(&input, start_pos) {
Ok(_) => panic!(),
Err(reason) => assert_eq!("Tried to scan an incomplete hex value", reason)
LexResult::Err(reason) => assert_eq!("Tried to scan an incomplete hex value", reason),
_ => panic!(),
}
let input = str_to_vec("0 x20 ");
let start_pos = 0;
let (token, _) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, _) = scan(&input, start_pos) {
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("0", token.value);
} else {panic!()}
}
// Should not scan a hex value if it doesn't start with 0x
@ -232,9 +239,10 @@ mod tests {
fn test_hex_3() {
let input = str_to_vec("1x20");
let start_pos = 0;
let (token, _) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, _) = scan(&input, start_pos) {
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("1", token.value);
} else {panic!()}
}
// Should scan a double
@ -242,18 +250,20 @@ mod tests {
fn test_double_1() {
let input = str_to_vec("3.22");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(4, next);
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("3.22", token.value);
} else {panic!()}
let input = str_to_vec("123456.7890 ");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(11, next);
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("123456.7890", token.value);
} else {panic!()}
}
@ -264,8 +274,8 @@ mod tests {
let start_pos = 0;
match scan(&input, start_pos) {
Ok(_) => panic!(),
Err(reason) => assert_eq!("The character after the dot when scanning a double is not a number.", reason)
LexResult::Err(reason) => assert_eq!("The character after the dot when scanning a double is not a number.", reason),
_ => panic!(),
}
@ -273,8 +283,8 @@ mod tests {
let start_pos = 0;
match scan(&input, start_pos) {
Ok(_) => panic!(),
Err(reason) => assert_eq!("EOF when scanning a double number.", reason)
LexResult::Err(reason) => assert_eq!("EOF when scanning a double number.", reason),
_ => panic!(),
}
}
@ -283,32 +293,36 @@ mod tests {
fn test_exp_1() {
let input = str_to_vec("1e+0");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!("1e+0", token.value);
assert_eq!(4, next);
assert_eq!(TokenType::Number, token.token_type);
} else {panic!()}
let input = str_to_vec("1e-0");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(4, next);
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("1e-0", token.value);
} else {panic!()}
let input = str_to_vec("0e+0");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(4, next);
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("0e+0", token.value);
} else {panic!()}
let input = str_to_vec("123498790e+12349870");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(19, next);
assert_eq!(TokenType::Number, token.token_type);
assert_eq!("123498790e+12349870", token.value);
} else {panic!()}
}
// Should scan a double with decimal part and exponent
@ -316,16 +330,18 @@ mod tests {
fn test_exp_2(){
let input = str_to_vec("1.24e+1");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!("1.24e+1", token.value);
assert_eq!(7, next);
assert_eq!(TokenType::Number, token.token_type);
} else {panic!()}
let input = str_to_vec("0.00000000000001e+1");
let start_pos = 0;
let (token, next) = scan(&input, start_pos).unwrap();
if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!("0.00000000000001e+1", token.value);
assert_eq!(19, next);
assert_eq!(TokenType::Number, token.token_type);
} else {panic!()}
}
}

View File

@ -19,6 +19,14 @@ pub fn is_operator(c: char) -> bool {
|| c == '^' || c == '.' || c == ':'
}
pub fn is_grouping_sign(c: char) -> bool {
c == '(' || c == ')' || c == '{' || c == '}' || c == '[' || c == ']'
pub fn is_lowercase(c: char) -> bool {
c >= 'a' && c <= 'z'
}
pub fn is_uppercase(c: char) -> bool {
c >= 'A' && c <= 'Z'
}
pub fn is_identifier_char(c: char) -> bool {
is_lowercase(c) || is_uppercase(c) || c == '_' || is_digit(c)
}

View File

@ -55,3 +55,11 @@ pub fn new_operator(value: String, position: i32) -> Token {
pub fn new_grouping_sign(value: String, position: i32, token_type: TokenType) -> Token {
Token {token_type, value, position}
}
pub fn new_identifier(value: String, position: i32) -> Token {
Token {
token_type: TokenType::Identifier,
value,
position,
}
}