From f28e7394e9c09cd77bc4a9e9a0e8da9b3dd122fb Mon Sep 17 00:00:00 2001 From: Araozu Date: Wed, 15 Feb 2023 16:17:50 -0500 Subject: [PATCH] Scan datatypes. Parse datatype annotations --- CHANGELOG.md | 5 +++ src/ast_types.rs | 2 + src/codegen/binding.rs | 1 + src/codegen/module_ast.rs | 1 + src/lexic/mod.rs | 9 ++++ src/lexic/scanner/identifier.rs | 19 +++++++-- src/lexic/scanner/mod.rs | 7 ++++ src/syntax/{val_binding.rs => binding.rs} | 50 ++++++++++++++++++++--- src/syntax/grammar.md | 22 +++++++--- src/syntax/mod.rs | 4 +- src/token.rs | 9 ++++ 11 files changed, 114 insertions(+), 15 deletions(-) rename src/syntax/{val_binding.rs => binding.rs} (72%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b1b205..79c20e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,11 @@ - [ ] Stdlib - [ ] Document code + +## v0.0.4 + +- Explicit datatype of variables + ## v0.0.3 - Get datatype of an identifier from the symbol table diff --git a/src/ast_types.rs b/src/ast_types.rs index f657079..64eac54 100644 --- a/src/ast_types.rs +++ b/src/ast_types.rs @@ -9,11 +9,13 @@ pub enum Binding<'a> { } pub struct ValBinding<'a> { + pub datatype: Option, pub identifier: &'a String, pub expression: Expression<'a>, } pub struct VarBinding<'a> { + pub datatype: Option, pub identifier: &'a String, pub expression: Expression<'a>, } diff --git a/src/codegen/binding.rs b/src/codegen/binding.rs index 301a643..44207bf 100644 --- a/src/codegen/binding.rs +++ b/src/codegen/binding.rs @@ -31,6 +31,7 @@ mod tests { let id = String::from("identifier"); let value = String::from("322"); let binding = Binding::Val(ValBinding { + datatype: None, identifier: &id, expression: Expression::Number(&value), }); diff --git a/src/codegen/module_ast.rs b/src/codegen/module_ast.rs index e9f17b4..e577f39 100644 --- a/src/codegen/module_ast.rs +++ b/src/codegen/module_ast.rs @@ -22,6 +22,7 @@ mod tests { let id = String::from("identifier"); let value = String::from("322"); let binding = Binding::Val(ValBinding { + datatype: None, identifier: &id, expression: Expression::Number(&value), }); diff --git a/src/lexic/mod.rs b/src/lexic/mod.rs index 81338f6..828b0dd 100755 --- a/src/lexic/mod.rs +++ b/src/lexic/mod.rs @@ -75,6 +75,7 @@ fn next_token(chars: &Chars, current_pos: usize) -> LexResult { None .or_else(|| scanner::number(next_char, chars, current_pos)) .or_else(|| scanner::identifier(next_char, chars, current_pos)) + .or_else(|| scanner::datatype(next_char, chars, current_pos)) .or_else(|| scanner::string(next_char, chars, current_pos)) .or_else(|| scanner::operator(next_char, chars, current_pos)) .or_else(|| scanner::grouping_sign(next_char, chars, current_pos)) @@ -203,6 +204,14 @@ mod tests { assert_eq!("]", t.value); } + #[test] + fn should_scan_datatype() { + let input = String::from("Num"); + let tokens = get_tokens(&input).unwrap(); + + assert_eq!(TokenType::Datatype, tokens[0].token_type); + } + #[test] fn should_scan_new_line() { let input = String::from("3\n22"); diff --git a/src/lexic/scanner/identifier.rs b/src/lexic/scanner/identifier.rs index d9ecef0..533c450 100755 --- a/src/lexic/scanner/identifier.rs +++ b/src/lexic/scanner/identifier.rs @@ -13,19 +13,32 @@ fn str_is_keyword(s: &String) -> Option { /// a valid identifier pub fn scan(start_char: char, chars: &Vec, start_pos: usize) -> LexResult { // The scanning is done by this recursive function - scan_impl(chars, start_pos + 1, format!("{}", start_char)) + scan_impl( + chars, + start_pos + 1, + format!("{}", start_char), + utils::is_uppercase(start_char), + ) } /// Recursive funtion that scans the identifier -fn scan_impl(chars: &Vec, start_pos: usize, current: String) -> LexResult { +fn scan_impl(chars: &Vec, start_pos: usize, current: String, is_datatype: bool) -> LexResult { match chars.get(start_pos) { Some(c) if utils::is_identifier_char(*c) => { - scan_impl(chars, start_pos + 1, utils::str_append(current, *c)) + scan_impl( + chars, + start_pos + 1, + utils::str_append(current, *c), + is_datatype, + ) }, _ => { if let Some(token_type) = str_is_keyword(¤t) { LexResult::Some(token::new(current, start_pos as i32, token_type), start_pos) } + else if is_datatype { + LexResult::Some(token::new_datatype(current, start_pos as i32), start_pos) + } else { LexResult::Some(token::new_identifier(current, start_pos as i32), start_pos) } diff --git a/src/lexic/scanner/mod.rs b/src/lexic/scanner/mod.rs index 79b8544..4327164 100755 --- a/src/lexic/scanner/mod.rs +++ b/src/lexic/scanner/mod.rs @@ -48,6 +48,13 @@ pub fn identifier(c: char, chars: &Vec, start_pos: usize) -> Option, start_pos: usize) -> Option { + // Since the only difference with an identifier is that the fist character is an + // uppercase letter, reuse the identifier scanner + utils::is_uppercase(c) + .then(|| identifier::scan(c, chars, start_pos)) +} /// Attempts to scan a string. If not found returns None to be able to chain other scanner pub fn string(c: char, chars: &Vec, start_pos: usize) -> Option { diff --git a/src/syntax/val_binding.rs b/src/syntax/binding.rs similarity index 72% rename from src/syntax/val_binding.rs rename to src/syntax/binding.rs index fae7e9e..0fabce1 100644 --- a/src/syntax/val_binding.rs +++ b/src/syntax/binding.rs @@ -2,11 +2,26 @@ use crate::token::{Token, TokenType}; use super::ast_types::{ValBinding, VarBinding, Binding}; use super::expression; -// Should return a 3 state value: +// TODO: Should return a 3 state value: // - Success: binding parsed successfully // - NotFound: the first token (var | val) was not found, so the parser should try other options // - Error: token (var | val) was found, but then other expected tokens were not found pub fn try_parse<'a>(tokens: &'a Vec, pos: usize) -> Option { + let mut pos = pos; + + // Optional datatype annotation + let datatype_annotation = { + match tokens.get(pos) { + Some(t) if t.token_type == TokenType::Datatype => { + pos += 1; + Some(String::from(&t.value)) + } + Some(_) => None, + None => return None + } + }; + + // var/val keyword let is_val = { let res1 = try_token_type(tokens, pos, TokenType::VAL); match res1 { @@ -27,7 +42,6 @@ pub fn try_parse<'a>(tokens: &'a Vec, pos: usize) -> Option { let equal_operator = try_operator(tokens, pos + 2, String::from("=")); if equal_operator.is_none() { return None } - let _ = equal_operator.unwrap(); let expression = expression::try_parse(tokens, pos + 3); if expression.is_none() { return None } @@ -35,12 +49,14 @@ pub fn try_parse<'a>(tokens: &'a Vec, pos: usize) -> Option { if is_val { Some(Binding::Val(ValBinding { + datatype: datatype_annotation, identifier: &identifier.value, expression, })) } else { Some(Binding::Var(VarBinding { + datatype: datatype_annotation, identifier: &identifier.value, expression, })) @@ -80,9 +96,7 @@ mod tests { Binding::Val(binding) => { assert_eq!("identifier", binding.identifier); } - Binding::Var(binding) => { - assert_eq!("identifier", binding.identifier); - } + _ => panic!() } } @@ -110,4 +124,30 @@ mod tests { assert_eq!("=", token.value); } + + #[test] + fn should_parse_binding_with_datatype() { + let tokens = get_tokens(&String::from("Num val identifier = 20")).unwrap(); + let binding = try_parse(&tokens, 0).unwrap(); + + match binding { + Binding::Val(binding) => { + assert_eq!(Some(String::from("Num")), binding.datatype); + assert_eq!("identifier", binding.identifier); + } + _ => panic!() + } + + + let tokens = get_tokens(&String::from("Bool var identifier = true")).unwrap(); + let binding = try_parse(&tokens, 0).unwrap(); + + match binding { + Binding::Var(binding) => { + assert_eq!(Some(String::from("Bool")), binding.datatype); + assert_eq!("identifier", binding.identifier); + } + _ => panic!() + } + } } diff --git a/src/syntax/grammar.md b/src/syntax/grammar.md index 3af9772..6f26cb0 100644 --- a/src/syntax/grammar.md +++ b/src/syntax/grammar.md @@ -11,13 +11,25 @@ A module is (commonly) a single source file. A declaration with `var` or `val`. -- `var = "var"` -- `val = "val"` -- `variable_binding = (var | val), identifier, "=", expression` +```ebnf +var = "var" +val = "val" +variable_binding = (var | val), identifier, "=", expression +``` + ### `expression` -For now just a number +For now just a number, string or boolean -- `expression = number` +```ebnf +expression = number | string | boolean +``` + + +## Type annotations + +```ebnf +variable_binding = Datatype, (var | val), identifier, "=", expression +``` diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index 66e1717..5983094 100755 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -1,14 +1,14 @@ use super::token::Token; mod expression; -mod val_binding; +mod binding; use super::ast_types; use ast_types::ModuleAST; /// Constructs the Misti AST from a vector of tokens pub fn construct_ast<'a>(tokens: &'a Vec) -> Result, String> { - let maybe_binding = val_binding::try_parse(tokens, 0); + let maybe_binding = binding::try_parse(tokens, 0); match maybe_binding { Some(binding) => { diff --git a/src/token.rs b/src/token.rs index 0a03426..ffd00ce 100755 --- a/src/token.rs +++ b/src/token.rs @@ -1,6 +1,7 @@ #[derive(PartialEq, Debug, Clone)] pub enum TokenType { Identifier, + Datatype, Number, String, Operator, @@ -76,3 +77,11 @@ pub fn new_semicolon(position: i32) -> Token { _position: position, } } + +pub fn new_datatype(value: String, position: i32) -> Token { + Token { + token_type: TokenType::Datatype, + value, + _position: position, + } +}