From c445f8bb00b7187b200ad6b78c5e7d4a21366069 Mon Sep 17 00:00:00 2001 From: Araozu Date: Sat, 15 Apr 2023 17:17:27 -0500 Subject: [PATCH] [Compiler] Refactor string lexer to include starting and closing double quotes --- compiler/CHANGELOG.md | 1 + compiler/src/error_handling/syntax_error.rs | 2 +- compiler/src/lexic/scanner/string.rs | 23 ++--- compiler/src/repl/mod.rs | 18 ++-- compiler/src/semantic/datatype.rs | 18 ++-- compiler/src/semantic/mod.rs | 98 ++++++++++++++++++--- compiler/src/semantic/type_check.rs | 10 +-- compiler/src/symbol_table.rs | 12 +-- compiler/src/syntax/expression.rs | 2 +- compiler/src/syntax/mod.rs | 11 +-- compiler/src/token.rs | 5 +- 11 files changed, 131 insertions(+), 69 deletions(-) diff --git a/compiler/CHANGELOG.md b/compiler/CHANGELOG.md index 6b430dd..f9591f0 100644 --- a/compiler/CHANGELOG.md +++ b/compiler/CHANGELOG.md @@ -15,6 +15,7 @@ - Scan single line comments - Refactor String token to include double quotes (") in its content +- Refactor datachecking of semantic analysis ## v0.0.4 diff --git a/compiler/src/error_handling/syntax_error.rs b/compiler/src/error_handling/syntax_error.rs index 2f83b4d..84aa777 100644 --- a/compiler/src/error_handling/syntax_error.rs +++ b/compiler/src/error_handling/syntax_error.rs @@ -94,7 +94,7 @@ fn get_line( mod tests { use super::*; use crate::{ - error_handling::{PrintableError, SyntaxError, MistiError}, + error_handling::{MistiError, PrintableError, SyntaxError}, lexic::get_tokens, syntax::construct_ast, }; diff --git a/compiler/src/lexic/scanner/string.rs b/compiler/src/lexic/scanner/string.rs index f46e24b..75a4a9d 100755 --- a/compiler/src/lexic/scanner/string.rs +++ b/compiler/src/lexic/scanner/string.rs @@ -6,7 +6,7 @@ use crate::lexic::{token, utils, LexResult}; /// This function assumes that `start_pos` is after the first double quote, /// e.g. if the input is `"hello"`, `start_pos == 1` pub fn scan(chars: &Vec, start_pos: usize) -> LexResult { - scan_impl(chars, start_pos, String::from("")) + scan_impl(chars, start_pos, String::from("\"")) } /// Recursive function that does the scanning @@ -16,10 +16,11 @@ pub fn scan_impl(chars: &Vec, start_pos: usize, current: String) -> LexRes // start_pos is the position where the token ENDS, not where it STARTS, // so this is used to retrieve the original START position of the token // 1 is added to account for the opening `"` - let current_len = current.len() + 1; + let current_len = current.len(); + let final_str = format!("{}\"", current); LexResult::Some( - token::new_string(current, start_pos - current_len), + token::new_string(final_str, start_pos - current_len), start_pos + 1, ) } @@ -77,7 +78,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(2, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("", token.value); + assert_eq!("\"\"", token.value); assert_eq!(0, token.position); } else { panic!() @@ -91,7 +92,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(15, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("Hello, world!", token.value); + assert_eq!("\"Hello, world!\"", token.value); assert_eq!(0, token.position); } else { panic!() @@ -116,7 +117,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("Sample\\ntext", token.value); + assert_eq!("\"Sample\\ntext\"", token.value); assert_eq!(0, token.position); } else { panic!() @@ -127,7 +128,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("Sample\\\"text", token.value); + assert_eq!("\"Sample\\\"text\"", token.value); assert_eq!(0, token.position); } else { panic!() @@ -138,7 +139,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("Sample\\rtext", token.value); + assert_eq!("\"Sample\\rtext\"", token.value); assert_eq!(0, token.position); } else { panic!() @@ -149,7 +150,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("Sample\\\\text", token.value); + assert_eq!("\"Sample\\\\text\"", token.value); assert_eq!(0, token.position); } else { panic!() @@ -160,7 +161,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("Sample\\ttext", token.value); + assert_eq!("\"Sample\\ttext\"", token.value); assert_eq!(0, token.position); } else { panic!() @@ -171,7 +172,7 @@ mod tests { if let LexResult::Some(token, next) = scan(&input, start_pos) { assert_eq!(14, next); assert_eq!(TokenType::String, token.token_type); - assert_eq!("Sample\\ text", token.value); + assert_eq!("\"Sample\\ text\"", token.value); assert_eq!(0, token.position); } else { panic!() diff --git a/compiler/src/repl/mod.rs b/compiler/src/repl/mod.rs index 2e705cc..e17a128 100755 --- a/compiler/src/repl/mod.rs +++ b/compiler/src/repl/mod.rs @@ -10,12 +10,12 @@ use super::semantic; use super::syntax; /// Executes Lexical analysis, handles errors and calls build_ast for the next phase -fn compile(input: &String) { +fn compile(input: &String, symbol_table: &mut SymbolTable) { let tokens = lexic::get_tokens(input); match tokens { Ok(tokens) => { - build_ast(input, tokens); + build_ast(input, tokens, symbol_table); } Err(error) => { let chars: Vec = input.chars().into_iter().collect(); @@ -27,13 +27,12 @@ fn compile(input: &String) { /// Executes Syntax analysis, and for now, Semantic analysis and Code generation. /// /// Prints the generated code in stdin -fn build_ast(input: &String, tokens: Vec) { +fn build_ast(input: &String, tokens: Vec, symbol_table: &mut SymbolTable) { let ast = syntax::construct_ast(&tokens); match ast { - Ok(mut ast) => { - let mut symbol_table = SymbolTable::new(); - semantic::check_ast(&mut ast, &mut symbol_table); + Ok( ast) => { + semantic::check_ast(& ast, symbol_table); let js_code = codegen::codegen(&ast); println!("{}", js_code) @@ -45,17 +44,16 @@ fn build_ast(input: &String, tokens: Vec) { } } - - /// Executes the REPL, reading from stdin, compiling and emitting JS to stdout pub fn run() -> io::Result<()> { let stdin = io::stdin(); let mut buffer = String::new(); + let mut repl_symbol_table = SymbolTable::new(); println!("REPL: Enter expressions to evaluate. Type Ctrl-D to exit."); loop { print!("> "); - let _ = io::stdout().flush(); + io::stdout().flush()?; buffer.clear(); let read = stdin.read_line(&mut buffer); @@ -65,7 +63,7 @@ pub fn run() -> io::Result<()> { break Ok(()); } Ok(_) => { - compile(&buffer); + compile(&buffer, &mut repl_symbol_table); } Err(error) => { eprintln!("Error reading stdin."); diff --git a/compiler/src/semantic/datatype.rs b/compiler/src/semantic/datatype.rs index be59b44..5cc6689 100644 --- a/compiler/src/semantic/datatype.rs +++ b/compiler/src/semantic/datatype.rs @@ -1,28 +1,32 @@ - /// Represents a qualified datatype of the compiler. -/// +/// /// A datatype is composed of a path, e.g. `base.Str`, `base.Num` #[derive(PartialEq)] pub struct Datatype { t: String, } - impl Datatype { pub fn new(t: String) -> Datatype { Datatype { t } } pub fn str() -> Datatype { - Datatype { t: String::from("base.Str") } + Datatype { + t: String::from("base.Str"), + } } pub fn num() -> Datatype { - Datatype { t: String::from("base.Num") } + Datatype { + t: String::from("base.Num"), + } } pub fn bool() -> Datatype { - Datatype { t: String::from("base.Bool") } + Datatype { + t: String::from("base.Bool"), + } } pub fn clone(&self) -> Datatype { @@ -30,8 +34,6 @@ impl Datatype { } } - - #[cfg(test)] mod tests { use super::Datatype; diff --git a/compiler/src/semantic/mod.rs b/compiler/src/semantic/mod.rs index b9be116..a9275b8 100644 --- a/compiler/src/semantic/mod.rs +++ b/compiler/src/semantic/mod.rs @@ -1,25 +1,101 @@ -use super::ast_types::{Binding, Expression, ModuleAST}; -use super::symbol_table::{SymbolTable, _BOOLEAN, _NUMBER, _STRING}; +use super::ast_types::{Binding, ModuleAST}; +use super::symbol_table::SymbolTable; mod datatype; mod type_check; +use type_check::Typed; + pub use datatype::Datatype; /// Checks the AST. In the future should return a list of errors. -pub fn check_ast<'a>(ast: &'a mut ModuleAST, symbol_table: &'a mut SymbolTable) { - +pub fn check_ast<'a>(ast: &'a ModuleAST, symbol_table: &'a mut SymbolTable) { + for binding in ast.bindings.iter() { + match binding { + Binding::Val(b) => { + let datatype = b.expression.t(symbol_table); + let identifier = b.identifier; + + // TODO: check datatype of a explicit datatype, e.g. `Str val x = 322` + + symbol_table.insert(identifier.as_str(), datatype); + } + Binding::Var(b) => { + let datatype = b.expression.t(symbol_table); + let identifier = b.identifier; + + // TODO: check datatype of a explicit datatype, e.g. `Str val x = 322` + + symbol_table.insert(identifier.as_str(), datatype); + } + } + } } - - #[cfg(test)] -mod tests { - use crate::lexic; - use crate::symbol_table::_BOOLEAN; - use crate::symbol_table::_STRING; - use crate::syntax; +mod t { + use crate::ast_types::Expression; use super::*; + #[test] + fn should_insert_into_symbol_table() { + let s1 = String::from("id"); + let s2 = String::from("322"); + let binding = Binding::Val(crate::ast_types::ValBinding { + datatype: None, + identifier: &s1, + expression: Expression::Number(&s2), + }); + + let mut table = SymbolTable::new(); + + check_ast( + &ModuleAST { + bindings: vec![binding], + }, + &mut table, + ); + + assert!(table.has_id(&String::from("id"))); + assert!(table.check_type(&String::from("id"), Datatype::num())); + } + + #[test] + fn should_insert_id_reference() { + let s1 = String::from("id"); + let s2 = String::from("322"); + let binding = Binding::Val(crate::ast_types::ValBinding { + datatype: None, + identifier: &s1, + expression: Expression::Number(&s2), + }); + + let mut table = SymbolTable::new(); + + check_ast( + &ModuleAST { + bindings: vec![binding], + }, + &mut table, + ); + + let s1 = String::from("id2"); + let s2 = String::from("id"); + let binding = Binding::Val(crate::ast_types::ValBinding { + datatype: None, + identifier: &s1, + expression: Expression::Identifier(&s2), + }); + + check_ast( + &ModuleAST { + bindings: vec![binding], + }, + &mut table, + ); + + assert!(table.has_id(&String::from("id2"))); + assert!(table.check_type(&String::from("id2"), Datatype::num())); + } } diff --git a/compiler/src/semantic/type_check.rs b/compiler/src/semantic/type_check.rs index 43fefb4..450f6ca 100644 --- a/compiler/src/semantic/type_check.rs +++ b/compiler/src/semantic/type_check.rs @@ -2,26 +2,27 @@ use crate::{ast_types::Expression, symbol_table::SymbolTable}; use super::datatype::Datatype; -trait Typed<'a> { +pub trait Typed<'a> { fn t(&self, symbol_table: &'a mut SymbolTable) -> Datatype; } impl<'a> Typed<'a> for Expression<'a> { + /// Returns the Datatype of this Expression fn t(&self, symbol_table: &'a mut SymbolTable) -> Datatype { match self { Expression::Number(_) => Datatype::num(), Expression::String(_) => Datatype::str(), Expression::Boolean(_) => Datatype::bool(), Expression::Identifier(id) => { - let res = symbol_table.get_type(id).unwrap(); + let res = symbol_table + .get_type(id) + .expect("SEMANTIC: identifier doesn't exist in Symbol table"); res.clone() } } } } - - #[cfg(test)] mod t { use super::*; @@ -59,4 +60,3 @@ mod t { assert!(exp.t(&mut table) == Datatype::num()); } } - diff --git a/compiler/src/symbol_table.rs b/compiler/src/symbol_table.rs index c1e82f9..2c9b9bb 100644 --- a/compiler/src/symbol_table.rs +++ b/compiler/src/symbol_table.rs @@ -2,11 +2,6 @@ use std::collections::HashMap; use crate::semantic::Datatype; -// Primitive datatypes -pub const _NUMBER: &str = "Num"; -pub const _STRING: &str = "Str"; -pub const _BOOLEAN: &str = "Bool"; - pub struct SymbolTable { /// For now just stores identifiers and datatypes table: HashMap, @@ -22,8 +17,7 @@ impl SymbolTable { } pub fn insert(&mut self, identifier: &str, datatype: Datatype) { - self.table - .insert(String::from(identifier), datatype); + self.table.insert(String::from(identifier), datatype); } pub fn has_id(&self, identifier: &String) -> bool { @@ -45,9 +39,7 @@ impl SymbolTable { /// Returns the Datatype of a given identifier pub fn get_type(&self, identifier: &String) -> Option<&Datatype> { - self.table - .get(identifier) - .and_then(|value| Some(value)) + self.table.get(identifier).and_then(|value| Some(value)) } } diff --git a/compiler/src/syntax/expression.rs b/compiler/src/syntax/expression.rs index 4cd7e2e..d124a4d 100644 --- a/compiler/src/syntax/expression.rs +++ b/compiler/src/syntax/expression.rs @@ -41,7 +41,7 @@ mod tests { let expression = try_parse(&tokens, 0).unwrap(); match expression { - Expression::String(value) => assert_eq!("Hello", value), + Expression::String(value) => assert_eq!("\"Hello\"", value), _ => panic!(), } } diff --git a/compiler/src/syntax/mod.rs b/compiler/src/syntax/mod.rs index bd5ed52..5088113 100755 --- a/compiler/src/syntax/mod.rs +++ b/compiler/src/syntax/mod.rs @@ -1,5 +1,5 @@ use crate::ast_types::Binding; -use crate::error_handling::{SyntaxError, MistiError}; +use crate::error_handling::{MistiError, SyntaxError}; use super::token::Token; @@ -41,13 +41,8 @@ pub fn construct_ast<'a>(tokens: &'a Vec) -> Result, MistiE } fn next_construct<'a>(tokens: &'a Vec, current_pos: usize) -> SyntaxResult { - None.or_else(|| binding::try_parse(tokens, 0)) + None.or_else(|| binding::try_parse(tokens, current_pos)) .unwrap_or_else(|| { - SyntaxResult::Err(SyntaxError { - reason: String::from("Unrecognized token"), - // FIXME: This should get the position of the _token_ that current_pos points to - error_start: current_pos, - error_end: current_pos, - }) + SyntaxResult::None }) } diff --git a/compiler/src/token.rs b/compiler/src/token.rs index 7ad5c81..61648fd 100755 --- a/compiler/src/token.rs +++ b/compiler/src/token.rs @@ -30,10 +30,7 @@ pub struct Token { impl Token { pub fn get_end_position(&self) -> usize { - match self.token_type { - TokenType::String => self.position + self.value.len() + 2, - _ => self.position + self.value.len(), - } + self.position + self.value.len() } }