[Compiler] Refactor string lexer to include starting and closing double quotes

This commit is contained in:
Araozu 2023-04-15 17:17:27 -05:00
parent ccfb95956c
commit c445f8bb00
11 changed files with 131 additions and 69 deletions

View File

@ -15,6 +15,7 @@
- Scan single line comments - Scan single line comments
- Refactor String token to include double quotes (") in its content - Refactor String token to include double quotes (") in its content
- Refactor datachecking of semantic analysis
## v0.0.4 ## v0.0.4

View File

@ -94,7 +94,7 @@ fn get_line(
mod tests { mod tests {
use super::*; use super::*;
use crate::{ use crate::{
error_handling::{PrintableError, SyntaxError, MistiError}, error_handling::{MistiError, PrintableError, SyntaxError},
lexic::get_tokens, lexic::get_tokens,
syntax::construct_ast, syntax::construct_ast,
}; };

View File

@ -6,7 +6,7 @@ use crate::lexic::{token, utils, LexResult};
/// This function assumes that `start_pos` is after the first double quote, /// This function assumes that `start_pos` is after the first double quote,
/// e.g. if the input is `"hello"`, `start_pos == 1` /// e.g. if the input is `"hello"`, `start_pos == 1`
pub fn scan(chars: &Vec<char>, start_pos: usize) -> LexResult { pub fn scan(chars: &Vec<char>, start_pos: usize) -> LexResult {
scan_impl(chars, start_pos, String::from("")) scan_impl(chars, start_pos, String::from("\""))
} }
/// Recursive function that does the scanning /// Recursive function that does the scanning
@ -16,10 +16,11 @@ pub fn scan_impl(chars: &Vec<char>, start_pos: usize, current: String) -> LexRes
// start_pos is the position where the token ENDS, not where it STARTS, // start_pos is the position where the token ENDS, not where it STARTS,
// so this is used to retrieve the original START position of the token // so this is used to retrieve the original START position of the token
// 1 is added to account for the opening `"` // 1 is added to account for the opening `"`
let current_len = current.len() + 1; let current_len = current.len();
let final_str = format!("{}\"", current);
LexResult::Some( LexResult::Some(
token::new_string(current, start_pos - current_len), token::new_string(final_str, start_pos - current_len),
start_pos + 1, start_pos + 1,
) )
} }
@ -77,7 +78,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(2, next); assert_eq!(2, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("", token.value); assert_eq!("\"\"", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -91,7 +92,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(15, next); assert_eq!(15, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("Hello, world!", token.value); assert_eq!("\"Hello, world!\"", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -116,7 +117,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("Sample\\ntext", token.value); assert_eq!("\"Sample\\ntext\"", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -127,7 +128,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("Sample\\\"text", token.value); assert_eq!("\"Sample\\\"text\"", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -138,7 +139,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("Sample\\rtext", token.value); assert_eq!("\"Sample\\rtext\"", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -149,7 +150,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("Sample\\\\text", token.value); assert_eq!("\"Sample\\\\text\"", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -160,7 +161,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("Sample\\ttext", token.value); assert_eq!("\"Sample\\ttext\"", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()
@ -171,7 +172,7 @@ mod tests {
if let LexResult::Some(token, next) = scan(&input, start_pos) { if let LexResult::Some(token, next) = scan(&input, start_pos) {
assert_eq!(14, next); assert_eq!(14, next);
assert_eq!(TokenType::String, token.token_type); assert_eq!(TokenType::String, token.token_type);
assert_eq!("Sample\\ text", token.value); assert_eq!("\"Sample\\ text\"", token.value);
assert_eq!(0, token.position); assert_eq!(0, token.position);
} else { } else {
panic!() panic!()

View File

@ -10,12 +10,12 @@ use super::semantic;
use super::syntax; use super::syntax;
/// Executes Lexical analysis, handles errors and calls build_ast for the next phase /// Executes Lexical analysis, handles errors and calls build_ast for the next phase
fn compile(input: &String) { fn compile(input: &String, symbol_table: &mut SymbolTable) {
let tokens = lexic::get_tokens(input); let tokens = lexic::get_tokens(input);
match tokens { match tokens {
Ok(tokens) => { Ok(tokens) => {
build_ast(input, tokens); build_ast(input, tokens, symbol_table);
} }
Err(error) => { Err(error) => {
let chars: Vec<char> = input.chars().into_iter().collect(); let chars: Vec<char> = input.chars().into_iter().collect();
@ -27,13 +27,12 @@ fn compile(input: &String) {
/// Executes Syntax analysis, and for now, Semantic analysis and Code generation. /// Executes Syntax analysis, and for now, Semantic analysis and Code generation.
/// ///
/// Prints the generated code in stdin /// Prints the generated code in stdin
fn build_ast(input: &String, tokens: Vec<Token>) { fn build_ast(input: &String, tokens: Vec<Token>, symbol_table: &mut SymbolTable) {
let ast = syntax::construct_ast(&tokens); let ast = syntax::construct_ast(&tokens);
match ast { match ast {
Ok(mut ast) => { Ok( ast) => {
let mut symbol_table = SymbolTable::new(); semantic::check_ast(& ast, symbol_table);
semantic::check_ast(&mut ast, &mut symbol_table);
let js_code = codegen::codegen(&ast); let js_code = codegen::codegen(&ast);
println!("{}", js_code) println!("{}", js_code)
@ -45,17 +44,16 @@ fn build_ast(input: &String, tokens: Vec<Token>) {
} }
} }
/// Executes the REPL, reading from stdin, compiling and emitting JS to stdout /// Executes the REPL, reading from stdin, compiling and emitting JS to stdout
pub fn run() -> io::Result<()> { pub fn run() -> io::Result<()> {
let stdin = io::stdin(); let stdin = io::stdin();
let mut buffer = String::new(); let mut buffer = String::new();
let mut repl_symbol_table = SymbolTable::new();
println!("REPL: Enter expressions to evaluate. Type Ctrl-D to exit."); println!("REPL: Enter expressions to evaluate. Type Ctrl-D to exit.");
loop { loop {
print!("> "); print!("> ");
let _ = io::stdout().flush(); io::stdout().flush()?;
buffer.clear(); buffer.clear();
let read = stdin.read_line(&mut buffer); let read = stdin.read_line(&mut buffer);
@ -65,7 +63,7 @@ pub fn run() -> io::Result<()> {
break Ok(()); break Ok(());
} }
Ok(_) => { Ok(_) => {
compile(&buffer); compile(&buffer, &mut repl_symbol_table);
} }
Err(error) => { Err(error) => {
eprintln!("Error reading stdin."); eprintln!("Error reading stdin.");

View File

@ -1,28 +1,32 @@
/// Represents a qualified datatype of the compiler. /// Represents a qualified datatype of the compiler.
/// ///
/// A datatype is composed of a path, e.g. `base.Str`, `base.Num` /// A datatype is composed of a path, e.g. `base.Str`, `base.Num`
#[derive(PartialEq)] #[derive(PartialEq)]
pub struct Datatype { pub struct Datatype {
t: String, t: String,
} }
impl Datatype { impl Datatype {
pub fn new(t: String) -> Datatype { pub fn new(t: String) -> Datatype {
Datatype { t } Datatype { t }
} }
pub fn str() -> Datatype { pub fn str() -> Datatype {
Datatype { t: String::from("base.Str") } Datatype {
t: String::from("base.Str"),
}
} }
pub fn num() -> Datatype { pub fn num() -> Datatype {
Datatype { t: String::from("base.Num") } Datatype {
t: String::from("base.Num"),
}
} }
pub fn bool() -> Datatype { pub fn bool() -> Datatype {
Datatype { t: String::from("base.Bool") } Datatype {
t: String::from("base.Bool"),
}
} }
pub fn clone(&self) -> Datatype { pub fn clone(&self) -> Datatype {
@ -30,8 +34,6 @@ impl Datatype {
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::Datatype; use super::Datatype;

View File

@ -1,25 +1,101 @@
use super::ast_types::{Binding, Expression, ModuleAST}; use super::ast_types::{Binding, ModuleAST};
use super::symbol_table::{SymbolTable, _BOOLEAN, _NUMBER, _STRING}; use super::symbol_table::SymbolTable;
mod datatype; mod datatype;
mod type_check; mod type_check;
use type_check::Typed;
pub use datatype::Datatype; pub use datatype::Datatype;
/// Checks the AST. In the future should return a list of errors. /// Checks the AST. In the future should return a list of errors.
pub fn check_ast<'a>(ast: &'a mut ModuleAST, symbol_table: &'a mut SymbolTable) { pub fn check_ast<'a>(ast: &'a ModuleAST, symbol_table: &'a mut SymbolTable) {
for binding in ast.bindings.iter() {
match binding {
Binding::Val(b) => {
let datatype = b.expression.t(symbol_table);
let identifier = b.identifier;
// TODO: check datatype of a explicit datatype, e.g. `Str val x = 322`
symbol_table.insert(identifier.as_str(), datatype);
}
Binding::Var(b) => {
let datatype = b.expression.t(symbol_table);
let identifier = b.identifier;
// TODO: check datatype of a explicit datatype, e.g. `Str val x = 322`
symbol_table.insert(identifier.as_str(), datatype);
}
}
}
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod t {
use crate::lexic; use crate::ast_types::Expression;
use crate::symbol_table::_BOOLEAN;
use crate::symbol_table::_STRING;
use crate::syntax;
use super::*; use super::*;
#[test]
fn should_insert_into_symbol_table() {
let s1 = String::from("id");
let s2 = String::from("322");
let binding = Binding::Val(crate::ast_types::ValBinding {
datatype: None,
identifier: &s1,
expression: Expression::Number(&s2),
});
let mut table = SymbolTable::new();
check_ast(
&ModuleAST {
bindings: vec![binding],
},
&mut table,
);
assert!(table.has_id(&String::from("id")));
assert!(table.check_type(&String::from("id"), Datatype::num()));
}
#[test]
fn should_insert_id_reference() {
let s1 = String::from("id");
let s2 = String::from("322");
let binding = Binding::Val(crate::ast_types::ValBinding {
datatype: None,
identifier: &s1,
expression: Expression::Number(&s2),
});
let mut table = SymbolTable::new();
check_ast(
&ModuleAST {
bindings: vec![binding],
},
&mut table,
);
let s1 = String::from("id2");
let s2 = String::from("id");
let binding = Binding::Val(crate::ast_types::ValBinding {
datatype: None,
identifier: &s1,
expression: Expression::Identifier(&s2),
});
check_ast(
&ModuleAST {
bindings: vec![binding],
},
&mut table,
);
assert!(table.has_id(&String::from("id2")));
assert!(table.check_type(&String::from("id2"), Datatype::num()));
}
} }

View File

@ -2,26 +2,27 @@ use crate::{ast_types::Expression, symbol_table::SymbolTable};
use super::datatype::Datatype; use super::datatype::Datatype;
trait Typed<'a> { pub trait Typed<'a> {
fn t(&self, symbol_table: &'a mut SymbolTable) -> Datatype; fn t(&self, symbol_table: &'a mut SymbolTable) -> Datatype;
} }
impl<'a> Typed<'a> for Expression<'a> { impl<'a> Typed<'a> for Expression<'a> {
/// Returns the Datatype of this Expression
fn t(&self, symbol_table: &'a mut SymbolTable) -> Datatype { fn t(&self, symbol_table: &'a mut SymbolTable) -> Datatype {
match self { match self {
Expression::Number(_) => Datatype::num(), Expression::Number(_) => Datatype::num(),
Expression::String(_) => Datatype::str(), Expression::String(_) => Datatype::str(),
Expression::Boolean(_) => Datatype::bool(), Expression::Boolean(_) => Datatype::bool(),
Expression::Identifier(id) => { Expression::Identifier(id) => {
let res = symbol_table.get_type(id).unwrap(); let res = symbol_table
.get_type(id)
.expect("SEMANTIC: identifier doesn't exist in Symbol table");
res.clone() res.clone()
} }
} }
} }
} }
#[cfg(test)] #[cfg(test)]
mod t { mod t {
use super::*; use super::*;
@ -59,4 +60,3 @@ mod t {
assert!(exp.t(&mut table) == Datatype::num()); assert!(exp.t(&mut table) == Datatype::num());
} }
} }

View File

@ -2,11 +2,6 @@ use std::collections::HashMap;
use crate::semantic::Datatype; use crate::semantic::Datatype;
// Primitive datatypes
pub const _NUMBER: &str = "Num";
pub const _STRING: &str = "Str";
pub const _BOOLEAN: &str = "Bool";
pub struct SymbolTable { pub struct SymbolTable {
/// For now just stores identifiers and datatypes /// For now just stores identifiers and datatypes
table: HashMap<String, Datatype>, table: HashMap<String, Datatype>,
@ -22,8 +17,7 @@ impl SymbolTable {
} }
pub fn insert(&mut self, identifier: &str, datatype: Datatype) { pub fn insert(&mut self, identifier: &str, datatype: Datatype) {
self.table self.table.insert(String::from(identifier), datatype);
.insert(String::from(identifier), datatype);
} }
pub fn has_id(&self, identifier: &String) -> bool { pub fn has_id(&self, identifier: &String) -> bool {
@ -45,9 +39,7 @@ impl SymbolTable {
/// Returns the Datatype of a given identifier /// Returns the Datatype of a given identifier
pub fn get_type(&self, identifier: &String) -> Option<&Datatype> { pub fn get_type(&self, identifier: &String) -> Option<&Datatype> {
self.table self.table.get(identifier).and_then(|value| Some(value))
.get(identifier)
.and_then(|value| Some(value))
} }
} }

View File

@ -41,7 +41,7 @@ mod tests {
let expression = try_parse(&tokens, 0).unwrap(); let expression = try_parse(&tokens, 0).unwrap();
match expression { match expression {
Expression::String(value) => assert_eq!("Hello", value), Expression::String(value) => assert_eq!("\"Hello\"", value),
_ => panic!(), _ => panic!(),
} }
} }

View File

@ -1,5 +1,5 @@
use crate::ast_types::Binding; use crate::ast_types::Binding;
use crate::error_handling::{SyntaxError, MistiError}; use crate::error_handling::{MistiError, SyntaxError};
use super::token::Token; use super::token::Token;
@ -41,13 +41,8 @@ pub fn construct_ast<'a>(tokens: &'a Vec<Token>) -> Result<ModuleAST<'a>, MistiE
} }
fn next_construct<'a>(tokens: &'a Vec<Token>, current_pos: usize) -> SyntaxResult { fn next_construct<'a>(tokens: &'a Vec<Token>, current_pos: usize) -> SyntaxResult {
None.or_else(|| binding::try_parse(tokens, 0)) None.or_else(|| binding::try_parse(tokens, current_pos))
.unwrap_or_else(|| { .unwrap_or_else(|| {
SyntaxResult::Err(SyntaxError { SyntaxResult::None
reason: String::from("Unrecognized token"),
// FIXME: This should get the position of the _token_ that current_pos points to
error_start: current_pos,
error_end: current_pos,
})
}) })
} }

View File

@ -30,10 +30,7 @@ pub struct Token {
impl Token { impl Token {
pub fn get_end_position(&self) -> usize { pub fn get_end_position(&self) -> usize {
match self.token_type { self.position + self.value.len()
TokenType::String => self.position + self.value.len() + 2,
_ => self.position + self.value.len(),
}
} }
} }