diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 34dd372..f91dc89 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,135 +1,210 @@ -use thiserror::Error; +mod tree_node; use crate::tokenizer::{ - token::{Keyword, Number, Symbol, Token, TokenType}, + token::{Symbol, Token, TokenType}, Tokenizer, TokenizerError, }; -use std::io::{Read, Seek}; +use std::{ + collections::VecDeque, + io::{Read, Seek}, +}; +use thiserror::Error; +use tree_node::*; #[derive(Debug, Error)] pub enum ParseError { - #[error("{0}")] + #[error(transparent)] TokenizerError(#[from] TokenizerError), - #[error("Unexpected EOF\n\nLine: {0}, Column: {1}", token.line, token.column)] - UnexpectedEOF { token: Token }, #[error("Unexpected token\n\nLine: {0}, Column: {1}\nToken: {2}", token.line, token.column, token.token_type)] UnexpectedToken { token: Token }, + #[error("Unexpected EOF")] + UnexpectedEOF, #[error("An unknown error has occurred")] UnknownError, } -#[derive(Debug)] -enum Literal { - Number(Number), - String(String), - Boolean(bool), +pub struct Parser { + tokenizer: Tokenizer, + current_token: Option, } -#[derive(Debug)] -struct Identifier(String); - -#[derive(Debug)] -pub enum Expression { - Declaration { - identifier: Identifier, - value: Box, - }, - Assignment { - identifier: Identifier, - value: Box, - }, - Binary { - left: Box, - operator: Symbol, - right: Box, - }, - Literal(Literal), -} - -pub struct Parser +impl Parser where - T: Read + Seek, + R: Read + Seek, { - tokenizer: Tokenizer, -} - -impl Parser -where - T: Read + Seek, -{ - pub fn new(tokenizer: Tokenizer) -> Self { - Self { tokenizer } - } - - pub fn parse(&mut self) -> Result, ParseError> { - while let Some(token) = self.tokenizer.next_token()? { - match token.token_type { - TokenType::Number(n) => { - if let Some(Token { - token_type: TokenType::Symbol(s), - .. - }) = self.tokenizer.peek_next()? - { - if s.is_operator() { - self.tokenizer.next_token()?; - return Ok(Some(Expression::Binary { - left: Box::new(Expression::Literal(Literal::Number(n))), - operator: s, - right: Box::new(self.parse()?.ok_or(ParseError::UnknownError)?), - })); - } - } else { - return Ok(Some(Expression::Literal(Literal::Number(n)))); - } - } - _ => return Err(ParseError::UnexpectedToken { token }), - } + pub fn new(tokenizer: Tokenizer) -> Self { + Parser { + tokenizer, + current_token: None, } - return Err(ParseError::UnknownError); } - fn parse_declaration(&mut self) -> Result { - let identifier = match self.tokenizer.next_token()? { - Some(token) => match token.token_type { - TokenType::Identifier(i) => Identifier(i), - _ => return Err(ParseError::UnexpectedToken { token }), - }, - None => return Err(ParseError::UnknownError), + pub fn parse(&mut self) -> Result { + self.current_token = self.tokenizer.next_token()?; + self.expression() + } + + fn expression(&mut self) -> Result { + let current_token = self + .current_token + .as_ref() + .ok_or(ParseError::UnknownError)?; + + Ok(match current_token.token_type { + // Match a number or string literal as long as the next token is not an operator + TokenType::Number(_) | TokenType::String(_) + if !matches!( + self.tokenizer.peek_next()?, Some(Token { token_type: TokenType::Symbol(e), .. }) if e.is_operator() + ) => + { + Expression::Literal(self.literal()?) + } + + // Match a negation operator + TokenType::Symbol(Symbol::Minus) => Expression::Negation(Box::new(self.parse()?)), + + _ if matches!(self.tokenizer.peek_next()?, Some(Token { token_type: TokenType::Symbol(e), .. }) if e.is_operator()) => { + Expression::BinaryExpression(self.binary()?) + } + + // Something went wrong. Return an error + _ => { + return Err(ParseError::UnexpectedToken { + token: current_token.clone(), + }) + } + }) + } + + fn binary(&mut self) -> Result { + let literal = self.literal()?; + + let Some(Token { + token_type: TokenType::Symbol(operator), + .. + }) = self.current_token + else { + return Err(ParseError::UnknownError); + }; + self.current_token = self.tokenizer.next_token()?; + + Ok(match operator { + Symbol::Plus => BinaryExpression::Add( + Box::new(Expression::Literal(literal)), + Box::new(self.expression()?), + ), + Symbol::Asterisk => BinaryExpression::Multiply( + Box::new(Expression::Literal(literal)), + Box::new(self.expression()?), + ), + Symbol::Slash => BinaryExpression::Divide( + Box::new(Expression::Literal(literal)), + Box::new(self.expression()?), + ), + Symbol::Minus => BinaryExpression::Subtract( + Box::new(Expression::Literal(literal)), + Box::new(self.expression()?), + ), + _ => { + return Err(ParseError::UnexpectedToken { + token: Token { + token_type: TokenType::Symbol(operator), + line: 0, + column: 0, + }, + }) + } + }) + } + + fn literal(&mut self) -> Result { + let current_token = self + .current_token + .as_ref() + .ok_or(ParseError::UnknownError)?; + + let to_return = match current_token.token_type { + TokenType::Number(ref number) => tree_node::Literal::Number(number.clone()), + TokenType::String(ref string) => tree_node::Literal::String(string.clone()), + _ => { + return Err(ParseError::UnexpectedToken { + token: current_token.clone(), + }) + } }; - return Ok(Expression::Declaration { - identifier, - value: Box::new(self.parse()?.ok_or(ParseError::UnknownError)?), - }); + self.current_token = self.tokenizer.next_token()?; + Ok(to_return) } } #[cfg(test)] mod tests { + use super::tree_node::*; use super::*; use anyhow::Result; #[test] - fn test_parser() -> Result<()> { - let input = r#" - 5.3245 + 5 + fn test_add_expr() -> Result<()> { + let input = "123 + 456"; + let mut parser = Parser::new(Tokenizer::from(input.to_owned())); + let result = parser.parse()?; + let formatted_output = format!("{}", result); + assert_eq!(formatted_output, "(123 + 456)"); + Ok(()) + } - 45 - 2 - "#; + #[test] + fn test_parse_number() -> Result<()> { + let input = "123"; + let mut parser = Parser::new(Tokenizer::from(input.to_owned())); + let result = parser.parse()?; - let tokenizer = Tokenizer::from(input.to_owned()); - let mut parser = Parser::new(tokenizer); + let formatted_output = format!("{}", result); - let expr = parser.parse()?; + assert_eq!(formatted_output, "123"); - println!("{:?}", expr); + Ok(()) + } - let expr = parser.parse()?; + #[test] + fn test_parse_negation() -> Result<()> { + let input = "-123"; + let mut parser = Parser::new(Tokenizer::from(input.to_owned())); + let result = parser.parse()?; - println!("{:?}", expr); + let formatted_output = format!("{}", result); + + assert_eq!(formatted_output, "(-123)"); + + Ok(()) + } + + #[test] + fn test_order_of_operations() -> Result<()> { + let input = "123 - 456 + 789"; + + let mut parser = Parser::new(Tokenizer::from(input.to_owned())); + let result = parser.parse()?; + + let formatted_output = format!("{}", result); + println!("{}", formatted_output); + + Ok(()) + } + + #[test] + fn test_chained_operators() -> Result<()> { + let input = "123 + 456 * 789"; + let mut parser = Parser::new(Tokenizer::from(input.to_owned())); + let result = parser.parse()?; + + let formatted_output = format!("{}", result); + + assert_eq!(formatted_output, "(123 + (456 * 789))"); Ok(()) } diff --git a/src/parser/tree_node.rs b/src/parser/tree_node.rs new file mode 100644 index 0000000..a74ed18 --- /dev/null +++ b/src/parser/tree_node.rs @@ -0,0 +1,83 @@ +use crate::tokenizer::token::Number; + +#[derive(Debug, Eq, PartialEq)] +pub enum Literal { + Number(Number), + String(String), +} + +impl std::fmt::Display for Literal { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Literal::Number(n) => write!(f, "{}", n), + Literal::String(s) => write!(f, "{}", s), + } + } +} + +#[derive(Debug, PartialEq, Eq)] +pub enum BinaryExpression { + Add(Box, Box), + Multiply(Box, Box), + Divide(Box, Box), + Subtract(Box, Box), +} + +impl std::fmt::Display for BinaryExpression { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + BinaryExpression::Add(l, r) => write!(f, "({} + {})", l, r), + BinaryExpression::Multiply(l, r) => write!(f, "({} * {})", l, r), + BinaryExpression::Divide(l, r) => write!(f, "({} / {})", l, r), + BinaryExpression::Subtract(l, r) => write!(f, "({} - {})", l, r), + } + } +} + +#[derive(Debug, PartialEq, Eq)] +pub enum LogicalExpression { + And(Box, Box), + Or(Box, Box), + Not(Box), + Equal(Box, Box), + NotEqual(Box, Box), + GreaterThan(Box, Box), + GreaterThanOrEqual(Box, Box), + LessThan(Box, Box), + LessThanOrEqual(Box, Box), +} + +impl std::fmt::Display for LogicalExpression { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LogicalExpression::And(l, r) => write!(f, "({} && {})", l, r), + LogicalExpression::Or(l, r) => write!(f, "({} || {})", l, r), + LogicalExpression::Not(e) => write!(f, "(!{})", e), + LogicalExpression::Equal(l, r) => write!(f, "({} == {})", l, r), + LogicalExpression::NotEqual(l, r) => write!(f, "({} != {})", l, r), + LogicalExpression::GreaterThan(l, r) => write!(f, "({} > {})", l, r), + LogicalExpression::GreaterThanOrEqual(l, r) => write!(f, "({} >= {})", l, r), + LogicalExpression::LessThan(l, r) => write!(f, "({} < {})", l, r), + LogicalExpression::LessThanOrEqual(l, r) => write!(f, "({} <= {})", l, r), + } + } +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Expression { + Literal(Literal), + Negation(Box), + BinaryExpression(BinaryExpression), + LogicalExpression(LogicalExpression), +} + +impl std::fmt::Display for Expression { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Expression::Literal(l) => write!(f, "{}", l), + Expression::Negation(e) => write!(f, "(-{})", e), + Expression::BinaryExpression(e) => write!(f, "{}", e), + Expression::LogicalExpression(e) => write!(f, "{}", e), + } + } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index ed6ef1c..a712bc3 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -1,6 +1,7 @@ pub mod token; use std::{ + collections::VecDeque, fs::File, io::{BufReader, Cursor, Read, Seek, SeekFrom}, path::PathBuf, @@ -123,6 +124,8 @@ where Ok(()) } + /// Consumes the tokenizer and returns the next token in the stream + /// If there are no more tokens in the stream, this function returns None pub fn next_token(&mut self) -> Result, TokenizerError> { while let Some(next_char) = self.next_char()? { // skip whitespace @@ -167,6 +170,8 @@ where } } + /// Peeks the next token in the stream without consuming it + /// If there are no more tokens in the stream, this function returns None pub fn peek_next(&mut self) -> Result, TokenizerError> { let current_pos = self.reader.stream_position()?; let column = self.column.clone(); @@ -408,6 +413,106 @@ where } } +pub struct TokenizerBuffer +where + T: Read + Seek, +{ + tokenizer: Tokenizer, + buffer: VecDeque, + history: VecDeque, +} + +impl TokenizerBuffer +where + T: Seek + Read, +{ + pub fn new(tokenizer: Tokenizer) -> Self { + Self { + tokenizer, + buffer: VecDeque::new(), + history: VecDeque::with_capacity(128), + } + } + + /// Reads the next token from the tokenizer, pushing the value to the back of the history + /// and returning the token + pub fn next(&mut self) -> Result, TokenizerError> { + if let Some(token) = self.buffer.pop_front() { + self.history.push_back(token.clone()); + return Ok(Some(token)); + } + + let token = self.tokenizer.next_token()?; + if let Some(ref token) = token { + self.history.push_back(token.clone()); + } + Ok(token) + } + + /// Peeks the next token in the stream without adding to the history stack + pub fn peek(&mut self) -> Result, TokenizerError> { + if let Some(token) = self.buffer.front() { + return Ok(Some(token.clone())); + } + + let token = self.tokenizer.peek_next()?; + Ok(token) + } + + fn seek_from_start(&mut self, pos: usize) -> Result<(), TokenizerError> { + // if pos + + + Ok(()) + } + + fn seek_from_current(&mut self, seek_to: i64) -> Result<(), TokenizerError> { + // if seek_to > 0 then we need to check if the buffer has enough tokens to pop, otherwise we need to read from the tokenizer + // if seek_to < 0 then we need to pop from the history and push to the front of the buffer. If not enough, then we throw (we reached the front of the history) + // if seek_to == 0 then we don't need to do anything + + if seek_to > 0 { + let mut tokens = Vec::with_capacity(seek_to as usize); + for _ in 0..seek_to { + if let Some(token) = self.tokenizer.next_token()? { + tokens.push(token); + } else { + return Err(TokenizerError::IOError(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "Unexpected EOF", + ))); + } + } + self.history.extend(tokens); + } else if seek_to < 0 { + let seek_to = seek_to.abs() as usize; + let mut tokens = Vec::with_capacity(seek_to); + for _ in 0..seek_to { + if let Some(token) = self.history.pop_back() { + tokens.push(token); + } else { + return Err(TokenizerError::IOError(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "Unexpected EOF", + ))); + } + } + self.buffer.extend(tokens.into_iter().rev()); + } + + Ok(()) + } + + /// Adds to or removes from the History stack, allowing the user to move back and forth in the stream + pub fn seek(&mut self, from: SeekFrom) -> Result<(), TokenizerError> { + Ok(match from { + SeekFrom::Start(pos) => self.seek_from_start(pos as usize)?, + SeekFrom::Current(seek_to) => self.seek_from_current(seek_to)?, + SeekFrom::End(_) => unimplemented!("SeekFrom::End will not be implemented"), + }) + } +} + #[cfg(test)] mod tests { use super::*; @@ -422,6 +527,35 @@ mod tests { } "#; + #[test] + fn test_tokenizer_buffer_seek_from_start() -> Result<()> { + let tokenizer = Tokenizer::from(TEST_STRING.to_owned()); + let mut buffer = TokenizerBuffer::new(tokenizer); + + let token = buffer.next()?; + assert_eq!(token.unwrap().token_type, TokenType::Keyword(Keyword::Fn)); + + let token = buffer.next()?; + assert_eq!( + token.unwrap().token_type, + TokenType::Identifier(String::from("test")) + ); + + buffer.seek(SeekFrom::Start(0))?; + + let token = buffer.next()?; + + assert_eq!(token.unwrap().token_type, TokenType::Keyword(Keyword::Fn)); + + buffer.seek(SeekFrom::Start(16))?; + + let token = buffer.next()?; + + assert_eq!(token.unwrap().token_type, TokenType::Keyword(Keyword::Let)); + + Ok(()) + } + #[test] fn test_tokenizer_from_path_ok() { let tokenizer = Tokenizer::from_path(TEST_FILE); diff --git a/src/tokenizer/token.rs b/src/tokenizer/token.rs index 58db8db..46465dd 100644 --- a/src/tokenizer/token.rs +++ b/src/tokenizer/token.rs @@ -1,4 +1,4 @@ -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone)] pub struct Token { /// The type of the token pub token_type: TokenType, @@ -18,7 +18,7 @@ impl Token { } } -#[derive(Debug, PartialEq, Hash, Eq)] +#[derive(Debug, PartialEq, Hash, Eq, Clone)] pub enum TokenType { /// Represents a string token String(String), @@ -50,7 +50,7 @@ impl std::fmt::Display for TokenType { } } -#[derive(Debug, PartialEq, Hash, Eq)] +#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)] pub enum Number { /// Represents an integer number Integer(u64), @@ -67,7 +67,7 @@ impl std::fmt::Display for Number { } } -#[derive(Debug, PartialEq, Hash, Eq)] +#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)] pub enum Symbol { // Single Character Symbols /// Represents the `(` symbol @@ -157,7 +157,7 @@ impl Symbol { } } -#[derive(Debug, PartialEq, Hash, Eq)] +#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)] pub enum Keyword { /// Represents the `let` keyword Let,