wip AST generation

This commit is contained in:
2024-11-20 16:52:40 -07:00
parent 609e7ccdb3
commit f756e3e29f
4 changed files with 388 additions and 96 deletions

View File

@@ -1,135 +1,210 @@
use thiserror::Error; mod tree_node;
use crate::tokenizer::{ use crate::tokenizer::{
token::{Keyword, Number, Symbol, Token, TokenType}, token::{Symbol, Token, TokenType},
Tokenizer, TokenizerError, Tokenizer, TokenizerError,
}; };
use std::io::{Read, Seek}; use std::{
collections::VecDeque,
io::{Read, Seek},
};
use thiserror::Error;
use tree_node::*;
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum ParseError { pub enum ParseError {
#[error("{0}")] #[error(transparent)]
TokenizerError(#[from] TokenizerError), TokenizerError(#[from] TokenizerError),
#[error("Unexpected EOF\n\nLine: {0}, Column: {1}", token.line, token.column)]
UnexpectedEOF { token: Token },
#[error("Unexpected token\n\nLine: {0}, Column: {1}\nToken: {2}", token.line, token.column, token.token_type)] #[error("Unexpected token\n\nLine: {0}, Column: {1}\nToken: {2}", token.line, token.column, token.token_type)]
UnexpectedToken { token: Token }, UnexpectedToken { token: Token },
#[error("Unexpected EOF")]
UnexpectedEOF,
#[error("An unknown error has occurred")] #[error("An unknown error has occurred")]
UnknownError, UnknownError,
} }
#[derive(Debug)] pub struct Parser<R: Read + Seek> {
enum Literal { tokenizer: Tokenizer<R>,
Number(Number), current_token: Option<Token>,
String(String),
Boolean(bool),
} }
#[derive(Debug)] impl<R> Parser<R>
struct Identifier(String);
#[derive(Debug)]
pub enum Expression {
Declaration {
identifier: Identifier,
value: Box<Expression>,
},
Assignment {
identifier: Identifier,
value: Box<Expression>,
},
Binary {
left: Box<Expression>,
operator: Symbol,
right: Box<Expression>,
},
Literal(Literal),
}
pub struct Parser<T>
where where
T: Read + Seek, R: Read + Seek,
{ {
tokenizer: Tokenizer<T>, pub fn new(tokenizer: Tokenizer<R>) -> Self {
} Parser {
tokenizer,
impl<T> Parser<T> current_token: None,
where }
T: Read + Seek,
{
pub fn new(tokenizer: Tokenizer<T>) -> Self {
Self { tokenizer }
} }
pub fn parse(&mut self) -> Result<Option<Expression>, ParseError> { pub fn parse(&mut self) -> Result<tree_node::Expression, ParseError> {
while let Some(token) = self.tokenizer.next_token()? { self.current_token = self.tokenizer.next_token()?;
match token.token_type { self.expression()
TokenType::Number(n) => { }
if let Some(Token {
token_type: TokenType::Symbol(s), fn expression(&mut self) -> Result<tree_node::Expression, ParseError> {
.. let current_token = self
}) = self.tokenizer.peek_next()? .current_token
.as_ref()
.ok_or(ParseError::UnknownError)?;
Ok(match current_token.token_type {
// Match a number or string literal as long as the next token is not an operator
TokenType::Number(_) | TokenType::String(_)
if !matches!(
self.tokenizer.peek_next()?, Some(Token { token_type: TokenType::Symbol(e), .. }) if e.is_operator()
) =>
{ {
if s.is_operator() { Expression::Literal(self.literal()?)
self.tokenizer.next_token()?;
return Ok(Some(Expression::Binary {
left: Box::new(Expression::Literal(Literal::Number(n))),
operator: s,
right: Box::new(self.parse()?.ok_or(ParseError::UnknownError)?),
}));
}
} else {
return Ok(Some(Expression::Literal(Literal::Number(n))));
}
}
_ => return Err(ParseError::UnexpectedToken { token }),
}
}
return Err(ParseError::UnknownError);
} }
fn parse_declaration(&mut self) -> Result<Expression, ParseError> { // Match a negation operator
let identifier = match self.tokenizer.next_token()? { TokenType::Symbol(Symbol::Minus) => Expression::Negation(Box::new(self.parse()?)),
Some(token) => match token.token_type {
TokenType::Identifier(i) => Identifier(i), _ if matches!(self.tokenizer.peek_next()?, Some(Token { token_type: TokenType::Symbol(e), .. }) if e.is_operator()) => {
_ => return Err(ParseError::UnexpectedToken { token }), Expression::BinaryExpression(self.binary()?)
}
// Something went wrong. Return an error
_ => {
return Err(ParseError::UnexpectedToken {
token: current_token.clone(),
})
}
})
}
fn binary(&mut self) -> Result<tree_node::BinaryExpression, ParseError> {
let literal = self.literal()?;
let Some(Token {
token_type: TokenType::Symbol(operator),
..
}) = self.current_token
else {
return Err(ParseError::UnknownError);
};
self.current_token = self.tokenizer.next_token()?;
Ok(match operator {
Symbol::Plus => BinaryExpression::Add(
Box::new(Expression::Literal(literal)),
Box::new(self.expression()?),
),
Symbol::Asterisk => BinaryExpression::Multiply(
Box::new(Expression::Literal(literal)),
Box::new(self.expression()?),
),
Symbol::Slash => BinaryExpression::Divide(
Box::new(Expression::Literal(literal)),
Box::new(self.expression()?),
),
Symbol::Minus => BinaryExpression::Subtract(
Box::new(Expression::Literal(literal)),
Box::new(self.expression()?),
),
_ => {
return Err(ParseError::UnexpectedToken {
token: Token {
token_type: TokenType::Symbol(operator),
line: 0,
column: 0,
}, },
None => return Err(ParseError::UnknownError), })
}
})
}
fn literal(&mut self) -> Result<tree_node::Literal, ParseError> {
let current_token = self
.current_token
.as_ref()
.ok_or(ParseError::UnknownError)?;
let to_return = match current_token.token_type {
TokenType::Number(ref number) => tree_node::Literal::Number(number.clone()),
TokenType::String(ref string) => tree_node::Literal::String(string.clone()),
_ => {
return Err(ParseError::UnexpectedToken {
token: current_token.clone(),
})
}
}; };
return Ok(Expression::Declaration { self.current_token = self.tokenizer.next_token()?;
identifier, Ok(to_return)
value: Box::new(self.parse()?.ok_or(ParseError::UnknownError)?),
});
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::tree_node::*;
use super::*; use super::*;
use anyhow::Result; use anyhow::Result;
#[test] #[test]
fn test_parser() -> Result<()> { fn test_add_expr() -> Result<()> {
let input = r#" let input = "123 + 456";
5.3245 + 5
let mut parser = Parser::new(Tokenizer::from(input.to_owned()));
let result = parser.parse()?;
let formatted_output = format!("{}", result);
assert_eq!(formatted_output, "(123 + 456)");
Ok(())
}
45 - 2 #[test]
"#; fn test_parse_number() -> Result<()> {
let input = "123";
let mut parser = Parser::new(Tokenizer::from(input.to_owned()));
let result = parser.parse()?;
let tokenizer = Tokenizer::from(input.to_owned()); let formatted_output = format!("{}", result);
let mut parser = Parser::new(tokenizer);
let expr = parser.parse()?; assert_eq!(formatted_output, "123");
println!("{:?}", expr); Ok(())
}
let expr = parser.parse()?; #[test]
fn test_parse_negation() -> Result<()> {
let input = "-123";
let mut parser = Parser::new(Tokenizer::from(input.to_owned()));
let result = parser.parse()?;
println!("{:?}", expr); let formatted_output = format!("{}", result);
assert_eq!(formatted_output, "(-123)");
Ok(())
}
#[test]
fn test_order_of_operations() -> Result<()> {
let input = "123 - 456 + 789";
let mut parser = Parser::new(Tokenizer::from(input.to_owned()));
let result = parser.parse()?;
let formatted_output = format!("{}", result);
println!("{}", formatted_output);
Ok(())
}
#[test]
fn test_chained_operators() -> Result<()> {
let input = "123 + 456 * 789";
let mut parser = Parser::new(Tokenizer::from(input.to_owned()));
let result = parser.parse()?;
let formatted_output = format!("{}", result);
assert_eq!(formatted_output, "(123 + (456 * 789))");
Ok(()) Ok(())
} }

83
src/parser/tree_node.rs Normal file
View File

@@ -0,0 +1,83 @@
use crate::tokenizer::token::Number;
#[derive(Debug, Eq, PartialEq)]
pub enum Literal {
Number(Number),
String(String),
}
impl std::fmt::Display for Literal {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Literal::Number(n) => write!(f, "{}", n),
Literal::String(s) => write!(f, "{}", s),
}
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum BinaryExpression {
Add(Box<Expression>, Box<Expression>),
Multiply(Box<Expression>, Box<Expression>),
Divide(Box<Expression>, Box<Expression>),
Subtract(Box<Expression>, Box<Expression>),
}
impl std::fmt::Display for BinaryExpression {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
BinaryExpression::Add(l, r) => write!(f, "({} + {})", l, r),
BinaryExpression::Multiply(l, r) => write!(f, "({} * {})", l, r),
BinaryExpression::Divide(l, r) => write!(f, "({} / {})", l, r),
BinaryExpression::Subtract(l, r) => write!(f, "({} - {})", l, r),
}
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum LogicalExpression {
And(Box<Expression>, Box<Expression>),
Or(Box<Expression>, Box<Expression>),
Not(Box<Expression>),
Equal(Box<Expression>, Box<Expression>),
NotEqual(Box<Expression>, Box<Expression>),
GreaterThan(Box<Expression>, Box<Expression>),
GreaterThanOrEqual(Box<Expression>, Box<Expression>),
LessThan(Box<Expression>, Box<Expression>),
LessThanOrEqual(Box<Expression>, Box<Expression>),
}
impl std::fmt::Display for LogicalExpression {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
LogicalExpression::And(l, r) => write!(f, "({} && {})", l, r),
LogicalExpression::Or(l, r) => write!(f, "({} || {})", l, r),
LogicalExpression::Not(e) => write!(f, "(!{})", e),
LogicalExpression::Equal(l, r) => write!(f, "({} == {})", l, r),
LogicalExpression::NotEqual(l, r) => write!(f, "({} != {})", l, r),
LogicalExpression::GreaterThan(l, r) => write!(f, "({} > {})", l, r),
LogicalExpression::GreaterThanOrEqual(l, r) => write!(f, "({} >= {})", l, r),
LogicalExpression::LessThan(l, r) => write!(f, "({} < {})", l, r),
LogicalExpression::LessThanOrEqual(l, r) => write!(f, "({} <= {})", l, r),
}
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum Expression {
Literal(Literal),
Negation(Box<Expression>),
BinaryExpression(BinaryExpression),
LogicalExpression(LogicalExpression),
}
impl std::fmt::Display for Expression {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Expression::Literal(l) => write!(f, "{}", l),
Expression::Negation(e) => write!(f, "(-{})", e),
Expression::BinaryExpression(e) => write!(f, "{}", e),
Expression::LogicalExpression(e) => write!(f, "{}", e),
}
}
}

View File

@@ -1,6 +1,7 @@
pub mod token; pub mod token;
use std::{ use std::{
collections::VecDeque,
fs::File, fs::File,
io::{BufReader, Cursor, Read, Seek, SeekFrom}, io::{BufReader, Cursor, Read, Seek, SeekFrom},
path::PathBuf, path::PathBuf,
@@ -123,6 +124,8 @@ where
Ok(()) Ok(())
} }
/// Consumes the tokenizer and returns the next token in the stream
/// If there are no more tokens in the stream, this function returns None
pub fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> { pub fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
while let Some(next_char) = self.next_char()? { while let Some(next_char) = self.next_char()? {
// skip whitespace // skip whitespace
@@ -167,6 +170,8 @@ where
} }
} }
/// Peeks the next token in the stream without consuming it
/// If there are no more tokens in the stream, this function returns None
pub fn peek_next(&mut self) -> Result<Option<Token>, TokenizerError> { pub fn peek_next(&mut self) -> Result<Option<Token>, TokenizerError> {
let current_pos = self.reader.stream_position()?; let current_pos = self.reader.stream_position()?;
let column = self.column.clone(); let column = self.column.clone();
@@ -408,6 +413,106 @@ where
} }
} }
pub struct TokenizerBuffer<T>
where
T: Read + Seek,
{
tokenizer: Tokenizer<T>,
buffer: VecDeque<Token>,
history: VecDeque<Token>,
}
impl<T> TokenizerBuffer<T>
where
T: Seek + Read,
{
pub fn new(tokenizer: Tokenizer<T>) -> Self {
Self {
tokenizer,
buffer: VecDeque::new(),
history: VecDeque::with_capacity(128),
}
}
/// Reads the next token from the tokenizer, pushing the value to the back of the history
/// and returning the token
pub fn next(&mut self) -> Result<Option<Token>, TokenizerError> {
if let Some(token) = self.buffer.pop_front() {
self.history.push_back(token.clone());
return Ok(Some(token));
}
let token = self.tokenizer.next_token()?;
if let Some(ref token) = token {
self.history.push_back(token.clone());
}
Ok(token)
}
/// Peeks the next token in the stream without adding to the history stack
pub fn peek(&mut self) -> Result<Option<Token>, TokenizerError> {
if let Some(token) = self.buffer.front() {
return Ok(Some(token.clone()));
}
let token = self.tokenizer.peek_next()?;
Ok(token)
}
fn seek_from_start(&mut self, pos: usize) -> Result<(), TokenizerError> {
// if pos
Ok(())
}
fn seek_from_current(&mut self, seek_to: i64) -> Result<(), TokenizerError> {
// if seek_to > 0 then we need to check if the buffer has enough tokens to pop, otherwise we need to read from the tokenizer
// if seek_to < 0 then we need to pop from the history and push to the front of the buffer. If not enough, then we throw (we reached the front of the history)
// if seek_to == 0 then we don't need to do anything
if seek_to > 0 {
let mut tokens = Vec::with_capacity(seek_to as usize);
for _ in 0..seek_to {
if let Some(token) = self.tokenizer.next_token()? {
tokens.push(token);
} else {
return Err(TokenizerError::IOError(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"Unexpected EOF",
)));
}
}
self.history.extend(tokens);
} else if seek_to < 0 {
let seek_to = seek_to.abs() as usize;
let mut tokens = Vec::with_capacity(seek_to);
for _ in 0..seek_to {
if let Some(token) = self.history.pop_back() {
tokens.push(token);
} else {
return Err(TokenizerError::IOError(std::io::Error::new(
std::io::ErrorKind::UnexpectedEof,
"Unexpected EOF",
)));
}
}
self.buffer.extend(tokens.into_iter().rev());
}
Ok(())
}
/// Adds to or removes from the History stack, allowing the user to move back and forth in the stream
pub fn seek(&mut self, from: SeekFrom) -> Result<(), TokenizerError> {
Ok(match from {
SeekFrom::Start(pos) => self.seek_from_start(pos as usize)?,
SeekFrom::Current(seek_to) => self.seek_from_current(seek_to)?,
SeekFrom::End(_) => unimplemented!("SeekFrom::End will not be implemented"),
})
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@@ -422,6 +527,35 @@ mod tests {
} }
"#; "#;
#[test]
fn test_tokenizer_buffer_seek_from_start() -> Result<()> {
let tokenizer = Tokenizer::from(TEST_STRING.to_owned());
let mut buffer = TokenizerBuffer::new(tokenizer);
let token = buffer.next()?;
assert_eq!(token.unwrap().token_type, TokenType::Keyword(Keyword::Fn));
let token = buffer.next()?;
assert_eq!(
token.unwrap().token_type,
TokenType::Identifier(String::from("test"))
);
buffer.seek(SeekFrom::Start(0))?;
let token = buffer.next()?;
assert_eq!(token.unwrap().token_type, TokenType::Keyword(Keyword::Fn));
buffer.seek(SeekFrom::Start(16))?;
let token = buffer.next()?;
assert_eq!(token.unwrap().token_type, TokenType::Keyword(Keyword::Let));
Ok(())
}
#[test] #[test]
fn test_tokenizer_from_path_ok() { fn test_tokenizer_from_path_ok() {
let tokenizer = Tokenizer::from_path(TEST_FILE); let tokenizer = Tokenizer::from_path(TEST_FILE);

View File

@@ -1,4 +1,4 @@
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq, Clone)]
pub struct Token { pub struct Token {
/// The type of the token /// The type of the token
pub token_type: TokenType, pub token_type: TokenType,
@@ -18,7 +18,7 @@ impl Token {
} }
} }
#[derive(Debug, PartialEq, Hash, Eq)] #[derive(Debug, PartialEq, Hash, Eq, Clone)]
pub enum TokenType { pub enum TokenType {
/// Represents a string token /// Represents a string token
String(String), String(String),
@@ -50,7 +50,7 @@ impl std::fmt::Display for TokenType {
} }
} }
#[derive(Debug, PartialEq, Hash, Eq)] #[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)]
pub enum Number { pub enum Number {
/// Represents an integer number /// Represents an integer number
Integer(u64), Integer(u64),
@@ -67,7 +67,7 @@ impl std::fmt::Display for Number {
} }
} }
#[derive(Debug, PartialEq, Hash, Eq)] #[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)]
pub enum Symbol { pub enum Symbol {
// Single Character Symbols // Single Character Symbols
/// Represents the `(` symbol /// Represents the `(` symbol
@@ -157,7 +157,7 @@ impl Symbol {
} }
} }
#[derive(Debug, PartialEq, Hash, Eq)] #[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)]
pub enum Keyword { pub enum Keyword {
/// Represents the `let` keyword /// Represents the `let` keyword
Let, Let,