wip AST generation
This commit is contained in:
@@ -1,135 +1,210 @@
|
||||
use thiserror::Error;
|
||||
mod tree_node;
|
||||
|
||||
use crate::tokenizer::{
|
||||
token::{Keyword, Number, Symbol, Token, TokenType},
|
||||
token::{Symbol, Token, TokenType},
|
||||
Tokenizer, TokenizerError,
|
||||
};
|
||||
use std::io::{Read, Seek};
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
io::{Read, Seek},
|
||||
};
|
||||
use thiserror::Error;
|
||||
use tree_node::*;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ParseError {
|
||||
#[error("{0}")]
|
||||
#[error(transparent)]
|
||||
TokenizerError(#[from] TokenizerError),
|
||||
#[error("Unexpected EOF\n\nLine: {0}, Column: {1}", token.line, token.column)]
|
||||
UnexpectedEOF { token: Token },
|
||||
#[error("Unexpected token\n\nLine: {0}, Column: {1}\nToken: {2}", token.line, token.column, token.token_type)]
|
||||
UnexpectedToken { token: Token },
|
||||
#[error("Unexpected EOF")]
|
||||
UnexpectedEOF,
|
||||
#[error("An unknown error has occurred")]
|
||||
UnknownError,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum Literal {
|
||||
Number(Number),
|
||||
String(String),
|
||||
Boolean(bool),
|
||||
pub struct Parser<R: Read + Seek> {
|
||||
tokenizer: Tokenizer<R>,
|
||||
current_token: Option<Token>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Identifier(String);
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Expression {
|
||||
Declaration {
|
||||
identifier: Identifier,
|
||||
value: Box<Expression>,
|
||||
},
|
||||
Assignment {
|
||||
identifier: Identifier,
|
||||
value: Box<Expression>,
|
||||
},
|
||||
Binary {
|
||||
left: Box<Expression>,
|
||||
operator: Symbol,
|
||||
right: Box<Expression>,
|
||||
},
|
||||
Literal(Literal),
|
||||
}
|
||||
|
||||
pub struct Parser<T>
|
||||
impl<R> Parser<R>
|
||||
where
|
||||
T: Read + Seek,
|
||||
R: Read + Seek,
|
||||
{
|
||||
tokenizer: Tokenizer<T>,
|
||||
}
|
||||
|
||||
impl<T> Parser<T>
|
||||
where
|
||||
T: Read + Seek,
|
||||
{
|
||||
pub fn new(tokenizer: Tokenizer<T>) -> Self {
|
||||
Self { tokenizer }
|
||||
pub fn new(tokenizer: Tokenizer<R>) -> Self {
|
||||
Parser {
|
||||
tokenizer,
|
||||
current_token: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse(&mut self) -> Result<Option<Expression>, ParseError> {
|
||||
while let Some(token) = self.tokenizer.next_token()? {
|
||||
match token.token_type {
|
||||
TokenType::Number(n) => {
|
||||
if let Some(Token {
|
||||
token_type: TokenType::Symbol(s),
|
||||
..
|
||||
}) = self.tokenizer.peek_next()?
|
||||
pub fn parse(&mut self) -> Result<tree_node::Expression, ParseError> {
|
||||
self.current_token = self.tokenizer.next_token()?;
|
||||
self.expression()
|
||||
}
|
||||
|
||||
fn expression(&mut self) -> Result<tree_node::Expression, ParseError> {
|
||||
let current_token = self
|
||||
.current_token
|
||||
.as_ref()
|
||||
.ok_or(ParseError::UnknownError)?;
|
||||
|
||||
Ok(match current_token.token_type {
|
||||
// Match a number or string literal as long as the next token is not an operator
|
||||
TokenType::Number(_) | TokenType::String(_)
|
||||
if !matches!(
|
||||
self.tokenizer.peek_next()?, Some(Token { token_type: TokenType::Symbol(e), .. }) if e.is_operator()
|
||||
) =>
|
||||
{
|
||||
if s.is_operator() {
|
||||
self.tokenizer.next_token()?;
|
||||
return Ok(Some(Expression::Binary {
|
||||
left: Box::new(Expression::Literal(Literal::Number(n))),
|
||||
operator: s,
|
||||
right: Box::new(self.parse()?.ok_or(ParseError::UnknownError)?),
|
||||
}));
|
||||
}
|
||||
} else {
|
||||
return Ok(Some(Expression::Literal(Literal::Number(n))));
|
||||
}
|
||||
}
|
||||
_ => return Err(ParseError::UnexpectedToken { token }),
|
||||
}
|
||||
}
|
||||
return Err(ParseError::UnknownError);
|
||||
Expression::Literal(self.literal()?)
|
||||
}
|
||||
|
||||
fn parse_declaration(&mut self) -> Result<Expression, ParseError> {
|
||||
let identifier = match self.tokenizer.next_token()? {
|
||||
Some(token) => match token.token_type {
|
||||
TokenType::Identifier(i) => Identifier(i),
|
||||
_ => return Err(ParseError::UnexpectedToken { token }),
|
||||
// Match a negation operator
|
||||
TokenType::Symbol(Symbol::Minus) => Expression::Negation(Box::new(self.parse()?)),
|
||||
|
||||
_ if matches!(self.tokenizer.peek_next()?, Some(Token { token_type: TokenType::Symbol(e), .. }) if e.is_operator()) => {
|
||||
Expression::BinaryExpression(self.binary()?)
|
||||
}
|
||||
|
||||
// Something went wrong. Return an error
|
||||
_ => {
|
||||
return Err(ParseError::UnexpectedToken {
|
||||
token: current_token.clone(),
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn binary(&mut self) -> Result<tree_node::BinaryExpression, ParseError> {
|
||||
let literal = self.literal()?;
|
||||
|
||||
let Some(Token {
|
||||
token_type: TokenType::Symbol(operator),
|
||||
..
|
||||
}) = self.current_token
|
||||
else {
|
||||
return Err(ParseError::UnknownError);
|
||||
};
|
||||
self.current_token = self.tokenizer.next_token()?;
|
||||
|
||||
Ok(match operator {
|
||||
Symbol::Plus => BinaryExpression::Add(
|
||||
Box::new(Expression::Literal(literal)),
|
||||
Box::new(self.expression()?),
|
||||
),
|
||||
Symbol::Asterisk => BinaryExpression::Multiply(
|
||||
Box::new(Expression::Literal(literal)),
|
||||
Box::new(self.expression()?),
|
||||
),
|
||||
Symbol::Slash => BinaryExpression::Divide(
|
||||
Box::new(Expression::Literal(literal)),
|
||||
Box::new(self.expression()?),
|
||||
),
|
||||
Symbol::Minus => BinaryExpression::Subtract(
|
||||
Box::new(Expression::Literal(literal)),
|
||||
Box::new(self.expression()?),
|
||||
),
|
||||
_ => {
|
||||
return Err(ParseError::UnexpectedToken {
|
||||
token: Token {
|
||||
token_type: TokenType::Symbol(operator),
|
||||
line: 0,
|
||||
column: 0,
|
||||
},
|
||||
None => return Err(ParseError::UnknownError),
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn literal(&mut self) -> Result<tree_node::Literal, ParseError> {
|
||||
let current_token = self
|
||||
.current_token
|
||||
.as_ref()
|
||||
.ok_or(ParseError::UnknownError)?;
|
||||
|
||||
let to_return = match current_token.token_type {
|
||||
TokenType::Number(ref number) => tree_node::Literal::Number(number.clone()),
|
||||
TokenType::String(ref string) => tree_node::Literal::String(string.clone()),
|
||||
_ => {
|
||||
return Err(ParseError::UnexpectedToken {
|
||||
token: current_token.clone(),
|
||||
})
|
||||
}
|
||||
};
|
||||
|
||||
return Ok(Expression::Declaration {
|
||||
identifier,
|
||||
value: Box::new(self.parse()?.ok_or(ParseError::UnknownError)?),
|
||||
});
|
||||
self.current_token = self.tokenizer.next_token()?;
|
||||
Ok(to_return)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::tree_node::*;
|
||||
use super::*;
|
||||
use anyhow::Result;
|
||||
|
||||
#[test]
|
||||
fn test_parser() -> Result<()> {
|
||||
let input = r#"
|
||||
5.3245 + 5
|
||||
fn test_add_expr() -> Result<()> {
|
||||
let input = "123 + 456";
|
||||
|
||||
let mut parser = Parser::new(Tokenizer::from(input.to_owned()));
|
||||
|
||||
let result = parser.parse()?;
|
||||
let formatted_output = format!("{}", result);
|
||||
|
||||
assert_eq!(formatted_output, "(123 + 456)");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
45 - 2
|
||||
"#;
|
||||
#[test]
|
||||
fn test_parse_number() -> Result<()> {
|
||||
let input = "123";
|
||||
let mut parser = Parser::new(Tokenizer::from(input.to_owned()));
|
||||
let result = parser.parse()?;
|
||||
|
||||
let tokenizer = Tokenizer::from(input.to_owned());
|
||||
let mut parser = Parser::new(tokenizer);
|
||||
let formatted_output = format!("{}", result);
|
||||
|
||||
let expr = parser.parse()?;
|
||||
assert_eq!(formatted_output, "123");
|
||||
|
||||
println!("{:?}", expr);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
let expr = parser.parse()?;
|
||||
#[test]
|
||||
fn test_parse_negation() -> Result<()> {
|
||||
let input = "-123";
|
||||
let mut parser = Parser::new(Tokenizer::from(input.to_owned()));
|
||||
let result = parser.parse()?;
|
||||
|
||||
println!("{:?}", expr);
|
||||
let formatted_output = format!("{}", result);
|
||||
|
||||
assert_eq!(formatted_output, "(-123)");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_order_of_operations() -> Result<()> {
|
||||
let input = "123 - 456 + 789";
|
||||
|
||||
let mut parser = Parser::new(Tokenizer::from(input.to_owned()));
|
||||
let result = parser.parse()?;
|
||||
|
||||
let formatted_output = format!("{}", result);
|
||||
println!("{}", formatted_output);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chained_operators() -> Result<()> {
|
||||
let input = "123 + 456 * 789";
|
||||
let mut parser = Parser::new(Tokenizer::from(input.to_owned()));
|
||||
let result = parser.parse()?;
|
||||
|
||||
let formatted_output = format!("{}", result);
|
||||
|
||||
assert_eq!(formatted_output, "(123 + (456 * 789))");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
83
src/parser/tree_node.rs
Normal file
83
src/parser/tree_node.rs
Normal file
@@ -0,0 +1,83 @@
|
||||
use crate::tokenizer::token::Number;
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub enum Literal {
|
||||
Number(Number),
|
||||
String(String),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Literal {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Literal::Number(n) => write!(f, "{}", n),
|
||||
Literal::String(s) => write!(f, "{}", s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum BinaryExpression {
|
||||
Add(Box<Expression>, Box<Expression>),
|
||||
Multiply(Box<Expression>, Box<Expression>),
|
||||
Divide(Box<Expression>, Box<Expression>),
|
||||
Subtract(Box<Expression>, Box<Expression>),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BinaryExpression {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
BinaryExpression::Add(l, r) => write!(f, "({} + {})", l, r),
|
||||
BinaryExpression::Multiply(l, r) => write!(f, "({} * {})", l, r),
|
||||
BinaryExpression::Divide(l, r) => write!(f, "({} / {})", l, r),
|
||||
BinaryExpression::Subtract(l, r) => write!(f, "({} - {})", l, r),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum LogicalExpression {
|
||||
And(Box<Expression>, Box<Expression>),
|
||||
Or(Box<Expression>, Box<Expression>),
|
||||
Not(Box<Expression>),
|
||||
Equal(Box<Expression>, Box<Expression>),
|
||||
NotEqual(Box<Expression>, Box<Expression>),
|
||||
GreaterThan(Box<Expression>, Box<Expression>),
|
||||
GreaterThanOrEqual(Box<Expression>, Box<Expression>),
|
||||
LessThan(Box<Expression>, Box<Expression>),
|
||||
LessThanOrEqual(Box<Expression>, Box<Expression>),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for LogicalExpression {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
LogicalExpression::And(l, r) => write!(f, "({} && {})", l, r),
|
||||
LogicalExpression::Or(l, r) => write!(f, "({} || {})", l, r),
|
||||
LogicalExpression::Not(e) => write!(f, "(!{})", e),
|
||||
LogicalExpression::Equal(l, r) => write!(f, "({} == {})", l, r),
|
||||
LogicalExpression::NotEqual(l, r) => write!(f, "({} != {})", l, r),
|
||||
LogicalExpression::GreaterThan(l, r) => write!(f, "({} > {})", l, r),
|
||||
LogicalExpression::GreaterThanOrEqual(l, r) => write!(f, "({} >= {})", l, r),
|
||||
LogicalExpression::LessThan(l, r) => write!(f, "({} < {})", l, r),
|
||||
LogicalExpression::LessThanOrEqual(l, r) => write!(f, "({} <= {})", l, r),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
pub enum Expression {
|
||||
Literal(Literal),
|
||||
Negation(Box<Expression>),
|
||||
BinaryExpression(BinaryExpression),
|
||||
LogicalExpression(LogicalExpression),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Expression {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Expression::Literal(l) => write!(f, "{}", l),
|
||||
Expression::Negation(e) => write!(f, "(-{})", e),
|
||||
Expression::BinaryExpression(e) => write!(f, "{}", e),
|
||||
Expression::LogicalExpression(e) => write!(f, "{}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
pub mod token;
|
||||
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
fs::File,
|
||||
io::{BufReader, Cursor, Read, Seek, SeekFrom},
|
||||
path::PathBuf,
|
||||
@@ -123,6 +124,8 @@ where
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Consumes the tokenizer and returns the next token in the stream
|
||||
/// If there are no more tokens in the stream, this function returns None
|
||||
pub fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
|
||||
while let Some(next_char) = self.next_char()? {
|
||||
// skip whitespace
|
||||
@@ -167,6 +170,8 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Peeks the next token in the stream without consuming it
|
||||
/// If there are no more tokens in the stream, this function returns None
|
||||
pub fn peek_next(&mut self) -> Result<Option<Token>, TokenizerError> {
|
||||
let current_pos = self.reader.stream_position()?;
|
||||
let column = self.column.clone();
|
||||
@@ -408,6 +413,106 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TokenizerBuffer<T>
|
||||
where
|
||||
T: Read + Seek,
|
||||
{
|
||||
tokenizer: Tokenizer<T>,
|
||||
buffer: VecDeque<Token>,
|
||||
history: VecDeque<Token>,
|
||||
}
|
||||
|
||||
impl<T> TokenizerBuffer<T>
|
||||
where
|
||||
T: Seek + Read,
|
||||
{
|
||||
pub fn new(tokenizer: Tokenizer<T>) -> Self {
|
||||
Self {
|
||||
tokenizer,
|
||||
buffer: VecDeque::new(),
|
||||
history: VecDeque::with_capacity(128),
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads the next token from the tokenizer, pushing the value to the back of the history
|
||||
/// and returning the token
|
||||
pub fn next(&mut self) -> Result<Option<Token>, TokenizerError> {
|
||||
if let Some(token) = self.buffer.pop_front() {
|
||||
self.history.push_back(token.clone());
|
||||
return Ok(Some(token));
|
||||
}
|
||||
|
||||
let token = self.tokenizer.next_token()?;
|
||||
if let Some(ref token) = token {
|
||||
self.history.push_back(token.clone());
|
||||
}
|
||||
Ok(token)
|
||||
}
|
||||
|
||||
/// Peeks the next token in the stream without adding to the history stack
|
||||
pub fn peek(&mut self) -> Result<Option<Token>, TokenizerError> {
|
||||
if let Some(token) = self.buffer.front() {
|
||||
return Ok(Some(token.clone()));
|
||||
}
|
||||
|
||||
let token = self.tokenizer.peek_next()?;
|
||||
Ok(token)
|
||||
}
|
||||
|
||||
fn seek_from_start(&mut self, pos: usize) -> Result<(), TokenizerError> {
|
||||
// if pos
|
||||
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn seek_from_current(&mut self, seek_to: i64) -> Result<(), TokenizerError> {
|
||||
// if seek_to > 0 then we need to check if the buffer has enough tokens to pop, otherwise we need to read from the tokenizer
|
||||
// if seek_to < 0 then we need to pop from the history and push to the front of the buffer. If not enough, then we throw (we reached the front of the history)
|
||||
// if seek_to == 0 then we don't need to do anything
|
||||
|
||||
if seek_to > 0 {
|
||||
let mut tokens = Vec::with_capacity(seek_to as usize);
|
||||
for _ in 0..seek_to {
|
||||
if let Some(token) = self.tokenizer.next_token()? {
|
||||
tokens.push(token);
|
||||
} else {
|
||||
return Err(TokenizerError::IOError(std::io::Error::new(
|
||||
std::io::ErrorKind::UnexpectedEof,
|
||||
"Unexpected EOF",
|
||||
)));
|
||||
}
|
||||
}
|
||||
self.history.extend(tokens);
|
||||
} else if seek_to < 0 {
|
||||
let seek_to = seek_to.abs() as usize;
|
||||
let mut tokens = Vec::with_capacity(seek_to);
|
||||
for _ in 0..seek_to {
|
||||
if let Some(token) = self.history.pop_back() {
|
||||
tokens.push(token);
|
||||
} else {
|
||||
return Err(TokenizerError::IOError(std::io::Error::new(
|
||||
std::io::ErrorKind::UnexpectedEof,
|
||||
"Unexpected EOF",
|
||||
)));
|
||||
}
|
||||
}
|
||||
self.buffer.extend(tokens.into_iter().rev());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Adds to or removes from the History stack, allowing the user to move back and forth in the stream
|
||||
pub fn seek(&mut self, from: SeekFrom) -> Result<(), TokenizerError> {
|
||||
Ok(match from {
|
||||
SeekFrom::Start(pos) => self.seek_from_start(pos as usize)?,
|
||||
SeekFrom::Current(seek_to) => self.seek_from_current(seek_to)?,
|
||||
SeekFrom::End(_) => unimplemented!("SeekFrom::End will not be implemented"),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -422,6 +527,35 @@ mod tests {
|
||||
}
|
||||
"#;
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_buffer_seek_from_start() -> Result<()> {
|
||||
let tokenizer = Tokenizer::from(TEST_STRING.to_owned());
|
||||
let mut buffer = TokenizerBuffer::new(tokenizer);
|
||||
|
||||
let token = buffer.next()?;
|
||||
assert_eq!(token.unwrap().token_type, TokenType::Keyword(Keyword::Fn));
|
||||
|
||||
let token = buffer.next()?;
|
||||
assert_eq!(
|
||||
token.unwrap().token_type,
|
||||
TokenType::Identifier(String::from("test"))
|
||||
);
|
||||
|
||||
buffer.seek(SeekFrom::Start(0))?;
|
||||
|
||||
let token = buffer.next()?;
|
||||
|
||||
assert_eq!(token.unwrap().token_type, TokenType::Keyword(Keyword::Fn));
|
||||
|
||||
buffer.seek(SeekFrom::Start(16))?;
|
||||
|
||||
let token = buffer.next()?;
|
||||
|
||||
assert_eq!(token.unwrap().token_type, TokenType::Keyword(Keyword::Let));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_from_path_ok() {
|
||||
let tokenizer = Tokenizer::from_path(TEST_FILE);
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub struct Token {
|
||||
/// The type of the token
|
||||
pub token_type: TokenType,
|
||||
@@ -18,7 +18,7 @@ impl Token {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Hash, Eq)]
|
||||
#[derive(Debug, PartialEq, Hash, Eq, Clone)]
|
||||
pub enum TokenType {
|
||||
/// Represents a string token
|
||||
String(String),
|
||||
@@ -50,7 +50,7 @@ impl std::fmt::Display for TokenType {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Hash, Eq)]
|
||||
#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)]
|
||||
pub enum Number {
|
||||
/// Represents an integer number
|
||||
Integer(u64),
|
||||
@@ -67,7 +67,7 @@ impl std::fmt::Display for Number {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Hash, Eq)]
|
||||
#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)]
|
||||
pub enum Symbol {
|
||||
// Single Character Symbols
|
||||
/// Represents the `(` symbol
|
||||
@@ -157,7 +157,7 @@ impl Symbol {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Hash, Eq)]
|
||||
#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)]
|
||||
pub enum Keyword {
|
||||
/// Represents the `let` keyword
|
||||
Let,
|
||||
|
||||
Reference in New Issue
Block a user