pub mod token; use rust_decimal::Decimal; use std::{ cmp::Ordering, collections::VecDeque, io::{BufReader, Cursor, Read, Seek, SeekFrom}, path::PathBuf, }; use thiserror::Error; use token::{Keyword, Number, Symbol, Token, TokenType}; #[derive(Error, Debug)] pub enum TokenizerError { #[error("IO Error: {0}")] IOError(#[from] std::io::Error), #[error("Number Parse Error \"{0}\"\nLine: {1}, Column: {2}")] NumberParseError(std::num::ParseIntError, usize, usize), #[error("Decimal Parse Error \"{0}\"\nLine: {1}, Column: {2}")] DecimalParseError(rust_decimal::Error, usize, usize), #[error("Unknown Symbol \"{0}\"\nLine: {1}, Column: {2}")] UnknownSymbolError(char, usize, usize), #[error("Unknown Keyword or Identifier \"{0}\"\nLine: {1}, Column: {2}")] UnknownKeywordOrIdentifierError(String, usize, usize), } pub trait Tokenize: Read + Seek {} impl Tokenize for T where T: Read + Seek {} pub(crate) struct Tokenizer { reader: BufReader>, char_buffer: [u8; 1], line: usize, column: usize, returned_eof: bool, } impl Tokenizer { pub fn from_path(input_file: impl Into) -> Result { let file = std::fs::File::open(input_file.into())?; let reader = BufReader::new(Box::new(file) as Box); Ok(Self { reader, line: 1, column: 1, char_buffer: [0], returned_eof: false, }) } } impl From for Tokenizer { fn from(input: String) -> Self { let reader = BufReader::new(Box::new(Cursor::new(input)) as Box); Self { reader, line: 1, column: 1, char_buffer: [0], returned_eof: false, } } } impl Tokenizer { /// Consumes the tokenizer and returns the next token in the stream /// If there are no more tokens in the stream, this function returns None /// If there is an error reading the stream, this function returns an error /// /// # Important /// This function will increment the line and column counters fn next_char(&mut self) -> Result, TokenizerError> { let bytes_read = self.reader.read(&mut self.char_buffer)?; if bytes_read == 0 { return Ok(None); } // Safety: The buffer is guaranteed to have 1 value as it is initialized with a size of 1 let c = self.char_buffer[0] as char; if c == '\n' { self.line += 1; self.column = 1; } else { self.column += 1; } Ok(Some(c)) } /// Peeks the next character in the stream without consuming it /// /// # Important /// This does not increment the line or column counters fn peek_next_char(&mut self) -> Result, TokenizerError> { let current_pos = self.reader.stream_position()?; let to_return = if self.reader.read(&mut self.char_buffer)? == 0 { None } else { self.reader.seek(SeekFrom::Start(current_pos))?; // Safety: The buffer is guaranteed to have 1 value as it is initialized with a size of 1 Some(self.char_buffer[0] as char) }; Ok(to_return) } /// Skips the current line in the stream. /// Useful for skipping comments or empty lines /// /// # Important /// This function will increment the line and column counters fn skip_line(&mut self) -> Result<(), TokenizerError> { while let Some(next_char) = self.next_char()? { if next_char == '\n' { break; } } Ok(()) } /// Consumes the tokenizer and returns the next token in the stream /// If there are no more tokens in the stream, this function returns None pub fn next_token(&mut self) -> Result, TokenizerError> { while let Some(next_char) = self.next_char()? { // skip whitespace if next_char.is_whitespace() { continue; } // skip comments if next_char == '/' && self.peek_next_char()? == Some('/') { self.skip_line()?; continue; } match next_char { // numbers '0'..='9' => { return self.tokenize_number(next_char).map(Some); } // strings '"' | '\'' => return self.tokenize_string(next_char).map(Some), // symbols excluding `"` and `'` char if !char.is_alphanumeric() && char != '"' && char != '\'' => { return self.tokenize_symbol(next_char).map(Some) } // keywords and identifiers char if char.is_alphabetic() => { return self.tokenize_keyword_or_identifier(next_char).map(Some) } _ => { return Err(TokenizerError::UnknownSymbolError( next_char, self.line, self.column, )) } } } if self.returned_eof { Ok(None) } else { self.returned_eof = true; Ok(Some(Token::new(TokenType::EOF, self.line, self.column))) } } /// Peeks the next token in the stream without consuming it /// If there are no more tokens in the stream, this function returns None pub fn peek_next(&mut self) -> Result, TokenizerError> { let current_pos = self.reader.stream_position()?; let column = self.column.clone(); let line = self.line.clone(); let token = self.next_token()?; self.reader.seek(SeekFrom::Start(current_pos))?; self.column = column; self.line = line; Ok(token) } /// Tokenizes a symbol fn tokenize_symbol(&mut self, first_symbol: char) -> Result { /// Helper macro to create a symbol token macro_rules! symbol { ($symbol:ident) => { Ok(Token::new( TokenType::Symbol(Symbol::$symbol), self.line, self.column, )) }; } match first_symbol { // single character symbols '(' => symbol!(LParen), ')' => symbol!(RParen), '{' => symbol!(LBrace), '}' => symbol!(RBrace), '[' => symbol!(LBracket), ']' => symbol!(RBracket), ';' => symbol!(Semicolon), ':' => symbol!(Colon), ',' => symbol!(Comma), '+' => symbol!(Plus), '-' => symbol!(Minus), '/' => symbol!(Slash), '.' => symbol!(Dot), '^' => symbol!(Caret), // multi-character symbols '<' if self.peek_next_char()? == Some('=') => { self.next_char()?; symbol!(LessThanOrEqual) } '<' => symbol!(LessThan), '>' if self.peek_next_char()? == Some('=') => { self.next_char()?; symbol!(GreaterThanOrEqual) } '>' => symbol!(GreaterThan), '=' if self.peek_next_char()? == Some('=') => { self.next_char()?; symbol!(Equal) } '=' => symbol!(Assign), '!' if self.peek_next_char()? == Some('=') => { self.next_char()?; symbol!(NotEqual) } '!' => symbol!(LogicalNot), '*' if self.peek_next_char()? == Some('*') => { self.next_char()?; symbol!(Exp) } '*' => symbol!(Asterisk), '&' if self.peek_next_char()? == Some('&') => { self.next_char()?; symbol!(LogicalAnd) } '|' if self.peek_next_char()? == Some('|') => { self.next_char()?; symbol!(LogicalOr) } _ => Err(TokenizerError::UnknownSymbolError( first_symbol, self.line, self.column, )), } } /// Tokenizes a number literal fn tokenize_number(&mut self, first_char: char) -> Result { let mut primary = String::with_capacity(16); let mut decimal: Option = None; let mut reading_decimal = false; let column = self.column.clone(); let line = self.line.clone(); primary.push(first_char); while let Some(next_char) = self.peek_next_char()? { if next_char.is_whitespace() { break; } if next_char == '.' { reading_decimal = true; self.next_char()?; continue; } // support underscores in numbers for readability if next_char == '_' { self.next_char()?; continue; } // This is for the times when we have a number followed by a symbol (like a semicolon or =) if !next_char.is_numeric() { break; } if reading_decimal { decimal.get_or_insert_with(String::new).push(next_char); } else { primary.push(next_char); } self.next_char()?; } if let Some(decimal) = decimal { let decimal_scale = decimal.len() as u32; let number = format!("{}{}", primary, decimal) .parse::() .map_err(|e| TokenizerError::NumberParseError(e, self.line, self.column))?; Ok(Token::new( TokenType::Number(Number::Decimal( Decimal::try_from_i128_with_scale(number, decimal_scale) .map_err(|e| TokenizerError::DecimalParseError(e, line, column))?, )), line, column, )) } else { Ok(Token::new( TokenType::Number(Number::Integer( primary .parse() .map_err(|e| TokenizerError::NumberParseError(e, line, column))?, )), line, column, )) } } /// Tokenizes a string literal fn tokenize_string(&mut self, beginning_quote: char) -> Result { let mut buffer = String::with_capacity(16); let column = self.column.clone(); let line = self.line.clone(); while let Some(next_char) = self.next_char()? { if next_char == beginning_quote { break; } buffer.push(next_char); } Ok(Token::new(TokenType::String(buffer), line, column)) } /// Tokenizes a keyword or an identifier. Also handles boolean literals fn tokenize_keyword_or_identifier( &mut self, first_char: char, ) -> Result { macro_rules! keyword { ($keyword:ident) => {{ return Ok(Token::new( TokenType::Keyword(Keyword::$keyword), self.line, self.column, )); }}; } /// Helper macro to check if the next character is whitespace or not alphanumeric macro_rules! next_ws { () => { matches!(self.peek_next_char()?, Some(x) if x.is_whitespace() || !x.is_alphanumeric()) || matches!(self.peek_next_char()?, None) }; } let mut buffer = String::with_capacity(16); let line = self.line.clone(); let column = self.column.clone(); let mut looped_char = Some(first_char); while let Some(next_char) = looped_char { if next_char.is_whitespace() { break; } if !next_char.is_alphanumeric() { break; } buffer.push(next_char); match buffer.as_str() { "let" if next_ws!() => keyword!(Let), "fn" if next_ws!() => keyword!(Fn), "if" if next_ws!() => keyword!(If), "else" if next_ws!() => keyword!(Else), "return" if next_ws!() => keyword!(Return), "enum" if next_ws!() => keyword!(Enum), "import" if next_ws!() => keyword!(Import), "export" if next_ws!() => keyword!(Export), // boolean literals "true" if next_ws!() => { return Ok(Token::new(TokenType::Boolean(true), self.line, self.column)) } "false" if next_ws!() => { return Ok(Token::new( TokenType::Boolean(false), self.line, self.column, )) } // if the next character is whitespace or not alphanumeric, then we have an identifier // this is because keywords are checked first val if next_ws!() => { return Ok(Token::new( TokenType::Identifier(val.to_string()), line, column, )); } _ => {} } looped_char = self.next_char()?; } Err(TokenizerError::UnknownKeywordOrIdentifierError( buffer, line, column, )) } } pub struct TokenizerBuffer { tokenizer: Tokenizer, buffer: VecDeque, history: VecDeque, } impl TokenizerBuffer { pub fn new(tokenizer: Tokenizer) -> Self { Self { tokenizer, buffer: VecDeque::new(), history: VecDeque::with_capacity(128), } } /// Reads the next token from the tokenizer, pushing the value to the back of the history /// and returning the token pub fn next(&mut self) -> Result, TokenizerError> { if let Some(token) = self.buffer.pop_front() { self.history.push_back(token.clone()); return Ok(Some(token)); } let token = self.tokenizer.next_token()?; if let Some(ref token) = token { self.history.push_back(token.clone()); } Ok(token) } /// Peeks the next token in the stream without adding to the history stack pub fn peek(&mut self) -> Result, TokenizerError> { if let Some(token) = self.buffer.front() { return Ok(Some(token.clone())); } let token = self.tokenizer.peek_next()?; Ok(token) } fn seek_from_current(&mut self, seek_to: i64) -> Result<(), TokenizerError> { use Ordering::*; // if seek_to > 0 then we need to check if the buffer has enough tokens to pop, otherwise we need to read from the tokenizer // if seek_to < 0 then we need to pop from the history and push to the front of the buffer. If not enough, then we throw (we reached the front of the history) // if seek_to == 0 then we don't need to do anything match seek_to.cmp(&0) { Greater => { let mut tokens = Vec::with_capacity(seek_to as usize); for _ in 0..seek_to { if let Some(token) = self.tokenizer.next_token()? { tokens.push(token); } else { return Err(TokenizerError::IOError(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, "Unexpected EOF", ))); } } self.history.extend(tokens); } Less => { let seek_to = seek_to.unsigned_abs() as usize; let mut tokens = Vec::with_capacity(seek_to); for _ in 0..seek_to { if let Some(token) = self.history.pop_back() { tokens.push(token); } else { return Err(TokenizerError::IOError(std::io::Error::new( std::io::ErrorKind::UnexpectedEof, "Unexpected EOF", ))); } } self.buffer.extend(tokens.into_iter().rev()); } _ => {} } Ok(()) } /// Adds to or removes from the History stack, allowing the user to move back and forth in the stream pub fn seek(&mut self, from: SeekFrom) -> Result<(), TokenizerError> { match from { SeekFrom::Current(seek_to) => self.seek_from_current(seek_to)?, SeekFrom::End(_) => unimplemented!("SeekFrom::End will not be implemented"), SeekFrom::Start(_) => unimplemented!("SeekFrom::Start will not be implemented"), } Ok(()) } } #[cfg(test)] mod tests { use super::*; use anyhow::Result; use rust_decimal::Decimal; const TEST_FILE: &str = "tests/file.stlg"; const TEST_STRING: &str = r#" fn test() { let x = 10; return x + 2; } "#; #[test] fn test_seek_from_current() -> Result<()> { let tokenizer = Tokenizer::from(TEST_STRING.to_owned()); let mut buffer = TokenizerBuffer::new(tokenizer); let token = buffer.next()?.unwrap(); assert_eq!(token.token_type, TokenType::Keyword(Keyword::Fn)); buffer.seek(SeekFrom::Current(1))?; let token = buffer.next()?.unwrap(); assert_eq!(token.token_type, TokenType::Symbol(Symbol::LParen)); Ok(()) } #[test] fn test_tokenizer_from_path_ok() { let tokenizer = Tokenizer::from_path(TEST_FILE); assert!(tokenizer.is_ok()); } #[test] fn test_tokenizer_from_path_err() { let tokenizer = Tokenizer::from_path("non_existent_file.stlg"); assert!(tokenizer.is_err()); } #[test] fn test_next_char() -> Result<()> { let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); let char = tokenizer.next_char()?; assert_eq!(char, Some('\n')); assert_eq!(tokenizer.line, 2); assert_eq!(tokenizer.column, 1); let mut tokenizer = Tokenizer::from(String::from("fn")); let char = tokenizer.next_char()?; assert_eq!(char, Some('f')); assert_eq!(tokenizer.line, 1); assert_eq!(tokenizer.column, 2); Ok(()) } #[test] fn test_peek_next_char() -> Result<()> { let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); let char = tokenizer.peek_next_char()?; assert_eq!(char, Some('\n')); assert_eq!(tokenizer.line, 1); assert_eq!(tokenizer.column, 1); let char = tokenizer.next_char()?; assert_eq!(char, Some('\n')); assert_eq!(tokenizer.line, 2); assert_eq!(tokenizer.column, 1); let char = tokenizer.peek_next_char()?; assert_eq!(char, Some(' ')); assert_eq!(tokenizer.line, 2); assert_eq!(tokenizer.column, 1); Ok(()) } #[test] fn test_skip_line() -> Result<()> { let mut tokenizer = Tokenizer::from(String::from( r#" This is a skippable line"#, )); tokenizer.skip_line()?; assert_eq!(tokenizer.line, 2); assert_eq!(tokenizer.column, 1); let next_char = tokenizer.next_char()?; assert_eq!(next_char, Some('T')); Ok(()) } #[test] fn test_parse_integer() -> Result<()> { let mut tokenizer = Tokenizer::from(String::from("10")); let token = tokenizer.next_token()?.unwrap(); assert_eq!(token.token_type, TokenType::Number(Number::Integer(10))); Ok(()) } #[test] fn test_parse_integer_with_underscore() -> Result<()> { let mut tokenizer = Tokenizer::from(String::from("1_000")); let token = tokenizer.next_token()?.unwrap(); assert_eq!(token.token_type, TokenType::Number(Number::Integer(1000))); Ok(()) } #[test] fn test_parse_decimal() -> Result<()> { let mut tokenizer = Tokenizer::from(String::from("10.5")); let token = tokenizer.next_token()?.unwrap(); assert_eq!( token.token_type, TokenType::Number(Number::Decimal(Decimal::new(105, 1))) // 10.5 ); Ok(()) } #[test] fn test_parse_decimal_with_underscore() -> Result<()> { let mut tokenizer = Tokenizer::from(String::from("1_000.000_6")); Ok(()) } #[test] fn test_parse_number_with_symbol() -> Result<()> { let mut tokenizer = Tokenizer::from(String::from("10;")); let token = tokenizer.next_token()?.unwrap(); assert_eq!(token.token_type, TokenType::Number(Number::Integer(10))); let next_char = tokenizer.next_char()?; assert_eq!(next_char, Some(';')); Ok(()) } #[test] fn test_string_parse() -> Result<()> { let mut tokenizer = Tokenizer::from(String::from(r#""Hello, World!""#)); let token = tokenizer.next_token()?.unwrap(); assert_eq!( token.token_type, TokenType::String(String::from("Hello, World!")) ); let mut tokenizer = Tokenizer::from(String::from(r#"'Hello, World!'"#)); let token = tokenizer.next_token()?.unwrap(); assert_eq!( token.token_type, TokenType::String(String::from("Hello, World!")) ); Ok(()) } #[test] fn test_symbol_parse() -> Result<()> { let mut tokenizer = Tokenizer::from(String::from( "^ ! () [] {} , . ; : + - * / < > = != && || >= <=**", )); let expected_tokens = vec![ TokenType::Symbol(Symbol::Caret), TokenType::Symbol(Symbol::LogicalNot), TokenType::Symbol(Symbol::LParen), TokenType::Symbol(Symbol::RParen), TokenType::Symbol(Symbol::LBracket), TokenType::Symbol(Symbol::RBracket), TokenType::Symbol(Symbol::LBrace), TokenType::Symbol(Symbol::RBrace), TokenType::Symbol(Symbol::Comma), TokenType::Symbol(Symbol::Dot), TokenType::Symbol(Symbol::Semicolon), TokenType::Symbol(Symbol::Colon), TokenType::Symbol(Symbol::Plus), TokenType::Symbol(Symbol::Minus), TokenType::Symbol(Symbol::Asterisk), TokenType::Symbol(Symbol::Slash), TokenType::Symbol(Symbol::LessThan), TokenType::Symbol(Symbol::GreaterThan), TokenType::Symbol(Symbol::Assign), TokenType::Symbol(Symbol::NotEqual), TokenType::Symbol(Symbol::LogicalAnd), TokenType::Symbol(Symbol::LogicalOr), TokenType::Symbol(Symbol::GreaterThanOrEqual), TokenType::Symbol(Symbol::LessThanOrEqual), TokenType::Symbol(Symbol::Exp), ]; for expected_token in expected_tokens { let token = tokenizer.next_token()?.unwrap(); assert_eq!(token.token_type, expected_token); } Ok(()) } #[test] fn test_keyword_parse() -> Result<()> { let mut tokenizer = Tokenizer::from(String::from("let fn if else return enum import export")); let expected_tokens = vec![ TokenType::Keyword(Keyword::Let), TokenType::Keyword(Keyword::Fn), TokenType::Keyword(Keyword::If), TokenType::Keyword(Keyword::Else), TokenType::Keyword(Keyword::Return), TokenType::Keyword(Keyword::Enum), TokenType::Keyword(Keyword::Import), TokenType::Keyword(Keyword::Export), ]; for expected_token in expected_tokens { let token = tokenizer.next_token()?.unwrap(); assert_eq!(token.token_type, expected_token); } Ok(()) } #[test] fn test_identifier_parse() -> Result<()> { let mut tokenizer = Tokenizer::from(String::from("fn test")); let token = tokenizer.next_token()?.unwrap(); assert_eq!(token.token_type, TokenType::Keyword(Keyword::Fn)); let token = tokenizer.next_token()?.unwrap(); assert_eq!( token.token_type, TokenType::Identifier(String::from("test")) ); Ok(()) } #[test] fn test_boolean_parse() -> Result<()> { let mut tokenizer = Tokenizer::from(String::from("true false")); let token = tokenizer.next_token()?.unwrap(); assert_eq!(token.token_type, TokenType::Boolean(true)); let token = tokenizer.next_token()?.unwrap(); assert_eq!(token.token_type, TokenType::Boolean(false)); Ok(()) } #[test] fn test_full_source() -> Result<()> { let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); let expected_tokens = vec![ TokenType::Keyword(Keyword::Fn), TokenType::Identifier(String::from("test")), TokenType::Symbol(Symbol::LParen), TokenType::Symbol(Symbol::RParen), TokenType::Symbol(Symbol::LBrace), TokenType::Keyword(Keyword::Let), TokenType::Identifier(String::from("x")), TokenType::Symbol(Symbol::Assign), TokenType::Number(Number::Integer(10)), TokenType::Symbol(Symbol::Semicolon), TokenType::Keyword(Keyword::Return), TokenType::Identifier(String::from("x")), TokenType::Symbol(Symbol::Plus), TokenType::Number(Number::Integer(2)), TokenType::Symbol(Symbol::Semicolon), TokenType::Symbol(Symbol::RBrace), ]; for expected_token in expected_tokens { let token = tokenizer.next_token()?.unwrap(); assert_eq!(token.token_type, expected_token); } Ok(()) } #[test] fn test_peek_next() -> Result<()> { let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); let column = tokenizer.column.clone(); let line = tokenizer.line.clone(); let peeked_token = tokenizer.peek_next()?; assert_eq!( peeked_token.unwrap().token_type, TokenType::Keyword(Keyword::Fn) ); assert_eq!(tokenizer.column, column); assert_eq!(tokenizer.line, line); let next_token = tokenizer.next_token()?; assert_eq!( next_token.unwrap().token_type, TokenType::Keyword(Keyword::Fn) ); assert_ne!(tokenizer.column, column); assert_ne!(tokenizer.line, line); Ok(()) } }