From 66064a21d7d7e3bc70440369c91685ab48ad58f8 Mon Sep 17 00:00:00 2001 From: Devin Bidwell Date: Tue, 19 Nov 2024 23:29:01 -0700 Subject: [PATCH] tokenizer --- .gitignore | 1 + Cargo.lock | 265 ++++++++++++++++ Cargo.toml | 14 + src/main.rs | 43 +++ src/tokenizer/mod.rs | 671 +++++++++++++++++++++++++++++++++++++++++ src/tokenizer/token.rs | 120 ++++++++ tests/file.stlg | 3 + 7 files changed, 1117 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/main.rs create mode 100644 src/tokenizer/mod.rs create mode 100644 src/tokenizer/token.rs create mode 100644 tests/file.stlg diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..ee82b16 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,265 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +dependencies = [ + "anstyle", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" + +[[package]] +name = "clap" +version = "4.5.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "stationlang" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "thiserror", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..9ea3d16 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "stationlang" +version = "0.1.0" +edition = "2021" + +[profile.dev] +panic = "unwind" + +[dependencies] +clap = { version = "^4.5", features = ["derive"] } +thiserror = { version = "^2.0" } + +[dev-dependencies] +anyhow = { version = "^1.0" } \ No newline at end of file diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..fa831b9 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,43 @@ +mod tokenizer; + +use clap::Parser; +use tokenizer::{Tokenizer, TokenizerError}; + +#[derive(Debug, thiserror::Error)] +enum StationlangError { + #[error("{0}")] + TokenizerError(#[from] TokenizerError), +} + +#[derive(Parser, Debug)] +#[command(version, about, long_about = None)] +struct Args { + /// What file should be compiled + #[arg(short, long)] + input_file: String, + /// The default stack size for the program + #[arg(short, long, default_value_t = 512)] + stack_size: usize, + /// The output file for the compiled program. If not set, output will go to stdout + #[arg(short, long)] + output_file: Option, +} + +fn run_logic() -> Result<(), StationlangError> { + let args = Args::parse(); + let input_file = args.input_file; + + let mut tokenizer = Tokenizer::from_path(&input_file)?; + + while let Some(token) = tokenizer.next_token()? { + println!("{:?}", token); + } + + Ok(()) +} + +fn main() { + if let Err(e) = run_logic() { + eprintln!("\n\n{}", e); + } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs new file mode 100644 index 0000000..c913933 --- /dev/null +++ b/src/tokenizer/mod.rs @@ -0,0 +1,671 @@ +mod token; + +use std::{ + fs::File, + io::{BufReader, Cursor, Read, Seek, SeekFrom}, + path::PathBuf, +}; +use thiserror::Error; +use token::{Keyword, Number, Symbol, Token, TokenType}; + +#[derive(Error, Debug)] +pub enum TokenizerError { + #[error("IO Error: {0}")] + IOError(#[from] std::io::Error), + #[error("Number Parse Error \"{0}\"\nLine: {1}, Column: {2}")] + NumberParseError(std::num::ParseIntError, usize, usize), + #[error("Unknown Symbol \"{0}\"\nLine: {1}, Column: {2}")] + UnknownSymbolError(char, usize, usize), + #[error("Unknown Keyword or Identifier \"{0}\"\nLine: {1}, Column: {2}")] + UnknownKeywordOrIdentifierError(String, usize, usize), +} + +pub(crate) struct Tokenizer +where + T: Read + Seek, +{ + reader: BufReader, + char_buffer: [u8; 1], + line: usize, + column: usize, + returned_eof: bool, +} + +impl From for Tokenizer>> { + fn from(input: String) -> Self { + let cursor = Cursor::new(input.into_bytes()); + let reader = BufReader::new(cursor); + + Self { + reader, + line: 1, + column: 1, + char_buffer: [0], + returned_eof: false, + } + } +} + +impl Tokenizer { + pub fn from_path(input_file: impl Into) -> Result { + let file = std::fs::File::open(input_file.into())?; + let reader = BufReader::new(file); + + Ok(Self { + reader, + line: 1, + column: 1, + char_buffer: [0], + returned_eof: false, + }) + } +} + +impl Tokenizer +where + T: Read + Seek, +{ + /// Consumes the tokenizer and returns the next token in the stream + /// If there are no more tokens in the stream, this function returns None + /// If there is an error reading the stream, this function returns an error + /// + /// # Important + /// This function will increment the line and column counters + fn next_char(&mut self) -> Result, TokenizerError> { + let bytes_read = self.reader.read(&mut self.char_buffer)?; + + if bytes_read == 0 { + return Ok(None); + } + + // Safety: The buffer is guaranteed to have 1 value as it is initialized with a size of 1 + let c = self.char_buffer[0] as char; + if c == '\n' { + self.line += 1; + self.column = 1; + } else { + self.column += 1; + } + + Ok(Some(c)) + } + + /// Peeks the next character in the stream without consuming it + /// + /// # Important + /// This does not increment the line or column counters + fn peek_next_char(&mut self) -> Result, TokenizerError> { + let current_pos = self.reader.stream_position()?; + + let to_return = if self.reader.read(&mut self.char_buffer)? == 0 { + None + } else { + self.reader.seek(SeekFrom::Start(current_pos))?; + + // Safety: The buffer is guaranteed to have 1 value as it is initialized with a size of 1 + Some(self.char_buffer[0] as char) + }; + + Ok(to_return) + } + + /// Skips the current line in the stream. + /// Useful for skipping comments or empty lines + /// + /// # Important + /// This function will increment the line and column counters + fn skip_line(&mut self) -> Result<(), TokenizerError> { + while let Some(next_char) = self.next_char()? { + if next_char == '\n' { + break; + } + } + Ok(()) + } + + pub fn next_token(&mut self) -> Result, TokenizerError> { + while let Some(next_char) = self.next_char()? { + // skip whitespace + if next_char.is_whitespace() { + continue; + } + // skip comments + if next_char == '/' && self.peek_next_char()? == Some('/') { + self.skip_line()?; + continue; + } + + match next_char { + // numbers + '0'..='9' => { + return self.tokenize_number(next_char).map(Some); + } + // strings + '"' | '\'' => return self.tokenize_string(next_char).map(Some), + // symbols excluding `"` and `'` + char if !char.is_alphanumeric() && char != '"' && char != '\'' => { + return self.tokenize_symbol(next_char).map(Some) + } + // keywords and identifiers + char if char.is_alphabetic() => { + return self.tokenize_keyword_or_identifier(next_char).map(Some) + } + _ => { + return Err(TokenizerError::UnknownSymbolError( + next_char, + self.line, + self.column, + )) + } + } + } + if self.returned_eof { + Ok(None) + } else { + self.returned_eof = true; + Ok(Some(Token::new(TokenType::EOF, self.line, self.column))) + } + } + + /// Tokenizes a symbol + fn tokenize_symbol(&mut self, first_symbol: char) -> Result { + /// Helper macro to create a symbol token + macro_rules! symbol { + ($symbol:ident) => { + Ok(Token::new( + TokenType::Symbol(Symbol::$symbol), + self.line, + self.column, + )) + }; + } + + match first_symbol { + // single character symbols + '(' => symbol!(LParen), + ')' => symbol!(RParen), + '{' => symbol!(LBrace), + '}' => symbol!(RBrace), + '[' => symbol!(LBracket), + ']' => symbol!(RBracket), + ';' => symbol!(Semicolon), + ':' => symbol!(Colon), + ',' => symbol!(Comma), + '+' => symbol!(Plus), + '-' => symbol!(Minus), + '/' => symbol!(Slash), + '*' => symbol!(Asterisk), + '.' => symbol!(Dot), + + // multi-character symbols + '<' if self.peek_next_char()? == Some('=') => { + self.next_char()?; + symbol!(LessThanOrEqual) + } + '<' => symbol!(LessThan), + + '>' if self.peek_next_char()? == Some('=') => { + self.next_char()?; + symbol!(GreaterThanOrEqual) + } + '>' => symbol!(GreaterThan), + + '=' if self.peek_next_char()? == Some('=') => { + self.next_char()?; + symbol!(Equal) + } + '=' => symbol!(Assign), + + '!' if self.peek_next_char()? == Some('=') => { + self.next_char()?; + symbol!(NotEqual) + } + '!' => symbol!(LogicalNot), + + '&' if self.peek_next_char()? == Some('&') => { + self.next_char()?; + symbol!(LogicalAnd) + } + '|' if self.peek_next_char()? == Some('|') => { + self.next_char()?; + symbol!(LogicalOr) + } + _ => Err(TokenizerError::UnknownSymbolError( + first_symbol, + self.line, + self.column, + )), + } + } + + /// Tokenizes a number literal + fn tokenize_number(&mut self, first_char: char) -> Result { + let mut primary = String::with_capacity(16); + let mut decimal: Option = None; + let mut reading_decimal = false; + + let column = self.column.clone(); + let line = self.line.clone(); + + primary.push(first_char); + + while let Some(next_char) = self.peek_next_char()? { + if next_char.is_whitespace() { + break; + } + + if next_char == '.' { + reading_decimal = true; + self.next_char()?; + continue; + } + + // This is for the times when we have a number followed by a symbol (like a semicolon or =) + if !next_char.is_numeric() { + break; + } + + if reading_decimal { + decimal.get_or_insert_with(String::new).push(next_char); + } else { + primary.push(next_char); + } + self.next_char()?; + } + + if let Some(decimal) = decimal { + Ok(Token::new( + TokenType::Number(Number::Decimal( + primary + .parse() + .map_err(|e| TokenizerError::NumberParseError(e, line, column))?, + decimal + .parse() + .map_err(|e| TokenizerError::NumberParseError(e, line, column))?, + )), + line, + column, + )) + } else { + Ok(Token::new( + TokenType::Number(Number::Integer( + primary + .parse() + .map_err(|e| TokenizerError::NumberParseError(e, line, column))?, + )), + line, + column, + )) + } + } + + /// Tokenizes a string literal + fn tokenize_string(&mut self, beginning_quote: char) -> Result { + let mut buffer = String::with_capacity(16); + + let column = self.column.clone(); + let line = self.line.clone(); + + while let Some(next_char) = self.next_char()? { + if next_char == beginning_quote { + break; + } + + buffer.push(next_char); + } + + Ok(Token::new(TokenType::String(buffer), line, column)) + } + + /// Tokenizes a keyword or an identifier. Also handles boolean literals + fn tokenize_keyword_or_identifier( + &mut self, + first_char: char, + ) -> Result { + macro_rules! keyword { + ($keyword:ident) => {{ + return Ok(Token::new( + TokenType::Keyword(Keyword::$keyword), + self.line, + self.column, + )); + }}; + } + + /// Helper macro to check if the next character is whitespace or not alphanumeric + macro_rules! next_ws { + () => { + matches!(self.peek_next_char()?, Some(x) if x.is_whitespace() || !x.is_alphanumeric()) || matches!(self.peek_next_char()?, None) + }; + } + + let mut buffer = String::with_capacity(16); + let line = self.line.clone(); + let column = self.column.clone(); + + let mut looped_char = Some(first_char); + + while let Some(next_char) = looped_char { + if next_char.is_whitespace() { + break; + } + + if !next_char.is_alphanumeric() { + break; + } + buffer.push(next_char); + + match buffer.as_str() { + "let" if next_ws!() => keyword!(Let), + "fn" if next_ws!() => keyword!(Fn), + "if" if next_ws!() => keyword!(If), + "else" if next_ws!() => keyword!(Else), + "return" if next_ws!() => keyword!(Return), + "enum" if next_ws!() => keyword!(Enum), + "import" if next_ws!() => keyword!(Import), + "export" if next_ws!() => keyword!(Export), + + // boolean literals + "true" if next_ws!() => { + return Ok(Token::new(TokenType::Boolean(true), self.line, self.column)) + } + "false" if next_ws!() => { + return Ok(Token::new( + TokenType::Boolean(false), + self.line, + self.column, + )) + } + // if the next character is whitespace or not alphanumeric, then we have an identifier + // this is because keywords are checked first + val if next_ws!() => { + return Ok(Token::new( + TokenType::Identifier(val.to_string()), + line, + column, + )); + } + _ => {} + } + + looped_char = self.next_char()?; + } + Err(TokenizerError::UnknownKeywordOrIdentifierError( + buffer, line, column, + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use anyhow::Result; + + const TEST_FILE: &str = "tests/file.stlg"; + + const TEST_STRING: &str = r#" + fn test() { + let x = 10; + return x + 2; + } + "#; + + #[test] + fn test_tokenizer_from_path_ok() { + let tokenizer = Tokenizer::from_path(TEST_FILE); + assert!(tokenizer.is_ok()); + } + + #[test] + fn test_tokenizer_from_path_err() { + let tokenizer = Tokenizer::from_path("non_existent_file.stlg"); + assert!(tokenizer.is_err()); + } + + #[test] + fn test_next_char() -> Result<()> { + let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); + + let char = tokenizer.next_char()?; + + assert_eq!(char, Some('\n')); + assert_eq!(tokenizer.line, 2); + assert_eq!(tokenizer.column, 1); + + let mut tokenizer = Tokenizer::from(String::from("fn")); + + let char = tokenizer.next_char()?; + + assert_eq!(char, Some('f')); + assert_eq!(tokenizer.line, 1); + assert_eq!(tokenizer.column, 2); + + Ok(()) + } + + #[test] + fn test_peek_next_char() -> Result<()> { + let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); + + let char = tokenizer.peek_next_char()?; + + assert_eq!(char, Some('\n')); + assert_eq!(tokenizer.line, 1); + assert_eq!(tokenizer.column, 1); + + let char = tokenizer.next_char()?; + assert_eq!(char, Some('\n')); + assert_eq!(tokenizer.line, 2); + assert_eq!(tokenizer.column, 1); + + let char = tokenizer.peek_next_char()?; + assert_eq!(char, Some(' ')); + assert_eq!(tokenizer.line, 2); + assert_eq!(tokenizer.column, 1); + + Ok(()) + } + + #[test] + fn test_skip_line() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from( + r#" +This is a skippable line"#, + )); + + tokenizer.skip_line()?; + + assert_eq!(tokenizer.line, 2); + assert_eq!(tokenizer.column, 1); + + let next_char = tokenizer.next_char()?; + assert_eq!(next_char, Some('T')); + + Ok(()) + } + + #[test] + fn test_parse_integer() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("10")); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, TokenType::Number(Number::Integer(10))); + + Ok(()) + } + + #[test] + fn test_parse_decimal() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("10.5")); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, TokenType::Number(Number::Decimal(10, 5))); + + Ok(()) + } + + #[test] + fn test_parse_number_with_symbol() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("10;")); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, TokenType::Number(Number::Integer(10))); + + let next_char = tokenizer.next_char()?; + + assert_eq!(next_char, Some(';')); + + Ok(()) + } + + #[test] + fn test_string_parse() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from(r#""Hello, World!""#)); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!( + token.token_type, + TokenType::String(String::from("Hello, World!")) + ); + + let mut tokenizer = Tokenizer::from(String::from(r#"'Hello, World!'"#)); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!( + token.token_type, + TokenType::String(String::from("Hello, World!")) + ); + + Ok(()) + } + + #[test] + fn test_symbol_parse() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from( + "! () [] {} , . ; : + - * / < > = != && || >= <=", + )); + + let expected_tokens = vec![ + TokenType::Symbol(Symbol::LogicalNot), + TokenType::Symbol(Symbol::LParen), + TokenType::Symbol(Symbol::RParen), + TokenType::Symbol(Symbol::LBracket), + TokenType::Symbol(Symbol::RBracket), + TokenType::Symbol(Symbol::LBrace), + TokenType::Symbol(Symbol::RBrace), + TokenType::Symbol(Symbol::Comma), + TokenType::Symbol(Symbol::Dot), + TokenType::Symbol(Symbol::Semicolon), + TokenType::Symbol(Symbol::Colon), + TokenType::Symbol(Symbol::Plus), + TokenType::Symbol(Symbol::Minus), + TokenType::Symbol(Symbol::Asterisk), + TokenType::Symbol(Symbol::Slash), + TokenType::Symbol(Symbol::LessThan), + TokenType::Symbol(Symbol::GreaterThan), + TokenType::Symbol(Symbol::Assign), + TokenType::Symbol(Symbol::NotEqual), + TokenType::Symbol(Symbol::LogicalAnd), + TokenType::Symbol(Symbol::LogicalOr), + TokenType::Symbol(Symbol::GreaterThanOrEqual), + TokenType::Symbol(Symbol::LessThanOrEqual), + ]; + + for expected_token in expected_tokens { + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, expected_token); + } + + Ok(()) + } + + #[test] + fn test_keyword_parse() -> Result<()> { + let mut tokenizer = + Tokenizer::from(String::from("let fn if else return enum import export")); + + let expected_tokens = vec![ + TokenType::Keyword(Keyword::Let), + TokenType::Keyword(Keyword::Fn), + TokenType::Keyword(Keyword::If), + TokenType::Keyword(Keyword::Else), + TokenType::Keyword(Keyword::Return), + TokenType::Keyword(Keyword::Enum), + TokenType::Keyword(Keyword::Import), + TokenType::Keyword(Keyword::Export), + ]; + + for expected_token in expected_tokens { + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, expected_token); + } + + Ok(()) + } + + #[test] + fn test_identifier_parse() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("fn test")); + + let token = tokenizer.next_token()?.unwrap(); + assert_eq!(token.token_type, TokenType::Keyword(Keyword::Fn)); + let token = tokenizer.next_token()?.unwrap(); + assert_eq!( + token.token_type, + TokenType::Identifier(String::from("test")) + ); + + Ok(()) + } + + #[test] + fn test_boolean_parse() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("true false")); + + let token = tokenizer.next_token()?.unwrap(); + assert_eq!(token.token_type, TokenType::Boolean(true)); + let token = tokenizer.next_token()?.unwrap(); + assert_eq!(token.token_type, TokenType::Boolean(false)); + + Ok(()) + } + + #[test] + fn test_full_source() -> Result<()> { + let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); + + let expected_tokens = vec![ + TokenType::Keyword(Keyword::Fn), + TokenType::Identifier(String::from("test")), + TokenType::Symbol(Symbol::LParen), + TokenType::Symbol(Symbol::RParen), + TokenType::Symbol(Symbol::LBrace), + TokenType::Keyword(Keyword::Let), + TokenType::Identifier(String::from("x")), + TokenType::Symbol(Symbol::Assign), + TokenType::Number(Number::Integer(10)), + TokenType::Symbol(Symbol::Semicolon), + TokenType::Keyword(Keyword::Return), + TokenType::Identifier(String::from("x")), + TokenType::Symbol(Symbol::Plus), + TokenType::Number(Number::Integer(2)), + TokenType::Symbol(Symbol::Semicolon), + TokenType::Symbol(Symbol::RBrace), + ]; + + for expected_token in expected_tokens { + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, expected_token); + } + + Ok(()) + } +} diff --git a/src/tokenizer/token.rs b/src/tokenizer/token.rs new file mode 100644 index 0000000..e7fe73c --- /dev/null +++ b/src/tokenizer/token.rs @@ -0,0 +1,120 @@ +#[derive(Debug, PartialEq)] +pub struct Token { + /// The type of the token + pub token_type: TokenType, + /// The line where the token was found + pub line: usize, + /// The column where the token was found + pub column: usize, +} + +impl Token { + pub fn new(token_type: TokenType, line: usize, column: usize) -> Self { + Self { + token_type, + line, + column, + } + } +} + +#[derive(Debug, PartialEq, Hash, Eq)] +pub enum TokenType { + /// Represents a string token + String(String), + /// Represents a number token + Number(Number), + /// Represents a boolean token + Boolean(bool), + /// Represents a keyword token + Keyword(Keyword), + /// Represents an identifier token + Identifier(String), + /// Represents a symbol token + Symbol(Symbol), + /// Represents an end of file token + EOF, +} + +#[derive(Debug, PartialEq, Hash, Eq)] +pub enum Number { + /// Represents an integer number + Integer(u64), + /// Represents a decimal type number with a precision of 64 bits + Decimal(u64, u64), +} + +#[derive(Debug, PartialEq, Hash, Eq)] +pub enum Symbol { + // Single Character Symbols + /// Represents the `(` symbol + LParen, + /// Represents the `)` symbol + RParen, + /// Represents the `{` symbol + LBrace, + /// Represents the `}` symbol + RBrace, + /// Represents the `[` symbol + LBracket, + /// Represents the `]` symbol + RBracket, + /// Represents the `;` symbol + Semicolon, + /// Represents the `:` symbol + Colon, + /// Represents the `,` symbol + Comma, + /// Represents the `+` symbol + Plus, + /// Represents the `-` symbol + Minus, + /// Represents the `*` symbol + Asterisk, + /// Represents the `/` symbol + Slash, + /// Represents the `<` symbol + LessThan, + /// Represents the `>` symbol + GreaterThan, + /// Represents the `=` symbol + Assign, + /// Represents the `!` symbol + LogicalNot, + /// Represents the `.` symbol + Dot, + + // Double Character Symbols + /// Represents the `==` symbol + Equal, + /// Represents the `!=` symbol + NotEqual, + /// Represents the `&&` Symbol + LogicalAnd, + // Represents the `||` Symbol + LogicalOr, + /// Represents the `<=` symbol + LessThanOrEqual, + /// Represents the `>=` symbol + GreaterThanOrEqual, +} + +#[derive(Debug, PartialEq, Hash, Eq)] +pub enum Keyword { + /// Represents the `let` keyword + Let, + /// Represents the `fn` keyword + Fn, + /// Represents the `if` keyword + If, + /// Represents the `else` keyword + Else, + /// Represents the `return` keyword + Return, + /// Represents the `enum` keyword + Enum, + /// Represents an import keyword + Import, + /// Represents an export keyword + Export, +} diff --git a/tests/file.stlg b/tests/file.stlg new file mode 100644 index 0000000..05b01a5 --- /dev/null +++ b/tests/file.stlg @@ -0,0 +1,3 @@ +export fn doThings() { + power.myPowerItem(12.45 + 5); +}