tokenizer
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/target
|
||||
265
Cargo.lock
generated
Normal file
265
Cargo.lock
generated
Normal file
@@ -0,0 +1,265 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "anstream"
|
||||
version = "0.6.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"anstyle-parse",
|
||||
"anstyle-query",
|
||||
"anstyle-wincon",
|
||||
"colorchoice",
|
||||
"is_terminal_polyfill",
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle"
|
||||
version = "1.0.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-parse"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
|
||||
dependencies = [
|
||||
"utf8parse",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-query"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
|
||||
dependencies = [
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anstyle-wincon"
|
||||
version = "3.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
|
||||
dependencies = [
|
||||
"anstyle",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.93"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775"
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.5.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_builder"
|
||||
version = "4.5.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"clap_lex",
|
||||
"strsim",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.5.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.89"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.37"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stationlang"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.87"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "2.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "2.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.59.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_gnullvm",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
14
Cargo.toml
Normal file
14
Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "stationlang"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[profile.dev]
|
||||
panic = "unwind"
|
||||
|
||||
[dependencies]
|
||||
clap = { version = "^4.5", features = ["derive"] }
|
||||
thiserror = { version = "^2.0" }
|
||||
|
||||
[dev-dependencies]
|
||||
anyhow = { version = "^1.0" }
|
||||
43
src/main.rs
Normal file
43
src/main.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
mod tokenizer;
|
||||
|
||||
use clap::Parser;
|
||||
use tokenizer::{Tokenizer, TokenizerError};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
enum StationlangError {
|
||||
#[error("{0}")]
|
||||
TokenizerError(#[from] TokenizerError),
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version, about, long_about = None)]
|
||||
struct Args {
|
||||
/// What file should be compiled
|
||||
#[arg(short, long)]
|
||||
input_file: String,
|
||||
/// The default stack size for the program
|
||||
#[arg(short, long, default_value_t = 512)]
|
||||
stack_size: usize,
|
||||
/// The output file for the compiled program. If not set, output will go to stdout
|
||||
#[arg(short, long)]
|
||||
output_file: Option<String>,
|
||||
}
|
||||
|
||||
fn run_logic() -> Result<(), StationlangError> {
|
||||
let args = Args::parse();
|
||||
let input_file = args.input_file;
|
||||
|
||||
let mut tokenizer = Tokenizer::from_path(&input_file)?;
|
||||
|
||||
while let Some(token) = tokenizer.next_token()? {
|
||||
println!("{:?}", token);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn main() {
|
||||
if let Err(e) = run_logic() {
|
||||
eprintln!("\n\n{}", e);
|
||||
}
|
||||
}
|
||||
671
src/tokenizer/mod.rs
Normal file
671
src/tokenizer/mod.rs
Normal file
@@ -0,0 +1,671 @@
|
||||
mod token;
|
||||
|
||||
use std::{
|
||||
fs::File,
|
||||
io::{BufReader, Cursor, Read, Seek, SeekFrom},
|
||||
path::PathBuf,
|
||||
};
|
||||
use thiserror::Error;
|
||||
use token::{Keyword, Number, Symbol, Token, TokenType};
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum TokenizerError {
|
||||
#[error("IO Error: {0}")]
|
||||
IOError(#[from] std::io::Error),
|
||||
#[error("Number Parse Error \"{0}\"\nLine: {1}, Column: {2}")]
|
||||
NumberParseError(std::num::ParseIntError, usize, usize),
|
||||
#[error("Unknown Symbol \"{0}\"\nLine: {1}, Column: {2}")]
|
||||
UnknownSymbolError(char, usize, usize),
|
||||
#[error("Unknown Keyword or Identifier \"{0}\"\nLine: {1}, Column: {2}")]
|
||||
UnknownKeywordOrIdentifierError(String, usize, usize),
|
||||
}
|
||||
|
||||
pub(crate) struct Tokenizer<T>
|
||||
where
|
||||
T: Read + Seek,
|
||||
{
|
||||
reader: BufReader<T>,
|
||||
char_buffer: [u8; 1],
|
||||
line: usize,
|
||||
column: usize,
|
||||
returned_eof: bool,
|
||||
}
|
||||
|
||||
impl From<String> for Tokenizer<Cursor<Vec<u8>>> {
|
||||
fn from(input: String) -> Self {
|
||||
let cursor = Cursor::new(input.into_bytes());
|
||||
let reader = BufReader::new(cursor);
|
||||
|
||||
Self {
|
||||
reader,
|
||||
line: 1,
|
||||
column: 1,
|
||||
char_buffer: [0],
|
||||
returned_eof: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenizer<File> {
|
||||
pub fn from_path(input_file: impl Into<PathBuf>) -> Result<Self, TokenizerError> {
|
||||
let file = std::fs::File::open(input_file.into())?;
|
||||
let reader = BufReader::new(file);
|
||||
|
||||
Ok(Self {
|
||||
reader,
|
||||
line: 1,
|
||||
column: 1,
|
||||
char_buffer: [0],
|
||||
returned_eof: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Tokenizer<T>
|
||||
where
|
||||
T: Read + Seek,
|
||||
{
|
||||
/// Consumes the tokenizer and returns the next token in the stream
|
||||
/// If there are no more tokens in the stream, this function returns None
|
||||
/// If there is an error reading the stream, this function returns an error
|
||||
///
|
||||
/// # Important
|
||||
/// This function will increment the line and column counters
|
||||
fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
|
||||
let bytes_read = self.reader.read(&mut self.char_buffer)?;
|
||||
|
||||
if bytes_read == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Safety: The buffer is guaranteed to have 1 value as it is initialized with a size of 1
|
||||
let c = self.char_buffer[0] as char;
|
||||
if c == '\n' {
|
||||
self.line += 1;
|
||||
self.column = 1;
|
||||
} else {
|
||||
self.column += 1;
|
||||
}
|
||||
|
||||
Ok(Some(c))
|
||||
}
|
||||
|
||||
/// Peeks the next character in the stream without consuming it
|
||||
///
|
||||
/// # Important
|
||||
/// This does not increment the line or column counters
|
||||
fn peek_next_char(&mut self) -> Result<Option<char>, TokenizerError> {
|
||||
let current_pos = self.reader.stream_position()?;
|
||||
|
||||
let to_return = if self.reader.read(&mut self.char_buffer)? == 0 {
|
||||
None
|
||||
} else {
|
||||
self.reader.seek(SeekFrom::Start(current_pos))?;
|
||||
|
||||
// Safety: The buffer is guaranteed to have 1 value as it is initialized with a size of 1
|
||||
Some(self.char_buffer[0] as char)
|
||||
};
|
||||
|
||||
Ok(to_return)
|
||||
}
|
||||
|
||||
/// Skips the current line in the stream.
|
||||
/// Useful for skipping comments or empty lines
|
||||
///
|
||||
/// # Important
|
||||
/// This function will increment the line and column counters
|
||||
fn skip_line(&mut self) -> Result<(), TokenizerError> {
|
||||
while let Some(next_char) = self.next_char()? {
|
||||
if next_char == '\n' {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn next_token(&mut self) -> Result<Option<Token>, TokenizerError> {
|
||||
while let Some(next_char) = self.next_char()? {
|
||||
// skip whitespace
|
||||
if next_char.is_whitespace() {
|
||||
continue;
|
||||
}
|
||||
// skip comments
|
||||
if next_char == '/' && self.peek_next_char()? == Some('/') {
|
||||
self.skip_line()?;
|
||||
continue;
|
||||
}
|
||||
|
||||
match next_char {
|
||||
// numbers
|
||||
'0'..='9' => {
|
||||
return self.tokenize_number(next_char).map(Some);
|
||||
}
|
||||
// strings
|
||||
'"' | '\'' => return self.tokenize_string(next_char).map(Some),
|
||||
// symbols excluding `"` and `'`
|
||||
char if !char.is_alphanumeric() && char != '"' && char != '\'' => {
|
||||
return self.tokenize_symbol(next_char).map(Some)
|
||||
}
|
||||
// keywords and identifiers
|
||||
char if char.is_alphabetic() => {
|
||||
return self.tokenize_keyword_or_identifier(next_char).map(Some)
|
||||
}
|
||||
_ => {
|
||||
return Err(TokenizerError::UnknownSymbolError(
|
||||
next_char,
|
||||
self.line,
|
||||
self.column,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
if self.returned_eof {
|
||||
Ok(None)
|
||||
} else {
|
||||
self.returned_eof = true;
|
||||
Ok(Some(Token::new(TokenType::EOF, self.line, self.column)))
|
||||
}
|
||||
}
|
||||
|
||||
/// Tokenizes a symbol
|
||||
fn tokenize_symbol(&mut self, first_symbol: char) -> Result<Token, TokenizerError> {
|
||||
/// Helper macro to create a symbol token
|
||||
macro_rules! symbol {
|
||||
($symbol:ident) => {
|
||||
Ok(Token::new(
|
||||
TokenType::Symbol(Symbol::$symbol),
|
||||
self.line,
|
||||
self.column,
|
||||
))
|
||||
};
|
||||
}
|
||||
|
||||
match first_symbol {
|
||||
// single character symbols
|
||||
'(' => symbol!(LParen),
|
||||
')' => symbol!(RParen),
|
||||
'{' => symbol!(LBrace),
|
||||
'}' => symbol!(RBrace),
|
||||
'[' => symbol!(LBracket),
|
||||
']' => symbol!(RBracket),
|
||||
';' => symbol!(Semicolon),
|
||||
':' => symbol!(Colon),
|
||||
',' => symbol!(Comma),
|
||||
'+' => symbol!(Plus),
|
||||
'-' => symbol!(Minus),
|
||||
'/' => symbol!(Slash),
|
||||
'*' => symbol!(Asterisk),
|
||||
'.' => symbol!(Dot),
|
||||
|
||||
// multi-character symbols
|
||||
'<' if self.peek_next_char()? == Some('=') => {
|
||||
self.next_char()?;
|
||||
symbol!(LessThanOrEqual)
|
||||
}
|
||||
'<' => symbol!(LessThan),
|
||||
|
||||
'>' if self.peek_next_char()? == Some('=') => {
|
||||
self.next_char()?;
|
||||
symbol!(GreaterThanOrEqual)
|
||||
}
|
||||
'>' => symbol!(GreaterThan),
|
||||
|
||||
'=' if self.peek_next_char()? == Some('=') => {
|
||||
self.next_char()?;
|
||||
symbol!(Equal)
|
||||
}
|
||||
'=' => symbol!(Assign),
|
||||
|
||||
'!' if self.peek_next_char()? == Some('=') => {
|
||||
self.next_char()?;
|
||||
symbol!(NotEqual)
|
||||
}
|
||||
'!' => symbol!(LogicalNot),
|
||||
|
||||
'&' if self.peek_next_char()? == Some('&') => {
|
||||
self.next_char()?;
|
||||
symbol!(LogicalAnd)
|
||||
}
|
||||
'|' if self.peek_next_char()? == Some('|') => {
|
||||
self.next_char()?;
|
||||
symbol!(LogicalOr)
|
||||
}
|
||||
_ => Err(TokenizerError::UnknownSymbolError(
|
||||
first_symbol,
|
||||
self.line,
|
||||
self.column,
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Tokenizes a number literal
|
||||
fn tokenize_number(&mut self, first_char: char) -> Result<Token, TokenizerError> {
|
||||
let mut primary = String::with_capacity(16);
|
||||
let mut decimal: Option<String> = None;
|
||||
let mut reading_decimal = false;
|
||||
|
||||
let column = self.column.clone();
|
||||
let line = self.line.clone();
|
||||
|
||||
primary.push(first_char);
|
||||
|
||||
while let Some(next_char) = self.peek_next_char()? {
|
||||
if next_char.is_whitespace() {
|
||||
break;
|
||||
}
|
||||
|
||||
if next_char == '.' {
|
||||
reading_decimal = true;
|
||||
self.next_char()?;
|
||||
continue;
|
||||
}
|
||||
|
||||
// This is for the times when we have a number followed by a symbol (like a semicolon or =)
|
||||
if !next_char.is_numeric() {
|
||||
break;
|
||||
}
|
||||
|
||||
if reading_decimal {
|
||||
decimal.get_or_insert_with(String::new).push(next_char);
|
||||
} else {
|
||||
primary.push(next_char);
|
||||
}
|
||||
self.next_char()?;
|
||||
}
|
||||
|
||||
if let Some(decimal) = decimal {
|
||||
Ok(Token::new(
|
||||
TokenType::Number(Number::Decimal(
|
||||
primary
|
||||
.parse()
|
||||
.map_err(|e| TokenizerError::NumberParseError(e, line, column))?,
|
||||
decimal
|
||||
.parse()
|
||||
.map_err(|e| TokenizerError::NumberParseError(e, line, column))?,
|
||||
)),
|
||||
line,
|
||||
column,
|
||||
))
|
||||
} else {
|
||||
Ok(Token::new(
|
||||
TokenType::Number(Number::Integer(
|
||||
primary
|
||||
.parse()
|
||||
.map_err(|e| TokenizerError::NumberParseError(e, line, column))?,
|
||||
)),
|
||||
line,
|
||||
column,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
/// Tokenizes a string literal
|
||||
fn tokenize_string(&mut self, beginning_quote: char) -> Result<Token, TokenizerError> {
|
||||
let mut buffer = String::with_capacity(16);
|
||||
|
||||
let column = self.column.clone();
|
||||
let line = self.line.clone();
|
||||
|
||||
while let Some(next_char) = self.next_char()? {
|
||||
if next_char == beginning_quote {
|
||||
break;
|
||||
}
|
||||
|
||||
buffer.push(next_char);
|
||||
}
|
||||
|
||||
Ok(Token::new(TokenType::String(buffer), line, column))
|
||||
}
|
||||
|
||||
/// Tokenizes a keyword or an identifier. Also handles boolean literals
|
||||
fn tokenize_keyword_or_identifier(
|
||||
&mut self,
|
||||
first_char: char,
|
||||
) -> Result<Token, TokenizerError> {
|
||||
macro_rules! keyword {
|
||||
($keyword:ident) => {{
|
||||
return Ok(Token::new(
|
||||
TokenType::Keyword(Keyword::$keyword),
|
||||
self.line,
|
||||
self.column,
|
||||
));
|
||||
}};
|
||||
}
|
||||
|
||||
/// Helper macro to check if the next character is whitespace or not alphanumeric
|
||||
macro_rules! next_ws {
|
||||
() => {
|
||||
matches!(self.peek_next_char()?, Some(x) if x.is_whitespace() || !x.is_alphanumeric()) || matches!(self.peek_next_char()?, None)
|
||||
};
|
||||
}
|
||||
|
||||
let mut buffer = String::with_capacity(16);
|
||||
let line = self.line.clone();
|
||||
let column = self.column.clone();
|
||||
|
||||
let mut looped_char = Some(first_char);
|
||||
|
||||
while let Some(next_char) = looped_char {
|
||||
if next_char.is_whitespace() {
|
||||
break;
|
||||
}
|
||||
|
||||
if !next_char.is_alphanumeric() {
|
||||
break;
|
||||
}
|
||||
buffer.push(next_char);
|
||||
|
||||
match buffer.as_str() {
|
||||
"let" if next_ws!() => keyword!(Let),
|
||||
"fn" if next_ws!() => keyword!(Fn),
|
||||
"if" if next_ws!() => keyword!(If),
|
||||
"else" if next_ws!() => keyword!(Else),
|
||||
"return" if next_ws!() => keyword!(Return),
|
||||
"enum" if next_ws!() => keyword!(Enum),
|
||||
"import" if next_ws!() => keyword!(Import),
|
||||
"export" if next_ws!() => keyword!(Export),
|
||||
|
||||
// boolean literals
|
||||
"true" if next_ws!() => {
|
||||
return Ok(Token::new(TokenType::Boolean(true), self.line, self.column))
|
||||
}
|
||||
"false" if next_ws!() => {
|
||||
return Ok(Token::new(
|
||||
TokenType::Boolean(false),
|
||||
self.line,
|
||||
self.column,
|
||||
))
|
||||
}
|
||||
// if the next character is whitespace or not alphanumeric, then we have an identifier
|
||||
// this is because keywords are checked first
|
||||
val if next_ws!() => {
|
||||
return Ok(Token::new(
|
||||
TokenType::Identifier(val.to_string()),
|
||||
line,
|
||||
column,
|
||||
));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
looped_char = self.next_char()?;
|
||||
}
|
||||
Err(TokenizerError::UnknownKeywordOrIdentifierError(
|
||||
buffer, line, column,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use anyhow::Result;
|
||||
|
||||
const TEST_FILE: &str = "tests/file.stlg";
|
||||
|
||||
const TEST_STRING: &str = r#"
|
||||
fn test() {
|
||||
let x = 10;
|
||||
return x + 2;
|
||||
}
|
||||
"#;
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_from_path_ok() {
|
||||
let tokenizer = Tokenizer::from_path(TEST_FILE);
|
||||
assert!(tokenizer.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_from_path_err() {
|
||||
let tokenizer = Tokenizer::from_path("non_existent_file.stlg");
|
||||
assert!(tokenizer.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_next_char() -> Result<()> {
|
||||
let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned());
|
||||
|
||||
let char = tokenizer.next_char()?;
|
||||
|
||||
assert_eq!(char, Some('\n'));
|
||||
assert_eq!(tokenizer.line, 2);
|
||||
assert_eq!(tokenizer.column, 1);
|
||||
|
||||
let mut tokenizer = Tokenizer::from(String::from("fn"));
|
||||
|
||||
let char = tokenizer.next_char()?;
|
||||
|
||||
assert_eq!(char, Some('f'));
|
||||
assert_eq!(tokenizer.line, 1);
|
||||
assert_eq!(tokenizer.column, 2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_peek_next_char() -> Result<()> {
|
||||
let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned());
|
||||
|
||||
let char = tokenizer.peek_next_char()?;
|
||||
|
||||
assert_eq!(char, Some('\n'));
|
||||
assert_eq!(tokenizer.line, 1);
|
||||
assert_eq!(tokenizer.column, 1);
|
||||
|
||||
let char = tokenizer.next_char()?;
|
||||
assert_eq!(char, Some('\n'));
|
||||
assert_eq!(tokenizer.line, 2);
|
||||
assert_eq!(tokenizer.column, 1);
|
||||
|
||||
let char = tokenizer.peek_next_char()?;
|
||||
assert_eq!(char, Some(' '));
|
||||
assert_eq!(tokenizer.line, 2);
|
||||
assert_eq!(tokenizer.column, 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_line() -> Result<()> {
|
||||
let mut tokenizer = Tokenizer::from(String::from(
|
||||
r#"
|
||||
This is a skippable line"#,
|
||||
));
|
||||
|
||||
tokenizer.skip_line()?;
|
||||
|
||||
assert_eq!(tokenizer.line, 2);
|
||||
assert_eq!(tokenizer.column, 1);
|
||||
|
||||
let next_char = tokenizer.next_char()?;
|
||||
assert_eq!(next_char, Some('T'));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_integer() -> Result<()> {
|
||||
let mut tokenizer = Tokenizer::from(String::from("10"));
|
||||
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
|
||||
assert_eq!(token.token_type, TokenType::Number(Number::Integer(10)));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_decimal() -> Result<()> {
|
||||
let mut tokenizer = Tokenizer::from(String::from("10.5"));
|
||||
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
|
||||
assert_eq!(token.token_type, TokenType::Number(Number::Decimal(10, 5)));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_number_with_symbol() -> Result<()> {
|
||||
let mut tokenizer = Tokenizer::from(String::from("10;"));
|
||||
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
|
||||
assert_eq!(token.token_type, TokenType::Number(Number::Integer(10)));
|
||||
|
||||
let next_char = tokenizer.next_char()?;
|
||||
|
||||
assert_eq!(next_char, Some(';'));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_parse() -> Result<()> {
|
||||
let mut tokenizer = Tokenizer::from(String::from(r#""Hello, World!""#));
|
||||
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
token.token_type,
|
||||
TokenType::String(String::from("Hello, World!"))
|
||||
);
|
||||
|
||||
let mut tokenizer = Tokenizer::from(String::from(r#"'Hello, World!'"#));
|
||||
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
|
||||
assert_eq!(
|
||||
token.token_type,
|
||||
TokenType::String(String::from("Hello, World!"))
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_symbol_parse() -> Result<()> {
|
||||
let mut tokenizer = Tokenizer::from(String::from(
|
||||
"! () [] {} , . ; : + - * / < > = != && || >= <=",
|
||||
));
|
||||
|
||||
let expected_tokens = vec![
|
||||
TokenType::Symbol(Symbol::LogicalNot),
|
||||
TokenType::Symbol(Symbol::LParen),
|
||||
TokenType::Symbol(Symbol::RParen),
|
||||
TokenType::Symbol(Symbol::LBracket),
|
||||
TokenType::Symbol(Symbol::RBracket),
|
||||
TokenType::Symbol(Symbol::LBrace),
|
||||
TokenType::Symbol(Symbol::RBrace),
|
||||
TokenType::Symbol(Symbol::Comma),
|
||||
TokenType::Symbol(Symbol::Dot),
|
||||
TokenType::Symbol(Symbol::Semicolon),
|
||||
TokenType::Symbol(Symbol::Colon),
|
||||
TokenType::Symbol(Symbol::Plus),
|
||||
TokenType::Symbol(Symbol::Minus),
|
||||
TokenType::Symbol(Symbol::Asterisk),
|
||||
TokenType::Symbol(Symbol::Slash),
|
||||
TokenType::Symbol(Symbol::LessThan),
|
||||
TokenType::Symbol(Symbol::GreaterThan),
|
||||
TokenType::Symbol(Symbol::Assign),
|
||||
TokenType::Symbol(Symbol::NotEqual),
|
||||
TokenType::Symbol(Symbol::LogicalAnd),
|
||||
TokenType::Symbol(Symbol::LogicalOr),
|
||||
TokenType::Symbol(Symbol::GreaterThanOrEqual),
|
||||
TokenType::Symbol(Symbol::LessThanOrEqual),
|
||||
];
|
||||
|
||||
for expected_token in expected_tokens {
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
|
||||
assert_eq!(token.token_type, expected_token);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_keyword_parse() -> Result<()> {
|
||||
let mut tokenizer =
|
||||
Tokenizer::from(String::from("let fn if else return enum import export"));
|
||||
|
||||
let expected_tokens = vec![
|
||||
TokenType::Keyword(Keyword::Let),
|
||||
TokenType::Keyword(Keyword::Fn),
|
||||
TokenType::Keyword(Keyword::If),
|
||||
TokenType::Keyword(Keyword::Else),
|
||||
TokenType::Keyword(Keyword::Return),
|
||||
TokenType::Keyword(Keyword::Enum),
|
||||
TokenType::Keyword(Keyword::Import),
|
||||
TokenType::Keyword(Keyword::Export),
|
||||
];
|
||||
|
||||
for expected_token in expected_tokens {
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
|
||||
assert_eq!(token.token_type, expected_token);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_identifier_parse() -> Result<()> {
|
||||
let mut tokenizer = Tokenizer::from(String::from("fn test"));
|
||||
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
assert_eq!(token.token_type, TokenType::Keyword(Keyword::Fn));
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
assert_eq!(
|
||||
token.token_type,
|
||||
TokenType::Identifier(String::from("test"))
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_boolean_parse() -> Result<()> {
|
||||
let mut tokenizer = Tokenizer::from(String::from("true false"));
|
||||
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
assert_eq!(token.token_type, TokenType::Boolean(true));
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
assert_eq!(token.token_type, TokenType::Boolean(false));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_full_source() -> Result<()> {
|
||||
let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned());
|
||||
|
||||
let expected_tokens = vec![
|
||||
TokenType::Keyword(Keyword::Fn),
|
||||
TokenType::Identifier(String::from("test")),
|
||||
TokenType::Symbol(Symbol::LParen),
|
||||
TokenType::Symbol(Symbol::RParen),
|
||||
TokenType::Symbol(Symbol::LBrace),
|
||||
TokenType::Keyword(Keyword::Let),
|
||||
TokenType::Identifier(String::from("x")),
|
||||
TokenType::Symbol(Symbol::Assign),
|
||||
TokenType::Number(Number::Integer(10)),
|
||||
TokenType::Symbol(Symbol::Semicolon),
|
||||
TokenType::Keyword(Keyword::Return),
|
||||
TokenType::Identifier(String::from("x")),
|
||||
TokenType::Symbol(Symbol::Plus),
|
||||
TokenType::Number(Number::Integer(2)),
|
||||
TokenType::Symbol(Symbol::Semicolon),
|
||||
TokenType::Symbol(Symbol::RBrace),
|
||||
];
|
||||
|
||||
for expected_token in expected_tokens {
|
||||
let token = tokenizer.next_token()?.unwrap();
|
||||
|
||||
assert_eq!(token.token_type, expected_token);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
120
src/tokenizer/token.rs
Normal file
120
src/tokenizer/token.rs
Normal file
@@ -0,0 +1,120 @@
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct Token {
|
||||
/// The type of the token
|
||||
pub token_type: TokenType,
|
||||
/// The line where the token was found
|
||||
pub line: usize,
|
||||
/// The column where the token was found
|
||||
pub column: usize,
|
||||
}
|
||||
|
||||
impl Token {
|
||||
pub fn new(token_type: TokenType, line: usize, column: usize) -> Self {
|
||||
Self {
|
||||
token_type,
|
||||
line,
|
||||
column,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Hash, Eq)]
|
||||
pub enum TokenType {
|
||||
/// Represents a string token
|
||||
String(String),
|
||||
/// Represents a number token
|
||||
Number(Number),
|
||||
/// Represents a boolean token
|
||||
Boolean(bool),
|
||||
/// Represents a keyword token
|
||||
Keyword(Keyword),
|
||||
/// Represents an identifier token
|
||||
Identifier(String),
|
||||
/// Represents a symbol token
|
||||
Symbol(Symbol),
|
||||
/// Represents an end of file token
|
||||
EOF,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Hash, Eq)]
|
||||
pub enum Number {
|
||||
/// Represents an integer number
|
||||
Integer(u64),
|
||||
/// Represents a decimal type number with a precision of 64 bits
|
||||
Decimal(u64, u64),
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Hash, Eq)]
|
||||
pub enum Symbol {
|
||||
// Single Character Symbols
|
||||
/// Represents the `(` symbol
|
||||
LParen,
|
||||
/// Represents the `)` symbol
|
||||
RParen,
|
||||
/// Represents the `{` symbol
|
||||
LBrace,
|
||||
/// Represents the `}` symbol
|
||||
RBrace,
|
||||
/// Represents the `[` symbol
|
||||
LBracket,
|
||||
/// Represents the `]` symbol
|
||||
RBracket,
|
||||
/// Represents the `;` symbol
|
||||
Semicolon,
|
||||
/// Represents the `:` symbol
|
||||
Colon,
|
||||
/// Represents the `,` symbol
|
||||
Comma,
|
||||
/// Represents the `+` symbol
|
||||
Plus,
|
||||
/// Represents the `-` symbol
|
||||
Minus,
|
||||
/// Represents the `*` symbol
|
||||
Asterisk,
|
||||
/// Represents the `/` symbol
|
||||
Slash,
|
||||
/// Represents the `<` symbol
|
||||
LessThan,
|
||||
/// Represents the `>` symbol
|
||||
GreaterThan,
|
||||
/// Represents the `=` symbol
|
||||
Assign,
|
||||
/// Represents the `!` symbol
|
||||
LogicalNot,
|
||||
/// Represents the `.` symbol
|
||||
Dot,
|
||||
|
||||
// Double Character Symbols
|
||||
/// Represents the `==` symbol
|
||||
Equal,
|
||||
/// Represents the `!=` symbol
|
||||
NotEqual,
|
||||
/// Represents the `&&` Symbol
|
||||
LogicalAnd,
|
||||
// Represents the `||` Symbol
|
||||
LogicalOr,
|
||||
/// Represents the `<=` symbol
|
||||
LessThanOrEqual,
|
||||
/// Represents the `>=` symbol
|
||||
GreaterThanOrEqual,
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Hash, Eq)]
|
||||
pub enum Keyword {
|
||||
/// Represents the `let` keyword
|
||||
Let,
|
||||
/// Represents the `fn` keyword
|
||||
Fn,
|
||||
/// Represents the `if` keyword
|
||||
If,
|
||||
/// Represents the `else` keyword
|
||||
Else,
|
||||
/// Represents the `return` keyword
|
||||
Return,
|
||||
/// Represents the `enum` keyword
|
||||
Enum,
|
||||
/// Represents an import keyword
|
||||
Import,
|
||||
/// Represents an export keyword
|
||||
Export,
|
||||
}
|
||||
3
tests/file.stlg
Normal file
3
tests/file.stlg
Normal file
@@ -0,0 +1,3 @@
|
||||
export fn doThings() {
|
||||
power.myPowerItem(12.45 + 5);
|
||||
}
|
||||
Reference in New Issue
Block a user