From 115a57128caa81e4e8698e282a0132052461913c Mon Sep 17 00:00:00 2001 From: Devin Bidwell Date: Mon, 8 Dec 2025 22:50:20 -0700 Subject: [PATCH] Before error type refactor --- rust_compiler/Cargo.lock | 22 + rust_compiler/libs/parser/Cargo.toml | 1 + rust_compiler/libs/parser/src/lib.rs | 7 +- rust_compiler/libs/tokenizer/Cargo.toml | 1 + rust_compiler/libs/tokenizer/src/lib.rs | 927 +--------------------- rust_compiler/libs/tokenizer/src/token.rs | 174 ++-- 6 files changed, 195 insertions(+), 937 deletions(-) diff --git a/rust_compiler/Cargo.lock b/rust_compiler/Cargo.lock index d5e79e8..c255c15 100644 --- a/rust_compiler/Cargo.lock +++ b/rust_compiler/Cargo.lock @@ -572,6 +572,7 @@ dependencies = [ "lsp-types", "pretty_assertions", "quick-error", + "thiserror", "tokenizer", ] @@ -998,6 +999,26 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "thiserror" +version = "2.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.111", +] + [[package]] name = "tinyvec" version = "1.10.0" @@ -1023,6 +1044,7 @@ dependencies = [ "lsp-types", "quick-error", "rust_decimal", + "thiserror", ] [[package]] diff --git a/rust_compiler/libs/parser/Cargo.toml b/rust_compiler/libs/parser/Cargo.toml index 336b498..e3c304b 100644 --- a/rust_compiler/libs/parser/Cargo.toml +++ b/rust_compiler/libs/parser/Cargo.toml @@ -8,6 +8,7 @@ quick-error = { workspace = true } tokenizer = { path = "../tokenizer" } helpers = { path = "../helpers" } lsp-types = { workspace = true } +thiserror = "2" [dev-dependencies] diff --git a/rust_compiler/libs/parser/src/lib.rs b/rust_compiler/libs/parser/src/lib.rs index 2011ebb..f00150d 100644 --- a/rust_compiler/libs/parser/src/lib.rs +++ b/rust_compiler/libs/parser/src/lib.rs @@ -111,7 +111,7 @@ macro_rules! self_matches_current { pub struct Parser<'a> { tokenizer: TokenizerBuffer<'a>, - current_token: Option, + current_token: Option>, pub errors: Vec, } @@ -126,12 +126,11 @@ impl<'a> Parser<'a> { /// Calculates a Span from a given Token reference. fn token_to_span(t: &Token) -> Span { - let len = t.original_string.as_ref().map(|s| s.len()).unwrap_or(0); Span { start_line: t.line, - start_col: t.column, + start_col: t.span.start, end_line: t.line, - end_col: t.column + len, + end_col: t.span.end, } } diff --git a/rust_compiler/libs/tokenizer/Cargo.toml b/rust_compiler/libs/tokenizer/Cargo.toml index 37b5611..9d50066 100644 --- a/rust_compiler/libs/tokenizer/Cargo.toml +++ b/rust_compiler/libs/tokenizer/Cargo.toml @@ -9,6 +9,7 @@ quick-error = { workspace = true } lsp-types = { workspace = true } helpers = { path = "../helpers" } logos = "0.16" +thiserror = "2" [dev-dependencies] anyhow = { version = "^1" } diff --git a/rust_compiler/libs/tokenizer/src/lib.rs b/rust_compiler/libs/tokenizer/src/lib.rs index 3d8dabb..44b2223 100644 --- a/rust_compiler/libs/tokenizer/src/lib.rs +++ b/rust_compiler/libs/tokenizer/src/lib.rs @@ -1,14 +1,15 @@ pub mod token; +use logos::{Lexer, Logos}; use quick_error::quick_error; -use rust_decimal::Decimal; use std::{ cmp::Ordering, collections::VecDeque, io::{BufReader, Cursor, Read, Seek, SeekFrom}, + iter::Peekable, path::PathBuf, }; -use token::{Keyword, Number, Symbol, Temperature, Token, TokenType}; +use token::{Token, TokenType}; quick_error! { #[derive(Debug)] @@ -18,19 +19,8 @@ quick_error! { display("IO Error: {}", err) source(err) } - NumberParseError(err: std::num::ParseIntError, line: usize, column: usize, original: String) { - display("Number Parse Error: {}", err) - source(err) - } - DecimalParseError(err: rust_decimal::Error, line: usize, column: usize, original: String) { - display("Decimal Parse Error: {}", err) - source(err) - } - UnknownSymbolError(char: char, line: usize, column: usize, original: String) { - display("Unknown Symbol: {}", char) - } - UnknownKeywordOrIdentifierError(val: String, line: usize, column: usize, original: String) { - display("Unknown Keyword or Identifier: {}", val) + LexError(err: token::LexError) { + from() } } } @@ -46,24 +36,7 @@ impl From for lsp_types::Diagnostic { severity: Some(DiagnosticSeverity::ERROR), ..Default::default() }, - NumberParseError(_, l, c, ref og) - | DecimalParseError(_, l, c, ref og) - | UnknownSymbolError(_, l, c, ref og) - | UnknownKeywordOrIdentifierError(_, l, c, ref og) => Diagnostic { - range: Range { - start: Position { - line: l as u32, - character: c as u32, - }, - end: Position { - line: l as u32, - character: (c + og.len()) as u32, - }, - }, - message: value.to_string(), - severity: Some(DiagnosticSeverity::ERROR), - ..Default::default() - }, + LexError(e) => e.into(), } } } @@ -73,452 +46,74 @@ pub trait Tokenize: Read + Seek {} impl Tokenize for T where T: Read + Seek {} pub struct Tokenizer<'a> { - reader: BufReader>, - char_buffer: [u8; 1], - line: usize, - column: usize, + lexer: Lexer<'a, TokenType<'a>>, returned_eof: bool, - string_buffer: String, -} - -impl<'a> Tokenizer<'a> { - pub fn from_path(input_file: impl Into) -> Result { - let file = std::fs::File::open(input_file.into())?; - let reader = BufReader::new(Box::new(file) as Box); - - Ok(Self { - reader, - line: 1, - column: 0, // Start at 0 so first char becomes 1 - char_buffer: [0], - returned_eof: false, - string_buffer: String::new(), - }) - } -} - -impl<'a> From for Tokenizer<'a> { - fn from(input: String) -> Self { - let reader = BufReader::new(Box::new(Cursor::new(input)) as Box); - - Self { - reader, - line: 1, - column: 0, - char_buffer: [0], - returned_eof: false, - string_buffer: String::new(), - } - } } impl<'a> From<&'a str> for Tokenizer<'a> { fn from(value: &'a str) -> Self { Self { - reader: BufReader::new(Box::new(Cursor::new(value)) as Box), - char_buffer: [0], - column: 0, - line: 1, + lexer: TokenType::lexer(value), returned_eof: false, - string_buffer: String::new(), } } } impl<'a> Tokenizer<'a> { - fn next_char(&mut self) -> Result, Error> { - let bytes_read = self.reader.read(&mut self.char_buffer)?; - - if bytes_read == 0 { - return Ok(None); - } - - let c = self.char_buffer[0] as char; - if c == '\n' { - self.line += 1; - self.column = 1; - } else { - self.column += 1; - } - - self.string_buffer.push(c); - Ok(Some(c)) + fn to_token(&mut self, t_type: TokenType<'a>) -> Token<'a> { + let mut span = self.lexer.span(); + span.start -= self.lexer.extras.line_start_index; + span.end -= self.lexer.extras.line_start_index; + Token::new(t_type, self.lexer.extras.line_count, span) } - fn peek_next_char(&mut self) -> Result, Error> { - let current_pos = self.reader.stream_position()?; - let to_return = if self.reader.read(&mut self.char_buffer)? == 0 { - None - } else { - self.reader.seek(SeekFrom::Start(current_pos))?; - Some(self.char_buffer[0] as char) - }; + pub fn next_token(&mut self) -> Result>, Error> { + let to_return = self + .lexer + .next() + .transpose() + .map(|t| t.map(|t| self.to_token(t)))?; + Ok(to_return) } - fn skip_line(&mut self) -> Result<(), Error> { - while let Some(next_char) = self.next_char()? { - if next_char == '\n' { - break; - } - } - Ok(()) - } - - pub fn next_token(&mut self) -> Result, Error> { - self.string_buffer.clear(); - - while let Some(next_char) = self.next_char()? { - if next_char.is_whitespace() { - self.string_buffer.clear(); - continue; - } - if next_char == '/' && self.peek_next_char()? == Some('/') { - self.skip_line()?; - self.string_buffer.clear(); - continue; - } - - // Capture start position before delegating - let start_line = self.line; - let start_col = self.column; - - match next_char { - '0'..='9' => { - return self - .tokenize_number(next_char, start_line, start_col) - .map(Some); - } - '"' | '\'' => { - return self - .tokenize_string(next_char, start_line, start_col) - .map(Some); - } - char if !char.is_alphanumeric() && char != '"' && char != '\'' => { - return self - .tokenize_symbol(next_char, start_line, start_col) - .map(Some); - } - char if char.is_alphabetic() || char == '_' => { - return self - .tokenize_keyword_or_identifier(next_char, start_line, start_col) - .map(Some); - } - _ => { - return Err(Error::UnknownSymbolError( - next_char, - start_line, - start_col, - std::mem::take(&mut self.string_buffer), - )); - } - } - } - if self.returned_eof { - Ok(None) - } else { - self.returned_eof = true; - Ok(Some(Token::new( - TokenType::EOF, - self.line, - self.column, - Some(std::mem::take(&mut self.string_buffer)), - ))) - } - } - - pub fn peek_next(&mut self) -> Result, Error> { - let current_pos = self.reader.stream_position()?; - let column = self.column; - let line = self.line; - let token = self.next_token()?; - self.reader.seek(SeekFrom::Start(current_pos))?; - self.column = column; - self.line = line; - Ok(token) - } - - // Updated helper functions to accept start_line and start_col - - fn tokenize_symbol( - &mut self, - first_symbol: char, - line: usize, - col: usize, - ) -> Result { - macro_rules! symbol { - ($symbol:ident) => { - Ok(Token::new( - TokenType::Symbol(Symbol::$symbol), - line, - col, - Some(std::mem::take(&mut self.string_buffer)), - )) - }; - } - - match first_symbol { - '(' => symbol!(LParen), - ')' => symbol!(RParen), - '{' => symbol!(LBrace), - '}' => symbol!(RBrace), - '[' => symbol!(LBracket), - ']' => symbol!(RBracket), - ';' => symbol!(Semicolon), - ':' => symbol!(Colon), - ',' => symbol!(Comma), - '+' => symbol!(Plus), - '-' => symbol!(Minus), - '/' => symbol!(Slash), - '.' => symbol!(Dot), - '^' => symbol!(Caret), - '%' => symbol!(Percent), - '<' if self.peek_next_char()? == Some('=') => { - self.next_char()?; - symbol!(LessThanOrEqual) - } - '<' => symbol!(LessThan), - '>' if self.peek_next_char()? == Some('=') => { - self.next_char()?; - symbol!(GreaterThanOrEqual) - } - '>' => symbol!(GreaterThan), - '=' if self.peek_next_char()? == Some('=') => { - self.next_char()?; - symbol!(Equal) - } - '=' => symbol!(Assign), - '!' if self.peek_next_char()? == Some('=') => { - self.next_char()?; - symbol!(NotEqual) - } - '!' => symbol!(LogicalNot), - '*' if self.peek_next_char()? == Some('*') => { - self.next_char()?; - symbol!(Exp) - } - '*' => symbol!(Asterisk), - '&' if self.peek_next_char()? == Some('&') => { - self.next_char()?; - symbol!(LogicalAnd) - } - '|' if self.peek_next_char()? == Some('|') => { - self.next_char()?; - symbol!(LogicalOr) - } - _ => Err(Error::UnknownSymbolError( - first_symbol, - line, - col, - std::mem::take(&mut self.string_buffer), - )), - } - } - - fn tokenize_number( - &mut self, - first_char: char, - line: usize, - col: usize, - ) -> Result { - let mut primary = String::with_capacity(16); - let mut decimal: Option = None; - let mut reading_decimal = false; - primary.push(first_char); - - while let Some(next_char) = self.peek_next_char()? { - if next_char.is_whitespace() { - break; - } - if next_char == '.' { - reading_decimal = true; - self.next_char()?; - continue; - } - if next_char == '_' { - self.next_char()?; - continue; - } - if !next_char.is_numeric() { - break; - } - - if reading_decimal { - decimal.get_or_insert_with(String::new).push(next_char); - } else { - primary.push(next_char); - } - self.next_char()?; - } - - let number: Number = if let Some(decimal) = decimal { - let decimal_scale = decimal.len() as u32; - let number_str = format!("{}{}", primary, decimal); - let number = number_str.parse::().map_err(|e| { - Error::NumberParseError(e, line, col, std::mem::take(&mut self.string_buffer)) - })?; - Number::Decimal( - Decimal::try_from_i128_with_scale(number, decimal_scale).map_err(|e| { - Error::DecimalParseError(e, line, col, std::mem::take(&mut self.string_buffer)) - })?, - ) - } else { - Number::Integer(primary.parse().map_err(|e| { - Error::NumberParseError(e, line, col, std::mem::take(&mut self.string_buffer)) - })?) - }; - - if let Some(next_char) = self.peek_next_char()? { - let temperature = match next_char { - 'c' => Temperature::Celsius(number), - 'f' => Temperature::Fahrenheit(number), - 'k' => Temperature::Kelvin(number), - _ => { - return Ok(Token::new( - TokenType::Number(number), - line, - col, - Some(std::mem::take(&mut self.string_buffer)), - )); - } - } - .to_kelvin(); - - self.next_char()?; - Ok(Token::new( - TokenType::Number(temperature), - line, - col, - Some(std::mem::take(&mut self.string_buffer)), - )) - } else { - Ok(Token::new( - TokenType::Number(number), - line, - col, - Some(std::mem::take(&mut self.string_buffer)), - )) - } - } - - fn tokenize_string( - &mut self, - beginning_quote: char, - line: usize, - col: usize, - ) -> Result { - let mut buffer = String::with_capacity(16); - while let Some(next_char) = self.next_char()? { - if next_char == beginning_quote { - break; - } - buffer.push(next_char); - } - Ok(Token::new( - TokenType::String(buffer), - line, - col, - Some(std::mem::take(&mut self.string_buffer)), - )) - } - - fn tokenize_keyword_or_identifier( - &mut self, - first_char: char, - line: usize, - col: usize, - ) -> Result { - macro_rules! keyword { - ($keyword:ident) => {{ - return Ok(Token::new( - TokenType::Keyword(Keyword::$keyword), - line, - col, - Some(std::mem::take(&mut self.string_buffer)), - )); - }}; - } - macro_rules! next_ws { - () => { matches!(self.peek_next_char()?, Some(x) if x.is_whitespace() || (!x.is_alphanumeric()) && x != '_') || self.peek_next_char()?.is_none() }; - } - - let mut buffer = String::with_capacity(16); - let mut looped_char = Some(first_char); - - while let Some(next_char) = looped_char { - // allow UNDERSCORE_IDENTS - if next_char.is_whitespace() || (!next_char.is_alphanumeric() && next_char != '_') { - break; - } - buffer.push(next_char); - - match buffer.as_str() { - "let" if next_ws!() => keyword!(Let), - "fn" if next_ws!() => keyword!(Fn), - "if" if next_ws!() => keyword!(If), - "else" if next_ws!() => keyword!(Else), - "return" if next_ws!() => keyword!(Return), - "enum" if next_ws!() => keyword!(Enum), - "device" if next_ws!() => keyword!(Device), - "loop" if next_ws!() => keyword!(Loop), - "break" if next_ws!() => keyword!(Break), - "while" if next_ws!() => keyword!(While), - "continue" if next_ws!() => keyword!(Continue), - "const" if next_ws!() => keyword!(Const), - "true" if next_ws!() => { - return Ok(Token::new( - TokenType::Boolean(true), - line, - col, - Some(std::mem::take(&mut self.string_buffer)), - )); - } - "false" if next_ws!() => { - return Ok(Token::new( - TokenType::Boolean(false), - line, - col, - Some(std::mem::take(&mut self.string_buffer)), - )); - } - val if next_ws!() => { - return Ok(Token::new( - TokenType::Identifier(val.to_string()), - line, - col, - Some(std::mem::take(&mut self.string_buffer)), - )); - } - _ => {} - } - looped_char = self.next_char()?; - } - Err(Error::UnknownKeywordOrIdentifierError( - buffer, - line, - col, - std::mem::take(&mut self.string_buffer), - )) + pub fn peek_next(&mut self) -> Result>, Error> { + todo!() } } // ... Iterator and TokenizerBuffer implementations remain unchanged ... // They just call the methods above which now use the passed-in start coordinates. impl<'a> Iterator for Tokenizer<'a> { - type Item = Result; + type Item = Result, Error>; fn next(&mut self) -> Option { - match self.next_token() { - Ok(Some(tok)) => Some(Ok(tok)), - Ok(None) => None, - Err(e) => Some(Err(e)), + match self.lexer.next() { + None => { + if self.returned_eof { + None + } else { + self.returned_eof = true; + Some(Ok(Token::new( + TokenType::EOF, + self.lexer.extras.line_count, + self.lexer.span(), + ))) + } + } + Some(t) => match t { + Err(e) => { + todo!() + } + Ok(t) => Some(Ok(self.to_token(t))), + }, } } } pub struct TokenizerBuffer<'a> { tokenizer: Tokenizer<'a>, - buffer: VecDeque, - history: VecDeque, + buffer: VecDeque>, + history: VecDeque>, index: i64, } @@ -601,437 +196,3 @@ impl<'a> TokenizerBuffer<'a> { Ok(()) } } - -#[cfg(test)] -mod tests { - use super::*; - use anyhow::Result; - use rust_decimal::Decimal; - - const TEST_FILE: &str = "tests/file.stlg"; - - const TEST_STRING: &str = r#" - fn test() { - let x = 10; - return x + 2; - } - "#; - - #[test] - fn test_seek_from_current() -> Result<()> { - let tokenizer = Tokenizer::from(TEST_STRING.to_owned()); - let mut buffer = TokenizerBuffer::new(tokenizer); - - let token = buffer.next_token()?.unwrap(); - assert_eq!(token.token_type, TokenType::Keyword(Keyword::Fn)); - - buffer.seek(SeekFrom::Current(1))?; - - let token = buffer.next_token()?.unwrap(); - - assert_eq!(token.token_type, TokenType::Symbol(Symbol::LParen)); - - Ok(()) - } - - #[test] - fn test_tokenizer_from_path_ok() { - let tokenizer = Tokenizer::from_path(TEST_FILE); - assert!(tokenizer.is_ok()); - } - - #[test] - fn test_tokenizer_from_path_err() { - let tokenizer = Tokenizer::from_path("non_existent_file.stlg"); - assert!(tokenizer.is_err()); - } - - #[test] - fn test_next_char() -> Result<()> { - let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); - - let char = tokenizer.next_char()?; - - assert_eq!(char, Some('\n')); - assert_eq!(tokenizer.line, 2); - assert_eq!(tokenizer.column, 1); - - let mut tokenizer = Tokenizer::from(String::from("fn")); - - let char = tokenizer.next_char()?; - - assert_eq!(char, Some('f')); - assert_eq!(tokenizer.line, 1); - assert_eq!(tokenizer.column, 1); - - Ok(()) - } - - #[test] - fn test_peek_next_char() -> Result<()> { - let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); - - let char = tokenizer.peek_next_char()?; - - assert_eq!(char, Some('\n')); - assert_eq!(tokenizer.line, 1); - assert_eq!(tokenizer.column, 0); - - let char = tokenizer.next_char()?; - assert_eq!(char, Some('\n')); - assert_eq!(tokenizer.line, 2); - assert_eq!(tokenizer.column, 1); - - let char = tokenizer.peek_next_char()?; - assert_eq!(char, Some(' ')); - assert_eq!(tokenizer.line, 2); - assert_eq!(tokenizer.column, 1); - - Ok(()) - } - - #[test] - fn test_temperature_unit() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from("10c 14f 10k")); - - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!( - token.token_type, - TokenType::Number(Number::Decimal(Decimal::new(28315, 2))) - ); - - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!( - token.token_type, - TokenType::Number(Number::Decimal(Decimal::new(26315, 2))) - ); - - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!(token.token_type, TokenType::Number(Number::Integer(10))); - - Ok(()) - } - - #[test] - fn test_parse_integer() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from("10")); - - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!(token.token_type, TokenType::Number(Number::Integer(10))); - - Ok(()) - } - - #[test] - fn test_parse_integer_with_underscore() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from("1_000")); - - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!(token.token_type, TokenType::Number(Number::Integer(1000))); - - Ok(()) - } - - #[test] - fn test_parse_decimal() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from("10.5")); - - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!( - token.token_type, - TokenType::Number(Number::Decimal(Decimal::new(105, 1))) // 10.5 - ); - - Ok(()) - } - - #[test] - fn test_parse_decimal_with_underscore() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from("1_000.000_6")); - - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!( - token.token_type, - TokenType::Number(Number::Decimal(Decimal::new(10000006, 4))) // 1000.0006 - ); - - Ok(()) - } - - #[test] - fn test_parse_number_with_symbol() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from("10;")); - - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!(token.token_type, TokenType::Number(Number::Integer(10))); - - let next_char = tokenizer.next_char()?; - - assert_eq!(next_char, Some(';')); - - Ok(()) - } - - #[test] - fn test_string_parse() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from(r#""Hello, World!""#)); - - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!( - token.token_type, - TokenType::String(String::from("Hello, World!")) - ); - - let mut tokenizer = Tokenizer::from(String::from(r#"'Hello, World!'"#)); - - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!( - token.token_type, - TokenType::String(String::from("Hello, World!")) - ); - - Ok(()) - } - - #[test] - fn test_symbol_parse() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from( - "^ ! () [] {} , . ; : + - * / < > = != && || >= <=**%", - )); - - let expected_tokens = vec![ - TokenType::Symbol(Symbol::Caret), - TokenType::Symbol(Symbol::LogicalNot), - TokenType::Symbol(Symbol::LParen), - TokenType::Symbol(Symbol::RParen), - TokenType::Symbol(Symbol::LBracket), - TokenType::Symbol(Symbol::RBracket), - TokenType::Symbol(Symbol::LBrace), - TokenType::Symbol(Symbol::RBrace), - TokenType::Symbol(Symbol::Comma), - TokenType::Symbol(Symbol::Dot), - TokenType::Symbol(Symbol::Semicolon), - TokenType::Symbol(Symbol::Colon), - TokenType::Symbol(Symbol::Plus), - TokenType::Symbol(Symbol::Minus), - TokenType::Symbol(Symbol::Asterisk), - TokenType::Symbol(Symbol::Slash), - TokenType::Symbol(Symbol::LessThan), - TokenType::Symbol(Symbol::GreaterThan), - TokenType::Symbol(Symbol::Assign), - TokenType::Symbol(Symbol::NotEqual), - TokenType::Symbol(Symbol::LogicalAnd), - TokenType::Symbol(Symbol::LogicalOr), - TokenType::Symbol(Symbol::GreaterThanOrEqual), - TokenType::Symbol(Symbol::LessThanOrEqual), - TokenType::Symbol(Symbol::Exp), - TokenType::Symbol(Symbol::Percent), - ]; - - for expected_token in expected_tokens { - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!(token.token_type, expected_token); - } - - Ok(()) - } - - #[test] - fn test_keyword_parse() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from( - "let fn if else return enum continue break const", - )); - - let expected_tokens = vec![ - TokenType::Keyword(Keyword::Let), - TokenType::Keyword(Keyword::Fn), - TokenType::Keyword(Keyword::If), - TokenType::Keyword(Keyword::Else), - TokenType::Keyword(Keyword::Return), - TokenType::Keyword(Keyword::Enum), - TokenType::Keyword(Keyword::Continue), - TokenType::Keyword(Keyword::Break), - TokenType::Keyword(Keyword::Const), - ]; - - for expected_token in expected_tokens { - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!(token.token_type, expected_token); - } - - Ok(()) - } - - #[test] - fn test_identifier_parse() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from("fn test fn test_underscores")); - - let token = tokenizer.next_token()?.unwrap(); - assert_eq!(token.token_type, TokenType::Keyword(Keyword::Fn)); - let token = tokenizer.next_token()?.unwrap(); - assert_eq!( - token.token_type, - TokenType::Identifier(String::from("test")) - ); - let token = tokenizer.next_token()?.unwrap(); - assert_eq!(token.token_type, TokenType::Keyword(Keyword::Fn)); - let token = tokenizer.next_token()?.unwrap(); - assert_eq!( - token.token_type, - TokenType::Identifier(String::from("test_underscores")) - ); - - Ok(()) - } - - #[test] - fn test_boolean_parse() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from("true false")); - - let token = tokenizer.next_token()?.unwrap(); - assert_eq!(token.token_type, TokenType::Boolean(true)); - let token = tokenizer.next_token()?.unwrap(); - assert_eq!(token.token_type, TokenType::Boolean(false)); - - Ok(()) - } - - #[test] - fn test_full_source() -> Result<()> { - let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); - - let expected_tokens = vec![ - TokenType::Keyword(Keyword::Fn), - TokenType::Identifier(String::from("test")), - TokenType::Symbol(Symbol::LParen), - TokenType::Symbol(Symbol::RParen), - TokenType::Symbol(Symbol::LBrace), - TokenType::Keyword(Keyword::Let), - TokenType::Identifier(String::from("x")), - TokenType::Symbol(Symbol::Assign), - TokenType::Number(Number::Integer(10)), - TokenType::Symbol(Symbol::Semicolon), - TokenType::Keyword(Keyword::Return), - TokenType::Identifier(String::from("x")), - TokenType::Symbol(Symbol::Plus), - TokenType::Number(Number::Integer(2)), - TokenType::Symbol(Symbol::Semicolon), - TokenType::Symbol(Symbol::RBrace), - ]; - - for expected_token in expected_tokens { - let token = tokenizer.next_token()?.unwrap(); - - assert_eq!(token.token_type, expected_token); - } - - Ok(()) - } - - #[test] - fn test_peek_next() -> Result<()> { - let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); - - let column = tokenizer.column; - let line = tokenizer.line; - - let peeked_token = tokenizer.peek_next()?; - - assert_eq!( - peeked_token.unwrap().token_type, - TokenType::Keyword(Keyword::Fn) - ); - assert_eq!(tokenizer.column, column); - assert_eq!(tokenizer.line, line); - - let next_token = tokenizer.next_token()?; - - assert_eq!( - next_token.unwrap().token_type, - TokenType::Keyword(Keyword::Fn) - ); - assert_ne!(tokenizer.column, column); - assert_ne!(tokenizer.line, line); - - Ok(()) - } - - #[test] - fn test_compact_syntax() -> Result<()> { - let mut tokenizer = Tokenizer::from(String::from("if(true) while(false)")); - - // if(true) - assert_eq!( - tokenizer.next_token()?.unwrap().token_type, - TokenType::Keyword(Keyword::If) - ); - assert_eq!( - tokenizer.next_token()?.unwrap().token_type, - TokenType::Symbol(Symbol::LParen) - ); - assert_eq!( - tokenizer.next_token()?.unwrap().token_type, - TokenType::Boolean(true) - ); - assert_eq!( - tokenizer.next_token()?.unwrap().token_type, - TokenType::Symbol(Symbol::RParen) - ); - - // while(false) - assert_eq!( - tokenizer.next_token()?.unwrap().token_type, - TokenType::Keyword(Keyword::While) - ); - assert_eq!( - tokenizer.next_token()?.unwrap().token_type, - TokenType::Symbol(Symbol::LParen) - ); - - Ok(()) - } - - #[test] - fn test_identifier_has_correct_length() -> Result<()> { - let mut tokenizer = Tokenizer::from("hello"); - assert_eq!( - tokenizer.next_token()?, - Some(Token { - token_type: TokenType::Identifier("hello".into()), - original_string: Some("hello".into()), - column: 1, - line: 1 - }) - ); - Ok(()) - } - - #[test] - fn test_keyword_token_has_correct_length() -> Result<()> { - let mut tokenizer = Tokenizer::from("while"); - - assert_eq!( - tokenizer.next_token()?, - Some(Token { - token_type: TokenType::Keyword(Keyword::While), - original_string: Some("while".into()), - column: 1, - line: 1 - }) - ); - - Ok(()) - } -} diff --git a/rust_compiler/libs/tokenizer/src/token.rs b/rust_compiler/libs/tokenizer/src/token.rs index 2233da3..53181d4 100644 --- a/rust_compiler/libs/tokenizer/src/token.rs +++ b/rust_compiler/libs/tokenizer/src/token.rs @@ -1,6 +1,57 @@ use helpers::prelude::*; -use logos::{Lexer, Logos}; +use logos::{Lexer, Logos, Skip, Span}; +use lsp_types::{Diagnostic, DiagnosticSeverity, Position, Range}; use rust_decimal::Decimal; +use thiserror::Error; + +#[derive(Debug, Error, Default, Clone, PartialEq)] +pub enum LexError { + #[error("Attempted to parse an invalid number: {2}")] + NumberParseError(usize, Span, String), + + #[error("An invalid character was found in token stream: {2}")] + InvalidInput(usize, Span, String), + + #[default] + #[error("An unknown error occurred")] + Other, +} + +impl From for Diagnostic { + fn from(value: LexError) -> Self { + match value { + LexError::NumberParseError(line, col, str) | LexError::InvalidInput(line, col, str) => { + Diagnostic { + range: Range { + start: Position { + character: col.start as u32, + line: line as u32, + }, + end: Position { + line: line as u32, + character: col.end as u32, + }, + }, + severity: Some(DiagnosticSeverity::ERROR), + message: str, + ..Default::default() + } + } + _ => todo!(), + } + } +} + +impl LexError { + pub fn from_lexer<'a>(lex: &mut Lexer<'a, TokenType<'a>>) -> Self { + let mut span = lex.span(); + let line = lex.extras.line_count; + span.start -= lex.extras.line_start_index; + span.end -= lex.extras.line_start_index; + + Self::InvalidInput(line, span, lex.slice().chars().as_str().to_string()) + } +} // Define a local macro to consume the list macro_rules! generate_check { @@ -11,29 +62,34 @@ macro_rules! generate_check { } } -#[derive(Debug, PartialEq, Eq, Clone)] -pub struct Token { - /// The type of the token - pub token_type: TokenType, - /// The line where the token was found - pub line: usize, - /// The column where the token was found - pub column: usize, - pub original_string: Option, +#[derive(Default)] +pub struct Extras { + pub line_count: usize, + pub line_start_index: usize, } -impl Token { - pub fn new( - token_type: TokenType, - line: usize, - column: usize, - original: Option, - ) -> Self { +fn update_line_index<'a>(lex: &mut Lexer<'a, TokenType<'a>>) -> Skip { + lex.extras.line_count += 1; + lex.extras.line_start_index = lex.span().end; + Skip +} + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Token<'a> { + /// The type of the token + pub token_type: TokenType<'a>, + /// The line where the token was found + pub line: usize, + /// The span where the token starts and ends + pub span: Span, +} + +impl<'a> Token<'a> { + pub fn new(token_type: TokenType<'a>, line: usize, span: Span) -> Self { Self { token_type, line, - column, - original_string: original, + span, } } } @@ -93,13 +149,19 @@ macro_rules! keyword { } #[derive(Debug, PartialEq, Hash, Eq, Clone, Logos)] -pub enum TokenType { +#[logos(skip r"[ \t\f]+")] +#[logos(extras = Extras)] +#[logos(error(LexError, LexError::from_lexer))] +pub enum TokenType<'a> { + #[regex(r"\n", update_line_index)] + Newline, + // matches strings with double quotes - #[regex(r#""(?:[^"\\]|\\.)*""#, |v| v.slice().to_string())] + #[regex(r#""(?:[^"\\]|\\.)*""#)] // matches strings with single quotes - #[regex(r#"'(?:[^'\\]|\\.)*'"#, |v| v.slice().to_string())] + #[regex(r#"'(?:[^'\\]|\\.)*'"#)] /// Represents a string token - String(String), + String(&'a str), #[regex(r"[0-9][0-9_]*(\.[0-9][0-9_]*)?([cfk])?", parse_number)] /// Represents a number token @@ -125,9 +187,9 @@ pub enum TokenType { /// Represents a keyword token Keyword(Keyword), - #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |v| v.slice().to_string())] + #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")] /// Represents an identifier token - Identifier(String), + Identifier(&'a str), #[token("(", symbol!(LParen))] #[token(")", symbol!(RParen))] @@ -159,10 +221,10 @@ pub enum TokenType { /// Represents a symbol token Symbol(Symbol), - #[regex(r"///[\n]*", |val| Comment::Doc(val.slice()[3..].trim().to_string()))] - #[regex(r"//[\n]*", |val| Comment::Line(val.slice()[2..].trim().to_string()))] + #[regex(r"///[\n]*", |val| Comment::Doc(val.slice()[3..].trim()))] + #[regex(r"//[\n]*", |val| Comment::Line(val.slice()[2..].trim()))] /// Represents a comment, both a line comment and a doc comment - Comment(Comment), + Comment(Comment<'a>), #[end] /// Represents an end of file token @@ -170,14 +232,14 @@ pub enum TokenType { } #[derive(Hash, Debug, Eq, PartialEq, Clone)] -pub enum Comment { - Line(String), - Doc(String), +pub enum Comment<'a> { + Line(&'a str), + Doc(&'a str), } -fn parse_number<'a>(lexer: &mut Lexer<'a, TokenType>) -> Option { +fn parse_number<'a>(lexer: &mut Lexer<'a, TokenType<'a>>) -> Result { let slice = lexer.slice(); - let last_char = slice.chars().last()?; + let last_char = slice.chars().last().unwrap_or_default(); let (num_str, suffix) = match last_char { 'c' | 'k' | 'f' => (&slice[..slice.len() - 1], Some(last_char)), _ => (slice, None), @@ -189,28 +251,39 @@ fn parse_number<'a>(lexer: &mut Lexer<'a, TokenType>) -> Option { num_str.to_string() }; + let line = lexer.extras.line_count; + let mut span = lexer.span(); + span.end -= lexer.extras.line_start_index; + span.start -= lexer.extras.line_start_index; + let num = if clean_str.contains('.') { - Number::Decimal(clean_str.parse::().ok()?) + Number::Decimal( + clean_str + .parse::() + .map_err(|_| LexError::NumberParseError(line, span, slice.to_string()))?, + ) } else { - Number::Integer(clean_str.parse::().ok()?) + Number::Integer( + clean_str + .parse::() + .map_err(|_| LexError::NumberParseError(line, span, slice.to_string()))?, + ) }; if let Some(suffix) = suffix { - Some( - match suffix { - 'c' => Temperature::Celsius(num), - 'f' => Temperature::Fahrenheit(num), - 'k' => Temperature::Kelvin(num), - _ => unreachable!(), - } - .to_kelvin(), - ) + Ok(match suffix { + 'c' => Temperature::Celsius(num), + 'f' => Temperature::Fahrenheit(num), + 'k' => Temperature::Kelvin(num), + _ => unreachable!(), + } + .to_kelvin()) } else { - Some(num) + Ok(num) } } -impl std::fmt::Display for Comment { +impl<'a> std::fmt::Display for Comment<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Line(c) => write!(f, "// {}", c), @@ -227,7 +300,7 @@ impl std::fmt::Display for Comment { } } -impl Documentation for TokenType { +impl<'a> Documentation for TokenType<'a> { fn docs(&self) -> String { match self { Self::Keyword(k) => k.docs(), @@ -242,7 +315,7 @@ impl Documentation for TokenType { helpers::with_syscalls!(generate_check); -impl From for u32 { +impl<'a> From> for u32 { fn from(value: TokenType) -> Self { match value { TokenType::String(_) => 1, @@ -277,12 +350,12 @@ impl From for u32 { 7 } } - TokenType::EOF => 0, + _ => 0, } } } -impl std::fmt::Display for TokenType { +impl<'a> std::fmt::Display for TokenType<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { TokenType::String(s) => write!(f, "{}", s), @@ -293,6 +366,7 @@ impl std::fmt::Display for TokenType { TokenType::Symbol(s) => write!(f, "{}", s), TokenType::Comment(c) => write!(f, "{}", c), TokenType::EOF => write!(f, "EOF"), + _ => write!(f, ""), } } }