From 72cf9ea042b5835d1feed286ecdcc3605b75ead5 Mon Sep 17 00:00:00 2001 From: Devin Bidwell Date: Tue, 9 Dec 2025 01:43:12 -0700 Subject: [PATCH] wip --- rust_compiler/libs/compiler/src/test/mod.rs | 4 +- rust_compiler/libs/parser/src/lib.rs | 27 +++++------ rust_compiler/libs/tokenizer/src/lib.rs | 16 +++--- rust_compiler/libs/tokenizer/src/token.rs | 54 ++++++++++++--------- rust_compiler/src/ffi/mod.rs | 40 ++++++++------- rust_compiler/src/main.rs | 13 +++-- 6 files changed, 85 insertions(+), 69 deletions(-) diff --git a/rust_compiler/libs/compiler/src/test/mod.rs b/rust_compiler/libs/compiler/src/test/mod.rs index b3e51c2..77d771a 100644 --- a/rust_compiler/libs/compiler/src/test/mod.rs +++ b/rust_compiler/libs/compiler/src/test/mod.rs @@ -22,7 +22,7 @@ macro_rules! compile { (result $source:expr) => {{ let mut writer = std::io::BufWriter::new(Vec::new()); let compiler = crate::Compiler::new( - parser::Parser::new(tokenizer::Tokenizer::from(String::from($source))), + parser::Parser::new(tokenizer::Tokenizer::from($source)), &mut writer, Some(crate::CompilerConfig { debug: true }), ); @@ -32,7 +32,7 @@ macro_rules! compile { (debug $source:expr) => {{ let mut writer = std::io::BufWriter::new(Vec::new()); let compiler = crate::Compiler::new( - parser::Parser::new(tokenizer::Tokenizer::from(String::from($source))), + parser::Parser::new(tokenizer::Tokenizer::from($source)), &mut writer, Some(crate::CompilerConfig { debug: true }), ); diff --git a/rust_compiler/libs/parser/src/lib.rs b/rust_compiler/libs/parser/src/lib.rs index 2e5df7b..e866c37 100644 --- a/rust_compiler/libs/parser/src/lib.rs +++ b/rust_compiler/libs/parser/src/lib.rs @@ -5,7 +5,6 @@ pub mod sys_call; pub mod tree_node; use crate::sys_call::{Math, System}; -use quick_error::quick_error; use std::io::SeekFrom; use sys_call::SysCall; use thiserror::Error; @@ -28,27 +27,27 @@ macro_rules! boxed { } #[derive(Error, Debug)] -pub enum Error<'a> { +pub enum Error { #[error("Tokenizer Error: {0}")] TokenizerError(#[from] tokenizer::Error), #[error("Unexpected token: {1}")] - UnexpectedToken(Span, Token<'a>), + UnexpectedToken(Span, Token), #[error("Duplicate identifier: {1}")] - DuplicateIdentifier(Span, Token<'a>), + DuplicateIdentifier(Span, Token), #[error("Invalid Syntax: {1}")] - InvalidSyntax(Span, Token<'a>), + InvalidSyntax(Span, String), #[error("Unsupported Keyword: {1}")] - UnsupportedKeyword(Span, Token<'a>), + UnsupportedKeyword(Span, Token), #[error("Unexpected End of File")] UnexpectedEOF, } -impl<'a> From> for lsp_types::Diagnostic { +impl From for lsp_types::Diagnostic { fn from(value: Error) -> Self { use Error::*; use lsp_types::*; @@ -107,8 +106,8 @@ macro_rules! self_matches_current { pub struct Parser<'a> { tokenizer: TokenizerBuffer<'a>, - current_token: Option>, - pub errors: Vec>, + current_token: Option, + pub errors: Vec, } impl<'a> Parser<'a> { @@ -160,9 +159,10 @@ impl<'a> Parser<'a> { let node = parser(self)?; - let end_token = self.current_token; + let end_token = &self.current_token; let (end_line, end_col) = end_token + .clone() .map(|t| (t.line, t.span.end)) .unwrap_or((start_line, start_col)); @@ -207,7 +207,7 @@ impl<'a> Parser<'a> { let first_token = self.tokenizer.peek().unwrap_or(None); let (start_line, start_col) = first_token .as_ref() - .map(|tok| (tok.line, tok.column)) + .map(|tok| (tok.line, tok.span.start)) .unwrap_or((1, 1)); let mut expressions = Vec::>::new(); @@ -238,10 +238,7 @@ impl<'a> Parser<'a> { let end_token_opt = self.tokenizer.peek().unwrap_or(None); let (end_line, end_col) = end_token_opt - .map(|tok| { - let len = tok.original_string.as_ref().map(|s| s.len()).unwrap_or(0); - (tok.line, tok.column + len) - }) + .map(|tok| (tok.line, tok.span.end)) .unwrap_or((start_line, start_col)); let span = Span { diff --git a/rust_compiler/libs/tokenizer/src/lib.rs b/rust_compiler/libs/tokenizer/src/lib.rs index adbe420..1e2b342 100644 --- a/rust_compiler/libs/tokenizer/src/lib.rs +++ b/rust_compiler/libs/tokenizer/src/lib.rs @@ -44,7 +44,7 @@ pub trait Tokenize: Read + Seek {} impl Tokenize for T where T: Read + Seek {} pub struct Tokenizer<'a> { - lexer: Lexer<'a, TokenType<'a>>, + lexer: Lexer<'a, TokenType>, returned_eof: bool, } @@ -58,14 +58,14 @@ impl<'a> From<&'a str> for Tokenizer<'a> { } impl<'a> Tokenizer<'a> { - fn get_token(&mut self, t_type: TokenType<'a>) -> Token<'a> { + fn get_token(&mut self, t_type: TokenType) -> Token { let mut span = self.lexer.span(); span.start -= self.lexer.extras.line_start_index; span.end -= self.lexer.extras.line_start_index; Token::new(t_type, self.lexer.extras.line_count, span) } - pub fn next_token(&mut self) -> Result>, Error> { + pub fn next_token(&mut self) -> Result, Error> { let to_return = self .lexer .next() @@ -79,7 +79,7 @@ impl<'a> Tokenizer<'a> { // ... Iterator and TokenizerBuffer implementations remain unchanged ... // They just call the methods above which now use the passed-in start coordinates. impl<'a> Iterator for Tokenizer<'a> { - type Item = Result, Error>; + type Item = Result; fn next(&mut self) -> Option { match self.lexer.next() { None => { @@ -104,8 +104,8 @@ impl<'a> Iterator for Tokenizer<'a> { pub struct TokenizerBuffer<'a> { tokenizer: Tokenizer<'a>, - buffer: VecDeque>, - history: VecDeque>, + buffer: VecDeque, + history: VecDeque, index: i64, } @@ -118,7 +118,7 @@ impl<'a> TokenizerBuffer<'a> { index: 0, } } - pub fn next_token(&mut self) -> Result>, Error> { + pub fn next_token(&mut self) -> Result, Error> { if let Some(token) = self.buffer.pop_front() { self.history.push_back(token.clone()); self.index += 1; @@ -133,7 +133,7 @@ impl<'a> TokenizerBuffer<'a> { self.index += 1; Ok(token) } - pub fn peek(&mut self) -> Result>, Error> { + pub fn peek(&mut self) -> Result, Error> { if let Some(token) = self.buffer.front() { return Ok(Some(token.clone())); } diff --git a/rust_compiler/libs/tokenizer/src/token.rs b/rust_compiler/libs/tokenizer/src/token.rs index 7e5f4e8..e8ffd04 100644 --- a/rust_compiler/libs/tokenizer/src/token.rs +++ b/rust_compiler/libs/tokenizer/src/token.rs @@ -43,7 +43,7 @@ impl From for Diagnostic { } impl LexError { - pub fn from_lexer<'a>(lex: &mut Lexer<'a, TokenType<'a>>) -> Self { + pub fn from_lexer<'a>(lex: &mut Lexer<'a, TokenType>) -> Self { let mut span = lex.span(); let line = lex.extras.line_count; span.start -= lex.extras.line_start_index; @@ -68,30 +68,30 @@ pub struct Extras { pub line_start_index: usize, } -fn update_line_index<'a>(lex: &mut Lexer<'a, TokenType<'a>>) -> Skip { +fn update_line_index<'a>(lex: &mut Lexer<'a, TokenType>) -> Skip { lex.extras.line_count += 1; lex.extras.line_start_index = lex.span().end; Skip } #[derive(Debug, PartialEq, Eq, Clone)] -pub struct Token<'a> { +pub struct Token { /// The type of the token - pub token_type: TokenType<'a>, + pub token_type: TokenType, /// The line where the token was found pub line: usize, /// The span where the token starts and ends pub span: Span, } -impl<'a> std::fmt::Display for Token<'a> { +impl std::fmt::Display for Token { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.token_type) } } -impl<'a> Token<'a> { - pub fn new(token_type: TokenType<'a>, line: usize, span: Span) -> Self { +impl Token { + pub fn new(token_type: TokenType, line: usize, span: Span) -> Self { Self { token_type, line, @@ -158,16 +158,22 @@ macro_rules! keyword { #[logos(skip r"[ \t\f]+")] #[logos(extras = Extras)] #[logos(error(LexError, LexError::from_lexer))] -pub enum TokenType<'a> { +pub enum TokenType { #[regex(r"\n", update_line_index)] Newline, // matches strings with double quotes - #[regex(r#""(?:[^"\\]|\\.)*""#)] + #[regex(r#""(?:[^"\\]|\\.)*""#, |v| { + let str = v.slice(); + str[1..str.len() - 1].to_string() + })] // matches strings with single quotes - #[regex(r#"'(?:[^'\\]|\\.)*'"#)] + #[regex(r#"'(?:[^'\\]|\\.)*'"#, |v| { + let str = v.slice(); + str[1..str.len() - 1].to_string() + })] /// Represents a string token - String(&'a str), + String(String), #[regex(r"[0-9][0-9_]*(\.[0-9][0-9_]*)?([cfk])?", parse_number)] /// Represents a number token @@ -193,9 +199,9 @@ pub enum TokenType<'a> { /// Represents a keyword token Keyword(Keyword), - #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")] + #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |v| v.slice().to_string())] /// Represents an identifier token - Identifier(&'a str), + Identifier(String), #[token("(", symbol!(LParen))] #[token(")", symbol!(RParen))] @@ -227,10 +233,10 @@ pub enum TokenType<'a> { /// Represents a symbol token Symbol(Symbol), - #[regex(r"///[\n]*", |val| Comment::Doc(val.slice()[3..].trim()))] - #[regex(r"//[\n]*", |val| Comment::Line(val.slice()[2..].trim()))] + #[regex(r"///[\n]*", |val| Comment::Doc(val.slice()[3..].trim().to_string()))] + #[regex(r"//[\n]*", |val| Comment::Line(val.slice()[2..].trim().to_string()))] /// Represents a comment, both a line comment and a doc comment - Comment(Comment<'a>), + Comment(Comment), #[end] /// Represents an end of file token @@ -238,12 +244,12 @@ pub enum TokenType<'a> { } #[derive(Hash, Debug, Eq, PartialEq, Clone)] -pub enum Comment<'a> { - Line(&'a str), - Doc(&'a str), +pub enum Comment { + Line(String), + Doc(String), } -fn parse_number<'a>(lexer: &mut Lexer<'a, TokenType<'a>>) -> Result { +fn parse_number<'a>(lexer: &mut Lexer<'a, TokenType>) -> Result { let slice = lexer.slice(); let last_char = slice.chars().last().unwrap_or_default(); let (num_str, suffix) = match last_char { @@ -289,7 +295,7 @@ fn parse_number<'a>(lexer: &mut Lexer<'a, TokenType<'a>>) -> Result std::fmt::Display for Comment<'a> { +impl std::fmt::Display for Comment { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Line(c) => write!(f, "// {}", c), @@ -306,7 +312,7 @@ impl<'a> std::fmt::Display for Comment<'a> { } } -impl<'a> Documentation for TokenType<'a> { +impl Documentation for TokenType { fn docs(&self) -> String { match self { Self::Keyword(k) => k.docs(), @@ -321,7 +327,7 @@ impl<'a> Documentation for TokenType<'a> { helpers::with_syscalls!(generate_check); -impl<'a> From> for u32 { +impl From for u32 { fn from(value: TokenType) -> Self { match value { TokenType::String(_) => 1, @@ -361,7 +367,7 @@ impl<'a> From> for u32 { } } -impl<'a> std::fmt::Display for TokenType<'a> { +impl std::fmt::Display for TokenType { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { TokenType::String(s) => write!(f, "{}", s), diff --git a/rust_compiler/src/ffi/mod.rs b/rust_compiler/src/ffi/mod.rs index ee31887..c09a539 100644 --- a/rust_compiler/src/ffi/mod.rs +++ b/rust_compiler/src/ffi/mod.rs @@ -4,7 +4,7 @@ use parser::{sys_call::SysCall, Parser}; use safer_ffi::prelude::*; use std::io::BufWriter; use tokenizer::{ - token::{Token, TokenType}, + token::{LexError, Token, TokenType}, Tokenizer, }; @@ -96,9 +96,10 @@ pub fn free_docs_vec(v: safer_ffi::Vec) { #[ffi_export] pub fn compile_from_string(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi::String { let res = std::panic::catch_unwind(|| { + let input = String::from_utf16_lossy(input.as_slice()); let mut writer = BufWriter::new(Vec::new()); - let tokenizer = Tokenizer::from(String::from_utf16_lossy(input.as_slice())); + let tokenizer = Tokenizer::from(input.as_str()); let parser = Parser::new(tokenizer); let compiler = Compiler::new(parser, &mut writer, None); @@ -120,7 +121,8 @@ pub fn compile_from_string(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi:: #[ffi_export] pub fn tokenize_line(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi::Vec { let res = std::panic::catch_unwind(|| { - let tokenizer = Tokenizer::from(String::from_utf16_lossy(input.as_slice())); + let input = String::from_utf16_lossy(input.as_slice()); + let tokenizer = Tokenizer::from(input.as_str()); let mut tokens = Vec::new(); @@ -136,34 +138,36 @@ pub fn tokenize_line(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi::Vec { + use tokenizer::token::LexError; use tokenizer::Error::*; - let (err_str, col, og) = match e { - NumberParseError(_, _, col, og) - | DecimalParseError(_, _, col, og) - | UnknownSymbolError(_, _, col, og) - | UnknownKeywordOrIdentifierError(_, _, col, og) => { - (e.to_string(), col, og) - } + let (err_str, line, span) = match e { + LexError(e) => match e { + LexError::NumberParseError(line, span, err) + | LexError::InvalidInput(line, span, err) => { + (err.to_string(), line, span) + } + _ => continue, + }, _ => continue, }; tokens.push(FfiToken { - column: *col as i32, + column: span.start as i32, error: err_str.into(), tooltip: "".into(), - length: og.len() as i32, + length: (span.end - span.start) as i32, token_kind: 0, }) } Ok(Token { - column, - original_string, + line, + span, token_type, .. }) => tokens.push(FfiToken { - column: column as i32, + column: span.start as i32, error: "".into(), - length: (original_string.unwrap_or_default().len()) as i32, + length: (span.end - span.start) as i32, tooltip: token_type.docs().into(), token_kind: token_type.into(), }), @@ -179,8 +183,10 @@ pub fn tokenize_line(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi::Vec) -> safer_ffi::Vec { let res = std::panic::catch_unwind(|| { + let input = String::from_utf16_lossy(input.as_slice()); + let mut writer = BufWriter::new(Vec::new()); - let tokenizer = Tokenizer::from(String::from_utf16_lossy(input.as_slice())); + let tokenizer = Tokenizer::from(input.as_str()); let compiler = Compiler::new(Parser::new(tokenizer), &mut writer, None); let diagnosis = compiler.compile(); diff --git a/rust_compiler/src/main.rs b/rust_compiler/src/main.rs index 730b2a2..2323398 100644 --- a/rust_compiler/src/main.rs +++ b/rust_compiler/src/main.rs @@ -50,8 +50,13 @@ fn run_logic() -> Result<(), StationlangError> { let args = Args::parse(); let input_file = args.input_file; - let tokenizer: Tokenizer = match input_file { - Some(input_file) => Tokenizer::from_path(&input_file)?, + let input_string = match input_file { + Some(input_path) => { + let mut buf = String::new(); + let mut file = std::fs::File::open(input_path).unwrap(); + file.read_to_string(&mut buf).unwrap(); + buf + } None => { let mut buf = String::new(); let stdin = std::io::stdin(); @@ -62,10 +67,12 @@ fn run_logic() -> Result<(), StationlangError> { return Ok(()); } - Tokenizer::from(buf) + buf } }; + let tokenizer = Tokenizer::from(input_string.as_str()); + let parser = ASTParser::new(tokenizer); let mut writer: BufWriter> = match args.output_file {