wip

2025-12-09 01:43:12 -07:00
parent fac36c756b
commit 72cf9ea042
6 changed files with 85 additions and 69 deletions
--- a/rust_compiler/libs/compiler/src/test/mod.rs
+++ b/rust_compiler/libs/compiler/src/test/mod.rs
@@ -22,7 +22,7 @@ macro_rules! compile {
    (result $source:expr) => {{
        let mut writer = std::io::BufWriter::new(Vec::new());
        let compiler = crate::Compiler::new(
-            parser::Parser::new(tokenizer::Tokenizer::from(String::from($source))),
+            parser::Parser::new(tokenizer::Tokenizer::from($source)),
            &mut writer,
            Some(crate::CompilerConfig { debug: true }),
        );
@@ -32,7 +32,7 @@ macro_rules! compile {
    (debug $source:expr) => {{
        let mut writer = std::io::BufWriter::new(Vec::new());
        let compiler = crate::Compiler::new(
-            parser::Parser::new(tokenizer::Tokenizer::from(String::from($source))),
+            parser::Parser::new(tokenizer::Tokenizer::from($source)),
            &mut writer,
            Some(crate::CompilerConfig { debug: true }),
        );
--- a/rust_compiler/libs/parser/src/lib.rs
+++ b/rust_compiler/libs/parser/src/lib.rs
@@ -5,7 +5,6 @@ pub mod sys_call;
 pub mod tree_node;

 use crate::sys_call::{Math, System};
-use quick_error::quick_error;
 use std::io::SeekFrom;
 use sys_call::SysCall;
 use thiserror::Error;
@@ -28,27 +27,27 @@ macro_rules! boxed {
 }

 #[derive(Error, Debug)]
-pub enum Error<'a> {
+pub enum Error {
    #[error("Tokenizer Error: {0}")]
    TokenizerError(#[from] tokenizer::Error),

    #[error("Unexpected token: {1}")]
-    UnexpectedToken(Span, Token<'a>),
+    UnexpectedToken(Span, Token),

    #[error("Duplicate identifier: {1}")]
-    DuplicateIdentifier(Span, Token<'a>),
+    DuplicateIdentifier(Span, Token),

    #[error("Invalid Syntax: {1}")]
-    InvalidSyntax(Span, Token<'a>),
+    InvalidSyntax(Span, String),

    #[error("Unsupported Keyword: {1}")]
-    UnsupportedKeyword(Span, Token<'a>),
+    UnsupportedKeyword(Span, Token),

    #[error("Unexpected End of File")]
    UnexpectedEOF,
 }

-impl<'a> From<Error<'a>> for lsp_types::Diagnostic {
+impl From<Error> for lsp_types::Diagnostic {
    fn from(value: Error) -> Self {
        use Error::*;
        use lsp_types::*;
@@ -107,8 +106,8 @@ macro_rules! self_matches_current {

 pub struct Parser<'a> {
    tokenizer: TokenizerBuffer<'a>,
-    current_token: Option<Token<'a>>,
-    pub errors: Vec<Error<'a>>,
+    current_token: Option<Token>,
+    pub errors: Vec<Error>,
 }

 impl<'a> Parser<'a> {
@@ -160,9 +159,10 @@ impl<'a> Parser<'a> {

        let node = parser(self)?;

-        let end_token = self.current_token;
+        let end_token = &self.current_token;

        let (end_line, end_col) = end_token
+            .clone()
            .map(|t| (t.line, t.span.end))
            .unwrap_or((start_line, start_col));

@@ -207,7 +207,7 @@ impl<'a> Parser<'a> {
        let first_token = self.tokenizer.peek().unwrap_or(None);
        let (start_line, start_col) = first_token
            .as_ref()
-            .map(|tok| (tok.line, tok.column))
+            .map(|tok| (tok.line, tok.span.start))
            .unwrap_or((1, 1));

        let mut expressions = Vec::<Spanned<Expression>>::new();
@@ -238,10 +238,7 @@ impl<'a> Parser<'a> {

        let end_token_opt = self.tokenizer.peek().unwrap_or(None);
        let (end_line, end_col) = end_token_opt
-            .map(|tok| {
-                let len = tok.original_string.as_ref().map(|s| s.len()).unwrap_or(0);
-                (tok.line, tok.column + len)
-            })
+            .map(|tok| (tok.line, tok.span.end))
            .unwrap_or((start_line, start_col));

        let span = Span {
--- a/rust_compiler/libs/tokenizer/src/lib.rs
+++ b/rust_compiler/libs/tokenizer/src/lib.rs
@@ -44,7 +44,7 @@ pub trait Tokenize: Read + Seek {}
 impl<T> Tokenize for T where T: Read + Seek {}

 pub struct Tokenizer<'a> {
-    lexer: Lexer<'a, TokenType<'a>>,
+    lexer: Lexer<'a, TokenType>,
    returned_eof: bool,
 }

@@ -58,14 +58,14 @@ impl<'a> From<&'a str> for Tokenizer<'a> {
 }

 impl<'a> Tokenizer<'a> {
-    fn get_token(&mut self, t_type: TokenType<'a>) -> Token<'a> {
+    fn get_token(&mut self, t_type: TokenType) -> Token {
        let mut span = self.lexer.span();
        span.start -= self.lexer.extras.line_start_index;
        span.end -= self.lexer.extras.line_start_index;
        Token::new(t_type, self.lexer.extras.line_count, span)
    }

-    pub fn next_token(&mut self) -> Result<Option<Token<'a>>, Error> {
+    pub fn next_token(&mut self) -> Result<Option<Token>, Error> {
        let to_return = self
            .lexer
            .next()
@@ -79,7 +79,7 @@ impl<'a> Tokenizer<'a> {
 // ... Iterator and TokenizerBuffer implementations remain unchanged ...
 // They just call the methods above which now use the passed-in start coordinates.
 impl<'a> Iterator for Tokenizer<'a> {
-    type Item = Result<Token<'a>, Error>;
+    type Item = Result<Token, Error>;
    fn next(&mut self) -> Option<Self::Item> {
        match self.lexer.next() {
            None => {
@@ -104,8 +104,8 @@ impl<'a> Iterator for Tokenizer<'a> {

 pub struct TokenizerBuffer<'a> {
    tokenizer: Tokenizer<'a>,
-    buffer: VecDeque<Token<'a>>,
-    history: VecDeque<Token<'a>>,
+    buffer: VecDeque<Token>,
+    history: VecDeque<Token>,
    index: i64,
 }

@@ -118,7 +118,7 @@ impl<'a> TokenizerBuffer<'a> {
            index: 0,
        }
    }
-    pub fn next_token(&mut self) -> Result<Option<Token<'a>>, Error> {
+    pub fn next_token(&mut self) -> Result<Option<Token>, Error> {
        if let Some(token) = self.buffer.pop_front() {
            self.history.push_back(token.clone());
            self.index += 1;
@@ -133,7 +133,7 @@ impl<'a> TokenizerBuffer<'a> {
        self.index += 1;
        Ok(token)
    }
-    pub fn peek(&mut self) -> Result<Option<Token<'a>>, Error> {
+    pub fn peek(&mut self) -> Result<Option<Token>, Error> {
        if let Some(token) = self.buffer.front() {
            return Ok(Some(token.clone()));
        }
--- a/rust_compiler/libs/tokenizer/src/token.rs
+++ b/rust_compiler/libs/tokenizer/src/token.rs
@@ -43,7 +43,7 @@ impl From<LexError> for Diagnostic {
 }

 impl LexError {
-    pub fn from_lexer<'a>(lex: &mut Lexer<'a, TokenType<'a>>) -> Self {
+    pub fn from_lexer<'a>(lex: &mut Lexer<'a, TokenType>) -> Self {
        let mut span = lex.span();
        let line = lex.extras.line_count;
        span.start -= lex.extras.line_start_index;
@@ -68,30 +68,30 @@ pub struct Extras {
    pub line_start_index: usize,
 }

-fn update_line_index<'a>(lex: &mut Lexer<'a, TokenType<'a>>) -> Skip {
+fn update_line_index<'a>(lex: &mut Lexer<'a, TokenType>) -> Skip {
    lex.extras.line_count += 1;
    lex.extras.line_start_index = lex.span().end;
    Skip
 }

 #[derive(Debug, PartialEq, Eq, Clone)]
-pub struct Token<'a> {
+pub struct Token {
    /// The type of the token
-    pub token_type: TokenType<'a>,
+    pub token_type: TokenType,
    /// The line where the token was found
    pub line: usize,
    /// The span where the token starts and ends
    pub span: Span,
 }

-impl<'a> std::fmt::Display for Token<'a> {
+impl std::fmt::Display for Token {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.token_type)
    }
 }

-impl<'a> Token<'a> {
-    pub fn new(token_type: TokenType<'a>, line: usize, span: Span) -> Self {
+impl Token {
+    pub fn new(token_type: TokenType, line: usize, span: Span) -> Self {
        Self {
            token_type,
            line,
@@ -158,16 +158,22 @@ macro_rules! keyword {
 #[logos(skip r"[ \t\f]+")]
 #[logos(extras = Extras)]
 #[logos(error(LexError, LexError::from_lexer))]
-pub enum TokenType<'a> {
+pub enum TokenType {
    #[regex(r"\n", update_line_index)]
    Newline,

    // matches strings with double quotes
-    #[regex(r#""(?:[^"\\]|\\.)*""#)]
+    #[regex(r#""(?:[^"\\]|\\.)*""#, |v| {
+        let str = v.slice();
+        str[1..str.len() - 1].to_string()
+    })]
    // matches strings with single quotes
-    #[regex(r#"'(?:[^'\\]|\\.)*'"#)]
+    #[regex(r#"'(?:[^'\\]|\\.)*'"#, |v| {
+        let str = v.slice();
+        str[1..str.len() - 1].to_string()
+    })]
    /// Represents a string token
-    String(&'a str),
+    String(String),

    #[regex(r"[0-9][0-9_]*(\.[0-9][0-9_]*)?([cfk])?", parse_number)]
    /// Represents a number token
@@ -193,9 +199,9 @@ pub enum TokenType<'a> {
    /// Represents a keyword token
    Keyword(Keyword),

-    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*")]
+    #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |v| v.slice().to_string())]
    /// Represents an identifier token
-    Identifier(&'a str),
+    Identifier(String),

    #[token("(", symbol!(LParen))]
    #[token(")", symbol!(RParen))]
@@ -227,10 +233,10 @@ pub enum TokenType<'a> {
    /// Represents a symbol token
    Symbol(Symbol),

-    #[regex(r"///[\n]*", |val| Comment::Doc(val.slice()[3..].trim()))]
-    #[regex(r"//[\n]*", |val| Comment::Line(val.slice()[2..].trim()))]
+    #[regex(r"///[\n]*", |val| Comment::Doc(val.slice()[3..].trim().to_string()))]
+    #[regex(r"//[\n]*", |val| Comment::Line(val.slice()[2..].trim().to_string()))]
    /// Represents a comment, both a line comment and a doc comment
-    Comment(Comment<'a>),
+    Comment(Comment),

    #[end]
    /// Represents an end of file token
@@ -238,12 +244,12 @@ pub enum TokenType<'a> {
 }

 #[derive(Hash, Debug, Eq, PartialEq, Clone)]
-pub enum Comment<'a> {
-    Line(&'a str),
-    Doc(&'a str),
+pub enum Comment {
+    Line(String),
+    Doc(String),
 }

-fn parse_number<'a>(lexer: &mut Lexer<'a, TokenType<'a>>) -> Result<Number, LexError> {
+fn parse_number<'a>(lexer: &mut Lexer<'a, TokenType>) -> Result<Number, LexError> {
    let slice = lexer.slice();
    let last_char = slice.chars().last().unwrap_or_default();
    let (num_str, suffix) = match last_char {
@@ -289,7 +295,7 @@ fn parse_number<'a>(lexer: &mut Lexer<'a, TokenType<'a>>) -> Result<Number, LexE
    }
 }

-impl<'a> std::fmt::Display for Comment<'a> {
+impl std::fmt::Display for Comment {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Line(c) => write!(f, "// {}", c),
@@ -306,7 +312,7 @@ impl<'a> std::fmt::Display for Comment<'a> {
    }
 }

-impl<'a> Documentation for TokenType<'a> {
+impl Documentation for TokenType {
    fn docs(&self) -> String {
        match self {
            Self::Keyword(k) => k.docs(),
@@ -321,7 +327,7 @@ impl<'a> Documentation for TokenType<'a> {

 helpers::with_syscalls!(generate_check);

-impl<'a> From<TokenType<'a>> for u32 {
+impl From<TokenType> for u32 {
    fn from(value: TokenType) -> Self {
        match value {
            TokenType::String(_) => 1,
@@ -361,7 +367,7 @@ impl<'a> From<TokenType<'a>> for u32 {
    }
 }

-impl<'a> std::fmt::Display for TokenType<'a> {
+impl std::fmt::Display for TokenType {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            TokenType::String(s) => write!(f, "{}", s),
--- a/rust_compiler/src/ffi/mod.rs
+++ b/rust_compiler/src/ffi/mod.rs
@@ -4,7 +4,7 @@ use parser::{sys_call::SysCall, Parser};
 use safer_ffi::prelude::*;
 use std::io::BufWriter;
 use tokenizer::{
-    token::{Token, TokenType},
+    token::{LexError, Token, TokenType},
    Tokenizer,
 };

@@ -96,9 +96,10 @@ pub fn free_docs_vec(v: safer_ffi::Vec<FfiDocumentedItem>) {
 #[ffi_export]
 pub fn compile_from_string(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi::String {
    let res = std::panic::catch_unwind(|| {
+        let input = String::from_utf16_lossy(input.as_slice());
        let mut writer = BufWriter::new(Vec::new());

-        let tokenizer = Tokenizer::from(String::from_utf16_lossy(input.as_slice()));
+        let tokenizer = Tokenizer::from(input.as_str());
        let parser = Parser::new(tokenizer);
        let compiler = Compiler::new(parser, &mut writer, None);

@@ -120,7 +121,8 @@ pub fn compile_from_string(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi::
 #[ffi_export]
 pub fn tokenize_line(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi::Vec<FfiToken> {
    let res = std::panic::catch_unwind(|| {
-        let tokenizer = Tokenizer::from(String::from_utf16_lossy(input.as_slice()));
+        let input = String::from_utf16_lossy(input.as_slice());
+        let tokenizer = Tokenizer::from(input.as_str());

        let mut tokens = Vec::new();

@@ -136,34 +138,36 @@ pub fn tokenize_line(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi::Vec<Ff
            }
            match token {
                Err(ref e) => {
+                    use tokenizer::token::LexError;
                    use tokenizer::Error::*;
-                    let (err_str, col, og) = match e {
-                        NumberParseError(_, _, col, og)
-                        | DecimalParseError(_, _, col, og)
-                        | UnknownSymbolError(_, _, col, og)
-                        | UnknownKeywordOrIdentifierError(_, _, col, og) => {
-                            (e.to_string(), col, og)
+                    let (err_str, line, span) = match e {
+                        LexError(e) => match e {
+                            LexError::NumberParseError(line, span, err)
+                            | LexError::InvalidInput(line, span, err) => {
+                                (err.to_string(), line, span)
                            }
                            _ => continue,
+                        },
+                        _ => continue,
                    };

                    tokens.push(FfiToken {
-                        column: *col as i32,
+                        column: span.start as i32,
                        error: err_str.into(),
                        tooltip: "".into(),
-                        length: og.len() as i32,
+                        length: (span.end - span.start) as i32,
                        token_kind: 0,
                    })
                }
                Ok(Token {
-                    column,
-                    original_string,
+                    line,
+                    span,
                    token_type,
                    ..
                }) => tokens.push(FfiToken {
-                    column: column as i32,
+                    column: span.start as i32,
                    error: "".into(),
-                    length: (original_string.unwrap_or_default().len()) as i32,
+                    length: (span.end - span.start) as i32,
                    tooltip: token_type.docs().into(),
                    token_kind: token_type.into(),
                }),
@@ -179,8 +183,10 @@ pub fn tokenize_line(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi::Vec<Ff
 #[ffi_export]
 pub fn diagnose_source(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi::Vec<FfiDiagnostic> {
    let res = std::panic::catch_unwind(|| {
+        let input = String::from_utf16_lossy(input.as_slice());
+
        let mut writer = BufWriter::new(Vec::new());
-        let tokenizer = Tokenizer::from(String::from_utf16_lossy(input.as_slice()));
+        let tokenizer = Tokenizer::from(input.as_str());
        let compiler = Compiler::new(Parser::new(tokenizer), &mut writer, None);

        let diagnosis = compiler.compile();
--- a/rust_compiler/src/main.rs
+++ b/rust_compiler/src/main.rs
@@ -50,8 +50,13 @@ fn run_logic() -> Result<(), StationlangError> {
    let args = Args::parse();
    let input_file = args.input_file;

-    let tokenizer: Tokenizer = match input_file {
-        Some(input_file) => Tokenizer::from_path(&input_file)?,
+    let input_string = match input_file {
+        Some(input_path) => {
+            let mut buf = String::new();
+            let mut file = std::fs::File::open(input_path).unwrap();
+            file.read_to_string(&mut buf).unwrap();
+            buf
+        }
        None => {
            let mut buf = String::new();
            let stdin = std::io::stdin();
@@ -62,10 +67,12 @@ fn run_logic() -> Result<(), StationlangError> {
                return Ok(());
            }

-            Tokenizer::from(buf)
+            buf
        }
    };

+    let tokenizer = Tokenizer::from(input_string.as_str());
+
    let parser = ASTParser::new(tokenizer);

    let mut writer: BufWriter<Box<dyn Write>> = match args.output_file {