diff --git a/rust_compiler/Cargo.lock b/rust_compiler/Cargo.lock index 50bc630..d5e79e8 100644 --- a/rust_compiler/Cargo.lock +++ b/rust_compiler/Cargo.lock @@ -28,6 +28,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + [[package]] name = "anstream" version = "0.6.21" @@ -114,6 +123,12 @@ dependencies = [ "windows-link", ] +[[package]] +name = "beef" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a8241f3ebb85c056b509d4327ad0358fbbba6ffb340bf388f26350aeda225b1" + [[package]] name = "bitflags" version = "1.3.2" @@ -327,6 +342,12 @@ dependencies = [ "bitflags", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "funty" version = "2.0.0" @@ -434,6 +455,40 @@ version = "0.2.178" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" +[[package]] +name = "logos" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a790d11254054e5dc83902dba85d253ff06ceb0cfafb12be8773435cb9dfb4f4" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60337c43a38313b58871f8d5d76872b8e17aa9d51fad494b5e76092c0ce05f5" +dependencies = [ + "beef", + "fnv", + "proc-macro2", + "quote", + "regex-automata", + "regex-syntax", + "rustc_version", + "syn 2.0.111", +] + +[[package]] +name = "logos-derive" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d151b2ae667f69e10b8738f5cac0c746faa22b2e15ea7e83b55476afec3767dc" +dependencies = [ + "logos-codegen", +] + [[package]] name = "lsp-types" version = "0.97.0" @@ -644,6 +699,23 @@ dependencies = [ "getrandom", ] +[[package]] +name = "regex-automata" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" + [[package]] name = "rend" version = "0.4.2" @@ -947,6 +1019,7 @@ version = "0.1.0" dependencies = [ "anyhow", "helpers", + "logos", "lsp-types", "quick-error", "rust_decimal", diff --git a/rust_compiler/libs/tokenizer/Cargo.toml b/rust_compiler/libs/tokenizer/Cargo.toml index 7433cab..37b5611 100644 --- a/rust_compiler/libs/tokenizer/Cargo.toml +++ b/rust_compiler/libs/tokenizer/Cargo.toml @@ -8,6 +8,7 @@ rust_decimal = { workspace = true } quick-error = { workspace = true } lsp-types = { workspace = true } helpers = { path = "../helpers" } +logos = "0.16" [dev-dependencies] anyhow = { version = "^1" } diff --git a/rust_compiler/libs/tokenizer/src/token.rs b/rust_compiler/libs/tokenizer/src/token.rs index 9745ecd..2233da3 100644 --- a/rust_compiler/libs/tokenizer/src/token.rs +++ b/rust_compiler/libs/tokenizer/src/token.rs @@ -1,4 +1,5 @@ use helpers::prelude::*; +use logos::{Lexer, Logos}; use rust_decimal::Decimal; // Define a local macro to consume the list @@ -79,24 +80,153 @@ impl Temperature { } } -#[derive(Debug, PartialEq, Hash, Eq, Clone)] +macro_rules! symbol { + ($var:ident) => { + |_| Symbol::$var + }; +} + +macro_rules! keyword { + ($var:ident) => { + |_| Keyword::$var + }; +} + +#[derive(Debug, PartialEq, Hash, Eq, Clone, Logos)] pub enum TokenType { + // matches strings with double quotes + #[regex(r#""(?:[^"\\]|\\.)*""#, |v| v.slice().to_string())] + // matches strings with single quotes + #[regex(r#"'(?:[^'\\]|\\.)*'"#, |v| v.slice().to_string())] /// Represents a string token String(String), + + #[regex(r"[0-9][0-9_]*(\.[0-9][0-9_]*)?([cfk])?", parse_number)] /// Represents a number token Number(Number), + + #[token("true", |_| true)] + #[token("false", |_| false)] /// Represents a boolean token Boolean(bool), + + #[token("continue", keyword!(Continue))] + #[token("const", keyword!(Const))] + #[token("let", keyword!(Let))] + #[token("fn", keyword!(Fn))] + #[token("if", keyword!(If))] + #[token("device", keyword!(Device))] + #[token("else", keyword!(Else))] + #[token("return", keyword!(Return))] + #[token("enum", keyword!(Enum))] + #[token("loop", keyword!(Loop))] + #[token("break", keyword!(Break))] + #[token("while", keyword!(While))] /// Represents a keyword token Keyword(Keyword), + + #[regex(r"[a-zA-Z_][a-zA-Z0-9_]*", |v| v.slice().to_string())] /// Represents an identifier token Identifier(String), + + #[token("(", symbol!(LParen))] + #[token(")", symbol!(RParen))] + #[token("{", symbol!(LBrace))] + #[token("}", symbol!(RBrace))] + #[token("[", symbol!(LBracket))] + #[token("]", symbol!(RBracket))] + #[token(";", symbol!(Semicolon))] + #[token(":", symbol!(Colon))] + #[token(",", symbol!(Comma))] + #[token("+", symbol!(Plus))] + #[token("-", symbol!(Minus))] + #[token("*", symbol!(Asterisk))] + #[token("/", symbol!(Slash))] + #[token("<", symbol!(LessThan))] + #[token(">", symbol!(GreaterThan))] + #[token("=", symbol!(Assign))] + #[token("!", symbol!(LogicalNot))] + #[token(".", symbol!(Dot))] + #[token("^", symbol!(Caret))] + #[token("%", symbol!(Percent))] + #[token("==", symbol!(Equal))] + #[token("!=", symbol!(NotEqual))] + #[token("&&", symbol!(LogicalAnd))] + #[token("||", symbol!(LogicalOr))] + #[token("<=", symbol!(LessThanOrEqual))] + #[token(">=", symbol!(GreaterThanOrEqual))] + #[token("**", symbol!(Exp))] /// Represents a symbol token Symbol(Symbol), + + #[regex(r"///[\n]*", |val| Comment::Doc(val.slice()[3..].trim().to_string()))] + #[regex(r"//[\n]*", |val| Comment::Line(val.slice()[2..].trim().to_string()))] + /// Represents a comment, both a line comment and a doc comment + Comment(Comment), + + #[end] /// Represents an end of file token EOF, } +#[derive(Hash, Debug, Eq, PartialEq, Clone)] +pub enum Comment { + Line(String), + Doc(String), +} + +fn parse_number<'a>(lexer: &mut Lexer<'a, TokenType>) -> Option { + let slice = lexer.slice(); + let last_char = slice.chars().last()?; + let (num_str, suffix) = match last_char { + 'c' | 'k' | 'f' => (&slice[..slice.len() - 1], Some(last_char)), + _ => (slice, None), + }; + + let clean_str = if num_str.contains('_') { + num_str.replace('_', "") + } else { + num_str.to_string() + }; + + let num = if clean_str.contains('.') { + Number::Decimal(clean_str.parse::().ok()?) + } else { + Number::Integer(clean_str.parse::().ok()?) + }; + + if let Some(suffix) = suffix { + Some( + match suffix { + 'c' => Temperature::Celsius(num), + 'f' => Temperature::Fahrenheit(num), + 'k' => Temperature::Kelvin(num), + _ => unreachable!(), + } + .to_kelvin(), + ) + } else { + Some(num) + } +} + +impl std::fmt::Display for Comment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Line(c) => write!(f, "// {}", c), + Self::Doc(d) => { + let lines = d + .split('\n') + .map(|s| format!("/// {s}")) + .collect::>() + .join("\n"); + + write!(f, "{}", lines) + } + } + } +} + impl Documentation for TokenType { fn docs(&self) -> String { match self { @@ -128,6 +258,7 @@ impl From for u32 { | Keyword::Return => 4, _ => 5, }, + TokenType::Comment(_) => 8, TokenType::Identifier(s) => { if is_syscall(&s) { 10 @@ -160,6 +291,7 @@ impl std::fmt::Display for TokenType { TokenType::Keyword(k) => write!(f, "{:?}", k), TokenType::Identifier(i) => write!(f, "{}", i), TokenType::Symbol(s) => write!(f, "{}", s), + TokenType::Comment(c) => write!(f, "{}", c), TokenType::EOF => write!(f, "EOF"), } }