From 245567c35463a10d4f5cf9bed779a7482be28fe8 Mon Sep 17 00:00:00 2001 From: Devin Bidwell Date: Thu, 12 Jun 2025 20:52:39 -0700 Subject: [PATCH] workspace libs --- Cargo.lock | 311 +++++--- Cargo.toml | 11 +- libs/tokenizer/Cargo.toml | 11 + libs/tokenizer/src/lib.rs | 897 ++++++++++++++++++++++ libs/tokenizer/src/token.rs | 221 ++++++ {tests => libs/tokenizer/tests}/file.stlg | 2 +- src/tokenizer/mod.rs | 5 +- 7 files changed, 1346 insertions(+), 112 deletions(-) create mode 100644 libs/tokenizer/Cargo.toml create mode 100644 libs/tokenizer/src/lib.rs create mode 100644 libs/tokenizer/src/token.rs rename {tests => libs/tokenizer/tests}/file.stlg (96%) diff --git a/Cargo.lock b/Cargo.lock index 62124d4..94811fd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13,9 +13,9 @@ dependencies = [ [[package]] name = "adler2" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" [[package]] name = "ahash" @@ -30,9 +30,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.18" +version = "0.6.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933" dependencies = [ "anstyle", "anstyle-parse", @@ -45,43 +45,44 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" [[package]] name = "anstyle-parse" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" -version = "3.0.6" +version = "3.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" dependencies = [ "anstyle", + "once_cell_polyfill", "windows-sys", ] [[package]] name = "anyhow" -version = "1.0.93" +version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" dependencies = [ "backtrace", ] @@ -100,9 +101,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "backtrace" -version = "0.3.74" +version = "0.3.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a" +checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" dependencies = [ "addr2line", "cfg-if", @@ -127,9 +128,9 @@ dependencies = [ [[package]] name = "borsh" -version = "1.5.3" +version = "1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2506947f73ad44e344215ccd6403ac2ae18cd8e046e581a441bf8d199f257f03" +checksum = "ad8646f98db542e39fc66e68a20b2144f6a732636df7c2354e74645faaa433ce" dependencies = [ "borsh-derive", "cfg_aliases", @@ -137,17 +138,23 @@ dependencies = [ [[package]] name = "borsh-derive" -version = "1.5.3" +version = "1.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2593a3b8b938bd68373196c9832f516be11fa487ef4ae745eb282e6a56a7244" +checksum = "fdd1d3c0c2f5833f22386f252fe8ed005c7f59fdcddeef025c01b4c3b9fd9ac3" dependencies = [ "once_cell", "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.102", ] +[[package]] +name = "bumpalo" +version = "3.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db76d6187cd04dff33004d8e6c9cc4e05cd330500379d2394209271b4aeee" + [[package]] name = "bytecheck" version = "0.6.12" @@ -170,23 +177,17 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - [[package]] name = "bytes" -version = "1.8.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" [[package]] name = "cfg-if" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" [[package]] name = "cfg_aliases" @@ -196,9 +197,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "clap" -version = "4.5.21" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f" +checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" dependencies = [ "clap_builder", "clap_derive", @@ -206,9 +207,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.21" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec" +checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" dependencies = [ "anstream", "anstyle", @@ -218,33 +219,33 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.18" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" +checksum = "d2c7947ae4cc3d851207c1adb5b5e260ff0cca11446b1d6d1423788e442257ce" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.102", ] [[package]] name = "clap_lex" -version = "0.7.3" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" [[package]] name = "colorchoice" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "equivalent" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "funty" @@ -254,9 +255,9 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "getrandom" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" dependencies = [ "cfg-if", "libc", @@ -280,9 +281,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.2" +version = "0.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" [[package]] name = "heck" @@ -292,12 +293,12 @@ checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "indexmap" -version = "2.6.0" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" dependencies = [ "equivalent", - "hashbrown 0.15.2", + "hashbrown 0.15.4", ] [[package]] @@ -308,27 +309,43 @@ checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itoa" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] [[package]] name = "libc" -version = "0.2.164" +version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "memchr" -version = "2.7.4" +version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" [[package]] name = "miniz_oxide" -version = "0.8.0" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", ] @@ -344,42 +361,48 @@ dependencies = [ [[package]] name = "object" -version = "0.36.5" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" dependencies = [ "memchr", ] [[package]] name = "once_cell" -version = "1.20.2" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" [[package]] name = "ppv-lite86" -version = "0.2.20" +version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ "zerocopy", ] [[package]] name = "proc-macro-crate" -version = "3.2.0" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" +checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35" dependencies = [ "toml_edit", ] [[package]] name = "proc-macro2" -version = "1.0.89" +version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] @@ -412,9 +435,9 @@ checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" [[package]] name = "quote" -version = "1.0.37" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] @@ -495,9 +518,9 @@ dependencies = [ [[package]] name = "rust_decimal" -version = "1.36.0" +version = "1.37.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b082d80e3e3cc52b2ed634388d436fe1f4de6af5786cc2de9ba9737527bdf555" +checksum = "faa7de2ba56ac291bd90c6b9bece784a52ae1411f9506544b3eae36dd2356d50" dependencies = [ "arrayvec", "borsh", @@ -511,15 +534,21 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f" + +[[package]] +name = "rustversion" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" [[package]] name = "seahash" @@ -529,29 +558,29 @@ checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b" [[package]] name = "serde" -version = "1.0.215" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.102", ] [[package]] name = "serde_json" -version = "1.0.133" +version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ "itoa", "memchr", @@ -594,9 +623,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.87" +version = "2.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +checksum = "f6397daf94fa90f058bd0fd88429dd9e5738999cca8d701813c80723add80462" dependencies = [ "proc-macro2", "quote", @@ -611,9 +640,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tinyvec" -version = "1.8.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" +checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71" dependencies = [ "tinyvec_macros", ] @@ -624,17 +653,26 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokenizer" +version = "0.1.0" +dependencies = [ + "anyhow", + "quick-error", + "rust_decimal", +] + [[package]] name = "toml_datetime" -version = "0.6.8" +version = "0.6.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dd7358ecb8fc2f8d014bf86f6f638ce72ba252a2c3a2572f2a795f1d23efb41" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" [[package]] name = "toml_edit" -version = "0.22.22" +version = "0.22.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" dependencies = [ "indexmap", "toml_datetime", @@ -643,9 +681,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.13" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "utf8parse" @@ -655,9 +693,13 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.11.0" +version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" +checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" +dependencies = [ + "js-sys", + "wasm-bindgen", +] [[package]] name = "version_check" @@ -667,9 +709,67 @@ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" +version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn 2.0.102", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.102", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] [[package]] name = "windows-sys" @@ -746,9 +846,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.20" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36c1fec1a2bb5866f07c25f68c26e565c4c200aebb96d7e55710c19d3e8ac49b" +checksum = "74c7b26e3480b707944fc872477815d29a8e429d2f93a1ce000f5fa84a15cbcd" dependencies = [ "memchr", ] @@ -764,21 +864,20 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.7.35" +version = "0.8.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" dependencies = [ - "byteorder", "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.35" +version = "0.8.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" dependencies = [ "proc-macro2", "quote", - "syn 2.0.87", + "syn 2.0.102", ] diff --git a/Cargo.toml b/Cargo.toml index 32ae60f..b7059ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,14 +3,21 @@ name = "stationlang" version = "0.1.0" edition = "2021" +[workspace] +members = ["libs/*"] + +[workspace.dependencies] +quick-error = "2" +rust_decimal = "1" + [[bin]] name = "slang" path = "src/main.rs" [dependencies] clap = { version = "^4.5", features = ["derive"] } -quick-error = "2.0.1" -rust_decimal = "1.36.0" +quick-error = { workspace = true } +rust_decimal = { workspace = true } [dev-dependencies] anyhow = { version = "^1.0", features = ["backtrace"] } diff --git a/libs/tokenizer/Cargo.toml b/libs/tokenizer/Cargo.toml new file mode 100644 index 0000000..100b2b7 --- /dev/null +++ b/libs/tokenizer/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "tokenizer" +version = "0.1.0" +edition = "2024" + +[dependencies] +rust_decimal = { workspace = true } +quick-error = { workspace = true } + +[dev-dependencies] +anyhow = { version = "^1" } diff --git a/libs/tokenizer/src/lib.rs b/libs/tokenizer/src/lib.rs new file mode 100644 index 0000000..7410c14 --- /dev/null +++ b/libs/tokenizer/src/lib.rs @@ -0,0 +1,897 @@ +pub mod token; + +use quick_error::quick_error; +use rust_decimal::Decimal; +use std::{ + cmp::Ordering, + collections::VecDeque, + io::{BufReader, Cursor, Read, Seek, SeekFrom}, + path::PathBuf, +}; +use token::{Keyword, Number, Symbol, Temperature, Token, TokenType}; + +quick_error! { + #[derive(Debug)] + pub enum TokenizerError { + IOError(err: std::io::Error) { + from() + display("IO Error: {}", err) + source(err) + } + NumberParseError(err: std::num::ParseIntError, line: usize, column: usize) { + display("Number Parse Error: {}\nLine: {}, Column: {}", err, line, column) + source(err) + } + DecimalParseError(err: rust_decimal::Error, line: usize, column: usize) { + display("Decimal Parse Error: {}\nLine: {}, Column: {}", err, line, column) + source(err) + } + UnknownSymbolError(char: char, line: usize, column: usize) { + display("Unknown Symbol: {}\nLine: {}, Column: {}", char, line, column) + } + UnknownKeywordOrIdentifierError(val: String, line: usize, column: usize) { + display("Unknown Keyword or Identifier: {}\nLine: {}, Column: {}", val, line, column) + } + } +} + +pub trait Tokenize: Read + Seek {} + +impl Tokenize for T where T: Read + Seek {} + +pub(crate) struct Tokenizer { + reader: BufReader>, + char_buffer: [u8; 1], + line: usize, + column: usize, + returned_eof: bool, +} + +impl Tokenizer { + pub fn from_path(input_file: impl Into) -> Result { + let file = std::fs::File::open(input_file.into())?; + let reader = BufReader::new(Box::new(file) as Box); + + Ok(Self { + reader, + line: 1, + column: 1, + char_buffer: [0], + returned_eof: false, + }) + } +} + +impl From for Tokenizer { + fn from(input: String) -> Self { + let reader = BufReader::new(Box::new(Cursor::new(input)) as Box); + + Self { + reader, + line: 1, + column: 1, + char_buffer: [0], + returned_eof: false, + } + } +} + +impl Tokenizer { + /// Consumes the tokenizer and returns the next token in the stream + /// If there are no more tokens in the stream, this function returns None + /// If there is an error reading the stream, this function returns an error + /// + /// # Important + /// This function will increment the line and column counters + fn next_char(&mut self) -> Result, TokenizerError> { + let bytes_read = self.reader.read(&mut self.char_buffer)?; + + if bytes_read == 0 { + return Ok(None); + } + + // Safety: The buffer is guaranteed to have 1 value as it is initialized with a size of 1 + let c = self.char_buffer[0] as char; + if c == '\n' { + self.line += 1; + self.column = 1; + } else { + self.column += 1; + } + + Ok(Some(c)) + } + + /// Peeks the next character in the stream without consuming it + /// + /// # Important + /// This does not increment the line or column counters + fn peek_next_char(&mut self) -> Result, TokenizerError> { + let current_pos = self.reader.stream_position()?; + + let to_return = if self.reader.read(&mut self.char_buffer)? == 0 { + None + } else { + self.reader.seek(SeekFrom::Start(current_pos))?; + + // Safety: The buffer is guaranteed to have 1 value as it is initialized with a size of 1 + Some(self.char_buffer[0] as char) + }; + + Ok(to_return) + } + + /// Skips the current line in the stream. + /// Useful for skipping comments or empty lines + /// + /// # Important + /// This function will increment the line and column counters + fn skip_line(&mut self) -> Result<(), TokenizerError> { + while let Some(next_char) = self.next_char()? { + if next_char == '\n' { + break; + } + } + Ok(()) + } + + /// Consumes the tokenizer and returns the next token in the stream + /// If there are no more tokens in the stream, this function returns None + pub fn next_token(&mut self) -> Result, TokenizerError> { + while let Some(next_char) = self.next_char()? { + // skip whitespace + if next_char.is_whitespace() { + continue; + } + // skip comments + if next_char == '/' && self.peek_next_char()? == Some('/') { + self.skip_line()?; + continue; + } + + match next_char { + // numbers + '0'..='9' => { + return self.tokenize_number(next_char).map(Some); + } + // strings + '"' | '\'' => return self.tokenize_string(next_char).map(Some), + // symbols excluding `"` and `'` + char if !char.is_alphanumeric() && char != '"' && char != '\'' => { + return self.tokenize_symbol(next_char).map(Some); + } + // keywords and identifiers + char if char.is_alphabetic() => { + return self.tokenize_keyword_or_identifier(next_char).map(Some); + } + _ => { + return Err(TokenizerError::UnknownSymbolError( + next_char, + self.line, + self.column, + )); + } + } + } + if self.returned_eof { + Ok(None) + } else { + self.returned_eof = true; + Ok(Some(Token::new(TokenType::EOF, self.line, self.column))) + } + } + + /// Peeks the next token in the stream without consuming it + /// If there are no more tokens in the stream, this function returns None + pub fn peek_next(&mut self) -> Result, TokenizerError> { + let current_pos = self.reader.stream_position()?; + let column = self.column.clone(); + let line = self.line.clone(); + + let token = self.next_token()?; + self.reader.seek(SeekFrom::Start(current_pos))?; + self.column = column; + self.line = line; + Ok(token) + } + + /// Tokenizes a symbol + fn tokenize_symbol(&mut self, first_symbol: char) -> Result { + /// Helper macro to create a symbol token + macro_rules! symbol { + ($symbol:ident) => { + Ok(Token::new( + TokenType::Symbol(Symbol::$symbol), + self.line, + self.column, + )) + }; + } + + match first_symbol { + // single character symbols + '(' => symbol!(LParen), + ')' => symbol!(RParen), + '{' => symbol!(LBrace), + '}' => symbol!(RBrace), + '[' => symbol!(LBracket), + ']' => symbol!(RBracket), + ';' => symbol!(Semicolon), + ':' => symbol!(Colon), + ',' => symbol!(Comma), + '+' => symbol!(Plus), + '-' => symbol!(Minus), + '/' => symbol!(Slash), + + '.' => symbol!(Dot), + '^' => symbol!(Caret), + + // multi-character symbols + '<' if self.peek_next_char()? == Some('=') => { + self.next_char()?; + symbol!(LessThanOrEqual) + } + '<' => symbol!(LessThan), + + '>' if self.peek_next_char()? == Some('=') => { + self.next_char()?; + symbol!(GreaterThanOrEqual) + } + '>' => symbol!(GreaterThan), + + '=' if self.peek_next_char()? == Some('=') => { + self.next_char()?; + symbol!(Equal) + } + '=' => symbol!(Assign), + + '!' if self.peek_next_char()? == Some('=') => { + self.next_char()?; + symbol!(NotEqual) + } + '!' => symbol!(LogicalNot), + + '*' if self.peek_next_char()? == Some('*') => { + self.next_char()?; + symbol!(Exp) + } + '*' => symbol!(Asterisk), + + '&' if self.peek_next_char()? == Some('&') => { + self.next_char()?; + symbol!(LogicalAnd) + } + '|' if self.peek_next_char()? == Some('|') => { + self.next_char()?; + symbol!(LogicalOr) + } + + _ => Err(TokenizerError::UnknownSymbolError( + first_symbol, + self.line, + self.column, + )), + } + } + + /// Tokenizes a number literal. Also handles temperatures with a suffix of `c`, `f`, or `k`. + fn tokenize_number(&mut self, first_char: char) -> Result { + let mut primary = String::with_capacity(16); + let mut decimal: Option = None; + let mut reading_decimal = false; + + let column = self.column.clone(); + let line = self.line.clone(); + + primary.push(first_char); + + while let Some(next_char) = self.peek_next_char()? { + if next_char.is_whitespace() { + break; + } + + if next_char == '.' { + reading_decimal = true; + self.next_char()?; + continue; + } + + // support underscores in numbers for readability + if next_char == '_' { + self.next_char()?; + continue; + } + + // This is for the times when we have a number followed by a symbol (like a semicolon or =) + if !next_char.is_numeric() { + break; + } + + if reading_decimal { + decimal.get_or_insert_with(String::new).push(next_char); + } else { + primary.push(next_char); + } + self.next_char()?; + } + + let number: Number = if let Some(decimal) = decimal { + let decimal_scale = decimal.len() as u32; + let number = format!("{}{}", primary, decimal) + .parse::() + .map_err(|e| TokenizerError::NumberParseError(e, self.line, self.column))?; + Number::Decimal( + Decimal::try_from_i128_with_scale(number, decimal_scale) + .map_err(|e| TokenizerError::DecimalParseError(e, line, column))?, + ) + } else { + Number::Integer( + primary + .parse() + .map_err(|e| TokenizerError::NumberParseError(e, line, column))?, + ) + }; + + // check if the next char is a temperature suffix + if let Some(next_char) = self.peek_next_char()? { + let temperature = match next_char { + 'c' => Temperature::Celsius(number), + 'f' => Temperature::Fahrenheit(number), + 'k' => Temperature::Kelvin(number), + _ => return Ok(Token::new(TokenType::Number(number), line, column)), + } + .to_kelvin(); + + self.next_char()?; + Ok(Token::new(TokenType::Number(temperature), line, column)) + } else { + Ok(Token::new(TokenType::Number(number), line, column)) + } + } + + /// Tokenizes a string literal + fn tokenize_string(&mut self, beginning_quote: char) -> Result { + let mut buffer = String::with_capacity(16); + + let column = self.column.clone(); + let line = self.line.clone(); + + while let Some(next_char) = self.next_char()? { + if next_char == beginning_quote { + break; + } + + buffer.push(next_char); + } + + Ok(Token::new(TokenType::String(buffer), line, column)) + } + + /// Tokenizes a keyword or an identifier. Also handles boolean literals + fn tokenize_keyword_or_identifier( + &mut self, + first_char: char, + ) -> Result { + macro_rules! keyword { + ($keyword:ident) => {{ + return Ok(Token::new( + TokenType::Keyword(Keyword::$keyword), + self.line, + self.column, + )); + }}; + } + + /// Helper macro to check if the next character is whitespace or not alphanumeric + macro_rules! next_ws { + () => { + matches!(self.peek_next_char()?, Some(x) if x.is_whitespace() || !x.is_alphanumeric()) || matches!(self.peek_next_char()?, None) + }; + } + + let mut buffer = String::with_capacity(16); + let line = self.line.clone(); + let column = self.column.clone(); + + let mut looped_char = Some(first_char); + + while let Some(next_char) = looped_char { + if next_char.is_whitespace() { + break; + } + + if !next_char.is_alphanumeric() { + break; + } + buffer.push(next_char); + + match buffer.as_str() { + "let" if next_ws!() => keyword!(Let), + "fn" if next_ws!() => keyword!(Fn), + "if" if next_ws!() => keyword!(If), + "else" if next_ws!() => keyword!(Else), + "return" if next_ws!() => keyword!(Return), + "enum" if next_ws!() => keyword!(Enum), + "device" if next_ws!() => keyword!(Device), + "loop" if next_ws!() => keyword!(Loop), + "break" if next_ws!() => keyword!(Break), + + // boolean literals + "true" if next_ws!() => { + return Ok(Token::new(TokenType::Boolean(true), self.line, self.column)); + } + "false" if next_ws!() => { + return Ok(Token::new( + TokenType::Boolean(false), + self.line, + self.column, + )); + } + // if the next character is whitespace or not alphanumeric, then we have an identifier + // this is because keywords are checked first + val if next_ws!() => { + return Ok(Token::new( + TokenType::Identifier(val.to_string()), + line, + column, + )); + } + _ => {} + } + + looped_char = self.next_char()?; + } + Err(TokenizerError::UnknownKeywordOrIdentifierError( + buffer, line, column, + )) + } +} + +pub struct TokenizerBuffer { + tokenizer: Tokenizer, + buffer: VecDeque, + history: VecDeque, +} + +impl TokenizerBuffer { + pub fn new(tokenizer: Tokenizer) -> Self { + Self { + tokenizer, + buffer: VecDeque::new(), + history: VecDeque::with_capacity(128), + } + } + + /// Reads the next token from the tokenizer, pushing the value to the back of the history + /// and returning the token + pub fn next(&mut self) -> Result, TokenizerError> { + if let Some(token) = self.buffer.pop_front() { + self.history.push_back(token.clone()); + return Ok(Some(token)); + } + + let token = self.tokenizer.next_token()?; + if let Some(ref token) = token { + self.history.push_back(token.clone()); + } + Ok(token) + } + + /// Peeks the next token in the stream without adding to the history stack + pub fn peek(&mut self) -> Result, TokenizerError> { + if let Some(token) = self.buffer.front() { + return Ok(Some(token.clone())); + } + + let token = self.tokenizer.peek_next()?; + Ok(token) + } + + fn seek_from_current(&mut self, seek_to: i64) -> Result<(), TokenizerError> { + use Ordering::*; + // if seek_to > 0 then we need to check if the buffer has enough tokens to pop, otherwise we need to read from the tokenizer + // if seek_to < 0 then we need to pop from the history and push to the front of the buffer. If not enough, then we throw (we reached the front of the history) + // if seek_to == 0 then we don't need to do anything + + match seek_to.cmp(&0) { + Greater => { + let mut tokens = Vec::with_capacity(seek_to as usize); + for _ in 0..seek_to { + if let Some(token) = self.tokenizer.next_token()? { + tokens.push(token); + } else { + return Err(TokenizerError::IOError(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "Unexpected EOF", + ))); + } + } + self.history.extend(tokens); + } + Less => { + let seek_to = seek_to.unsigned_abs() as usize; + let mut tokens = Vec::with_capacity(seek_to); + for _ in 0..seek_to { + if let Some(token) = self.history.pop_back() { + tokens.push(token); + } else { + return Err(TokenizerError::IOError(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "Unexpected EOF", + ))); + } + } + self.buffer.extend(tokens.into_iter().rev()); + } + _ => {} + } + + Ok(()) + } + + /// Adds to or removes from the History stack, allowing the user to move back and forth in the stream + pub fn seek(&mut self, from: SeekFrom) -> Result<(), TokenizerError> { + match from { + SeekFrom::Current(seek_to) => self.seek_from_current(seek_to)?, + SeekFrom::End(_) => unimplemented!("SeekFrom::End will not be implemented"), + SeekFrom::Start(_) => unimplemented!("SeekFrom::Start will not be implemented"), + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use anyhow::Result; + use rust_decimal::Decimal; + + const TEST_FILE: &str = "tests/file.stlg"; + + const TEST_STRING: &str = r#" + fn test() { + let x = 10; + return x + 2; + } + "#; + + #[test] + fn test_seek_from_current() -> Result<()> { + let tokenizer = Tokenizer::from(TEST_STRING.to_owned()); + let mut buffer = TokenizerBuffer::new(tokenizer); + + let token = buffer.next()?.unwrap(); + assert_eq!(token.token_type, TokenType::Keyword(Keyword::Fn)); + + buffer.seek(SeekFrom::Current(1))?; + + let token = buffer.next()?.unwrap(); + + assert_eq!(token.token_type, TokenType::Symbol(Symbol::LParen)); + + Ok(()) + } + + #[test] + fn test_tokenizer_from_path_ok() { + let tokenizer = Tokenizer::from_path(TEST_FILE); + assert!(tokenizer.is_ok()); + } + + #[test] + fn test_tokenizer_from_path_err() { + let tokenizer = Tokenizer::from_path("non_existent_file.stlg"); + assert!(tokenizer.is_err()); + } + + #[test] + fn test_next_char() -> Result<()> { + let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); + + let char = tokenizer.next_char()?; + + assert_eq!(char, Some('\n')); + assert_eq!(tokenizer.line, 2); + assert_eq!(tokenizer.column, 1); + + let mut tokenizer = Tokenizer::from(String::from("fn")); + + let char = tokenizer.next_char()?; + + assert_eq!(char, Some('f')); + assert_eq!(tokenizer.line, 1); + assert_eq!(tokenizer.column, 2); + + Ok(()) + } + + #[test] + fn test_peek_next_char() -> Result<()> { + let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); + + let char = tokenizer.peek_next_char()?; + + assert_eq!(char, Some('\n')); + assert_eq!(tokenizer.line, 1); + assert_eq!(tokenizer.column, 1); + + let char = tokenizer.next_char()?; + assert_eq!(char, Some('\n')); + assert_eq!(tokenizer.line, 2); + assert_eq!(tokenizer.column, 1); + + let char = tokenizer.peek_next_char()?; + assert_eq!(char, Some(' ')); + assert_eq!(tokenizer.line, 2); + assert_eq!(tokenizer.column, 1); + + Ok(()) + } + + #[test] + fn test_temperature_unit() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("10c 14f 10k")); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!( + token.token_type, + TokenType::Number(Number::Decimal(Decimal::new(28315, 2))) + ); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!( + token.token_type, + TokenType::Number(Number::Decimal(Decimal::new(26315, 2))) + ); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, TokenType::Number(Number::Integer(10))); + + Ok(()) + } + + #[test] + fn test_parse_integer() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("10")); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, TokenType::Number(Number::Integer(10))); + + Ok(()) + } + + #[test] + fn test_parse_integer_with_underscore() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("1_000")); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, TokenType::Number(Number::Integer(1000))); + + Ok(()) + } + + #[test] + fn test_parse_decimal() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("10.5")); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!( + token.token_type, + TokenType::Number(Number::Decimal(Decimal::new(105, 1))) // 10.5 + ); + + Ok(()) + } + + #[test] + fn test_parse_decimal_with_underscore() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("1_000.000_6")); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!( + token.token_type, + TokenType::Number(Number::Decimal(Decimal::new(10000006, 4))) // 1000.0006 + ); + + Ok(()) + } + + #[test] + fn test_parse_number_with_symbol() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("10;")); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, TokenType::Number(Number::Integer(10))); + + let next_char = tokenizer.next_char()?; + + assert_eq!(next_char, Some(';')); + + Ok(()) + } + + #[test] + fn test_string_parse() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from(r#""Hello, World!""#)); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!( + token.token_type, + TokenType::String(String::from("Hello, World!")) + ); + + let mut tokenizer = Tokenizer::from(String::from(r#"'Hello, World!'"#)); + + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!( + token.token_type, + TokenType::String(String::from("Hello, World!")) + ); + + Ok(()) + } + + #[test] + fn test_symbol_parse() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from( + "^ ! () [] {} , . ; : + - * / < > = != && || >= <=**", + )); + + let expected_tokens = vec![ + TokenType::Symbol(Symbol::Caret), + TokenType::Symbol(Symbol::LogicalNot), + TokenType::Symbol(Symbol::LParen), + TokenType::Symbol(Symbol::RParen), + TokenType::Symbol(Symbol::LBracket), + TokenType::Symbol(Symbol::RBracket), + TokenType::Symbol(Symbol::LBrace), + TokenType::Symbol(Symbol::RBrace), + TokenType::Symbol(Symbol::Comma), + TokenType::Symbol(Symbol::Dot), + TokenType::Symbol(Symbol::Semicolon), + TokenType::Symbol(Symbol::Colon), + TokenType::Symbol(Symbol::Plus), + TokenType::Symbol(Symbol::Minus), + TokenType::Symbol(Symbol::Asterisk), + TokenType::Symbol(Symbol::Slash), + TokenType::Symbol(Symbol::LessThan), + TokenType::Symbol(Symbol::GreaterThan), + TokenType::Symbol(Symbol::Assign), + TokenType::Symbol(Symbol::NotEqual), + TokenType::Symbol(Symbol::LogicalAnd), + TokenType::Symbol(Symbol::LogicalOr), + TokenType::Symbol(Symbol::GreaterThanOrEqual), + TokenType::Symbol(Symbol::LessThanOrEqual), + TokenType::Symbol(Symbol::Exp), + ]; + + for expected_token in expected_tokens { + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, expected_token); + } + + Ok(()) + } + + #[test] + fn test_keyword_parse() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("let fn if else return enum")); + + let expected_tokens = vec![ + TokenType::Keyword(Keyword::Let), + TokenType::Keyword(Keyword::Fn), + TokenType::Keyword(Keyword::If), + TokenType::Keyword(Keyword::Else), + TokenType::Keyword(Keyword::Return), + TokenType::Keyword(Keyword::Enum), + ]; + + for expected_token in expected_tokens { + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, expected_token); + } + + Ok(()) + } + + #[test] + fn test_identifier_parse() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("fn test")); + + let token = tokenizer.next_token()?.unwrap(); + assert_eq!(token.token_type, TokenType::Keyword(Keyword::Fn)); + let token = tokenizer.next_token()?.unwrap(); + assert_eq!( + token.token_type, + TokenType::Identifier(String::from("test")) + ); + + Ok(()) + } + + #[test] + fn test_boolean_parse() -> Result<()> { + let mut tokenizer = Tokenizer::from(String::from("true false")); + + let token = tokenizer.next_token()?.unwrap(); + assert_eq!(token.token_type, TokenType::Boolean(true)); + let token = tokenizer.next_token()?.unwrap(); + assert_eq!(token.token_type, TokenType::Boolean(false)); + + Ok(()) + } + + #[test] + fn test_full_source() -> Result<()> { + let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); + + let expected_tokens = vec![ + TokenType::Keyword(Keyword::Fn), + TokenType::Identifier(String::from("test")), + TokenType::Symbol(Symbol::LParen), + TokenType::Symbol(Symbol::RParen), + TokenType::Symbol(Symbol::LBrace), + TokenType::Keyword(Keyword::Let), + TokenType::Identifier(String::from("x")), + TokenType::Symbol(Symbol::Assign), + TokenType::Number(Number::Integer(10)), + TokenType::Symbol(Symbol::Semicolon), + TokenType::Keyword(Keyword::Return), + TokenType::Identifier(String::from("x")), + TokenType::Symbol(Symbol::Plus), + TokenType::Number(Number::Integer(2)), + TokenType::Symbol(Symbol::Semicolon), + TokenType::Symbol(Symbol::RBrace), + ]; + + for expected_token in expected_tokens { + let token = tokenizer.next_token()?.unwrap(); + + assert_eq!(token.token_type, expected_token); + } + + Ok(()) + } + + #[test] + fn test_peek_next() -> Result<()> { + let mut tokenizer = Tokenizer::from(TEST_STRING.to_owned()); + + let column = tokenizer.column.clone(); + let line = tokenizer.line.clone(); + + let peeked_token = tokenizer.peek_next()?; + + assert_eq!( + peeked_token.unwrap().token_type, + TokenType::Keyword(Keyword::Fn) + ); + assert_eq!(tokenizer.column, column); + assert_eq!(tokenizer.line, line); + + let next_token = tokenizer.next_token()?; + + assert_eq!( + next_token.unwrap().token_type, + TokenType::Keyword(Keyword::Fn) + ); + assert_ne!(tokenizer.column, column); + assert_ne!(tokenizer.line, line); + + Ok(()) + } +} + diff --git a/libs/tokenizer/src/token.rs b/libs/tokenizer/src/token.rs new file mode 100644 index 0000000..5e1c970 --- /dev/null +++ b/libs/tokenizer/src/token.rs @@ -0,0 +1,221 @@ +use rust_decimal::Decimal; + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Token { + /// The type of the token + pub token_type: TokenType, + /// The line where the token was found + pub line: usize, + /// The column where the token was found + pub column: usize, +} + +impl Token { + pub fn new(token_type: TokenType, line: usize, column: usize) -> Self { + Self { + token_type, + line, + column, + } + } +} + +#[derive(Debug, PartialEq, Hash, Eq, Clone)] +pub enum Temperature { + Celsius(Number), + Fahrenheit(Number), + Kelvin(Number), +} + +impl std::fmt::Display for Temperature { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Temperature::Celsius(n) => write!(f, "{}°C", n), + Temperature::Fahrenheit(n) => write!(f, "{}°F", n), + Temperature::Kelvin(n) => write!(f, "{}K", n), + } + } +} + +impl Temperature { + pub fn to_kelvin(self) -> Number { + match self { + Temperature::Celsius(n) => { + let n = match n { + Number::Integer(i) => Decimal::new(i as i64, 0), + Number::Decimal(d) => d, + }; + Number::Decimal(n + Decimal::new(27315, 2)) + } + Temperature::Fahrenheit(n) => { + let n = match n { + Number::Integer(i) => Decimal::new(i as i64, 0), + Number::Decimal(d) => d, + }; + + let a = n - Decimal::new(32, 0); + let b = Decimal::new(5, 0) / Decimal::new(9, 0); + Number::Decimal(a * b + Decimal::new(27315, 2)) + } + Temperature::Kelvin(n) => n, + } + } +} + +#[derive(Debug, PartialEq, Hash, Eq, Clone)] +pub enum TokenType { + /// Represents a string token + String(String), + /// Represents a number token + Number(Number), + /// Represents a boolean token + Boolean(bool), + /// Represents a keyword token + Keyword(Keyword), + /// Represents an identifier token + Identifier(String), + /// Represents a symbol token + Symbol(Symbol), + /// Represents an end of file token + EOF, +} + +impl std::fmt::Display for TokenType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + TokenType::String(s) => write!(f, "{}", s), + TokenType::Number(n) => write!(f, "{}", n), + TokenType::Boolean(b) => write!(f, "{}", b), + TokenType::Keyword(k) => write!(f, "{:?}", k), + TokenType::Identifier(i) => write!(f, "{}", i), + TokenType::Symbol(s) => write!(f, "{:?}", s), + TokenType::EOF => write!(f, "EOF"), + } + } +} + +#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)] +pub enum Number { + /// Represents an integer number + Integer(u128), + /// Represents a decimal type number with a precision of 64 bits + Decimal(Decimal), +} + +impl std::fmt::Display for Number { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Number::Integer(i) => write!(f, "{}", i), + Number::Decimal(d) => write!(f, "{}", d.to_string()), + } + } +} + +#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)] +pub enum Symbol { + // Single Character Symbols + /// Represents the `(` symbol + LParen, + /// Represents the `)` symbol + RParen, + /// Represents the `{` symbol + LBrace, + /// Represents the `}` symbol + RBrace, + /// Represents the `[` symbol + LBracket, + /// Represents the `]` symbol + RBracket, + /// Represents the `;` symbol + Semicolon, + /// Represents the `:` symbol + Colon, + /// Represents the `,` symbol + Comma, + /// Represents the `+` symbol + Plus, + /// Represents the `-` symbol + Minus, + /// Represents the `*` symbol + Asterisk, + /// Represents the `/` symbol + Slash, + /// Represents the `<` symbol + LessThan, + /// Represents the `>` symbol + GreaterThan, + /// Represents the `=` symbol + Assign, + /// Represents the `!` symbol + LogicalNot, + /// Represents the `.` symbol + Dot, + /// Represents the `^` symbol + Caret, + + // Double Character Symbols + /// Represents the `==` symbol + Equal, + /// Represents the `!=` symbol + NotEqual, + /// Represents the `&&` Symbol + LogicalAnd, + // Represents the `||` Symbol + LogicalOr, + /// Represents the `<=` symbol + LessThanOrEqual, + /// Represents the `>=` symbol + GreaterThanOrEqual, + /// Represents the `**` symbol + Exp, +} + +impl Symbol { + pub fn is_operator(&self) -> bool { + match self { + Symbol::Plus | Symbol::Minus | Symbol::Asterisk | Symbol::Slash | Symbol::Exp => true, + _ => false, + } + } + + pub fn is_comparison(&self) -> bool { + match self { + Symbol::LessThan + | Symbol::GreaterThan + | Symbol::Equal + | Symbol::NotEqual + | Symbol::LessThanOrEqual + | Symbol::GreaterThanOrEqual => true, + _ => false, + } + } + + pub fn is_logical(&self) -> bool { + match self { + Symbol::LogicalAnd | Symbol::LogicalOr => true, + _ => false, + } + } +} + +#[derive(Debug, PartialEq, Hash, Eq, Clone, Copy)] +pub enum Keyword { + /// Represents the `let` keyword + Let, + /// Represents the `fn` keyword + Fn, + /// Represents the `if` keyword + If, + /// Represents the `device` keyword. Useful for defining a device at a specific address (ex. d0, d1, d2, etc.) + Device, + /// Represents the `else` keyword + Else, + /// Represents the `return` keyword + Return, + /// Represents the `enum` keyword + Enum, + /// Represents the `loop` keyword + Loop, + /// Represents the `break` keyword + Break, +} diff --git a/tests/file.stlg b/libs/tokenizer/tests/file.stlg similarity index 96% rename from tests/file.stlg rename to libs/tokenizer/tests/file.stlg index 1532af2..dd186bf 100644 --- a/tests/file.stlg +++ b/libs/tokenizer/tests/file.stlg @@ -6,4 +6,4 @@ let roomTemperatureMin = 20c; let roomTemperatureMax = 30c; -let averageTemperature = (roomTemperatureMax + roomTemperatureMin) / 2; \ No newline at end of file +let averageTemperature = (roomTemperatureMax + roomTemperatureMin) / 2; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 643dd6a..a2578e5 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -1,6 +1,5 @@ pub mod token; -use crate::boxed; use rust_decimal::Decimal; use std::{ cmp::Ordering, @@ -50,7 +49,7 @@ pub(crate) struct Tokenizer { impl Tokenizer { pub fn from_path(input_file: impl Into) -> Result { let file = std::fs::File::open(input_file.into())?; - let reader = BufReader::new(boxed!(file) as Box); + let reader = BufReader::new(Box::new(file) as Box); Ok(Self { reader, @@ -64,7 +63,7 @@ impl Tokenizer { impl From for Tokenizer { fn from(input: String) -> Self { - let reader = BufReader::new(boxed!(Cursor::new(input)) as Box); + let reader = BufReader::new(Box::new(Cursor::new(input)) as Box); Self { reader,