Merge pull request #7 from dbidwell94/tokenization-refactor

Tokenization refactor
This commit is contained in:
2025-11-29 15:36:21 -07:00
committed by GitHub
8 changed files with 409 additions and 297 deletions

View File

@@ -39,6 +39,7 @@ echo "--------------------"
RUST_WIN_EXE="$RUST_DIR/target/x86_64-pc-windows-gnu/release/slang.exe" RUST_WIN_EXE="$RUST_DIR/target/x86_64-pc-windows-gnu/release/slang.exe"
RUST_LINUX_BIN="$RUST_DIR/target/x86_64-unknown-linux-gnu/release/slang" RUST_LINUX_BIN="$RUST_DIR/target/x86_64-unknown-linux-gnu/release/slang"
CHARP_DLL="$CSHARP_DIR/bin/Release/net46/StationeersSlang.dll" CHARP_DLL="$CSHARP_DIR/bin/Release/net46/StationeersSlang.dll"
CHARP_PDB="$CSHARP_DIR/bin/Release/net46/StationeersSlang.pdb"
# Check if the release dir exists, if not: create it. # Check if the release dir exists, if not: create it.
if [[ ! -d "$RELEASE_DIR" ]]; then if [[ ! -d "$RELEASE_DIR" ]]; then
@@ -48,3 +49,4 @@ fi
cp "$RUST_WIN_EXE" "$RELEASE_DIR/slang.exe" cp "$RUST_WIN_EXE" "$RELEASE_DIR/slang.exe"
cp "$RUST_LINUX_BIN" "$RELEASE_DIR/slang" cp "$RUST_LINUX_BIN" "$RELEASE_DIR/slang"
cp "$CHARP_DLL" "$RELEASE_DIR/StationeersSlang.dll" cp "$CHARP_DLL" "$RELEASE_DIR/StationeersSlang.dll"
cp "$CHARP_PDB" "$RELEASE_DIR/StationeersSlang.pdb"

View File

@@ -1,66 +1,103 @@
namespace Slang;
using System; using System;
using System.Text; using System.Text;
using StationeersIC10Editor; using StationeersIC10Editor;
namespace Slang public static unsafe class SlangExtensions
{ {
public static unsafe class SlangExtensions /**
* <summary>
* This is a helper method to convert a Rust struct for a string pointer
* into a C# style string.
* </summary>
*/
public static string AsString(this Vec_uint8_t vec)
{ {
/** if (vec.ptr == null || vec.len == UIntPtr.Zero)
* <summary>
* This is a helper method to convert a Rust struct for a string pointer
* into a C# style string.
* </summary>
*/
public static string AsString(this Vec_uint8_t vec)
{ {
if (vec.ptr == null || vec.len == UIntPtr.Zero) return string.Empty;
{
return string.Empty;
}
// Rust strings are UTF-8. Read bytes from raw pointer.
var toReturn = Encoding.UTF8.GetString(vec.ptr, (int)vec.len);
return toReturn;
} }
/** // Rust strings are UTF-8. Read bytes from raw pointer.
* <summary>This will free a Rust string struct. Because this is a pointer to a struct, this memory var toReturn = Encoding.UTF8.GetString(vec.ptr, (int)vec.len);
* is managed by Rust, therefor it must be freed by Rust
* </summary> return toReturn;
*/ }
public static void Drop(this Vec_uint8_t vec)
/**
* <summary>This will free a Rust string struct. Because this is a pointer to a struct, this memory
* is managed by Rust, therefor it must be freed by Rust
* </summary>
*/
public static void Drop(this Vec_uint8_t vec)
{
Ffi.free_string(vec);
}
/**
* <summary>This helper converts a Rust vec to a C# List. This handles freeing the
* Rust allocation after the List is created, there is no need to Drop this memory.
* </summary>
*/
public static Line ToLine(this Vec_FfiToken_t vec, string sourceText)
{
var list = new Line(sourceText);
var currentPtr = vec.ptr;
// Iterate through the raw memory array
for (int i = 0; i < (int)vec.len; i++)
{ {
Ffi.free_string(vec); var token = currentPtr[i];
var color = GetColorForKind(token.token_kind);
int colIndex = token.column;
if (colIndex < 0)
colIndex = 0;
var semanticToken = new SemanticToken(
0,
colIndex,
token.length,
color,
token.token_kind
);
string errMsg = token.error.AsString();
if (!string.IsNullOrEmpty(errMsg))
{
semanticToken.IsError = true;
semanticToken.Data = errMsg;
semanticToken.Color = ICodeFormatter.ColorError;
}
list.AddToken(semanticToken);
} }
/** Ffi.free_ffi_token_vec(vec);
* <summary>This helper converts a Rust vec to a C# List. This handles freeing the
* Rust allocation after the List is created, there is no need to Drop this memory. return list;
* </summary> }
*/
public static Line AsList(this Vec_FfiToken_t vec) private static uint GetColorForKind(uint kind)
{
switch (kind)
{ {
var list = new Line(); case 1:
list.Capacity = (int)vec.len; return SlangFormatter.ColorInstruction; // Keyword
case 2:
var currentPtr = vec.ptr; return SlangFormatter.ColorDefault; // Identifier
case 3:
// Iterate through the raw memory array return SlangFormatter.ColorNumber; // Number
for (int i = 0; i < (int)vec.len; i++) case 4:
{ return SlangFormatter.ColorString; // String
// Dereference pointer to get the struct at index i case 5:
FfiToken_t token = currentPtr[i]; return SlangFormatter.ColorInstruction; // Boolean
case 6:
var newToken = new Token(token.text.AsString(), token.column); return SlangFormatter.ColorDefault; // Symbol
default:
list.Add(newToken); return SlangFormatter.ColorDefault;
}
Ffi.free_ffi_token_vec(vec);
return list;
} }
} }
} }

View File

@@ -83,17 +83,17 @@ public unsafe partial class Ffi {
slice_ref_uint16_t input); slice_ref_uint16_t input);
} }
[StructLayout(LayoutKind.Sequential, Size = 104)] [StructLayout(LayoutKind.Sequential, Size = 64)]
public unsafe struct FfiToken_t { public unsafe struct FfiToken_t {
public Vec_uint8_t text;
public Vec_uint8_t tooltip; public Vec_uint8_t tooltip;
public Vec_uint8_t error; public Vec_uint8_t error;
public Vec_uint8_t status;
public Int32 column; public Int32 column;
public Int32 length;
public UInt32 token_kind;
} }
/// <summary> /// <summary>

View File

@@ -1,12 +1,24 @@
namespace Slang;
using StationeersIC10Editor; using StationeersIC10Editor;
namespace Slang public class SlangFormatter : ICodeFormatter
{ {
public class SlangFormatter : ICodeFormatter public static readonly uint ColorInstruction = ColorFromHTML("#ffff00");
public static readonly uint ColorString = ColorFromHTML("#ce9178");
public override Line ParseLine(string line)
{ {
public override Line ParseLine(string line) return Marshal.TokenizeLine(line);
}
public override string Compile()
{
if (Marshal.CompileFromString(this.Lines.RawText, out string compiled))
{ {
return Marshal.TokenizeLine(line); return compiled;
} }
return string.Empty;
} }
} }

View File

@@ -1,59 +1,157 @@
namespace Slang;
using System; using System;
using System.IO;
using System.Reflection;
using System.Runtime.InteropServices;
using StationeersIC10Editor; using StationeersIC10Editor;
namespace Slang public static class Marshal
{ {
public static class Marshal private static IntPtr _libraryHandle = IntPtr.Zero;
{
public static unsafe Line TokenizeLine(string source)
{
if (String.IsNullOrEmpty(source))
{
return new Line();
}
fixed (char* ptrString = source) [DllImport("kernel32", SetLastError = true, CharSet = CharSet.Ansi)]
{ private static extern IntPtr LoadLibrary([MarshalAs(UnmanagedType.LPStr)] string lpFileName);
var input = new slice_ref_uint16_t
{ [DllImport("kernel32", SetLastError = true)]
ptr = (ushort*)ptrString, private static extern bool FreeLibrary(IntPtr hModule);
len = (UIntPtr)source.Length,
}; private static bool EnsureLibLoaded()
return Ffi.tokenize_line(input).AsList(); {
} if (_libraryHandle != IntPtr.Zero)
{
return true;
} }
public static unsafe bool CompileFromString(string inputString, out string compiledString) try
{ {
if (String.IsNullOrEmpty(inputString)) _libraryHandle = LoadLibrary(ExtractNativeLibrary(Ffi.RustLib));
CodeFormatters.RegisterFormatter("Slang", typeof(SlangFormatter), true);
return true;
}
catch (Exception ex)
{
L.Error($"Failed to init slang compiler: {ex.Message}");
return false;
}
}
public static bool Init()
{
return EnsureLibLoaded();
}
public static bool Destroy()
{
if (_libraryHandle == IntPtr.Zero)
{
return true;
}
try
{
FreeLibrary(_libraryHandle);
_libraryHandle = IntPtr.Zero;
return true;
}
catch (Exception ex)
{
L.Warning($"Unable to free handle to slang compiler's dll. {ex.Message}");
return false;
}
}
public static unsafe Line TokenizeLine(string source)
{
if (String.IsNullOrEmpty(source))
{
return new Line(source);
}
if (!EnsureLibLoaded())
{
return new Line(source);
}
fixed (char* ptrString = source)
{
var input = new slice_ref_uint16_t
{ {
compiledString = String.Empty; ptr = (ushort*)ptrString,
return false; len = (UIntPtr)source.Length,
};
return Ffi.tokenize_line(input).ToLine(source);
}
}
public static unsafe bool CompileFromString(string inputString, out string compiledString)
{
if (String.IsNullOrEmpty(inputString))
{
compiledString = String.Empty;
return false;
}
if (!EnsureLibLoaded())
{
compiledString = String.Empty;
return false;
}
fixed (char* ptrString = inputString)
{
var input = new slice_ref_uint16_t
{
ptr = (ushort*)ptrString,
len = (UIntPtr)inputString.Length,
};
var result = Ffi.compile_from_string(input);
try
{
if ((ulong)result.len < 1)
{
compiledString = String.Empty;
return false;
}
compiledString = result.AsString();
return true;
}
finally
{
result.Drop();
}
}
}
private static string ExtractNativeLibrary(string libName)
{
string destinationPath = Path.Combine(Path.GetTempPath(), libName);
Assembly assembly = Assembly.GetExecutingAssembly();
using (Stream stream = assembly.GetManifestResourceStream(libName))
{
if (stream == null)
{
L.Error(
$"{libName} not found. This means it was not embedded in the mod. Please contact the mod author!"
);
return "";
} }
fixed (char* ptrString = inputString) try
{ {
var input = new slice_ref_uint16_t using (FileStream fileStream = new FileStream(destinationPath, FileMode.Create))
{ {
ptr = (ushort*)ptrString, stream.CopyTo(fileStream);
len = (UIntPtr)inputString.Length,
};
var result = Ffi.compile_from_string(input);
try
{
if ((ulong)result.len < 1)
{
compiledString = String.Empty;
return false;
}
compiledString = result.AsString();
return true;
}
finally
{
result.Drop();
} }
return destinationPath;
}
catch (IOException e)
{
L.Warning($"Could not overwrite {libName} (it might be in use): {e.Message}");
return "";
} }
} }
} }

View File

@@ -1,9 +1,6 @@
using System.IO;
using System.Reflection;
using System.Text.RegularExpressions; using System.Text.RegularExpressions;
using BepInEx; using BepInEx;
using HarmonyLib; using HarmonyLib;
using StationeersIC10Editor;
namespace Slang namespace Slang
{ {
@@ -44,6 +41,8 @@ namespace Slang
public const string PluginGuid = "com.biddydev.slang"; public const string PluginGuid = "com.biddydev.slang";
public const string PluginName = "Slang"; public const string PluginName = "Slang";
private Harmony? _harmony;
private static Regex? _slangSourceCheck = null; private static Regex? _slangSourceCheck = null;
private static Regex SlangSourceCheck private static Regex SlangSourceCheck
@@ -89,44 +88,28 @@ namespace Slang
private void Awake() private void Awake()
{ {
L.SetLogger(Logger); L.SetLogger(Logger);
this._harmony = new Harmony(PluginGuid);
L.Info("slang loaded");
if (ExtractNativeDll(Ffi.RustLib)) // If we failed to load the compiler, bail from the rest of the patches. It won't matter,
// as the compiler itself has failed to load.
if (!Marshal.Init())
{ {
var harmony = new Harmony(PluginGuid); return;
harmony.PatchAll();
CodeFormatters.RegisterFormatter("slang", () => new SlangFormatter(), true);
} }
this._harmony.PatchAll();
} }
private bool ExtractNativeDll(string fileName) private void OnDestroy()
{ {
string destinationPath = Path.Combine(Path.GetDirectoryName(Info.Location), fileName); if (Marshal.Destroy())
Assembly assembly = Assembly.GetExecutingAssembly();
using (Stream stream = assembly.GetManifestResourceStream(fileName))
{ {
if (stream == null) L.Info("FFI references cleaned up.");
{ }
L.Error( if (this._harmony is not null)
$"{Ffi.RustLib} not found. This means it was not embedded in the mod. Please contact the mod author!" {
); this._harmony.UnpatchSelf();
return false;
}
try
{
using (FileStream fileStream = new FileStream(destinationPath, FileMode.Create))
{
stream.CopyTo(fileStream);
}
return true;
}
catch (IOException e)
{
L.Warning($"Could not overwrite {fileName} (it might be in use): {e.Message}");
return false;
}
} }
} }
} }

View File

@@ -56,7 +56,7 @@ impl<'a> Tokenizer<'a> {
Ok(Self { Ok(Self {
reader, reader,
line: 1, line: 1,
column: 1, column: 0, // Start at 0 so first char becomes 1
char_buffer: [0], char_buffer: [0],
returned_eof: false, returned_eof: false,
string_buffer: String::new(), string_buffer: String::new(),
@@ -71,7 +71,7 @@ impl<'a> From<String> for Tokenizer<'a> {
Self { Self {
reader, reader,
line: 1, line: 1,
column: 1, column: 0,
char_buffer: [0], char_buffer: [0],
returned_eof: false, returned_eof: false,
string_buffer: String::new(), string_buffer: String::new(),
@@ -84,7 +84,7 @@ impl<'a> From<&'a str> for Tokenizer<'a> {
Self { Self {
reader: BufReader::new(Box::new(Cursor::new(value)) as Box<dyn Tokenize>), reader: BufReader::new(Box::new(Cursor::new(value)) as Box<dyn Tokenize>),
char_buffer: [0], char_buffer: [0],
column: 1, column: 0,
line: 1, line: 1,
returned_eof: false, returned_eof: false,
string_buffer: String::new(), string_buffer: String::new(),
@@ -93,12 +93,6 @@ impl<'a> From<&'a str> for Tokenizer<'a> {
} }
impl<'a> Tokenizer<'a> { impl<'a> Tokenizer<'a> {
/// Consumes the tokenizer and returns the next token in the stream
/// If there are no more tokens in the stream, this function returns None
/// If there is an error reading the stream, this function returns an error
///
/// # Important
/// This function will increment the line and column counters
fn next_char(&mut self) -> Result<Option<char>, Error> { fn next_char(&mut self) -> Result<Option<char>, Error> {
let bytes_read = self.reader.read(&mut self.char_buffer)?; let bytes_read = self.reader.read(&mut self.char_buffer)?;
@@ -106,7 +100,6 @@ impl<'a> Tokenizer<'a> {
return Ok(None); return Ok(None);
} }
// Safety: The buffer is guaranteed to have 1 value as it is initialized with a size of 1
let c = self.char_buffer[0] as char; let c = self.char_buffer[0] as char;
if c == '\n' { if c == '\n' {
self.line += 1; self.line += 1;
@@ -119,30 +112,17 @@ impl<'a> Tokenizer<'a> {
Ok(Some(c)) Ok(Some(c))
} }
/// Peeks the next character in the stream without consuming it
///
/// # Important
/// This does not increment the line or column counters
fn peek_next_char(&mut self) -> Result<Option<char>, Error> { fn peek_next_char(&mut self) -> Result<Option<char>, Error> {
let current_pos = self.reader.stream_position()?; let current_pos = self.reader.stream_position()?;
let to_return = if self.reader.read(&mut self.char_buffer)? == 0 { let to_return = if self.reader.read(&mut self.char_buffer)? == 0 {
None None
} else { } else {
self.reader.seek(SeekFrom::Start(current_pos))?; self.reader.seek(SeekFrom::Start(current_pos))?;
// Safety: The buffer is guaranteed to have 1 value as it is initialized with a size of 1
Some(self.char_buffer[0] as char) Some(self.char_buffer[0] as char)
}; };
Ok(to_return) Ok(to_return)
} }
/// Skips the current line in the stream.
/// Useful for skipping comments or empty lines
///
/// # Important
/// This function will increment the line and column counters
fn skip_line(&mut self) -> Result<(), Error> { fn skip_line(&mut self) -> Result<(), Error> {
while let Some(next_char) = self.next_char()? { while let Some(next_char) = self.next_char()? {
if next_char == '\n' { if next_char == '\n' {
@@ -152,40 +132,50 @@ impl<'a> Tokenizer<'a> {
Ok(()) Ok(())
} }
/// Consumes the tokenizer and returns the next token in the stream
/// If there are no more tokens in the stream, this function returns None
pub fn next_token(&mut self) -> Result<Option<Token>, Error> { pub fn next_token(&mut self) -> Result<Option<Token>, Error> {
self.string_buffer.clear();
while let Some(next_char) = self.next_char()? { while let Some(next_char) = self.next_char()? {
// skip whitespace
if next_char.is_whitespace() { if next_char.is_whitespace() {
self.string_buffer.clear();
continue; continue;
} }
// skip comments
if next_char == '/' && self.peek_next_char()? == Some('/') { if next_char == '/' && self.peek_next_char()? == Some('/') {
self.skip_line()?; self.skip_line()?;
self.string_buffer.clear();
continue; continue;
} }
// Capture start position before delegating
let start_line = self.line;
let start_col = self.column;
match next_char { match next_char {
// numbers
'0'..='9' => { '0'..='9' => {
return self.tokenize_number(next_char).map(Some); return self
.tokenize_number(next_char, start_line, start_col)
.map(Some);
}
'"' | '\'' => {
return self
.tokenize_string(next_char, start_line, start_col)
.map(Some);
} }
// strings
'"' | '\'' => return self.tokenize_string(next_char).map(Some),
// symbols excluding `"` and `'`
char if !char.is_alphanumeric() && char != '"' && char != '\'' => { char if !char.is_alphanumeric() && char != '"' && char != '\'' => {
return self.tokenize_symbol(next_char).map(Some); return self
.tokenize_symbol(next_char, start_line, start_col)
.map(Some);
} }
// keywords and identifiers
char if char.is_alphabetic() => { char if char.is_alphabetic() => {
return self.tokenize_keyword_or_identifier(next_char).map(Some); return self
.tokenize_keyword_or_identifier(next_char, start_line, start_col)
.map(Some);
} }
_ => { _ => {
return Err(Error::UnknownSymbolError( return Err(Error::UnknownSymbolError(
next_char, next_char,
self.line, start_line,
self.column, start_col,
std::mem::take(&mut self.string_buffer), std::mem::take(&mut self.string_buffer),
)); ));
} }
@@ -204,13 +194,10 @@ impl<'a> Tokenizer<'a> {
} }
} }
/// Peeks the next token in the stream without consuming it
/// If there are no more tokens in the stream, this function returns None
pub fn peek_next(&mut self) -> Result<Option<Token>, Error> { pub fn peek_next(&mut self) -> Result<Option<Token>, Error> {
let current_pos = self.reader.stream_position()?; let current_pos = self.reader.stream_position()?;
let column = self.column; let column = self.column;
let line = self.line; let line = self.line;
let token = self.next_token()?; let token = self.next_token()?;
self.reader.seek(SeekFrom::Start(current_pos))?; self.reader.seek(SeekFrom::Start(current_pos))?;
self.column = column; self.column = column;
@@ -218,22 +205,26 @@ impl<'a> Tokenizer<'a> {
Ok(token) Ok(token)
} }
/// Tokenizes a symbol // Updated helper functions to accept start_line and start_col
fn tokenize_symbol(&mut self, first_symbol: char) -> Result<Token, Error> {
/// Helper macro to create a symbol token fn tokenize_symbol(
&mut self,
first_symbol: char,
line: usize,
col: usize,
) -> Result<Token, Error> {
macro_rules! symbol { macro_rules! symbol {
($symbol:ident) => { ($symbol:ident) => {
Ok(Token::new( Ok(Token::new(
TokenType::Symbol(Symbol::$symbol), TokenType::Symbol(Symbol::$symbol),
self.line, line,
self.column, col,
Some(std::mem::take(&mut self.string_buffer)), Some(std::mem::take(&mut self.string_buffer)),
)) ))
}; };
} }
match first_symbol { match first_symbol {
// single character symbols
'(' => symbol!(LParen), '(' => symbol!(LParen),
')' => symbol!(RParen), ')' => symbol!(RParen),
'{' => symbol!(LBrace), '{' => symbol!(LBrace),
@@ -246,42 +237,34 @@ impl<'a> Tokenizer<'a> {
'+' => symbol!(Plus), '+' => symbol!(Plus),
'-' => symbol!(Minus), '-' => symbol!(Minus),
'/' => symbol!(Slash), '/' => symbol!(Slash),
'.' => symbol!(Dot), '.' => symbol!(Dot),
'^' => symbol!(Caret), '^' => symbol!(Caret),
'%' => symbol!(Percent), '%' => symbol!(Percent),
// multi-character symbols
'<' if self.peek_next_char()? == Some('=') => { '<' if self.peek_next_char()? == Some('=') => {
self.next_char()?; self.next_char()?;
symbol!(LessThanOrEqual) symbol!(LessThanOrEqual)
} }
'<' => symbol!(LessThan), '<' => symbol!(LessThan),
'>' if self.peek_next_char()? == Some('=') => { '>' if self.peek_next_char()? == Some('=') => {
self.next_char()?; self.next_char()?;
symbol!(GreaterThanOrEqual) symbol!(GreaterThanOrEqual)
} }
'>' => symbol!(GreaterThan), '>' => symbol!(GreaterThan),
'=' if self.peek_next_char()? == Some('=') => { '=' if self.peek_next_char()? == Some('=') => {
self.next_char()?; self.next_char()?;
symbol!(Equal) symbol!(Equal)
} }
'=' => symbol!(Assign), '=' => symbol!(Assign),
'!' if self.peek_next_char()? == Some('=') => { '!' if self.peek_next_char()? == Some('=') => {
self.next_char()?; self.next_char()?;
symbol!(NotEqual) symbol!(NotEqual)
} }
'!' => symbol!(LogicalNot), '!' => symbol!(LogicalNot),
'*' if self.peek_next_char()? == Some('*') => { '*' if self.peek_next_char()? == Some('*') => {
self.next_char()?; self.next_char()?;
symbol!(Exp) symbol!(Exp)
} }
'*' => symbol!(Asterisk), '*' => symbol!(Asterisk),
'&' if self.peek_next_char()? == Some('&') => { '&' if self.peek_next_char()? == Some('&') => {
self.next_char()?; self.next_char()?;
symbol!(LogicalAnd) symbol!(LogicalAnd)
@@ -290,45 +273,39 @@ impl<'a> Tokenizer<'a> {
self.next_char()?; self.next_char()?;
symbol!(LogicalOr) symbol!(LogicalOr)
} }
_ => Err(Error::UnknownSymbolError( _ => Err(Error::UnknownSymbolError(
first_symbol, first_symbol,
self.line, line,
self.column, col,
std::mem::take(&mut self.string_buffer), std::mem::take(&mut self.string_buffer),
)), )),
} }
} }
/// Tokenizes a number literal. Also handles temperatures with a suffix of `c`, `f`, or `k`. fn tokenize_number(
fn tokenize_number(&mut self, first_char: char) -> Result<Token, Error> { &mut self,
first_char: char,
line: usize,
col: usize,
) -> Result<Token, Error> {
let mut primary = String::with_capacity(16); let mut primary = String::with_capacity(16);
let mut decimal: Option<String> = None; let mut decimal: Option<String> = None;
let mut reading_decimal = false; let mut reading_decimal = false;
let column = self.column;
let line = self.line;
primary.push(first_char); primary.push(first_char);
while let Some(next_char) = self.peek_next_char()? { while let Some(next_char) = self.peek_next_char()? {
if next_char.is_whitespace() { if next_char.is_whitespace() {
break; break;
} }
if next_char == '.' { if next_char == '.' {
reading_decimal = true; reading_decimal = true;
self.next_char()?; self.next_char()?;
continue; continue;
} }
// support underscores in numbers for readability
if next_char == '_' { if next_char == '_' {
self.next_char()?; self.next_char()?;
continue; continue;
} }
// This is for the times when we have a number followed by a symbol (like a semicolon or =)
if !next_char.is_numeric() { if !next_char.is_numeric() {
break; break;
} }
@@ -343,33 +320,21 @@ impl<'a> Tokenizer<'a> {
let number: Number = if let Some(decimal) = decimal { let number: Number = if let Some(decimal) = decimal {
let decimal_scale = decimal.len() as u32; let decimal_scale = decimal.len() as u32;
let number = format!("{}{}", primary, decimal) let number_str = format!("{}{}", primary, decimal);
.parse::<i128>() let number = number_str.parse::<i128>().map_err(|e| {
.map_err(|e| { Error::NumberParseError(e, line, col, std::mem::take(&mut self.string_buffer))
Error::NumberParseError( })?;
e,
self.line,
self.column,
std::mem::take(&mut self.string_buffer),
)
})?;
Number::Decimal( Number::Decimal(
Decimal::try_from_i128_with_scale(number, decimal_scale).map_err(|e| { Decimal::try_from_i128_with_scale(number, decimal_scale).map_err(|e| {
Error::DecimalParseError( Error::DecimalParseError(e, line, col, std::mem::take(&mut self.string_buffer))
e,
line,
column,
std::mem::take(&mut self.string_buffer),
)
})?, })?,
) )
} else { } else {
Number::Integer(primary.parse().map_err(|e| { Number::Integer(primary.parse().map_err(|e| {
Error::NumberParseError(e, line, column, std::mem::take(&mut self.string_buffer)) Error::NumberParseError(e, line, col, std::mem::take(&mut self.string_buffer))
})?) })?)
}; };
// check if the next char is a temperature suffix
if let Some(next_char) = self.peek_next_char()? { if let Some(next_char) = self.peek_next_char()? {
let temperature = match next_char { let temperature = match next_char {
'c' => Temperature::Celsius(number), 'c' => Temperature::Celsius(number),
@@ -379,7 +344,7 @@ impl<'a> Tokenizer<'a> {
return Ok(Token::new( return Ok(Token::new(
TokenType::Number(number), TokenType::Number(number),
line, line,
column, col,
Some(std::mem::take(&mut self.string_buffer)), Some(std::mem::take(&mut self.string_buffer)),
)); ));
} }
@@ -390,74 +355,65 @@ impl<'a> Tokenizer<'a> {
Ok(Token::new( Ok(Token::new(
TokenType::Number(temperature), TokenType::Number(temperature),
line, line,
column, col,
Some(std::mem::take(&mut self.string_buffer)), Some(std::mem::take(&mut self.string_buffer)),
)) ))
} else { } else {
Ok(Token::new( Ok(Token::new(
TokenType::Number(number), TokenType::Number(number),
line, line,
column, col,
Some(std::mem::take(&mut self.string_buffer)), Some(std::mem::take(&mut self.string_buffer)),
)) ))
} }
} }
/// Tokenizes a string literal fn tokenize_string(
fn tokenize_string(&mut self, beginning_quote: char) -> Result<Token, Error> { &mut self,
beginning_quote: char,
line: usize,
col: usize,
) -> Result<Token, Error> {
let mut buffer = String::with_capacity(16); let mut buffer = String::with_capacity(16);
let column = self.column;
let line = self.line;
while let Some(next_char) = self.next_char()? { while let Some(next_char) = self.next_char()? {
if next_char == beginning_quote { if next_char == beginning_quote {
break; break;
} }
buffer.push(next_char); buffer.push(next_char);
} }
Ok(Token::new( Ok(Token::new(
TokenType::String(buffer), TokenType::String(buffer),
line, line,
column, col,
Some(std::mem::take(&mut self.string_buffer)), Some(std::mem::take(&mut self.string_buffer)),
)) ))
} }
/// Tokenizes a keyword or an identifier. Also handles boolean literals fn tokenize_keyword_or_identifier(
fn tokenize_keyword_or_identifier(&mut self, first_char: char) -> Result<Token, Error> { &mut self,
first_char: char,
line: usize,
col: usize,
) -> Result<Token, Error> {
macro_rules! keyword { macro_rules! keyword {
($keyword:ident) => {{ ($keyword:ident) => {{
return Ok(Token::new( return Ok(Token::new(
TokenType::Keyword(Keyword::$keyword), TokenType::Keyword(Keyword::$keyword),
self.line, line,
self.column, col,
Some(std::mem::take(&mut self.string_buffer)), Some(std::mem::take(&mut self.string_buffer)),
)); ));
}}; }};
} }
/// Helper macro to check if the next character is whitespace or not alphanumeric
macro_rules! next_ws { macro_rules! next_ws {
() => { () => { matches!(self.peek_next_char()?, Some(x) if x.is_whitespace() || !x.is_alphanumeric()) || self.peek_next_char()?.is_none() };
matches!(self.peek_next_char()?, Some(x) if x.is_whitespace() || !x.is_alphanumeric()) || self.peek_next_char()?.is_none()
};
} }
let mut buffer = String::with_capacity(16); let mut buffer = String::with_capacity(16);
let line = self.line;
let column = self.column;
let mut looped_char = Some(first_char); let mut looped_char = Some(first_char);
while let Some(next_char) = looped_char { while let Some(next_char) = looped_char {
if next_char.is_whitespace() { if next_char.is_whitespace() || !next_char.is_alphanumeric() {
break;
}
if !next_char.is_alphanumeric() {
break; break;
} }
buffer.push(next_char); buffer.push(next_char);
@@ -474,51 +430,47 @@ impl<'a> Tokenizer<'a> {
"break" if next_ws!() => keyword!(Break), "break" if next_ws!() => keyword!(Break),
"while" if next_ws!() => keyword!(While), "while" if next_ws!() => keyword!(While),
"continue" if next_ws!() => keyword!(Continue), "continue" if next_ws!() => keyword!(Continue),
// boolean literals
"true" if next_ws!() => { "true" if next_ws!() => {
return Ok(Token::new( return Ok(Token::new(
TokenType::Boolean(true), TokenType::Boolean(true),
self.line, line,
self.column, col,
Some(std::mem::take(&mut self.string_buffer)), Some(std::mem::take(&mut self.string_buffer)),
)); ));
} }
"false" if next_ws!() => { "false" if next_ws!() => {
return Ok(Token::new( return Ok(Token::new(
TokenType::Boolean(false), TokenType::Boolean(false),
self.line, line,
self.column, col,
Some(std::mem::take(&mut self.string_buffer)), Some(std::mem::take(&mut self.string_buffer)),
)); ));
} }
// if the next character is whitespace or not alphanumeric, then we have an identifier
// this is because keywords are checked first
val if next_ws!() => { val if next_ws!() => {
return Ok(Token::new( return Ok(Token::new(
TokenType::Identifier(val.to_string()), TokenType::Identifier(val.to_string()),
line, line,
column, col,
Some(std::mem::take(&mut self.string_buffer)), Some(std::mem::take(&mut self.string_buffer)),
)); ));
} }
_ => {} _ => {}
} }
looped_char = self.next_char()?; looped_char = self.next_char()?;
} }
Err(Error::UnknownKeywordOrIdentifierError( Err(Error::UnknownKeywordOrIdentifierError(
buffer, buffer,
line, line,
column, col,
std::mem::take(&mut self.string_buffer), std::mem::take(&mut self.string_buffer),
)) ))
} }
} }
// ... Iterator and TokenizerBuffer implementations remain unchanged ...
// They just call the methods above which now use the passed-in start coordinates.
impl<'a> Iterator for Tokenizer<'a> { impl<'a> Iterator for Tokenizer<'a> {
type Item = Result<Token, Error>; type Item = Result<Token, Error>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
match self.next_token() { match self.next_token() {
Ok(Some(tok)) => Some(Ok(tok)), Ok(Some(tok)) => Some(Ok(tok)),
@@ -542,38 +494,26 @@ impl<'a> TokenizerBuffer<'a> {
history: VecDeque::with_capacity(128), history: VecDeque::with_capacity(128),
} }
} }
/// Reads the next token from the tokenizer, pushing the value to the back of the history
/// and returning the token
pub fn next_token(&mut self) -> Result<Option<Token>, Error> { pub fn next_token(&mut self) -> Result<Option<Token>, Error> {
if let Some(token) = self.buffer.pop_front() { if let Some(token) = self.buffer.pop_front() {
self.history.push_back(token.clone()); self.history.push_back(token.clone());
return Ok(Some(token)); return Ok(Some(token));
} }
let token = self.tokenizer.next_token()?; let token = self.tokenizer.next_token()?;
if let Some(ref token) = token { if let Some(ref token) = token {
self.history.push_back(token.clone()); self.history.push_back(token.clone());
} }
Ok(token) Ok(token)
} }
/// Peeks the next token in the stream without adding to the history stack
pub fn peek(&mut self) -> Result<Option<Token>, Error> { pub fn peek(&mut self) -> Result<Option<Token>, Error> {
if let Some(token) = self.buffer.front() { if let Some(token) = self.buffer.front() {
return Ok(Some(token.clone())); return Ok(Some(token.clone()));
} }
let token = self.tokenizer.peek_next()?; let token = self.tokenizer.peek_next()?;
Ok(token) Ok(token)
} }
fn seek_from_current(&mut self, seek_to: i64) -> Result<(), Error> { fn seek_from_current(&mut self, seek_to: i64) -> Result<(), Error> {
use Ordering::*; use Ordering::*;
// if seek_to > 0 then we need to check if the buffer has enough tokens to pop, otherwise we need to read from the tokenizer
// if seek_to < 0 then we need to pop from the history and push to the front of the buffer. If not enough, then we throw (we reached the front of the history)
// if seek_to == 0 then we don't need to do anything
match seek_to.cmp(&0) { match seek_to.cmp(&0) {
Greater => { Greater => {
let mut tokens = Vec::with_capacity(seek_to as usize); let mut tokens = Vec::with_capacity(seek_to as usize);
@@ -606,18 +546,13 @@ impl<'a> TokenizerBuffer<'a> {
} }
_ => {} _ => {}
} }
Ok(()) Ok(())
} }
/// Adds to or removes from the History stack, allowing the user to move back and forth in the stream
pub fn seek(&mut self, from: SeekFrom) -> Result<(), Error> { pub fn seek(&mut self, from: SeekFrom) -> Result<(), Error> {
match from { match from {
SeekFrom::Current(seek_to) => self.seek_from_current(seek_to)?, SeekFrom::Current(seek_to) => self.seek_from_current(seek_to)?,
SeekFrom::End(_) => unimplemented!("SeekFrom::End will not be implemented"), _ => unimplemented!("SeekFrom::End/Start not implemented"),
SeekFrom::Start(_) => unimplemented!("SeekFrom::Start will not be implemented"),
} }
Ok(()) Ok(())
} }
} }
@@ -682,7 +617,7 @@ mod tests {
assert_eq!(char, Some('f')); assert_eq!(char, Some('f'));
assert_eq!(tokenizer.line, 1); assert_eq!(tokenizer.line, 1);
assert_eq!(tokenizer.column, 2); assert_eq!(tokenizer.column, 1);
Ok(()) Ok(())
} }
@@ -695,7 +630,7 @@ mod tests {
assert_eq!(char, Some('\n')); assert_eq!(char, Some('\n'));
assert_eq!(tokenizer.line, 1); assert_eq!(tokenizer.line, 1);
assert_eq!(tokenizer.column, 1); assert_eq!(tokenizer.column, 0);
let char = tokenizer.next_char()?; let char = tokenizer.next_char()?;
assert_eq!(char, Some('\n')); assert_eq!(char, Some('\n'));
@@ -1010,4 +945,36 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn test_identifier_has_correct_length() -> Result<()> {
let mut tokenizer = Tokenizer::from("hello");
assert_eq!(
tokenizer.next_token()?,
Some(Token {
token_type: TokenType::Identifier("hello".into()),
original_string: Some("hello".into()),
column: 1,
line: 1
})
);
Ok(())
}
#[test]
fn test_keyword_token_has_correct_length() -> Result<()> {
let mut tokenizer = Tokenizer::from("while");
assert_eq!(
tokenizer.next_token()?,
Some(Token {
token_type: TokenType::Keyword(Keyword::While),
original_string: Some("while".into()),
column: 1,
line: 1
})
);
Ok(())
}
} }

View File

@@ -7,11 +7,24 @@ use tokenizer::{token::TokenType, Error as TokenizerError, Tokenizer};
#[derive_ReprC] #[derive_ReprC]
#[repr(C)] #[repr(C)]
pub struct FfiToken { pub struct FfiToken {
pub text: safer_ffi::String,
pub tooltip: safer_ffi::String, pub tooltip: safer_ffi::String,
pub error: safer_ffi::String, pub error: safer_ffi::String,
pub status: safer_ffi::String,
pub column: i32, pub column: i32,
pub length: i32,
pub token_kind: u32,
}
fn map_token_kind(t: &TokenType) -> u32 {
use TokenType::*;
match t {
Keyword(_) => 1,
Identifier(_) => 2,
Number(_) => 3,
String(_) => 4,
Boolean(_) => 5,
Symbol(_) => 6,
_ => 0,
}
} }
/// C# handles strings as UTF16. We do NOT want to allocate that memory in C# because /// C# handles strings as UTF16. We do NOT want to allocate that memory in C# because
@@ -49,29 +62,29 @@ pub fn tokenize_line(input: safer_ffi::slice::Ref<'_, u16>) -> safer_ffi::Vec<Ff
for token in tokenizer { for token in tokenizer {
match token { match token {
Err(TokenizerError::NumberParseError(_, _, col, ref original)) Err(TokenizerError::NumberParseError(_, _, col, ref str))
| Err(TokenizerError::UnknownSymbolError(_, _, col, ref original)) | Err(TokenizerError::UnknownSymbolError(_, _, col, ref str))
| Err(TokenizerError::DecimalParseError(_, _, col, ref original)) | Err(TokenizerError::DecimalParseError(_, _, col, ref str))
| Err(TokenizerError::UnknownKeywordOrIdentifierError(_, _, col, ref original)) => { | Err(TokenizerError::UnknownKeywordOrIdentifierError(_, _, col, ref str)) => {
tokens.push(FfiToken { tokens.push(FfiToken {
column: col as i32, column: col as i32 - 1,
text: original.to_string().into(),
tooltip: "".into(), tooltip: "".into(),
length: str.len() as i32,
token_kind: 0,
// Safety: it's okay to unwrap the err here because we are matching on the `Err` variant // Safety: it's okay to unwrap the err here because we are matching on the `Err` variant
error: token.unwrap_err().to_string().into(), error: token.unwrap_err().to_string().into(),
status: "".into(),
}); });
} }
Err(_) => return safer_ffi::Vec::EMPTY, Err(_) => return safer_ffi::Vec::EMPTY,
Ok(token) if !matches!(token.token_type, TokenType::EOF) => tokens.push(FfiToken { Ok(token) if !matches!(token.token_type, TokenType::EOF) => tokens.push(FfiToken {
text: token
.original_string
.unwrap_or(token.token_type.to_string())
.into(),
tooltip: "".into(), tooltip: "".into(),
error: "".into(), error: "".into(),
status: "".into(), length: token
column: token.column as i32, .original_string
.map(|s| s.len() as i32)
.unwrap_or_default(),
token_kind: map_token_kind(&token.token_type),
column: token.column as i32 - 1,
}), }),
_ => {} _ => {}
} }