From d7314de57aef0344ab1743fde55830b09683c98249fb33cb7ad9404cf6ade102 Mon Sep 17 00:00:00 2001 From: Apache Date: Fri, 7 Jun 2024 16:21:54 -0500 Subject: [PATCH] Very basic tokenizer implementation --- src/lib.rs | 142 +++++++++++++++++++++++++++++++++++++++++++++++--- src/main.rs | 12 ++++- src/test.lisp | 6 +-- 3 files changed, 149 insertions(+), 11 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e027a74..ff653d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,6 @@ +use std::fmt; + +#[derive(Debug)] pub enum Token { OpenParen, CloseParen, @@ -6,16 +9,143 @@ pub enum Token { Integer(i32) } -pub trait Tokenizable { - fn tokenize(&self) -> Vec; + +pub struct TokenizerError { + line: u64, + column: u64, + + line_end: u64, + column_end: u64, + + message: String, } -impl Tokenizable for String { - fn tokenize(&self) -> Vec { - let tokens = Vec::new(); +impl fmt::Display for TokenizerError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.line != self.line_end || self.column != self.column_end { + return write!(f, "Error from {}:{} to {}:{}, '{}'", self.line, self.column, self.line_end, self.column_end, self.message); + } + write!(f, "Error at {}:{}, '{}'", self.line, self.column, self.message) + } +} +pub struct Tokenizer { + code: String, + line: u64, + column: u64, + reading_string: bool, + escape_next_char: bool, - tokens + reading_identifier: bool, + skipping_comment: bool, + + storage: Vec, +} + +impl Tokenizer { + pub fn new(code: String) -> Tokenizer { + Self { + code, + line: 1, + column: 1, + + reading_string: false, + escape_next_char: false, + + reading_identifier: false, + skipping_comment: false, + + storage: Vec::new() + } + } + + pub fn tokenize(&mut self) -> Result, TokenizerError> { + let mut tokens = Vec::new(); + + for char in self.code.chars() { + let line = self.line; + let column = self.column; + + self.line += 1; + if char == '\n' { + self.column += 1; + self.line = 1; + } + + if self.skipping_comment { + match char { + '\n' => { + self.skipping_comment = false; + }, + _ => continue + } + } + + if self.reading_identifier { + if !char.is_alphabetic() { + self.reading_identifier = false; + + tokens.push(Token::Identifier(self.storage.iter().collect())); + self.storage.clear(); + } + + self.storage.push(char); + continue; + } + + if self.reading_string { + if char == '"' && !self.escape_next_char { + self.reading_string = false; + + tokens.push(Token::String(self.storage.iter().collect())); + self.storage.clear(); + continue; + } + + if char == '\n' { + return Err(TokenizerError { + line: line - self.storage.len() as u64 - 1, + column, + + line_end: line, + column_end: column, + + message: String::from("Expected \", got \\n"), + }) + } + + if char == '\\' { + self.escape_next_char = true; + continue; + } + + self.storage.push(char); + + continue; + } + + match char { + ';' => self.skipping_comment = true, + '(' => tokens.push(Token::OpenParen), + ')' => tokens.push(Token::CloseParen), + '"' => { + self.reading_string = true; + self.storage.clear(); + }, + c => { + if c.is_alphabetic() { + self.reading_identifier = true; + self.storage.clear(); + self.storage.push(c); + continue; + } + + print!("{}", char) + } + } + } + + Ok(tokens) } } \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index ac6bb4e..f9afa84 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,14 @@ +use lisp_stuff::*; + fn main() { let source = std::fs::read_to_string("src/test.lisp").unwrap(); - println!("{}", source); + let mut tokenizer = Tokenizer::new(source); + let tokens = match tokenizer.tokenize() { + Ok(tokens) => tokens, + Err(e) => { + println!("{}", e); + Vec::new() + } + }; + println!("{:?}", tokens); } diff --git a/src/test.lisp b/src/test.lisp index da46449..a9b3370 100644 --- a/src/test.lisp +++ b/src/test.lisp @@ -1,6 +1,4 @@ ; This is a comment -(print "Hello, World") -(print (add 1 2)) -; OpenParen, Identifier, String, CloseParen -; OpenParen, Identifier, OpenParen, Identifier, Int, Int, CloseParen, CloseParen \ No newline at end of file +(print "Hello, World") ; OpenParen, Identifier, String, CloseParen +; (print (add 1 2)) ; OpenParen, Identifier, OpenParen, Identifier, Int, Int, CloseParen, CloseParen \ No newline at end of file