Very basic tokenizer implementation

This commit is contained in:
Apache 2024-06-07 16:21:54 -05:00
parent 71f8fbd748
commit d7314de57a
Signed by: apache
GPG key ID: 6B10F3EAF14F4C77
3 changed files with 149 additions and 11 deletions

View file

@ -1,3 +1,6 @@
use std::fmt;
#[derive(Debug)]
pub enum Token { pub enum Token {
OpenParen, OpenParen,
CloseParen, CloseParen,
@ -6,16 +9,143 @@ pub enum Token {
Integer(i32) Integer(i32)
} }
pub trait Tokenizable {
fn tokenize(&self) -> Vec<Token>; pub struct TokenizerError {
line: u64,
column: u64,
line_end: u64,
column_end: u64,
message: String,
} }
impl Tokenizable for String { impl fmt::Display for TokenizerError {
fn tokenize(&self) -> Vec<Token> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let tokens = Vec::new(); if self.line != self.line_end || self.column != self.column_end {
return write!(f, "Error from {}:{} to {}:{}, '{}'", self.line, self.column, self.line_end, self.column_end, self.message);
}
write!(f, "Error at {}:{}, '{}'", self.line, self.column, self.message)
}
}
pub struct Tokenizer {
code: String,
line: u64,
column: u64,
reading_string: bool,
escape_next_char: bool,
tokens reading_identifier: bool,
skipping_comment: bool,
storage: Vec<char>,
}
impl Tokenizer {
pub fn new(code: String) -> Tokenizer {
Self {
code,
line: 1,
column: 1,
reading_string: false,
escape_next_char: false,
reading_identifier: false,
skipping_comment: false,
storage: Vec::new()
}
}
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
let mut tokens = Vec::new();
for char in self.code.chars() {
let line = self.line;
let column = self.column;
self.line += 1;
if char == '\n' {
self.column += 1;
self.line = 1;
}
if self.skipping_comment {
match char {
'\n' => {
self.skipping_comment = false;
},
_ => continue
}
}
if self.reading_identifier {
if !char.is_alphabetic() {
self.reading_identifier = false;
tokens.push(Token::Identifier(self.storage.iter().collect()));
self.storage.clear();
}
self.storage.push(char);
continue;
}
if self.reading_string {
if char == '"' && !self.escape_next_char {
self.reading_string = false;
tokens.push(Token::String(self.storage.iter().collect()));
self.storage.clear();
continue;
}
if char == '\n' {
return Err(TokenizerError {
line: line - self.storage.len() as u64 - 1,
column,
line_end: line,
column_end: column,
message: String::from("Expected \", got \\n"),
})
}
if char == '\\' {
self.escape_next_char = true;
continue;
}
self.storage.push(char);
continue;
}
match char {
';' => self.skipping_comment = true,
'(' => tokens.push(Token::OpenParen),
')' => tokens.push(Token::CloseParen),
'"' => {
self.reading_string = true;
self.storage.clear();
},
c => {
if c.is_alphabetic() {
self.reading_identifier = true;
self.storage.clear();
self.storage.push(c);
continue;
}
print!("{}", char)
}
}
}
Ok(tokens)
} }
} }

View file

@ -1,4 +1,14 @@
use lisp_stuff::*;
fn main() { fn main() {
let source = std::fs::read_to_string("src/test.lisp").unwrap(); let source = std::fs::read_to_string("src/test.lisp").unwrap();
println!("{}", source); let mut tokenizer = Tokenizer::new(source);
let tokens = match tokenizer.tokenize() {
Ok(tokens) => tokens,
Err(e) => {
println!("{}", e);
Vec::new()
}
};
println!("{:?}", tokens);
} }

View file

@ -1,6 +1,4 @@
; This is a comment ; This is a comment
(print "Hello, World")
(print (add 1 2))
; OpenParen, Identifier, String, CloseParen (print "Hello, World") ; OpenParen, Identifier, String, CloseParen
; OpenParen, Identifier, OpenParen, Identifier, Int, Int, CloseParen, CloseParen ; (print (add 1 2)) ; OpenParen, Identifier, OpenParen, Identifier, Int, Int, CloseParen, CloseParen