jlx/libjlx/modules/tokenizer.cppm

378 lines
7.1 KiB
C++

module;
#include <string>
#include <optional>
#include <cctype>
#include <sstream>
#include <format>
#include <iostream>
export module jlx:tokenizer;
import :source_stream;
import utils;
namespace jlx {
export enum token_type {
Invalid = 0,
Punctuation,
Number,
String,
Boolean,
Keyword,
Identifier,
Operator
};
export constexpr std::string token_type_to_string(token_type t) {
switch(t) {
case Punctuation:
return "Punctuation";
case Number:
return "Number";
case String:
return "String";
case Boolean:
return "Boolean";
case Keyword:
return "Keyword";
case Identifier:
return "Identifier";
case Operator:
return "Operator";
default:
return "Invalid";
}
}
export struct token {
token_type type;
std::string source_file;
std::string content;
std::size_t line;
std::size_t col;
token(token_type type, const std::string& source_file, const std::string& content, std::size_t line, std::size_t col) :
type(type), source_file(source_file), content(content), line(line), col(col) {
}
token(const token&) = default;
token& operator=(const token&) = default;
};
export constexpr std::string token_to_string(const token& t) {
return std::format("{}({})", token_type_to_string(t.type), t.content);
}
export class tokenizer_exception {
protected:
std::string msg;
public:
tokenizer_exception(std::string msg, std::size_t line, std::size_t col) : msg(std::format("Tokenizer exception at %d:%d. %s", line, col, msg)) {
}
const std::string& what() const {
return msg;
}
};
export class tokenizer {
source_stream<char> source;
static constexpr std::array<std::string, 6> keywords = {{
"if",
"else",
"fun",
//"struct",
"let",
"var",
"return"
}};
static constexpr std::array<char, 7> punctuations = {{
'.',
'(',
'(',
'{',
'}',
':',
';'
}};
static constexpr std::array<std::string, 13> operators = {{
"=",
"+",
"-",
"*",
"/",
"%",
"==",
"!=",
"<=",
">=",
">",
"<",
"!"
}};
void skip_whitespace() {
while(!source.eof()) {
auto ch = source.peek();
if (!ch.has_value() || !std::isspace(static_cast<unsigned int>(ch.value()))) {
return;
}
source.next();
}
}
token read_string_token() {
auto start_line = source.current_line();
auto start_col = source.current_col();
source.next();
bool escape = false;
std::stringstream buffer;
while(!source.eof()) {
auto ch = source.next();
if (!ch.has_value() || (!escape && ch.value() == '"')) {
break;
}
auto val = ch.value();
if (val == '\n') {
continue;
}
if (escape) {
switch(val) {
case '"':
buffer.put('"');
break;
case '\\':
buffer.put('\\');
break;
case 'n':
buffer.put('\n');
break;
default:
throw tokenizer_exception("Invalid escape sequance ", source.current_line(), source.current_col());
}
escape = false;
} else if (val == '\\') {
escape = true;
} else {
buffer.put(val);
}
}
return {
token_type::String,
"mono_src",
buffer.str(),
start_line,
start_col
};
}
std::optional<token> read_decimal_token() {
auto res = source.peek();
if (!res.has_value()) {
return std::nullopt;
}
std::stringstream buffer;
std::size_t start_line = source.current_line();
std::size_t start_col = source.current_col();
bool found_period = false;
while(res.has_value() && (std::isdigit(static_cast<unsigned char>(res.value())) || res.value() == '.')) {
auto val = res.value();
if (val == '.') {
if (found_period) {
throw tokenizer_exception("Too many periods in numeric value", source.current_line(), source.current_col());
} else {
found_period = true;
}
}
buffer.put(val);
source.next();
res = source.peek();
}
return token {
token_type::Number,
"mono_src",
buffer.str(),
start_line,
start_col
};
}
constexpr bool is_valid_identifier_start(char ch) {
return ch == '_' || isletter(ch);
}
std::optional<token> read_identifier() {
std::stringstream buffer;
auto start_line = source.current_line();
auto start_col = source.current_col();
while(!source.eof()) {
auto res = source.peek();
if (!res.has_value()) {
break;
}
auto val = res.value();
if (val != '_' && !isletter(val) && !std::isdigit(static_cast<unsigned char>(val))) {
break;
}
buffer.put(val);
source.next();
}
auto word = buffer.str();
if (word == "true" || word == "false") {
return token {
token_type::Boolean,
"mono_src",
word,
start_line,
start_col
};
}
if (std::find(keywords.begin(), keywords.end(), word) != keywords.end()) {
return token {
token_type::Keyword,
"mono_src",
word,
start_line,
start_col
};
} else {
return token {
token_type::Identifier,
"mono_src",
word,
start_line,
start_col
};
}
}
std::optional<token> read_punctuation_token() {
auto res = source.peek();
if (!res.has_value()) {
return std::nullopt;
}
auto val = res.value();
if (std::find(punctuations.begin(), punctuations.end(), val) != punctuations.end()) {
auto line = source.current_line();
auto col = source.current_col();
source.next();
return token {
token_type::Punctuation,
"mono_src",
std::string() + val,
line,
col
};
}
return std::nullopt;
}
std::optional<token> read_operator_token() {
std::stringstream buffer;
auto line = source.current_line();
auto col = source.current_col();
while(!source.eof()) {
auto res = source.peek();
if (!res.has_value()) {
break;
}
auto val = res.value();
if (!is_valid_character_from_set(operators, val)) {
break;
}
buffer.put(val);
source.next();
}
auto word = buffer.str();
if (std::find(operators.begin(), operators.end(), word) != operators.end()) {
return token {
token_type::Operator,
"mono_src",
word,
line,
col
};
}
throw tokenizer_exception(std::format("Unknown operator '%s'", word), line, col);
}
public:
tokenizer(std::string source) : source(std::move(source)) {
}
std::optional<token> read_token() {
skip_whitespace();
if (source.eof()) {
return std::nullopt;
}
auto result = source.peek();
if (!result.has_value()) {
return std::nullopt;
}
auto val = result.value();
if (val == '"') {
return read_string_token();
}
if (std::isdigit(static_cast<unsigned char>(val))) {
return read_decimal_token();
}
if (is_valid_identifier_start(val)) {
return read_identifier();
}
auto punctuation_res = read_punctuation_token();
if (punctuation_res.has_value()) {
return punctuation_res;
}
auto op_res = read_operator_token();
if (op_res.has_value()) {
return op_res;
}
throw tokenizer_exception(std::format("Unknown character '%c'", val), source.current_line(), source.current_col());
}
};
}