374 lines
7.1 KiB
C++
374 lines
7.1 KiB
C++
module;
|
|
|
|
#include <string>
|
|
#include <optional>
|
|
#include <cctype>
|
|
#include <sstream>
|
|
#include <format>
|
|
#include <iostream>
|
|
|
|
export module jlx:tokenizer;
|
|
import :source_stream;
|
|
import utils;
|
|
|
|
namespace jlx {
|
|
export enum token_type {
|
|
Invalid = 0,
|
|
Punctuation,
|
|
Number,
|
|
String,
|
|
Boolean,
|
|
Keyword,
|
|
Identifier,
|
|
Operator
|
|
};
|
|
|
|
export constexpr std::string token_type_to_string(token_type t) {
|
|
switch(t) {
|
|
case Punctuation:
|
|
return "Punctuation";
|
|
case Number:
|
|
return "Number";
|
|
case String:
|
|
return "String";
|
|
case Boolean:
|
|
return "Boolean";
|
|
case Keyword:
|
|
return "Keyword";
|
|
case Identifier:
|
|
return "Identifier";
|
|
case Operator:
|
|
return "Operator";
|
|
default:
|
|
return "Invalid";
|
|
}
|
|
}
|
|
|
|
export struct token {
|
|
token_type type;
|
|
std::string source_file;
|
|
std::string content;
|
|
std::size_t line;
|
|
std::size_t col;
|
|
|
|
token(token_type type, const std::string& source_file, const std::string& content, std::size_t line, std::size_t col) :
|
|
type(type), source_file(source_file), content(content), line(line), col(col) {
|
|
|
|
}
|
|
|
|
token(const token&) = default;
|
|
token& operator=(const token&) = default;
|
|
};
|
|
|
|
export constexpr std::string token_to_string(const token& t) {
|
|
return std::format("{}({})", token_type_to_string(t.type), t.content);
|
|
}
|
|
|
|
export class tokenizer_exception : public std::runtime_error {
|
|
public:
|
|
tokenizer_exception(std::string msg, std::size_t line, std::size_t col) : std::runtime_error(std::format("Tokenizer exception at {}:{}. {}", line, col, msg).c_str()) {
|
|
|
|
}
|
|
};
|
|
|
|
export class tokenizer {
|
|
source_stream<char> source;
|
|
|
|
static constexpr std::array<std::string, 6> keywords = {{
|
|
"if",
|
|
"else",
|
|
"fun",
|
|
//"struct",
|
|
"let",
|
|
"var",
|
|
"return"
|
|
}};
|
|
|
|
static constexpr std::array<char, 8> punctuations = {{
|
|
'.',
|
|
'(',
|
|
')',
|
|
'{',
|
|
'}',
|
|
':',
|
|
';',
|
|
','
|
|
}};
|
|
|
|
static constexpr std::array<std::string, 13> operators = {{
|
|
"=",
|
|
"+",
|
|
"-",
|
|
"*",
|
|
"/",
|
|
"%",
|
|
"==",
|
|
"!=",
|
|
"<=",
|
|
">=",
|
|
">",
|
|
"<",
|
|
"!"
|
|
}};
|
|
|
|
void skip_whitespace() {
|
|
while(!source.eof()) {
|
|
auto ch = source.peek();
|
|
if (!ch.has_value() || !std::isspace(static_cast<unsigned int>(ch.value()))) {
|
|
return;
|
|
}
|
|
|
|
source.next();
|
|
}
|
|
}
|
|
|
|
token read_string_token() {
|
|
auto start_line = source.current_line();
|
|
auto start_col = source.current_col();
|
|
source.next();
|
|
bool escape = false;
|
|
std::stringstream buffer;
|
|
while(!source.eof()) {
|
|
auto ch = source.next();
|
|
if (!ch.has_value() || (!escape && ch.value() == '"')) {
|
|
break;
|
|
}
|
|
|
|
auto val = ch.value();
|
|
|
|
if (val == '\n') {
|
|
continue;
|
|
}
|
|
|
|
if (escape) {
|
|
switch(val) {
|
|
case '"':
|
|
buffer.put('"');
|
|
break;
|
|
case '\\':
|
|
buffer.put('\\');
|
|
break;
|
|
case 'n':
|
|
buffer.put('\n');
|
|
break;
|
|
default:
|
|
throw tokenizer_exception("Invalid escape sequance ", source.current_line(), source.current_col());
|
|
}
|
|
escape = false;
|
|
} else if (val == '\\') {
|
|
escape = true;
|
|
} else {
|
|
buffer.put(val);
|
|
}
|
|
}
|
|
|
|
return {
|
|
token_type::String,
|
|
"mono_src",
|
|
buffer.str(),
|
|
start_line,
|
|
start_col
|
|
};
|
|
}
|
|
|
|
std::optional<token> read_decimal_token() {
|
|
auto res = source.peek();
|
|
|
|
if (!res.has_value()) {
|
|
return std::nullopt;
|
|
}
|
|
|
|
std::stringstream buffer;
|
|
std::size_t start_line = source.current_line();
|
|
std::size_t start_col = source.current_col();
|
|
bool found_period = false;
|
|
|
|
while(res.has_value() && (std::isdigit(static_cast<unsigned char>(res.value())) || res.value() == '.')) {
|
|
auto val = res.value();
|
|
if (val == '.') {
|
|
if (found_period) {
|
|
throw tokenizer_exception("Too many periods in numeric value", source.current_line(), source.current_col());
|
|
} else {
|
|
found_period = true;
|
|
}
|
|
}
|
|
buffer.put(val);
|
|
source.next();
|
|
res = source.peek();
|
|
}
|
|
|
|
return token {
|
|
token_type::Number,
|
|
"mono_src",
|
|
buffer.str(),
|
|
start_line,
|
|
start_col
|
|
};
|
|
}
|
|
|
|
constexpr bool is_valid_identifier_start(char ch) {
|
|
return ch == '_' || isletter(ch);
|
|
}
|
|
|
|
std::optional<token> read_identifier() {
|
|
std::stringstream buffer;
|
|
|
|
auto start_line = source.current_line();
|
|
auto start_col = source.current_col();
|
|
|
|
while(!source.eof()) {
|
|
auto res = source.peek();
|
|
|
|
if (!res.has_value()) {
|
|
break;
|
|
}
|
|
|
|
auto val = res.value();
|
|
|
|
if (val != '_' && !isletter(val) && !std::isdigit(static_cast<unsigned char>(val))) {
|
|
break;
|
|
}
|
|
|
|
buffer.put(val);
|
|
source.next();
|
|
}
|
|
|
|
auto word = buffer.str();
|
|
|
|
if (word == "true" || word == "false") {
|
|
return token {
|
|
token_type::Boolean,
|
|
"mono_src",
|
|
word,
|
|
start_line,
|
|
start_col
|
|
};
|
|
}
|
|
|
|
if (std::find(keywords.begin(), keywords.end(), word) != keywords.end()) {
|
|
return token {
|
|
token_type::Keyword,
|
|
"mono_src",
|
|
word,
|
|
start_line,
|
|
start_col
|
|
};
|
|
} else {
|
|
return token {
|
|
token_type::Identifier,
|
|
"mono_src",
|
|
word,
|
|
start_line,
|
|
start_col
|
|
};
|
|
}
|
|
}
|
|
|
|
std::optional<token> read_punctuation_token() {
|
|
auto res = source.peek();
|
|
|
|
if (!res.has_value()) {
|
|
return std::nullopt;
|
|
}
|
|
|
|
auto val = res.value();
|
|
|
|
if (std::find(punctuations.begin(), punctuations.end(), val) != punctuations.end()) {
|
|
auto line = source.current_line();
|
|
auto col = source.current_col();
|
|
source.next();
|
|
return token {
|
|
token_type::Punctuation,
|
|
"mono_src",
|
|
std::string() + val,
|
|
line,
|
|
col
|
|
};
|
|
}
|
|
|
|
return std::nullopt;
|
|
}
|
|
|
|
std::optional<token> read_operator_token() {
|
|
std::stringstream buffer;
|
|
|
|
auto line = source.current_line();
|
|
auto col = source.current_col();
|
|
while(!source.eof()) {
|
|
auto res = source.peek();
|
|
|
|
if (!res.has_value()) {
|
|
break;
|
|
}
|
|
|
|
auto val = res.value();
|
|
|
|
if (!is_valid_character_from_set(operators, val)) {
|
|
break;
|
|
}
|
|
|
|
buffer.put(val);
|
|
source.next();
|
|
}
|
|
|
|
|
|
auto word = buffer.str();
|
|
|
|
if (std::find(operators.begin(), operators.end(), word) != operators.end()) {
|
|
return token {
|
|
token_type::Operator,
|
|
"mono_src",
|
|
word,
|
|
line,
|
|
col
|
|
};
|
|
}
|
|
|
|
//throw tokenizer_exception(std::format("Unknown operator '{}'", word), line, col);
|
|
return std::nullopt;
|
|
}
|
|
|
|
public:
|
|
tokenizer(std::string source) : source(std::move(source)) {
|
|
|
|
}
|
|
|
|
std::optional<token> read_token() {
|
|
skip_whitespace();
|
|
if (source.eof()) {
|
|
return std::nullopt;
|
|
}
|
|
|
|
auto result = source.peek();
|
|
if (!result.has_value()) {
|
|
return std::nullopt;
|
|
}
|
|
|
|
auto val = result.value();
|
|
|
|
if (val == '"') {
|
|
return read_string_token();
|
|
}
|
|
|
|
if (std::isdigit(static_cast<unsigned char>(val))) {
|
|
return read_decimal_token();
|
|
}
|
|
|
|
if (is_valid_identifier_start(val)) {
|
|
return read_identifier();
|
|
}
|
|
|
|
auto punctuation_res = read_punctuation_token();
|
|
if (punctuation_res.has_value()) {
|
|
return punctuation_res;
|
|
}
|
|
|
|
auto op_res = read_operator_token();
|
|
if (op_res.has_value()) {
|
|
return op_res;
|
|
}
|
|
|
|
throw tokenizer_exception(std::format("Unknown character '{}'", val), source.current_line(), source.current_col());
|
|
}
|
|
};
|
|
}
|