module; #include #include #include #include #include #include export module jlx:tokenizer; import :source_stream; import utils; namespace jlx { export enum token_type { Invalid = 0, Punctuation, Number, String, Boolean, Keyword, Identifier, Operator }; export constexpr std::string token_type_to_string(token_type t) { switch(t) { case Punctuation: return "Punctuation"; case Number: return "Number"; case String: return "String"; case Boolean: return "Boolean"; case Keyword: return "Keyword"; case Identifier: return "Identifier"; case Operator: return "Operator"; default: return "Invalid"; } } export struct token { token_type type; std::string source_file; std::string content; std::size_t line; std::size_t col; token(token_type type, const std::string& source_file, const std::string& content, std::size_t line, std::size_t col) : type(type), source_file(source_file), content(content), line(line), col(col) { } token(const token&) = default; token& operator=(const token&) = default; }; export constexpr std::string token_to_string(const token& t) { return std::format("{}({})", token_type_to_string(t.type), t.content); } export class tokenizer_exception : public std::runtime_error { public: tokenizer_exception(std::string msg, std::size_t line, std::size_t col) : std::runtime_error(std::format("Tokenizer exception at {}:{}. {}", line, col, msg).c_str()) { } }; export class tokenizer { source_stream source; static constexpr std::array keywords = {{ "if", "else", "fun", //"struct", "let", "var", "return" }}; static constexpr std::array punctuations = {{ '.', '(', ')', '{', '}', ':', ';', ',' }}; static constexpr std::array operators = {{ "=", "+", "-", "*", "/", "%", "==", "!=", "<=", ">=", ">", "<", "!" }}; void skip_whitespace() { while(!source.eof()) { auto ch = source.peek(); if (!ch.has_value() || !std::isspace(static_cast(ch.value()))) { return; } source.next(); } } token read_string_token() { auto start_line = source.current_line(); auto start_col = source.current_col(); source.next(); bool escape = false; std::stringstream buffer; while(!source.eof()) { auto ch = source.next(); if (!ch.has_value() || (!escape && ch.value() == '"')) { break; } auto val = ch.value(); if (val == '\n') { continue; } if (escape) { switch(val) { case '"': buffer.put('"'); break; case '\\': buffer.put('\\'); break; case 'n': buffer.put('\n'); break; default: throw tokenizer_exception("Invalid escape sequance ", source.current_line(), source.current_col()); } escape = false; } else if (val == '\\') { escape = true; } else { buffer.put(val); } } return { token_type::String, "mono_src", buffer.str(), start_line, start_col }; } std::optional read_decimal_token() { auto res = source.peek(); if (!res.has_value()) { return std::nullopt; } std::stringstream buffer; std::size_t start_line = source.current_line(); std::size_t start_col = source.current_col(); bool found_period = false; while(res.has_value() && (std::isdigit(static_cast(res.value())) || res.value() == '.')) { auto val = res.value(); if (val == '.') { if (found_period) { throw tokenizer_exception("Too many periods in numeric value", source.current_line(), source.current_col()); } else { found_period = true; } } buffer.put(val); source.next(); res = source.peek(); } return token { token_type::Number, "mono_src", buffer.str(), start_line, start_col }; } constexpr bool is_valid_identifier_start(char ch) { return ch == '_' || isletter(ch); } std::optional read_identifier() { std::stringstream buffer; auto start_line = source.current_line(); auto start_col = source.current_col(); while(!source.eof()) { auto res = source.peek(); if (!res.has_value()) { break; } auto val = res.value(); if (val != '_' && !isletter(val) && !std::isdigit(static_cast(val))) { break; } buffer.put(val); source.next(); } auto word = buffer.str(); if (word == "true" || word == "false") { return token { token_type::Boolean, "mono_src", word, start_line, start_col }; } if (std::find(keywords.begin(), keywords.end(), word) != keywords.end()) { return token { token_type::Keyword, "mono_src", word, start_line, start_col }; } else { return token { token_type::Identifier, "mono_src", word, start_line, start_col }; } } std::optional read_punctuation_token() { auto res = source.peek(); if (!res.has_value()) { return std::nullopt; } auto val = res.value(); if (std::find(punctuations.begin(), punctuations.end(), val) != punctuations.end()) { auto line = source.current_line(); auto col = source.current_col(); source.next(); return token { token_type::Punctuation, "mono_src", std::string() + val, line, col }; } return std::nullopt; } std::optional read_operator_token() { std::stringstream buffer; auto line = source.current_line(); auto col = source.current_col(); while(!source.eof()) { auto res = source.peek(); if (!res.has_value()) { break; } auto val = res.value(); if (!is_valid_character_from_set(operators, val)) { break; } buffer.put(val); source.next(); } auto word = buffer.str(); if (std::find(operators.begin(), operators.end(), word) != operators.end()) { return token { token_type::Operator, "mono_src", word, line, col }; } //throw tokenizer_exception(std::format("Unknown operator '{}'", word), line, col); return std::nullopt; } public: tokenizer(std::string source) : source(std::move(source)) { } std::optional read_token() { skip_whitespace(); if (source.eof()) { return std::nullopt; } auto result = source.peek(); if (!result.has_value()) { return std::nullopt; } auto val = result.value(); if (val == '"') { return read_string_token(); } if (std::isdigit(static_cast(val))) { return read_decimal_token(); } if (is_valid_identifier_start(val)) { return read_identifier(); } auto punctuation_res = read_punctuation_token(); if (punctuation_res.has_value()) { return punctuation_res; } auto op_res = read_operator_token(); if (op_res.has_value()) { return op_res; } throw tokenizer_exception(std::format("Unknown character '{}'", val), source.current_line(), source.current_col()); } }; }