jlx/libjlx/modules/tokenizer.cppm

module;

#include <string>
#include <optional>
#include <cctype>
#include <sstream>
#include <format>
#include <iostream>

export module jlx:tokenizer;
import :source_stream;
import utils;

namespace jlx {
	export enum token_type {
		Invalid = 0,
		Punctuation,
		Number,
		String,
		Boolean,
		Keyword,
		Identifier,
		Operator
	};

	export constexpr std::string token_type_to_string(token_type t) {
		switch(t) {
			case Punctuation:
				return "Punctuation";
			case Number:
				return "Number";
			case String:
				return "String";
			case Boolean:
				return "Boolean";
			case Keyword:
				return "Keyword";
			case Identifier:
				return "Identifier";
			case Operator:
				return "Operator";
			default:
				return "Invalid";
		}
	}

	export struct token {
		token_type type;
		std::string source_file;
		std::string content;
		std::size_t line;
		std::size_t col;

		token(token_type type, const std::string& source_file, const std::string& content, std::size_t line, std::size_t col) :
			type(type), source_file(source_file), content(content), line(line), col(col) {

		}

		token(const token&) = default;
		token& operator=(const token&) = default;
	};

	export constexpr std::string token_to_string(const token& t) {
		return std::format("{}({})", token_type_to_string(t.type), t.content);
	}

	export class tokenizer_exception : public std::runtime_error {
	public:
		tokenizer_exception(std::string msg, std::size_t line, std::size_t col) : std::runtime_error(std::format("Tokenizer exception at {}:{}. {}", line, col, msg).c_str()) {

		}
	};

	export class tokenizer {
		source_stream<char> source;

		static constexpr std::array<std::string, 6> keywords = {{
			"if",
			"else",
			"fun",
			//"struct",
			"let",
			"var",
			"return"
		}};

		static constexpr std::array<char, 8> punctuations = {{
			'.',
			'(',
			')',
			'{',
			'}',
			':',
			';',
			','
		}};

		static constexpr std::array<std::string, 13> operators = {{
			"=",
			"+",
			"-",
			"*",
			"/",
			"%",
			"==",
			"!=",
			"<=",
			">=",
			">",
			"<",
			"!"
		}};

		void skip_whitespace() {
			while(!source.eof()) {
				auto ch = source.peek();
				if (!ch.has_value() || !std::isspace(static_cast<unsigned int>(ch.value()))) {
					return;
				}

				source.next();
			}
		}

		token read_string_token() {
			auto start_line = source.current_line();
			auto start_col = source.current_col();
			source.next();
			bool escape = false;
			std::stringstream buffer;
			while(!source.eof()) {
				auto ch = source.next();
				if (!ch.has_value() || (!escape && ch.value() == '"')) {
					break;
				}

				auto val = ch.value();

				if (val == '\n') {
					continue;
				}

				if (escape) {
					switch(val) {
						case '"':
							buffer.put('"');
							break;
						case '\\':
							buffer.put('\\');
							break;
						case 'n':
							buffer.put('\n');
							break;
						default:
							throw tokenizer_exception("Invalid escape sequance ", source.current_line(), source.current_col());
					}
					escape = false;
				} else if (val == '\\') {
					escape = true;
				} else {
					buffer.put(val);
				}
			}

			return {
				token_type::String,
				"mono_src",
				buffer.str(),
				start_line,
				start_col
			};
		}

		std::optional<token> read_decimal_token() {
			auto res = source.peek();

			if (!res.has_value()) {
				return std::nullopt;
			}

			std::stringstream buffer;
			std::size_t start_line = source.current_line();
			std::size_t start_col = source.current_col();
			bool found_period = false;

			while(res.has_value() && (std::isdigit(static_cast<unsigned char>(res.value())) || res.value() == '.')) {
				auto val = res.value();
				if (val == '.') {
					if (found_period) {
						throw tokenizer_exception("Too many periods in numeric value", source.current_line(), source.current_col());
					} else {
						found_period = true;
					}
				}
				buffer.put(val);
				source.next();
				res = source.peek();
			}

			return token {
				token_type::Number,
				"mono_src",
				buffer.str(),
				start_line,
				start_col
			};
		}

		constexpr bool is_valid_identifier_start(char ch) {
			return ch == '_' || isletter(ch);
		}

		std::optional<token> read_identifier() {
			std::stringstream buffer;

			auto start_line = source.current_line();
			auto start_col = source.current_col();

			while(!source.eof()) {
				auto res = source.peek();

				if (!res.has_value()) {
					break;
				}

				auto val = res.value();

				if (val != '_' && !isletter(val) && !std::isdigit(static_cast<unsigned char>(val))) {
					break;
				}

				buffer.put(val);
				source.next();
			}

			auto word = buffer.str();

			if (word == "true" || word == "false") {
				return token {
					token_type::Boolean,
					"mono_src",
					word,
					start_line,
					start_col
				};
			}

			if (std::find(keywords.begin(), keywords.end(), word) != keywords.end()) {
				return token {
					token_type::Keyword,
					"mono_src",
					word,
					start_line,
					start_col
				};
			} else {
				return token {
					token_type::Identifier,
					"mono_src",
					word,
					start_line,
					start_col
				};
			}
		}

		std::optional<token> read_punctuation_token() {
			auto res = source.peek();

			if (!res.has_value()) {
				return std::nullopt;
			}

			auto val = res.value();

			if (std::find(punctuations.begin(), punctuations.end(), val) != punctuations.end()) {
				auto line = source.current_line();
				auto col = source.current_col();
				source.next();
				return token {
					token_type::Punctuation,
					"mono_src",
					std::string() + val,
					line,
					col
				};
			}

			return std::nullopt;
		}

		std::optional<token> read_operator_token() {
			std::stringstream buffer;

			auto line = source.current_line();
			auto col = source.current_col();
			while(!source.eof()) {
				auto res = source.peek();

				if (!res.has_value()) {
					break;
				}

				auto val = res.value();

				if (!is_valid_character_from_set(operators, val)) {
					break;
				}

				buffer.put(val);
				source.next();
			}


			auto word = buffer.str();

			if (std::find(operators.begin(), operators.end(), word) != operators.end()) {
				return token {
					token_type::Operator,
					"mono_src",
					word,
					line,
					col
				};
			}

			//throw tokenizer_exception(std::format("Unknown operator '{}'", word), line, col);
			return std::nullopt;
		}

	public:
		tokenizer(std::string source) : source(std::move(source)) {

		}

		std::optional<token> read_token() {
			skip_whitespace();
			if (source.eof()) {
				return std::nullopt;
			}

			auto result = source.peek();
			if (!result.has_value()) {
				return std::nullopt;
			}

			auto val = result.value();

			if (val == '"') {
				return read_string_token();
			}

			if (std::isdigit(static_cast<unsigned char>(val))) {
				return read_decimal_token();
			}

			if (is_valid_identifier_start(val)) {
				return read_identifier();
			}

			auto punctuation_res = read_punctuation_token();
			if (punctuation_res.has_value()) {
				return punctuation_res;
			}

			auto op_res = read_operator_token();
			if (op_res.has_value()) {
				return op_res;
			}

			throw tokenizer_exception(std::format("Unknown character '{}'", val), source.current_line(), source.current_col());
		}
	};
}