From b9921d410873b5255eef1e8ac18c715b834aeeb8 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Sat, 10 Jan 2026 12:39:09 -0500 Subject: [PATCH 01/33] + xo-tokenizer2 xo-reader2 xo-expression2 xo-interpreter2 2nd gen schematika interpreter using fomo --- CMakeLists.txt | 37 + README.md | 1 - cmake/xo-bootstrap-macros.cmake | 33 + cmake/xo_tokenizer2Config.cmake.in | 12 + example/CMakeLists.txt | 1 + example/tokenrepl/CMakeLists.txt | 15 + example/tokenrepl/tokenrepl.cpp | 128 ++++ include/xo/tokenizer2/.gitkeep | 0 include/xo/tokenizer2/TkInputState.hpp | 230 +++++++ include/xo/tokenizer2/Token.hpp | 226 ++++++ include/xo/tokenizer2/Tokenizer.hpp | 167 +++++ include/xo/tokenizer2/TokenizerError.hpp | 114 ++++ include/xo/tokenizer2/buffer.hpp | 328 +++++++++ include/xo/tokenizer2/scan_result.hpp | 81 +++ include/xo/tokenizer2/span.hpp | 291 ++++++++ include/xo/tokenizer2/tokentype.hpp | 192 ++++++ src/tokenizer2/CMakeLists.txt | 15 + src/tokenizer2/TkInputState.cpp | 151 ++++ src/tokenizer2/Token.cpp | 259 +++++++ src/tokenizer2/Tokenizer.cpp | 836 +++++++++++++++++++++++ src/tokenizer2/TokenizerError.cpp | 60 ++ src/tokenizer2/scan_result.cpp | 43 ++ src/tokenizer2/tokentype.cpp | 74 ++ 23 files changed, 3293 insertions(+), 1 deletion(-) create mode 100644 CMakeLists.txt delete mode 100644 README.md create mode 100644 cmake/xo-bootstrap-macros.cmake create mode 100644 cmake/xo_tokenizer2Config.cmake.in create mode 100644 example/CMakeLists.txt create mode 100644 example/tokenrepl/CMakeLists.txt create mode 100644 example/tokenrepl/tokenrepl.cpp create mode 100644 include/xo/tokenizer2/.gitkeep create mode 100644 include/xo/tokenizer2/TkInputState.hpp create mode 100644 include/xo/tokenizer2/Token.hpp create mode 100644 include/xo/tokenizer2/Tokenizer.hpp create mode 100644 include/xo/tokenizer2/TokenizerError.hpp create mode 100644 include/xo/tokenizer2/buffer.hpp create mode 100644 include/xo/tokenizer2/scan_result.hpp create mode 100644 include/xo/tokenizer2/span.hpp create mode 100644 include/xo/tokenizer2/tokentype.hpp create mode 100644 src/tokenizer2/CMakeLists.txt create mode 100644 src/tokenizer2/TkInputState.cpp create mode 100644 src/tokenizer2/Token.cpp create mode 100644 src/tokenizer2/Tokenizer.cpp create mode 100644 src/tokenizer2/TokenizerError.cpp create mode 100644 src/tokenizer2/scan_result.cpp create mode 100644 src/tokenizer2/tokentype.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..9eee1160 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,37 @@ +# xo-tokenizer2/CMakeLists.txt + +cmake_minimum_required(VERSION 3.10) + +project(xo_tokenizer2 VERSION 1.0) +enable_language(CXX) + +include(GNUInstallDirs) +include(cmake/xo-bootstrap-macros.cmake) + +xo_cxx_toplevel_options3() + +# ---------------------------------------------------------------- +# c++ settings + +# one-time project-specific c++ flags. usually empty +set(PROJECT_CXX_FLAGS "") +add_definitions(${PROJECT_CXX_FLAGS}) + +# ---------------------------------------------------------------- +# output targets + +add_subdirectory(src/tokenizer2) +add_subdirectory(example) +#add_subdirectory(utest) +xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets) + +if (XO_ENABLE_EXAMPLES) + install(TARGETS xo_tokenizer2_repl DESTINATION bin/xo/example/tokenizer2) +endif() + +# ---------------------------------------------------------------- +# docs targets depends on all the other library/utest targets +# +#add_subdirectory(docs) + +# end CMakeLists.txt diff --git a/README.md b/README.md deleted file mode 100644 index d64791cf..00000000 --- a/README.md +++ /dev/null @@ -1 +0,0 @@ -# xo-tokenizer2 diff --git a/cmake/xo-bootstrap-macros.cmake b/cmake/xo-bootstrap-macros.cmake new file mode 100644 index 00000000..2cf387e5 --- /dev/null +++ b/cmake/xo-bootstrap-macros.cmake @@ -0,0 +1,33 @@ +# ---------------------------------------------------------------- +# for example: +# $ PREFIX=/usr/local # for example +# $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build +# +# will get +# CMAKE_MODULE_PATH +# from xo-cmake-config --cmake-module-path +# +# and expect .cmake macros in +# CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake +# ---------------------------------------------------------------- + +find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED) + +if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL "prefix")) + message(FATAL "could not find xo-cmake-config executable") +endif() + +if (NOT XO_SUBMODULE_BUILD) + if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix)) + # default to typical install location for xo-project-macros + execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH) + message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}") + endif() +endif() + +# needs to have been installed somewhere on CMAKE_MODULE_PATH, +# (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX) +# +include(xo_macros/xo_cxx) + +xo_cxx_bootstrap_message() diff --git a/cmake/xo_tokenizer2Config.cmake.in b/cmake/xo_tokenizer2Config.cmake.in new file mode 100644 index 00000000..b5c3cd5c --- /dev/null +++ b/cmake/xo_tokenizer2Config.cmake.in @@ -0,0 +1,12 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) + +# note: changes to find_dependency() calls here +# must coordinate with xo_dependency() calls +# in CMakeLists.txt +# +#find_dependency(xo_flatstring) + +include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") +check_required_components("@PROJECT_NAME@") diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt new file mode 100644 index 00000000..e761ade5 --- /dev/null +++ b/example/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(tokenrepl) diff --git a/example/tokenrepl/CMakeLists.txt b/example/tokenrepl/CMakeLists.txt new file mode 100644 index 00000000..e7a8c8f3 --- /dev/null +++ b/example/tokenrepl/CMakeLists.txt @@ -0,0 +1,15 @@ +# xo-tokenizer2/example/tokenrepl/CMakeLists.txt + +set(SELF_EXE xo_tokenizer2_repl) +set(SELF_SRCS tokenrepl.cpp) + +if (XO_ENABLE_EXAMPLES) + xo_add_executable(${SELF_EXE} ${SELF_SRCS}) + xo_self_dependency(${SELF_EXE} xo_tokenizer2) + xo_external_target_dependency(${SELF_EXE} replxx replxx::replxx) + + find_package(Threads REQUIRED) # replxx needs this + target_link_libraries(${SELF_EXE} PUBLIC Threads::Threads) +endif() + +# end CMakeLists.txt diff --git a/example/tokenrepl/tokenrepl.cpp b/example/tokenrepl/tokenrepl.cpp new file mode 100644 index 00000000..f97b9cd0 --- /dev/null +++ b/example/tokenrepl/tokenrepl.cpp @@ -0,0 +1,128 @@ +/** @file tokenrepl.cpp **/ + +#include +#include +#include +#include +#include +#include +#include +#include // for isatty + +// presumeably replxx assumes input is a tty +// +bool replxx_getline(bool interactive, + std::size_t parser_stack_size, + replxx::Replxx & rx, + std::string& input) +{ + using namespace std; + + char const * prompt = ""; + + if (interactive) { + if (parser_stack_size <= 1) + prompt = "> "; + else + prompt = ". "; + } + + const char * input_cstr = rx.input(prompt); + + bool retval = (input_cstr != nullptr); + + if (retval) { + //cerr << "got reval->true" << endl; + + input = input_cstr; + + } else { + //cerr << "got retval->false" << endl; + } + + rx.history_add(input); + + // we want tokenizer to see newline, it's syntax + input.push_back('\n'); + + return retval; +} + +#ifdef OBSOLETE +bool repl_getline(bool interactive, + std::istream & in, + std::ostream & out, + std::string & input) +{ + if (interactive) { + out << "> "; + std::flush(out); + } + + return static_cast(std::getline(in, input)); +} +#endif + +int +main() { + using xo::scm::Tokenizer; + using xo::scm::span; + using xo::scm::operator<<; + using replxx::Replxx; + + using namespace std; + + using span_type = span; + + xo::log_config::min_log_level = xo::log_level::severe; + + bool interactive = isatty(STDIN_FILENO); + + Replxx rx; + rx.set_max_history_size(1000); + rx.history_load("repl_history.txt"); + + Tokenizer tkz(xo::log_config::min_log_level <= xo::log_level::info); + + string input_str; + + size_t line_no = 1; + + constexpr std::size_t c_maxlines = 25; + + while ( + //repl_getline(interactive, cin, cout, input_str) // once upon a time + replxx_getline(interactive, 0 /*parser_stack_size*/, rx, input_str)) + { + span_type input = span_type::from_string(input_str); + + //cout << "input: " << input << endl; + + // reminder: input may contain multiple tokens + while (!input.empty()) { + auto [tk, consumed, error] = tkz.scan(input, false /*!eof*/); + + if (tk.is_valid()) { + cout << tk << endl; + } else if (error.is_error()) { + cout << "tokenizer error: " << endl; + error.report(cout); + + break; + } + + input = input.after_prefix(consumed); + } + + /* here: input.empty() or error encountered */ + + ++line_no; + + if (line_no > c_maxlines) { + cout << "always exit after " << c_maxlines << " lines of input" << endl; + break; + } + } +} + +/** end tokenrepl.cpp */ diff --git a/include/xo/tokenizer2/.gitkeep b/include/xo/tokenizer2/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/include/xo/tokenizer2/TkInputState.hpp b/include/xo/tokenizer2/TkInputState.hpp new file mode 100644 index 00000000..531585a1 --- /dev/null +++ b/include/xo/tokenizer2/TkInputState.hpp @@ -0,0 +1,230 @@ +/* @file TkInputState.hpp + * + * author: Roland Conybeare, Jun 2025 + */ + +#pragma once + +#include "span.hpp" + +namespace xo { + namespace scm { + /** enum to report outcome of @ref capture_current_line **/ + enum class input_error { + /** normal return, input line successfully identified and captured **/ + ok = 0, + /** incomplete input; should not have been submitted + * to @ref capture_current_line. + * note: submit last line of input with eof_flag=true + **/ + incomplete, + N + }; + + /** @class input_state + * @brief Track detailed input position for use in error messages + * + * input characters fall into two categories: + * - consumed: memory can be reclaimed/recycled + * - buffered: memory will be retained unaltered until consumed + * + * remarks: + * - always in one of two states: + * - empty + * - contains exactly one line of input + * - also record current input position. + * Use this for example to identify where tokenizer rejected input. + * - .current_pos advances by one token + * + * - buffered characters always form a single contiguous range. + * - input_state does not own any storage; storage is owned elsewhere + * + * @text + * + * <------------------.current_line------------------> + * > <-- .whitespace + * cccccccccccccccccccccccccccccccc__TTTTTTTTxxxxxxxxx + * ^ ^ ^ + * .current_line.lo | .current_line.hi + * .current_pos + * + * <----prev_line----> <----current_line----> + * > <--whitespace + * ppppppppppppppppppp cccccccccccc__TTTTTTTT + * ^ + * + * @endtext + **/ + class TkInputState { + public: + /** @defgroup input-state-type-traits input-state type straits **/ + ///@{ + + using CharT = char; + + /** type representing a contiguous span of tokenizer input characters **/ + using span_type = span; + + ///@} + + public: + /** @defgroup input-state-ctors input_state constructors **/ + ///@{ + + TkInputState() = default; + explicit TkInputState(bool debug_flag) : debug_flag_{debug_flag} {} + /** Create instance with supplied @p current_line, @p current_pos, @p whitespace. + * Introduced for unit tests, not used in tokenizer. + **/ + explicit TkInputState(const span& current_line, + size_t current_pos, + size_t whitespace) : current_line_{current_line}, + current_pos_{current_pos}, + whitespace_{whitespace} {} + + ///@} + + /** @defgroup input-state-static-methods input_state static methods **/ + ///@{ + + /** recognize the newline character '\n' **/ + static bool is_newline(CharT ch); + /** identifies whitespace chars. + * These are chars that do not belong to any token. + * They are not permitted to appear within + * a symbol or string token. + * Appearance of a whitespace char forces completioon of + * preceding token. + **/ + static bool is_whitespace(CharT ch); + + ///@} + + /** @defgroup input-state-access-methods **/ + ///@{ + +#pragma GCC diagnostic push +#ifndef __APPLE__ +#pragma GCC diagnostic ignored "-Wchanges-meaning" +#endif + const span_type & current_line() const { return current_line_; } +#pragma GCC diagnostic pop + size_t tk_start() const { return tk_start_; } + size_t current_pos() const { return current_pos_; } + size_t whitespace() const { return whitespace_; } + bool debug_flag() const { return debug_flag_; } + + ///@} + + /** @defgroup input-state-general-methods **/ + ///@{ + + /** Input state less @p n chars. + * Use to recover input state before a complete but error-triggering token + **/ + TkInputState rewind(std::size_t n) const; + + /** Capture prefix of @p input up to first newline. + * Set read position to start of line. + * + * Alters: + * .current_line + * .current_pos + * + * Return pair comprising error code and input span representing first line + * (including trailing newline) from @p input. + **/ + std::pair capture_current_line(const span_type & input, + bool eof_flag); + + /** atomically return current line while discarding it from input state + * + * Alters + * .current_line + * .current_pos + * .whitespace + **/ + span_type consume_current_line(); + + /** Reset input state for start of next line. + * Expression parser may use this to discard remainder of input line + * after a parsing error. + * + * Alters: + * .current_line + * .current_pos + * .whitespace + **/ + void discard_current_line(); + + /** Advance input position by @p z + * + * Alters: + * .current_pos + **/ + void advance(size_t z); + + /** Advance .current_pos to pos. + * Require: pos in @ref current_line_ + **/ + void advance_until(const CharT * pos); + + /** Skip prefix of input, starting at current read position, + * comprising only whitespace. + * + * Presume input position is at end of token; + * on return @ref whitespace_ counts number of whitespace characters + * skipped. + * + * Return pointer to first non-whitespace character after @ref current_pos_ + * or @ref current_line_.hi if reached end of buffered line. + * + * Alters: + * .whitespace + **/ + const CharT * skip_leading_whitespace(); + + ///@} + + private: + /** @defgroup input-state-instance-vars input_state instance variables **/ + ///@{ + + /** remember current input line. Used only to report errors **/ + span current_line_ = span(); + /** start of last token within @ref current_line_ **/ + size_t tk_start_ = 0; + /** input position within @ref current_line_ **/ + size_t current_pos_ = 0; + /** number of whitespace chars since end of preceding token, + * or last newline, whichever is less + **/ + size_t whitespace_ = 0; + + /** true to log input activity */ + bool debug_flag_ = false; + + ///@} + }; /*TkInputState*/ + + inline std::ostream & + operator<<(std::ostream & os, + const TkInputState & x) + { + using xo::print::unq; + + os << ""; + + return os; + } + } /*namespace scm*/ +} /*namespace xo*/ + +/* end TkInputState.hpp */ diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp new file mode 100644 index 00000000..0994e3b8 --- /dev/null +++ b/include/xo/tokenizer2/Token.hpp @@ -0,0 +1,226 @@ +/* file Token.hpp + * + * author: Roland Conybeare, Jul 2024 + */ + +#pragma once + +#include "tokentype.hpp" +#include "xo/indentlog/print/tag.hpp" +#include +#include +#include +#include + +namespace xo { + namespace scm { + namespace detail { + /* compute a * b^p, p >= 0 */ + constexpr double + pow_aux(double a, double b, int p) { + while (p > 0) { + if (p % 2 == 1) { + /* a * b^p = a * b^(2q + 1) = a.b * 10^(2q) */ + a *= b; + p -= 1; + } else { + /* a * b^p = a * b^(2q) = a * (b^2)^q */ + b = b * b; + p /= 2; + } + } + + /* a * b^0 = a */ + return a; + } + + constexpr double + pow10(int p) { + if (p >= 0) + return pow_aux(1.0, 10.0, p); + else + return 1.0 / pow_aux(1.0, 10.0, -p); + } + } + + /** @class token + * @brief Represent a Schematika lexical token + **/ + class Token { + public: + /** @defgroup token-ctors token constructors **/ + ///@{ + + /** default ctor creates token with type @c tk_invalid **/ + Token() = default; + /** create token with type @c tk_type and input text @c text **/ + Token(tokentype tk_type, const std::string & text = "") + : tk_type_{tk_type}, text_{text} {} + + /** create invalid token (same as null ctor, but explicit) **/ + static Token invalid() { return Token(); } + /** Create token representing a boolean literal from text @p txt + * @p txt must be @c true or @c false + **/ + static Token bool_token(const std::string & txt) { + return Token(tokentype::tk_bool, txt); + } + /** Create token representing 64-bit signed integer literal parsed from decimal @p txt. + * The string @p txt must be a decimal integer literal, since @ref i64_value re-parses @p txt. + **/ + static Token i64_token(const std::string & txt) { + return Token(tokentype::tk_i64, txt); + } + /** create token representing 64-bit floating-point literal parsed from decimal @p txt + * The string @p txt must be a decimal floating-point literal, since @ref f64_value re-parses @p txt. + **/ + static Token f64_token(const std::string & txt) { + return Token(tokentype::tk_f64, txt); + } + /** create token representing literal string parsed from @p txt **/ + static Token string_token(const std::string & txt) { + return Token(tokentype::tk_string, txt); + } + /** create token representing a symbol parsed from @p txt. + * Note that not all strings are valid symbol names. + **/ + static Token symbol_token(const std::string & txt) { + return Token(tokentype::tk_symbol, txt); + } + /** token representing left angle bracket @c "<" **/ + static Token leftangle() { return Token(tokentype::tk_leftangle); } + /** token representing right angle bracket @c ">" **/ + static Token rightangle() { return Token(tokentype::tk_rightangle); } + /** token representing left parenthesis @c "(" **/ + static Token leftparen() { return Token(tokentype::tk_leftparen); } + /** Token representing right parenthesis @c ")" **/ + static Token rightparen() { return Token(tokentype::tk_rightparen); } + /** token representing left bracket @c "[" **/ + static Token leftbracket() { return Token(tokentype::tk_leftbracket); } + /** token representing right bracket @c "]" **/ + static Token rightbracket() { return Token(tokentype::tk_rightbracket); } + /** token representing left brace @c "{" **/ + static Token leftbrace() { return Token(tokentype::tk_leftbrace); } + /** token representing right brace @c "}' **/ + static Token rightbrace() { return Token(tokentype::tk_rightbrace); } + /** token representing period @c "." **/ + static Token dot() { return Token(tokentype::tk_dot); } + /** token representing comma @c "," **/ + static Token comma() { return Token(tokentype::tk_comma); } + /** token representing colon @c ":" **/ + static Token colon() { return Token(tokentype::tk_colon); } + /** token representing double-colo @c "::" **/ + static Token doublecolon() { return Token(tokentype::tk_doublecolon); } + /** token representing semicolon @c ";" **/ + static Token semicolon() { return Token(tokentype::tk_semicolon); } + /** token representing single-assignment @c "=" **/ + static Token singleassign() { return Token(tokentype::tk_singleassign); } + /** token representing unrestricted assignment @c ":=" **/ + static Token assign_token() { return Token(tokentype::tk_assign); } + /** token representing indirection @c "->" **/ + static Token yields() { return Token(tokentype::tk_yields); } + + /** token for @c "+" **/ + static Token plus_token() { return Token(tokentype::tk_plus); } + /** token for @c "-" **/ + static Token minus_token() { return Token(tokentype::tk_minus); } + /** token for @c "*" **/ + static Token star_token() { return Token(tokentype::tk_star); } + /** token for @c "/" **/ + static Token slash_token() { return Token(tokentype::tk_slash); } + + /** token representing keyword @c type **/ + static Token type() { return Token(tokentype::tk_type); } + /** token representing keyword @c def **/ + static Token def() { return Token(tokentype::tk_def); } + /** token representing keyword @c lambda **/ + static Token lambda() { return Token(tokentype::tk_lambda); } + /** token representing keyword @c if **/ + static Token if_token() { return Token(tokentype::tk_if); } + /** token representing keyword @c else **/ + static Token else_token() { return Token(tokentype::tk_else); } + /** token representing keyword @c let **/ + static Token let() { return Token(tokentype::tk_let); } + /** token representing keyword @c in **/ + static Token in() { return Token(tokentype::tk_in); } + /** token representing keyword @c end **/ + static Token end() { return Token(tokentype::tk_end); } + + ///@} + + /** @defgroup token-access-methods **/ + ///@{ + + tokentype tk_type() const { return tk_type_; } + const std::string & text() const { return text_; } + + ///@} + + /** @defgroup token-general-methods **/ + ///@{ + + /** true if token understood to represent valid input + * i.e. any token type except @c tk_invalid + **/ + bool is_valid() const { return tk_type_ != tokentype::tk_invalid; } + /** true for sentinel token with type tk_invalid **/ + bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; } + + /** true for tokens with variable text. false for those with fixed textual representation **/ + bool has_variable_text() const { return (tk_type_ == tokentype::tk_i64 + || tk_type_ == tokentype::tk_f64 + || tk_type_ == tokentype::tk_string + || tk_type_ == tokentype::tk_symbol); } + + /** expect input matching @c true or @c false **/ + bool bool_value() const; + + /** expect input matching @c [+|-][0-9][0-9]* **/ + std::int64_t i64_value() const; + + /** expect input matching @c [+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]* **/ + double f64_value() const; + + /** print human-readable token representation on stream @p os **/ + void print(std::ostream & os) const; + + ///@} + + private: + /** @defgroup token-instance-vars **/ + ///@{ + + /** category for this token **/ + tokentype tk_type_ = tokentype::tk_invalid; + + /** characters comprising this token. + * only provided for certain token types: + * + * tk_i64 + * tk_f64 + * tk_string + * tk_symbol + **/ + std::string text_; + + ///@} + }; + + inline std::ostream & + operator<< (std::ostream & os, + const Token & tk) + { + tk.print(os); + return os; + } + } /*namespace scm*/ + +#ifndef ppdetail_atomic + namespace print { + PPDETAIL_ATOMIC(xo::scm::token); + } +#endif + +} /*namespace xo*/ + +/* end Token.hpp */ diff --git a/include/xo/tokenizer2/Tokenizer.hpp b/include/xo/tokenizer2/Tokenizer.hpp new file mode 100644 index 00000000..99005fee --- /dev/null +++ b/include/xo/tokenizer2/Tokenizer.hpp @@ -0,0 +1,167 @@ +/* file Tokenizer.hpp + * + * author: Roland Conybeare, Jul 2024 + */ + +#pragma once + +#include "Token.hpp" +#include "TkInputState.hpp" +#include "span.hpp" +#include "scan_result.hpp" +#include "xo/indentlog/scope.hpp" +#include "xo/indentlog/print/ppdetail_atomic.hpp" +#include + +namespace xo { + namespace scm { + /** @class Tokenizer + * @brief Parse a Schematika character stream into lexical tokens + * + * Use: + * + * @code + * // see xo-tokenizer2/example/tokenrepl/tokenrepl.cpp + * // for exact working code + * + * using tokenizer_type = tokenizer; + * using span_type = tokenizer_type::span_type; + * + * tokenizer_type tkz; + * span_type input = ...; + * + * while (!input.empty()) { + * auto [tk, consumed, error] = tkz.scan(input); + * + * if (tk.is_valid()) { + * // do something with tk + * } else if (error.is_error()) { + * error.report(cout); + * break; + * } + * + * input = input.after_prefix(consumed); + * } + * + * if endofinput { + * auto [tk, consumed, error] = tzk.notify_eof() + * + * // do something with (final) tk if tk.is_valid() + * } + * + * @endcode + * + * See tokentype.hpp for token types + **/ + class Tokenizer { + public: + using CharT = char; + using token_type = Token; + using error_type = TokenizerError; + using span_type = span; + using input_state_type = TkInputState; + using result_type = scan_result; + + public: + /** @defgroup tokenizer-ctors tokenizer constructors **/ + ///@{ + + Tokenizer(bool debug_flag = false); + + ///@} + + /** @defgroup tokenizer-access-methods tokenizer access methods **/ + ///@{ + +#pragma GCC diagnostic push +#ifndef __APPLE__ +#pragma GCC diagnostic ignored "-Wchanges-meaning" +#endif + const TkInputState & input_state() const { return input_state_; } +#pragma GCC diagnostic pop + + ///@} + + /** @defgroup tokenizer-general-methods tokenizer methods **/ + ///@{ + + /** identifies punctuation chars. + * These are chars that are not permitted to appear within + * a symbol token. Instead they force completion of + * a preceding token, and start a new token with themselves + **/ + static bool is_1char_punctuation(CharT ch); + + /** more-relaxed version of is_1char_punctuation. + * Chars that are not permitted to appear within a symbol token, + * but may form token combined with next character + **/ + static bool is_2char_punctuation(CharT ch); + + /** assemble token from text @p token_text. + * @p initial_whitespace Amount of whitespace input being consumed from input. + * @p token_text subset of input_line representing a single token. + * @p p_input_state input state containing input_line. On exit current line cleared + * if error + * + * retval.consumed will represent some possibly-empty prefix of @p input + **/ + static scan_result assemble_token(std::size_t initial_whitespace, + const span_type & token_text, + TkInputState * p_input_state); + + /** degenerate version of assemble_token() on reaching end-of-file **/ + static scan_result assemble_final_token(const span_type & token_text, + TkInputState * p_input_state); + + /** true if tokenizer contains stored prefix of + * possibly-incomplete token + **/ + bool has_prefix() const { return !prefix_.empty(); } + + /** scan for next input token, given @p input. + * Note: + * - tokenizer can consume input (e.g. whitespace) + * without completing a token + * - input will remember the extent of the last line of input + * for which parsing has begun, but not completed. + * It's required that at least that portion of the input span + * remain valid across scan(), scan2() calls + * + * @return {parsed token, consumed span} + **/ + scan_result scan(const span_type & input, + bool eof_flag); + + /** discard current line after error. Just cleans up error-reporting state **/ + void discard_current_line(); + + ///@} + + private: + /** @defgroup tokenizer-instance-vars tokenizer instance variables **/ + ///@{ + + /** track input state (line#,pos,..) for error messages. + * There's an ordering problem here: + * 1. input_state_.skip_leading_whitespace() advances + * current line automagically when it sees \n + * 2. need to capture value of @ref input_state_ _before_ newline + * 3. but neeed newline to end token + * Also recall input_state_type needed for reporting errors. + **/ + input_state_type input_state_; + /** Accumulate partial token here. + * This will happen if input sent to @ref tokenizer::scan + * ends without whitespace such that last available token's + * extent is not determined + **/ + std::string prefix_; + + ///@} + }; /*tokenizer*/ + + } /*namespace scm*/ +} /*namespace xo*/ + +/* end Tokenizer.hpp */ diff --git a/include/xo/tokenizer2/TokenizerError.hpp b/include/xo/tokenizer2/TokenizerError.hpp new file mode 100644 index 00000000..a7fab3c2 --- /dev/null +++ b/include/xo/tokenizer2/TokenizerError.hpp @@ -0,0 +1,114 @@ +/* file TokenizerError.hpp + * + * author: Roland Conybeare, Jun 2025 + */ + +#pragma once + +#include "TkInputState.hpp" +#include "tokentype.hpp" +#include "span.hpp" +#include + +namespace xo { + namespace scm { + /** @class tokenizer_error + * @brief represent a lexing error, with context + * + * @tparam CharT representation for single characters + **/ + class TokenizerError { + public: + using CharT = char; + using span_type = span; + + public: + /** @defgroup tokenizer-error-ctors **/ + ///@{ + + /** Default ctor represents a not-an-error sentinel object **/ + TokenizerError() = default; + /** Constructor to capture parsing error context + * @p tk_start current position on entry to scanner + * @p error_pos error location relative to token start + **/ + TokenizerError(const char * src_function, + std::string error_description, + const TkInputState & input_state, + size_t error_pos) + : src_function_{src_function}, + error_description_{std::move(error_description)}, + input_state_{input_state}, + error_pos_{error_pos} + { + scope log(XO_DEBUG(input_state.debug_flag())); + + log && log(xtag("input_state.current_pos", input_state.current_pos()), + xtag("error_pos", error_pos)); + } + ///@} + + /** @defgroup tokenizer-error-access-methods **/ + ///@{ + + const char * src_function() const { return src_function_; } + const std::string & error_description() const { return error_description_; } +#pragma GCC diagnostic push +#ifndef __APPLE__ +#pragma GCC diagnostic ignored "-Wchanges-meaning" +#endif + const TkInputState & input_state() const { return input_state_; } +#pragma GCC diagnostic pop + size_t tk_start() const { return input_state_.current_pos(); } + size_t whitespace() const { return input_state_.whitespace(); } + size_t error_pos() const { return error_pos_; } + + ///@} + + /** @defgroup tokenizer-error-general-methods **/ + ///@{ + + /** true, except for a sentinel error object **/ + bool is_error() const { return !error_description_.empty(); } + /** false except for object in sentinel state **/ + bool is_not_an_error() const { return error_description_.empty(); } + + /** Print representation to stream @p os. Intended for tokenizer diagnostics. + * For Schematika errors prefer @ref report + **/ + void print(std::ostream & os) const; + + /** Print human-oriented error report on @p os. **/ + void report(std::ostream & os) const; + + ///@} + + private: + /** @defgroup tokenizer-error-vars **/ + ///@{ + + /** source location (in tokenizer) at which error identified **/ + char const * src_function_ = nullptr; + /** static error description **/ + std::string error_description_; + /** input state associated with this error. + * Sufficient to precisely locate it with context. + **/ + TkInputState input_state_; + /** position (relative to @ref tk_entry_) of error **/ + size_t error_pos_ = 0; + + ///@} + }; /*error_token*/ + + inline std::ostream & + operator<< (std::ostream & os, + const TokenizerError & tkerr) + { + tkerr.print(os); + return os; + } + } /*namespace scm*/ +} /*namespace xo*/ + +/* end tokenizer_error.hpp */ diff --git a/include/xo/tokenizer2/buffer.hpp b/include/xo/tokenizer2/buffer.hpp new file mode 100644 index 00000000..7b19316b --- /dev/null +++ b/include/xo/tokenizer2/buffer.hpp @@ -0,0 +1,328 @@ +/** @file buffer.hpp **/ + +#pragma once + +#include "span.hpp" +#include +#include +#include +#include + +namespace xo { + namespace scm { + /** + * @class buffer buffer.hpp + * + * @brief Container for a (possibly owned) FIFO queue of chars + * + * @tparam CharT. buffer element type. + * + * @code + * .buf + * + * +------------------------------------------+ + * | | ... | | X| ... | X| | ... | | + * +------------------------------------------+ + * ^ ^ ^ ^ + * 0 .lo .hi .buf_z + * + * <-contents-><----avail-----> + * @endcode + * + * Buffer does not support wrapped content: + * content that has not been consumed always occupies contiguous memory. + * + * Example: + * @code + * // 1. + * buffer buf(64*1024); + * buf.empty() -> true + * buf.buf_z() -> 65536 + * buf.lo_pos() -> 0 + * buf.hi_pos() -> 65536 + * buf.contents() -> empty span + * buf.avail() -> span entire buffer memory + * + * // write to (a prefix of) buf.avail() + * ::strncpy(buf.buf(), "hello, world\n", 13); + * buf.produce(span_type(buf.buf(), buf.buf() + 13)); + * + * buf.lo_pos() -> 0 + * buf.hi_pos() -> 13 + * buf.contents() -> "hello, world\n"; + * + * + * // examine stored content (does not change buffer state) + * auto span = buf.contents(); + * cerr << string_view(span.lo(), span.hi()); // "hello, world\n" + * + * // consume (a prefix of) stored content + * buf.consume(span.prefix(7); + * + * buf.lo_pos() -> 7 + * buf.hi_pos() -> 13 + * buf.contents() -> "world\n" + * + * // consuming all remain content resets to original state + * buf.consume(buf.contents()); + * + * buf.empty() -> true + * buf.hi_pos() -> 0 // not 13! + * + * // 2. + * buffer buf; + * buf.empty() -> true + * buf.buf_z() -> 0 + * buf.lo_pos() -> 0 + * buf.hi_pos() -> 0 + * buf.contents() -> empty span + * buf.avail() -> empty span + * + * // allocate memory separately from ctor + * buf.alloc(64*1024); + * @endcode + **/ + template + class buffer { + public: + /** @brief typealias for span of CharT **/ + using span_type = span; + /** @brief typealias for buffer size (counts CharT's, not bytes) **/ + using size_type = std::uint64_t; + + public: + /** @brief create empty buffer. + + Does not allocate any storage; @see alloc + **/ + buffer() = default; + /** @brief create empty buffer, and possibly allocate storage. + + @param buf_z Buffer size. allocate storage (owned by this buffer) if >0. + @param align_z Align to this value, e.g. 8 to align storage on an 8-byte boundary + **/ + buffer(size_type buf_z, + size_type align_z = sizeof(char)) + : is_owner_{true}, + buf_{buf_z ? (new (std::align_val_t(align_z)) CharT [buf_z]) : nullptr}, + buf_z_{buf_z}, + lo_pos_{0}, + hi_pos_{0} + {} + /** @brief buffer is not copyable **/ + buffer(buffer const & x) = delete; + /** @brief destructor. Release storage if owned **/ + ~buffer() { this->reset(); } + + /** @name Access methods **/ + ///@{ + + /** @brief start of buffer memory **/ + CharT * buf() const { return buf_; } + /** @brief buffer size (number of characters) **/ + size_type buf_z() const { return buf_z_; } + /** @brief current start position within buffer **/ + size_type lo_pos() const { return lo_pos_; } + /** @brief current end position within buffer **/ + size_type hi_pos() const { return hi_pos_; } + + ///@} + + /** @brief readonly access to a single buffer element. + + Relative to start of buffer (ignores current consume position) + **/ + CharT const & operator[](size_type i) const { return buf_[i]; } + + /** @brief return span for current buffer contents **/ + span_type contents() const { return span_type(buf_ + lo_pos_, + buf_ + hi_pos_); } + /** @brief returns span for writable buffer contents (unused prefix following produce position **/ + span_type avail() const { return span_type(buf_ + hi_pos_, + buf_ + buf_z_); } + + /** @brief @c true iff buffer is empty **/ + bool empty() const { return lo_pos_ == hi_pos_; } + + + /** + @brief update buffer produce position, after (independently) writing contents of span to it + + @pre left endpoint of @p span equals buffer produce position (@c .hi_pos) + @pre right endpoint of @p span within bounds of buffer memory range + @post right endpoint of @p span equals buffer produce position. + **/ + void produce(span_type const & span) { + assert(span.lo() == buf_ + hi_pos_); + + hi_pos_ += span.size(); + } + + /** + @brief update buffer consume position, when done with contents of span + + @pre left endpoint of @p span equals buffer consume position (@c .lo_pos) + @pre right endpoint of @p span within bounds of buffer memory range + @post Either + buffer is empty, with @c .lo_pos = @c .hi_pos = @c 0. + buffer is non-empty, right endpoint of @p span equals new buffer consume position. + **/ + void consume(span_type const & span) { + if (span.size()) { + assert(span.lo() == buf_ + lo_pos_); + + lo_pos_ += span.size(); + } else { + /* since .consume() that arrives at empty contents also resets .lo_pos .hi_pos, + * we don't want to blow up when called with an empty span -- argument + * may represent some pre-reset location in buffer + */ + } + + if (lo_pos_ == hi_pos_) { + lo_pos_ = 0; + hi_pos_ = 0; + } + } + + /** + @brief allocate buffer with desired amount of memory + + @param buf_z desired buffer size + @param align_z alignment; buffer memory will be aligned on this byte-boundary. + **/ + void alloc(size_type buf_z, size_type align_z = sizeof(char)) { + /* properly reset (+ discard) any existing state */ + this->reset(); + + is_owner_ = true; + if (buf_z) + buf_ = new (std::align_val_t(align_z)) CharT [buf_z]; + buf_z_ = buf_z; + lo_pos_ = 0; + hi_pos_ = 0; + } + + /** + @brief attach buffer to (unowned) range of @p buf_z bytes starting at @p buf[0] + + Buffer is not responsible for managing storage. + + @post + 1. buffer is empty + @post + 2. buffer read position = buffer write position = 0 + **/ + void setbuf(CharT * buf, size_type buf_z) { + /* properly reset (+ discard) any existing state */ + this->reset(); + + is_owner_ = false; + lo_pos_ = 0; + hi_pos_ = 0; + buf_ = buf; + buf_z_ = buf_z; + } + + /** + @brief revert buffer to empty state and possibly zero it + + @param zero_buffer_flag Zero buffer contents iff this is true + + @post + 1. buffer is empty + @post + 2. buffer read position = buffer write position = 0 + **/ + void clear2empty(bool zero_buffer_flag) { + if (buf_ && zero_buffer_flag) + explicit_bzero(buf_, buf_z_ * sizeof(CharT)); + + lo_pos_ = 0; + hi_pos_ = 0; + } + + /** + @brief swap representation with another buffer instance. + **/ + void swap (buffer & x) { + std::swap(is_owner_, x.is_owner_); + std::swap(buf_, x.buf_); + std::swap(buf_z_, x.buf_z_); + std::swap(lo_pos_, x.lo_pos_); + std::swap(hi_pos_, x.hi_pos_); + } + + /** + @brief reset buffer to an empty state and recover owned storage + **/ + void reset() { + if (is_owner_ && buf_) + delete [] buf_; + + is_owner_ = false; + buf_ = nullptr; + buf_z_ = 0; + lo_pos_ = 0; + hi_pos_ = 0; + } + + /** + @brief move-assignment operator. + @param x right-hand-side to move from. + + @post + @p x is in a valid, empty, + **/ + buffer & operator= (buffer && x) { + is_owner_ = x.is_owner_; + buf_ = x.buf_; + buf_z_ = x.buf_z_; + lo_pos_ = x.lo_pos_; + hi_pos_ = x.hi_pos_; + + x.is_owner_ = false; + x.lo_pos_ = 0; + x.hi_pos_ = 0; + x.buf_ = nullptr; + x.buf_z_ = 0; + + return *this; + } + + /** @brief buffer is not assignable */ + buffer & operator= (buffer & x) = delete; + + private: + /** @brief true iff buffer is responsible for freeing storage at @c buf_ **/ + bool is_owner_ = false; + /** @brief buffer contents. buffer memory comprises @c buf_[0] to @c buf_[buf_z_] **/ + CharT * buf_ = nullptr; + /** @brief buffer size (in units of CharT) **/ + size_type buf_z_ = 0; + + /** @brief buffer read (consume) position + + @invariant + 0 <= lo_pos_ <= hi_pos_ < buf_z_ + **/ + size_type lo_pos_ = 0; + /** @brief buffer write (produce) position + + @invariant + 0 <= hi_pos_ < hi_pos_ < buf_z_ + **/ + size_type hi_pos_ = 0; + }; + + /** @brief Overload for @c swap, so that @c buffer swappable **/ + template + inline void + swap(buffer & lhs, + buffer & rhs) { + lhs.swap(rhs); + } + } /*namespace scm*/ +} /*namespace xo*/ + +/* end buffer.hpp */ diff --git a/include/xo/tokenizer2/scan_result.hpp b/include/xo/tokenizer2/scan_result.hpp new file mode 100644 index 00000000..971e4b93 --- /dev/null +++ b/include/xo/tokenizer2/scan_result.hpp @@ -0,0 +1,81 @@ +/* file scan_result.hpp + * + * author: Roland Conybeare, Jun 2025 + */ + +#pragma once + +#include "Token.hpp" +#include "TokenizerError.hpp" +#include "TkInputState.hpp" + +namespace xo { + namespace scm { + /** @class scan_result + * @brief Represent result of parsing one input token. + * + * @code + * Possible outcomes fall into several categories + * (with T: @c token_.is_valid(), E: @cerror_.is_error()) + * + * | T | E | description | + * |-------+-------+-------------------------------------| + * | false | false | end of input, including end of line | + * | true | false | parsed token in T | + * | false | true | parse error in E | + * + * @endcode + **/ + class scan_result { + public: + using CharT = char; + using token_type = Token; + using span_type = span; + using error_type = TokenizerError; + using input_state_type = TkInputState; + + public: + scan_result(const Token & token, + const span_type & consumed, + const TokenizerError & error = TokenizerError()) + : token_{token}, consumed_{consumed}, error_{error} {} + + static scan_result make_whitespace(const span_type & prefix_input); + static scan_result make_partial(const span_type & prefix_input); + /** + * @p error_src can be __FUNCTION__ from site where error generated. + * @p error_msg error message + * @p error_pos error position, relative to start of token + * @p input_state_ref input state object; + * copied into scan_result, and leaving input_state_ref.current_line cleared + **/ + static scan_result make_error_consume_current_line(const char * error_src, + std::string error_msg, + size_t error_pos, + input_state_type & input_state_ref); + + bool is_eof_or_ambiguous() const { return token_.is_invalid() && error_.is_not_an_error(); } + bool is_token() const { return token_.is_valid(); } + bool is_error() const { return error_.is_error(); } + + const Token & get_token() const { return token_; } + const span_type & consumed() const { return consumed_; } + const TokenizerError & error() const { return error_; } + + public: + /** Successfully parsed token, whenever tk_type != tokentype::tk_invalid. + * Will be tokentype::tk_invalid in normal cause of events for valid input, + * when consuming whitespace + **/ + token_type token_; + /** input span represented by .token, on success. Otherwise not defined **/ + span_type consumed_; + /** error description, whenever .error_.is_error() is true **/ + TokenizerError error_; + }; + + + } /*namespace scm*/ +} /*namespace xo*/ + +/* end scan_result.hpp */ diff --git a/include/xo/tokenizer2/span.hpp b/include/xo/tokenizer2/span.hpp new file mode 100644 index 00000000..8cf7a4a7 --- /dev/null +++ b/include/xo/tokenizer2/span.hpp @@ -0,0 +1,291 @@ +/** @file span.hpp **/ + +#pragma once + +#include "xo/indentlog/scope.hpp" +#include "xo/indentlog/print/ppdetail_atomic.hpp" +#include +#include +#include + +namespace xo { + namespace scm { + /** @class span compression/span.hpp + * + * @brief A contiguous range of characters, without ownership. + * + * @tparam CharT type for elements referred to by this span. + **/ + template + class span { + public: + /** @defgroup span-type-traits span type traits **/ + ///@{ + + /** typealias for span size (in units of CharT) **/ + using size_type = std::uint64_t; + + ///@} + + public: + /** @defgroup span-ctors span constructors **/ + ///@{ + + /** null span **/ + span() : lo_{nullptr}, hi_{nullptr} {} + + /** Create span for the contiguous memory range [@p lo, @p hi) **/ + span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {} + + /** explicit conversion from span **/ + template + span(const span & other, + std::enable_if_t + && !std::is_same_v> * = nullptr) + : lo_{other.lo()}, hi_{other.hi()} {} + + /** copy ctor (explicit to avoid ambiguity with template ctor) **/ + span(const span & other) = default; + span & operator=(const span & other) = default; + + /** Create a null span (i.e. with null @p lo, @p hi pointers) + * A null span can be concatenated with any other span + * without triggering matching-endpoint asserts. + **/ + static span make_null() { return span(static_cast(nullptr), static_cast(nullptr)); } + + /** @brief create span for C-style string @p cstr **/ + static span from_cstr(const CharT * cstr) { + CharT * lo = cstr; + CharT * hi = cstr ? cstr + strlen(cstr) : nullptr; + + return span(lo, hi); + } + + /** @brief create span from std::string @p str **/ + static span from_string(const std::string& str) { + CharT * lo = &(*str.begin()); + CharT * hi = &(*str.end()); + + return span(lo, hi); + } + + /** @brief concatenate two contiguous spans */ + static span concat(const span & span1, const span & span2) { + if (span1.is_null()) + return span2; + if (span2.is_null()) + return span1; + + if (span1.hi() != span2.lo()) { + scope log(XO_DEBUG(true)); + + log && log(xtag("span1.hi", (void*)span1.hi()), xtag("span2.lo", (void*)span2.lo())); + } + + assert(span1.hi() == span2.lo()); + + CharT * lo = span1.lo(); + CharT * hi = span2.hi(); + + return span(lo, hi); + } + + ///@} + + /** @defgroup span-access-methods **/ + ///@{ + + CharT * lo() const { return lo_; } /* get member span::lo_ */ + CharT * hi() const { return hi_; } /* get member span::hi_ */ + + ///@} + + /** @defgroup span-general-methods **/ + ///@{ + + /** @brief strip prefix until first occurence of '\n', including the newline **/ + void discard_until_newline() { + for (const CharT * p = lo_; p < hi_; ++p) { + if (*p == '\n') { + lo_ = p + 1; + return; + } + } + + lo_ = hi_; + } + + /** Create new span over supplied type, + * with identical (possibly misaligned) endpoints. + * + * @warning + * 1. New span uses exactly the same memory addresses. + * Endpoint pointers may not be aligned. + * 2. Implementation assumes code compiled with + * @code -fno-strict-aliasing @endcode enabled. + * + * @tparam OtherT element type for new span + **/ + template + span + cast() const { return span(reinterpret_cast(lo_), + reinterpret_cast(hi_)); } + + /** @brief create span including the first @p z members of this span. **/ + span prefix(size_type z) const { return span(lo_, lo_ + z); } + + /** @brief create span representing prefix up to (but not including) @p *p + **/ + span prefix_upto(CharT * p) const { + if (p <= hi_) + return span(lo_, p); + else + return span(lo_, hi_); + } + + /** @brief create span with first @p z members of this span removed **/ + span after_prefix(size_type z) const { + if (lo_ + z > hi_) + z = hi_ - lo_; + + return span(lo_ + z, hi_); + } + + /** @brief create span with @p prefix of this span removed **/ + span after_prefix(const span & prefix) const { + if (!prefix.is_null() && (prefix.lo() != lo_)) { + throw std::runtime_error + ("after_prefix: expected prefix of this span"); + } + + return after_prefix(prefix.size()); + } + + /** Create span starting with position @p p. + * Does boundary checking; will return empty span if @p p is outside @c [lo_,hi) + **/ + span suffix_from(CharT * p) const { + if ((lo_ <= p) && (p <= hi_)) + return span(p, hi_); + else + return span(hi_, hi_); + } + + /** true iff this span is null. distinct from empty. **/ + bool is_null() const { return lo_ == nullptr && hi_ == nullptr; } + /** true iff this span is empty (comprises 0 elements). **/ + bool empty() const { return lo_ == hi_; } + /** report the number of elements (of type CharT) in this span. **/ + size_type size() const { return hi_ - lo_; } + + /** increase extent of this spans to include @p x. + * Requires @c hi() == @c x.lo() + **/ + span & operator+=(const span & x) { + if (hi_ == x.lo_) { + hi_ = x.hi_; + } else if (!x.is_null()) { + assert(false); + } + + return *this; + } + + /** print representation for this span on stream @p os **/ + void print(std::ostream & os) const { + os << ""; + } + ///@} + + private: + /** @defgroup span-instance-vars **/ + ///@{ + + /** start of span. + Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) + **/ + CharT * lo_ = nullptr; + + /** @brief end of span. + Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) + **/ + CharT * hi_ = nullptr; + + ///@} + }; /*span*/ + + /** @defgroup span-operators **/ + ///@{ + + /** compare spans for equality. + * Two spans are equal iff both endpoints match exactly. + **/ + template + inline bool + operator==(const span & lhs, const span & rhs) { + return ((lhs.lo() == rhs.lo()) + && (lhs.hi() == rhs.hi())); + } + + /** compare spans for inequality. + * Two spans are unequal if either paired endpoint differs. + **/ + template + inline bool + operator!=(const span & lhs, const span & rhs) { + return ((lhs.lo() != rhs.lo()) + || (lhs.hi() != rhs.hi())); + } + + /** print a summary of @p x on stream @p os. Intended for diagnostics **/ + template + inline std::ostream & + operator<<(std::ostream & os, + const span & x) { + x.print(os); + return os; + } + + ///@} + } /*namespace scm*/ + + namespace print { + template + class printspan_impl { + public: + printspan_impl(xo::scm::span x) : span_{x} {} + + xo::scm::span span_; + }; + + template + printspan_impl printspan(const xo::scm::span& span) { + return printspan_impl(span); + } + + template + inline std::ostream & + operator<< (std::ostream & os, + const printspan_impl & x) + { + for (const CharT * p = x.span_.lo(); p < x.span_.hi(); ++p) + os << *p; + + return os; + } + +#ifndef ppdetail_atomic + template \ + PPDETAIL_ATOMIC_BODY(printspan_impl); + + template \ + PPDETAIL_ATOMIC_BODY(xo::scm::span); +#endif + + } +} /*namespace xo*/ diff --git a/include/xo/tokenizer2/tokentype.hpp b/include/xo/tokenizer2/tokentype.hpp new file mode 100644 index 00000000..eeeb7dd0 --- /dev/null +++ b/include/xo/tokenizer2/tokentype.hpp @@ -0,0 +1,192 @@ +/** @file tokentype.hpp + * + * author: Roland Conybeare, Jul 2024 + **/ + +#pragma once + +#include "xo/indentlog/print/tag.hpp" // for STRINGIFY +#include "xo/indentlog/print/ppdetail_atomic.hpp" +#include + +namespace xo { + namespace scm { + /** @enum tokentype + * Enum to identify different schematika input token types + * + * Schematica code examples: + * + * @code + * type point :: { xcoord : f64, ycoord : f64 }; + * type matrix :: array; // 2-d array + * + * decl hypot(x : f64, y : f64) -> f64; + * + * def hypot(x : f64, y : f64) { + * let + * x2 = (x * x); + * y2 = (y * y); + * hypot2 = (x2 + y2); + * in + * sqrt(hypot2); + * }; + * + * def someconst 4; + * + * def foo(v : vec) { + * def (pi : f64) = 3.1415926; + * def (h : (f64,f64) -> f64) = hypot; + * + * h = hypot3; + * }; + * + * def matrixproduct(x : matrix, y : matrix) { + * [i, j : x.row(i) * y.col(j)]; + * }; + * @endcode + **/ + enum class tokentype { + /** sentinel value **/ + tk_invalid = -1, + + /** a boolean constant **/ + tk_bool, + + /** an integer constant (signed 64-bit integer) **/ + tk_i64, + + /** a 64-bit floating-point constant **/ + tk_f64, + + /** a string literal **/ + tk_string, + + /** a symbol **/ + tk_symbol, + + /** left-hand parenthesis @c '(' **/ + tk_leftparen, + + /** right-hand parenthesis @c ')' **/ + tk_rightparen, + + /** left-hand bracket @c '[' **/ + tk_leftbracket, + + /** right-hand bracket @c ']' **/ + tk_rightbracket, + + /** left-hand brace @c '{' **/ + tk_leftbrace, + + /** right-hand brace @c '}' **/ + tk_rightbrace, + + /** left-hand angle bracket @c '<' **/ + tk_leftangle, + + /** right-hand angle bracket @c '>' **/ + tk_rightangle, + + /** less-equal @c '<=' **/ + tk_lessequal, + + /** great-equal @c '>=' **/ + tk_greatequal, + + /** dot @c '.' **/ + tk_dot, + + /** comma @c ',' **/ + tk_comma, + + /** colon @c ':' **/ + tk_colon, + + /** double-colon @c '::' **/ + tk_doublecolon, + + /** semi-colon @c ';' **/ + tk_semicolon, + + /** single equals sign @c '=' **/ + tk_singleassign, + + /** assignment @c ':=' **/ + tk_assign, + + /** indirection @c '->' **/ + tk_yields, + + /** note: operators not treated as punctuation + * 'do-always' is a legal variable name, + * as is 'maybe*2', 'maybe+1', 'path/to/foo' + **/ + + /** operator @c '+' **/ + tk_plus, + /** operator @c '-' **/ + tk_minus, + /** operator @c '*' **/ + tk_star, + /** operator @c '/' **/ + tk_slash, + + /** operator @c '==' **/ + tk_cmpeq, + /** operator @c '!=' **/ + tk_cmpne, + + /** keyword @c 'type' **/ + tk_type, + + /** keyword @c 'def' **/ + tk_def, + + /** keyword @c 'lambda' **/ + tk_lambda, + + /** keyword @c 'if' **/ + tk_if, + + /** keyworkd @c 'then' **/ + tk_then, + + /** keyword @c 'else' **/ + tk_else, + + /** keyword @c 'let' **/ + tk_let, + + /** keyword @c 'in' **/ + tk_in, + + /** keyword @c 'end' **/ + tk_end, + + /** counts number of entries **/ + n_tokentype + }; /*tokentype*/ + + /** String representation for enum value. + * For example @c tokentype_descr(tokentype::tk_if) -> @c "if" + **/ + extern char const * + tokentype_descr(tokentype tk_type); + + /** Print enum value for @p tk_type on stream @p os **/ + inline std::ostream & + operator<< (std::ostream & os, tokentype tk_type) { + os << tokentype_descr(tk_type); + return os; + } + } /*namespace scm*/ + +#ifndef ppdetail_atomic + namespace print { + PPDETAIL_ATOMIC(xo::scm::tokentype); + } /*namespace print*/ +#endif +} /*namespace xo*/ + +/* end tokentype.hpp */ diff --git a/src/tokenizer2/CMakeLists.txt b/src/tokenizer2/CMakeLists.txt new file mode 100644 index 00000000..967535e2 --- /dev/null +++ b/src/tokenizer2/CMakeLists.txt @@ -0,0 +1,15 @@ +# tokenizer2/CMakeLists.txt + +set(SELF_LIB xo_tokenizer2) +set(SELF_SRCS + Tokenizer.cpp + TokenizerError.cpp + TkInputState.cpp + scan_result.cpp + Token.cpp + tokentype.cpp) + +xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS}) +xo_dependency(${SELF_LIB} indentlog) + +# end CMakeLists.txt diff --git a/src/tokenizer2/TkInputState.cpp b/src/tokenizer2/TkInputState.cpp new file mode 100644 index 00000000..30db1dbb --- /dev/null +++ b/src/tokenizer2/TkInputState.cpp @@ -0,0 +1,151 @@ +/** @file TkInputState.cpp + * + * @author Roland Conybeare, Jun 2025 + **/ + +#include "TkInputState.hpp" + +namespace xo { + namespace scm { + using CharT = char; + + bool + TkInputState::is_newline(CharT ch) { + return (ch == '\n'); + } + + bool + TkInputState::is_whitespace(CharT ch) { + switch(ch) { + case ' ': return true; + case '\t': return true; + case '\n': return true; + case '\r': return true; + } + + return false; + } + + TkInputState + TkInputState::rewind(std::size_t n) const + { + return TkInputState(this->current_line_, + (n <= current_pos_) ? current_pos_ - n : 0, + 0 /*whitespace*/); + } + + void + TkInputState::advance(size_t z) + { + scope log(XO_DEBUG(debug_flag_)); + + this->current_pos_ += z; + + log && log(xtag("z", z), xtag("current_pos", current_pos_)); + } + + void + TkInputState::advance_until(const CharT * pos) + { + scope log(XO_DEBUG(debug_flag_)); + + assert(current_line_.lo() <= pos && pos <= current_line_.hi()); + + this->current_pos_ = pos - current_line_.lo(); + + log && log(xtag("current_pos", current_pos_)); + } + + auto + TkInputState::consume_current_line() -> span_type + { + span_type retval = current_line_; + + this->discard_current_line(); + + return retval; + } + + void + TkInputState::discard_current_line() + { + this->current_line_ = span_type::make_null(); + this->current_pos_ = 0; + this->whitespace_ = 0; + } + + auto + TkInputState::capture_current_line(const span_type & input, + bool eof_flag) + -> std::pair + { + // see also discard_current_line() + // note: must capture entirety of first line, + // for example including leading whitespace. + // See discussion in tokenizer scan() method + + scope log(XO_DEBUG(debug_flag_)); + + /* look ahead to {end of line, end of input}, whichever comes first */ + const CharT * sol = input.lo(); + const CharT * eol = sol; + + if (sol == current_line_.lo()) { + log && log("short-circuit - current line already stashed"); + + /* nothing to do here */ + return std::make_pair(input_error::ok, current_line_); + } + + while ((eol < input.hi()) && (*eol != '\n')) + ++eol; + + if (*eol == '\n') { + /* include \n at end-of-line */ + ++eol; + } else { + if (!eof_flag) { + /* caller expected to provide complete line of input. complain and ignore */ + return std::make_pair(input_error::incomplete, + input.prefix(0ul)); + } + } + + this->current_line_ = span_type(sol, eol); + this->current_pos_ = 0; + this->whitespace_ = 0; + + log && log(xtag("current_line", print::printspan(current_line_)), + xtag("current_pos", current_pos_)); + + return std::make_pair(input_error::ok, + span_type(sol, eol)); + } + + const CharT * + TkInputState::skip_leading_whitespace() + { + scope log(XO_DEBUG(debug_flag_)); + + const CharT * ix = current_line_.lo() + current_pos_; + + this->whitespace_ = 0; + + /* skip whitespace + remember beginning of most recent line */ + while (is_whitespace(*ix) && (ix != current_line_.hi())) { + ++ix; + + ++(this->whitespace_); + } + + this->tk_start_ = ix - current_line_.lo(); + this->current_pos_ = ix - current_line_.lo(); + + return ix; + } + + + } /*namespace scm*/ +} /*namespace xo*/ + +/* end TkInputState.cpp */ diff --git a/src/tokenizer2/Token.cpp b/src/tokenizer2/Token.cpp new file mode 100644 index 00000000..f228d56e --- /dev/null +++ b/src/tokenizer2/Token.cpp @@ -0,0 +1,259 @@ +/** @file token.cpp + * + * author: Roland Conybeare + **/ + +#include "Token.hpp" +#include "xo/indentlog/print/tag.hpp" + +namespace xo { + namespace scm { + + bool + Token::bool_value() const + { + if (tk_type_ != tokentype::tk_bool) { + throw (std::runtime_error + (tostr("token::bool_value", + ": token with type tk found where tk_bool expected", + xtag("tk", tk_type_)))); + } + + if (text_ == "true") + return true; + if (text_ == "false") + return false; + + throw (std::runtime_error + (tostr("token::bool_value", + ": unexpected input string tk_bool token", + xtag("text", text_)))); + + return false; + } + + std::int64_t + Token::i64_value() const + { + if (tk_type_ != tokentype::tk_i64) { + throw (std::runtime_error + (tostr("token::i64_value", + ": token with type tk found where tk_i64 expected", + xtag("tk", tk_type_)))); + } + + if (text_.empty()) { + throw (std::runtime_error + (tostr("token::i64_value", + ": unexpected empty input string for tk_i64 token"))); + } + + int sign = 1; + int value = 0; + { + auto ix = text_.begin(); + auto end_ix = text_.end(); + + char ch = *ix; + + if (ch == '+') { + ++ix; + } else if (ch == '-') { + sign = -1; + ++ix; + } + + if (ix == end_ix) { + throw (std::runtime_error + (tostr("token::i64_value", + ": input text found where at least one digit expected", + xtag("text", text_)))); + } + + for (; ix != end_ix; ++ix) { + char ch = *ix; + + if ((ch >= '0') && (ch <= '9')) { + value *= 10; + value += (ch - '0'); + } else { + throw (std::runtime_error + (tostr("token::i64_value", + ": unexpected char ch in integer token", + xtag("ch", ch)))); + } + } + } + + return sign * value; + } /*i64_value*/ + + double + Token::f64_value() const + { + if (tk_type_ != tokentype::tk_f64) { + throw (std::runtime_error + (tostr("token::f64_value", + ": token with type tk found where tk_f64 expected", + xtag("tk", tk_type_)))); + } + + if (text_.empty()) { + throw (std::runtime_error + (tostr("token::f64_value", + ": unexpected empty input string for tk_f64 token"))); + } + + int sign = 1; + /* integer representing denormalized unsigned mantissa + * (mantissa scaled by smallest power of 10 sufficient to make + * it an integer) + */ + std::int64_t mantissa = 0; + /* counts #of digits to the right of decimal point '.' */ + int rh_digits = 0; + /* sign of exponent */ + int exp_sign = 1; + /* value of exponenct = integer to the right of 'e' or 'E' */ + int exponent = 0; + + /* floating-point value will represent + * sign * mantissa * 10^(sign*exponent - rh_digits) + */ + { + auto ix = text_.begin(); + auto end_ix = text_.end(); + + char ch = *ix; + + if (ch == '+') { + ++ix; + } else if (ch == '-') { + sign = -1; + ++ix; + } + + if (ix == end_ix) { + throw (std::runtime_error + (tostr("token::f64_value", + ": input text found where at least one digit expected", + xtag("text", text_)))); + } + + /* true iff decimal point '.' present in mantissa */ + bool have_decimal_point = false; + /* true iff exponent prefix 'e' or 'E' present */ + //bool have_exponent = false; + /* counts number of digits in mantissa + * (both before and after, but not including, any decimal point + */ + int m_digits = 0; + /* digits to the left of decimal point */ + int lh_digits = 0; + + /* loop over mantissa digits */ + for (; ix != end_ix; ++ix) { + char ch = *ix; + + if (ch == '.') { + if (have_decimal_point) { + throw (std::runtime_error + (tostr("token::f64_value", + ": input text found where at most one decimal point expected", + xtag("text", text_)))); + } + + have_decimal_point = true; + lh_digits = m_digits; + } else if ((ch >= '0') && (ch <= '9')) { + mantissa *= 10; + mantissa += (ch - '0'); + ++m_digits; + } else if (ch == 'e' || ch == 'E') { + //have_exponent = true; + break; // done with mantissa + } else { + throw (std::runtime_error + (tostr("token::i64_value", + ": unexpected char ch in integer token", + xtag("ch", ch)))); + } + } + + if (have_decimal_point) + rh_digits = m_digits - lh_digits; + + if (ix != end_ix) { + /* continue to read exponent */ + + /* skip e|E */ + ++ix; + + if (ix == end_ix) { + throw (std::runtime_error + (tostr("token::f64_value", + ": on input text, expect at least one digit following exponent marker e|E", + xtag("text", text_)))); + } + + char ch = *ix; + + if (ch == '+') { + ++ix; /*skip*/ + } else if (ch == '-') { + exp_sign = -1; + ++ix; + } + + for (; ix != end_ix; ++ix) { + char ch = *ix; + + if ((ch >= '0') && (ch <= '9')) { + exponent *= 10; + exponent += (ch - '0'); + } else { + throw (std::runtime_error + (tostr("token::f64_value", + "; on input text, expect only digits following" + " (possibly signed) exponenct marker", + xtag("text", text_)))); + } + } + } + } + + /* floating-point value will represent + * sign * mantissa * 10^(sign*exponent - rh_digits) + */ + + double mantissa_f64 = sign * mantissa; + +#ifdef OBSOLETE_DEBUG + std::cerr << xtag("text", text_) + << xtag("rh_digits", rh_digits) + << xtag("mantissa_f64", mantissa_f64) + << xtag("exp_sign", exp_sign) + << xtag("exponent", exponent) + << std::endl; +#endif + + double retval = (mantissa_f64 + * detail::pow10((exp_sign * exponent) + - rh_digits)); + + return retval; + } /*f64_value*/ + + void + Token::print(std::ostream & os) const + { + os << ""; + } /*print*/ + } /*namespace scm*/ +} /*namespace xo*/ + +/* end token.cpp */ diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp new file mode 100644 index 00000000..00ef4eec --- /dev/null +++ b/src/tokenizer2/Tokenizer.cpp @@ -0,0 +1,836 @@ +/** @file Tokenizer.cpp + * + * @author Roland Conybeare, Jul 2024 + **/ + +#include "Tokenizer.hpp" + +namespace xo { + namespace scm { + Tokenizer::Tokenizer(bool debug_flag) + : input_state_{debug_flag} + {} + + void + Tokenizer::discard_current_line() + { + this->input_state_.discard_current_line(); + } + + bool + Tokenizer::is_1char_punctuation(CharT ch) + { + switch(ch) { + case '(': + return true; + case ')': + return true; + case '[': + return true; + case ']': + return true; + case '{': + return true; + case '}': + return true; + case '<': + /* can't be 1char punctuation -- can begin lessequal token */ + return false; + case '>': + /* can't be 1char punctuation -- can begin greatequal token, + * and appears in tk_yields token + */ + return false; + case ',': + return true; + case ';': + return true; + case ':': + /* can't be 1char punctuation -- can begin assignment token */ + return false; + case '=': + /* can't be 1char punctuation -- can begin comparison token '==' */ + return false; + case '!': + /* can't be 1char punctuation -- can begin comparison token '!=' */ + return false; + case '-': + /* can't be punctuation + * - can appear inside f64 token: e.g. 1.23e-9. + * - begins tk_yields token: -> + */ + return false; + case '+': + /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */ + return false; + case '*': + /* not punctuation -- allowed in symbol */ + return false; + case '/': + /* not punctuation -- for symmetry with +,- */ + return false; + case '.': + /* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */ + return false; + } + + return false; + } + + bool + Tokenizer::is_2char_punctuation(CharT ch) + { + /* can't put '-' here, because of the way it appears in numeric literals + * characters here may not appear in symbol names + */ + + switch(ch) { + case '<': + /* can begin <= */ + return true; + case '>': + /* can begin >= */ + return true; + case ':': + /* can begin := */ + return true; + case '=': + /* can begin == */ + return true; + case '!': + /* can begin != */ + return true; + } + + return false; + } + + auto + Tokenizer::assemble_token(std::size_t initial_whitespace, + const span_type & token_text, + input_state_type * p_input_state) -> result_type + { + /* literal|pretty|streamlined */ + log_config::style = function_style::streamlined; + + scope log(XO_DEBUG(p_input_state->debug_flag())); + log && log(xtag("token_text", token_text), + xtag("initial_whitespace", initial_whitespace), + xtag("input_state", *p_input_state)); + + tokentype tk_type = tokentype::tk_invalid; + std::string tk_text; + + const CharT * tk_start = token_text.lo(); + const CharT * tk_end = token_text.hi(); + + const CharT * ix = tk_start; + + /* switch here applies to the first character in a token */ + switch (*ix) { + case '-': + case '+': + if (token_text.size() == 1) { + /* standalone '+' or '-' */ + if (*ix == '+') + tk_type = tokentype::tk_plus; + else if(*ix == '-') + tk_type = tokentype::tk_minus; + } + + /** fall through to numeric literal code below **/ + [[fallthrough]]; + case '.': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + /* examples of valid floating-point numbers: + * .0 + * 1e0 + * 1e + * 0. + * +1e0 + * -1e0 + * +1E+2 + * -1E+2 + * -0.123e-10 + * non-examples: + * . + * - + * + + * e0 + * .e0 + * -.e-0 + * +.e+0 + * + * in particular: to be recognized as a number, + * must contain at least one digit + */ + + log && log("possible number-token"); + + /* true if initial sign -/+ encountered */ + bool sign_flag = false; + /* true if '.' encountered */ + bool period_flag = false; + /* true if 'e' | 'E' encountered. + */ + bool exponent_flag = false; + /* true when sign '-' | '+' precedes exponenct digits */ + bool exponent_sign_flag = false; + /* true when at least one digit follows exponent marker */ + bool exponent_digit_flag = false; + /* true if at least one digit encountered */ + bool number_flag = false; + + log && log(xtag("*ix", *ix), + xtag("tk.length", token_text.size())); + if (log && (ix + 1 < tk_end)) + log(xtag("*(ix+1)", *(ix + 1))); + + if ((*ix == '-') && (ix + 2 == token_text.hi()) && (*(ix + 1) == '>')) { + /* composing exactly '->' */ + tk_type = tokentype::tk_yields; + } else { + /* token (if valid) will be one of: {tk_i64, tk_f64, tk_dot}: */ + for (; ix != token_text.hi(); ++ix) { + if ((*ix == '-') || (*ix == '+')) { + /* sign allowed: + * 1. before period and before first digit + * 2. after exponent + */ + if (!period_flag && !number_flag && !sign_flag) { + sign_flag = true; + } else if (exponent_flag && !exponent_digit_flag) { + exponent_sign_flag = true; + } else { + return result_type::make_error_consume_current_line + (__FUNCTION__ /*src_function*/, + "improperly placed sign indicator", + (ix - tk_start), + *p_input_state); + } + } else if (*ix == '.') { + if (period_flag) { + return result_type::make_error_consume_current_line + (__FUNCTION__ /*src_function*/, + "duplicate decimal point in numeric literal", + (ix - tk_start), + *p_input_state); + } + + period_flag = true; + } else if ((*ix == 'e') || (*ix == 'E')) { + if (exponent_flag) { + return result_type::make_error_consume_current_line + (__FUNCTION__ /*src_function*/, + "duplicate exponent marker in numeric literal", + (ix - tk_start), + *p_input_state); + } + + exponent_flag = true; + } else if (isdigit(*ix)) { + if (exponent_flag) { + /* need digit before exponent to recognize as number */ + exponent_digit_flag = true; + } else { + number_flag = true; + } + } else { + return result_type::make_error_consume_current_line + (__FUNCTION__ /*src_function*/, + "unexpected character in numeric constant" /*error_description*/, + (ix - tk_start), + *p_input_state); + } + } + + if (number_flag) { + if (period_flag || exponent_flag) { + tk_type = tokentype::tk_f64; + } else { + tk_type = tokentype::tk_i64; + } + } else if (period_flag && !exponent_flag) { + tk_type = tokentype::tk_dot; + } else { + /* not a valid token */ + } + + log && log(xtag("sign_flag", sign_flag)); + log && log(xtag("period_flag", period_flag), + xtag("exponent_flag", exponent_flag), + xtag("exponent_sign_flag", exponent_sign_flag), + xtag("number_flag", number_flag)); + log && log(xtag("tk_type", tk_type)); + } + + break; + } + case '*': + if (token_text.size() == 1) { + /* standalone '*' */ + tk_type = tokentype::tk_star; + ++ix; + } else { + /* '*' isn't punctuation -- but may allow appearance in a longer token + * + * thinking that x*y is a symbol with an embedded '*' character; + * in particular want to support kebab-case symbols like 'foo-config' + */ + } + break; + case '/': + if (token_text.size() == 1) { + /* standalone '/' */ + tk_type = tokentype::tk_slash; + ++ix; + } + break; + case '=': + log && log("singleassign or cmpeq token"); + + if (*(ix + 1) == '=') { + tk_type = tokentype::tk_cmpeq; + ++ix; + ++ix; + } else { + /* standalone '=' */ + tk_type = tokentype::tk_singleassign; + ++ix; + } + break; + case '!': + if (*(ix + 1) == '=') { + tk_type = tokentype::tk_cmpne; + ++ix; + ++ix; + } else { + /* standlone '!' */ + + // TODO + } + break; + case '"': + { + log && log("recognize string-token"); + + tk_type = tokentype::tk_string; + + tk_text.reserve(token_text.hi() - token_text.lo()); + + ++ix; /*skip initial " char*/ + + /* true on final " */ + bool endofstring = false; + + for (; ix != token_text.hi(); ++ix) { + log && log(xtag("*ix", *ix)); + + switch(*ix) { + case '"': + endofstring = true; + + /* skip final " char, don't capture */ + ++ix; + + break; + case '\\': + /* skip escape char, don't capture */ + ++ix; + + if (ix == token_text.hi()) { + return result_type::make_error_consume_current_line + (__FUNCTION__ /*src_function*/, + "expecting key following escape character \\", + (ix - tk_start), + *p_input_state); + } + + switch(*ix) { + case '\\': + log && log(xtag("*ix", *ix), xtag("escaped", "t")); + tk_text.push_back(*ix); + break; + case 'n': + log && log(xtag("*ix", *ix), xtag("newline", "t")); + tk_text.push_back('\n'); + break; + case 't': + log && log(xtag("*ix", *ix), xtag("tab", "t")); + tk_text.push_back('\t'); + break; + case 'r': + log && log(xtag("*ix", *ix), xtag("cr", "t")); + tk_text.push_back('\r'); + break; + case '"': + log && log(xtag("*ix", *ix), xtag("quote", "t")); + tk_text.push_back('"'); + break; + default: + return result_type::make_error_consume_current_line + (__FUNCTION__ /*src_function*/, + "expecting one of n|r|\"|\\ following escape \\", + (ix - tk_start), + *p_input_state); + } + break; + default: + tk_text.push_back(*ix); + break; + } + + if (endofstring) + break; + } + + if (!endofstring) { + return result_type::make_error_consume_current_line + (__FUNCTION__ /*src_function*/, + "missing terminating '\"' to complete literal string", + (ix - tk_start), + *p_input_state); + } + + log && log(tostr("tokenizer::assemble_token", + xtag("tk_text", tk_text))); + + break; + } + case 'a': case 'A': + case 'b': case 'B': + case 'c': case 'C': + case 'd': case 'D': + case 'e': case 'E': + case 'f': case 'F': + case 'g': case 'G': + case 'h': case 'H': + case 'i': case 'I': + case 'j': case 'J': + case 'k': case 'K': + case 'l': case 'L': + case 'm': case 'M': + case 'n': case 'N': + case 'o': case 'O': + case 'p': case 'P': + case 'q': case 'Q': + case 'r': case 'R': + case 's': case 'S': + case 't': case 'T': + case 'u': case 'U': + case 'v': case 'V': + case 'w': case 'W': + case 'x': case 'X': + case 'y': case 'Y': + case 'z': case 'Z': + { + /* symbol/identifier must begin with a letter? + * we want to accept some other chars too. + * specifically want to allow identifiers: + * this-is-the-way + * this+is+also+the+way + * how/much/is/that/doggy + * put*an*asterisk*in*that + * something%special% + * + * like pure lisp, we don't allow: + * - identifier beginning with digit + * - period . + * + * unlike pure lisp, we don't allow anywhere in a symbol: + * - colon : + * - semicolon ; + * - comma , + * + * also we don't allow symbols to begin with special chars + */ + + tk_type = tokentype::tk_symbol; + break; + } + case '<': + { + log && log("leftangle or lessequal token"); + + if (*(ix + 1) == '=') { + tk_type = tokentype::tk_lessequal; + ++ix; + ++ix; + } else { + tk_type = tokentype::tk_leftangle; + ++ix; + } + break; + } + case '>': + { + log && log("rightangle or greatequal token"); + + if (*(ix + 1) == '=') { + tk_type = tokentype::tk_greatequal; + ++ix; + ++ix; + } else { + tk_type = tokentype::tk_rightangle; + ++ix; + } + break; + } + case '(': + tk_type = tokentype::tk_leftparen; + ++ix; + break; + case ')': + tk_type = tokentype::tk_rightparen; + ++ix; + break; + case '[': + tk_type = tokentype::tk_leftbracket; + ++ix; + break; + case ']': + tk_type = tokentype::tk_rightbracket; + ++ix; + break; + case '{': + tk_type = tokentype::tk_leftbrace; + ++ix; + break; + case '}': + tk_type = tokentype::tk_rightbrace; + ++ix; + break; + case ',': + tk_type = tokentype::tk_comma; + ++ix; + break; + case ';': + tk_type = tokentype::tk_semicolon; + ++ix; + break; + case ':': + { + log && log("colon or assignment token"); + + if (*(ix + 1) == '=') { + tk_type = tokentype::tk_assign; + ++ix; + ++ix; + } else { + tk_type = tokentype::tk_colon; + ++ix; + } + break; + } + default: + break; + } + + if (tk_type == tokentype::tk_invalid) { + return result_type::make_error_consume_current_line + (__FUNCTION__ /*src_function*/, + "illegal input character", + (ix - tk_start), + *p_input_state); + } + + if ((tk_type == tokentype::tk_i64) + || (tk_type == tokentype::tk_f64) + || (tk_type == tokentype::tk_symbol)) + { + /* note: capturing token text here; + * for numeric literals will re-parse in token::i64_value() / token::f64_value() + */ + tk_text = std::string(tk_start, tk_end); + } else if (tk_type == tokentype::tk_string) { + ; /* nothing to do here -- desired tk_text already constructed */ + } + + if (tk_type == tokentype::tk_symbol) { + /* check for keywords */ + + bool keep_text = false; + + if ((tk_text == "true") || (tk_text == "false")) { + tk_type = tokentype::tk_bool; + keep_text = true; + } else if (tk_text == "type") { + tk_type = tokentype::tk_type; + } else if (tk_text == "def") { + tk_type = tokentype::tk_def; + } else if (tk_text == "lambda") { + tk_type = tokentype::tk_lambda; + } else if (tk_text == "if") { + tk_type = tokentype::tk_if; + } else if (tk_text == "then") { + tk_type = tokentype::tk_then; + } else if (tk_text == "else") { + tk_type = tokentype::tk_else; + } else if (tk_text == "let") { + tk_type = tokentype::tk_let; + } else if (tk_text == "in") { + tk_type = tokentype::tk_in; + } else if (tk_text == "end") { + tk_type = tokentype::tk_end; + } else { + /* keep as symbol */ + keep_text = true; + } + + if (!keep_text) + tk_text.clear(); + } + + /* input.prefix(0): + * require caller preserves current input line until it's entirely exhausted + */ + return result_type(token_type(tk_type, std::move(tk_text)), + p_input_state->current_line().prefix(0)); + } /*assemble_token*/ + + auto + Tokenizer::assemble_final_token(const span_type & token_text, + input_state_type * p_input_state) -> result_type + { + return assemble_token(0 /*initial_whitespace*/, + token_text, + p_input_state); + } + + auto + Tokenizer::scan(const span_type & input, + bool eof_flag) -> result_type + { + scope log(XO_DEBUG(input_state_.debug_flag())); + + log && log(xtag("input", input)); + + /* - Always at beginning of token when scan() invoked + * - scan will not report any portion of line as consumed until it has + * emitted all tokens in that line. + * rationale: caller is allowed to discard storage that + * scan() reports as consumed. But will be holding that line + * until all tokens have been read. + * - this means caller will typically call scan() + * with the same input span multiple times + */ + + /* automagically no-ops when the same input presented twice */ + this->input_state_.capture_current_line(input, eof_flag); + + const CharT * ix = this->input_state_.skip_leading_whitespace(); + + if(ix == input.hi()) { + log && log("end input -> consume current line"); + + /* entirety of current line has been tokenized + * -> caller may consume it + */ + return result_type::make_whitespace(this->input_state_.consume_current_line()); + } + + /* ix: if ix < input.hi: first non-whitespace character after input_state_.current_pos_ */ + + // TODO: + // 1. hoist complete_flag up here + // 2. use in each branch + // 3. common check for prefix-capturing after if-cascade below done + + /* here: *ix is not whitespace */ + + auto whitespace_z = input_state_.whitespace(); + + log && log(xtag("whitespace_z", whitespace_z)); + + /* tk_start points to known beginning of token + * (after any whitespace) + * + * goal is to leave ix pointing to 1 char past the end of the token + */ + const CharT * tk_start = ix; + + if (is_1char_punctuation(*ix)) { + /* 1-character token */ + ++ix; + } else if (is_2char_punctuation(*ix)) { + CharT ch1 = *ix; + + (void)ch1; + + ++ix; + +#ifdef OBSOLETE // no longer a thing. either input ends in whitespace, or ends translation unit + if (ix == input.hi()) { + /* need more input to know if/when token complete */ + this->prefix_ += std::string(tk_start, input.hi()); + + log && log(xtag("captured-prefix1", this->prefix_)); + } else +#endif + { + CharT ch2 = *ix; + + if (((ch2 >= '0') && (ch2 <= '9')) + || ((ch2 >= 'A') && (ch2 <= 'Z')) + || ((ch2 >= 'a') && (ch2 <= 'z'))) + { + /* treat as 1 char punctuation */ + ; + } else { + /* include next char */ + ++ix; + } + } + } else if (*ix == '"') { + bool complete_flag = false; + + /* 1. embedded space/tab allowed in string literal. + * 2. embedded newline/cr not allowed. + */ + CharT prev_ch = '"'; + + ++ix; + + for (; ix != input.hi(); ++ix) { + /* looking for unescaped " char to end literal */ + if (*ix == '"') { + if (prev_ch != '\\') { + ++ix; /* include terminating " for assemble_token */ + complete_flag = true; + break; + } + } else if ((*ix == '\n') || (*ix == '\r')) { + log && log ("string literal with naked newline or CR"); + + return result_type::make_error_consume_current_line + (__FUNCTION__ /*src_function*/, + "must use \\n or \\r to encode newline/cr in string literal", + (ix - tk_start), + this->input_state_); + } + + prev_ch = *ix; + } + + if (!complete_flag) { + log && log("unterminated string literal"); + + return result_type::make_error_consume_current_line + (__FUNCTION__ /*src_function*/, + "unterminated string literal", + (ix - tk_start), + this->input_state_); + } + } else { + /* ix is start of some token */ + + if (*ix == '-') { + /* this section load-bearing for input '->' scanning from beginning of token */ + ++ix; + + if (ix == input.hi()) { + /* need more input to know if/when token complete -- see captured-prefix5 below */ + } else { + CharT ch2 = *ix; + + if (ch2 == '>') { + /* include next char and complete token */ + ++ix; + + log && log("complete '->' token"); + + this->input_state_.advance_until(ix); + + return assemble_token(whitespace_z, + span_type(tk_start, ix) /*token*/, + &(this->input_state_)); + } + + /* here: -123, -.5e-21 for example */ + } + } else if (*ix == '>') { + /* this section load-bearing for input '>=' scanning from beginning of token. + * Need this because '>' necessarily excluded from is_1char_punctuation() + */ + ++ix; + + if (ix == input.hi()) { + /* need more input to know if/when token complete -- see captured-prefix5 below */ + } else { + CharT ch2 = *ix; + + if (ch2 != '=') { + log && log("complete '>=' token"); + + this->input_state_.advance_until(ix); + + /* ignore next char and complete token */ + return assemble_token(whitespace_z, + span_type(tk_start, ix) /*token*/, + &(this->input_state_)); + } + + /* here: >= for example */ + } + } + + /* scan until: + * - whitespace + * - punctuation + */ + for (; ix != input.hi(); ++ix) { + if (input_state_type::is_whitespace(*ix) + || is_1char_punctuation(*ix) + || is_2char_punctuation(*ix)) + { + break; + } + + /* this section load-bearing for input '>' after beginning of a token, e.g. p> */ + if ((ix > tk_start) && (*ix == '>')) + break; + + /* this section load-bearing for input '->' at the end of another token, e.g. p->q */ + if (*ix == '-') { + if (ix + 1 == input.hi()) { + /* need more input to know if/when token complete + * + * apple-banana parses as: {tk_symbol: apple-banana} + * apple-> parses as: {tk_symbol: apple} {tk_yields} + * apple- illegal (may not end symbol with '-') + */ + break; + } + + if (*(ix + 1) == '>') { + /* treat '->' as punctuation; complete preceding token */ + break; + } + } + } + } + + log && log("assemble token z", xtag("token_z", ix - tk_start)); + + assert(tk_start < ix); + + this->input_state_.advance_until(ix); + + return assemble_token(whitespace_z, + span_type(tk_start, ix) /*token*/, + &(this->input_state_)); + } /*scan*/ + } /*namespace scm*/ +} /*namespace xo*/ + +/* end Tokenizer.cpp */ diff --git a/src/tokenizer2/TokenizerError.cpp b/src/tokenizer2/TokenizerError.cpp new file mode 100644 index 00000000..ffe3c8b4 --- /dev/null +++ b/src/tokenizer2/TokenizerError.cpp @@ -0,0 +1,60 @@ +/** @file TokenizerError.cpp + * + * @author Roland Conybeare, Jun 2025 + **/ + +#include "TokenizerError.hpp" + +namespace xo { + namespace scm { + + void + TokenizerError::print(std::ostream & os) const + { + os << ""; + } + + void + TokenizerError::report(std::ostream & os) const + { + using namespace std; + + if (!error_description_.empty()) { + const char * prefix = "input: "; + /* input_state.tk_start: position of first character in token + * input_state.current_pos: position of first character following preceding token. + * error_pos: position (relative to start) at which failure detected + */ + const size_t tk_start = input_state_.tk_start(); + const size_t tk_indent = (strlen(prefix) + tk_start); + const size_t error_pos = 1 + tk_start + error_pos_; + + os << "token col: " << tk_start << ", error col: " << error_pos << "\n"; + os << prefix; + for (const char *p = input_state_.current_line().lo(), + *e = input_state_.current_line().hi(); p < e; ++p) + { + os << *p; + } + //os << endl; + os << std::setw(tk_indent) << " "; + + for (size_t i = 0; i < error_pos_; ++i) { + os << '_'; + } + os << '^' << endl; + + os << error_description_ << endl; + } + } + + } /*namespace scm*/ +} /*namespace xo*/ + +/* end TokenizerError.cpp */ diff --git a/src/tokenizer2/scan_result.cpp b/src/tokenizer2/scan_result.cpp new file mode 100644 index 00000000..05c5c0f7 --- /dev/null +++ b/src/tokenizer2/scan_result.cpp @@ -0,0 +1,43 @@ +/** @file scan_result.cpp + * + * @author Roland Conybeare, 2025 + **/ + +#include "scan_result.hpp" + +namespace xo { + namespace scm { + scan_result + scan_result::make_whitespace(const span_type& whitespace_input) + { + return scan_result(token_type::invalid(), whitespace_input /*consumed*/); + } + + scan_result + scan_result::make_partial(const span_type& prefix_input) + { + return scan_result(token_type::invalid(), prefix_input /*consumed*/); + } + + scan_result + scan_result::make_error_consume_current_line(const char * error_src, + std::string error_msg, + size_t error_pos, + input_state_type & input_state_ref) + { + /* report+consume entire input line */ + + /* copy before altered by .consume_current_line() */ + input_state_type input_state_copy = input_state_ref; + + return scan_result(token_type::invalid(), + input_state_ref.consume_current_line(), + error_type(error_src, + error_msg, + input_state_copy, + error_pos)); + } + } /*namespace scm*/ +} /*namespace xo*/ + +/* end scan_result.cpp */ diff --git a/src/tokenizer2/tokentype.cpp b/src/tokenizer2/tokentype.cpp new file mode 100644 index 00000000..33d683de --- /dev/null +++ b/src/tokenizer2/tokentype.cpp @@ -0,0 +1,74 @@ +/* file tokentype.cpp + * + * author: Roland Conybeare + */ + +#include "tokentype.hpp" + +namespace xo { + namespace scm { + char const * + tokentype_descr(tokentype tk_type) + { +#define CASE(x) case tokentype::x: return STRINGIFY(x) + + switch(tk_type) { + CASE(tk_bool); + CASE(tk_i64); + CASE(tk_f64); + CASE(tk_string); + CASE(tk_symbol); + CASE(tk_leftparen); + + CASE(tk_rightparen); + CASE(tk_leftbracket); + CASE(tk_rightbracket); + CASE(tk_leftbrace); + CASE(tk_rightbrace); + + CASE(tk_leftangle); + CASE(tk_rightangle); + CASE(tk_lessequal); + CASE(tk_greatequal); + CASE(tk_dot); + CASE(tk_comma); + CASE(tk_colon); + + CASE(tk_doublecolon); + CASE(tk_semicolon); + CASE(tk_singleassign); + CASE(tk_assign); + CASE(tk_yields); + + CASE(tk_plus); + CASE(tk_minus); + CASE(tk_star); + CASE(tk_slash); + + CASE(tk_cmpeq); + CASE(tk_cmpne); + + CASE(tk_type); + CASE(tk_def); + CASE(tk_lambda); + CASE(tk_if); + CASE(tk_then); + CASE(tk_else); + CASE(tk_let); + + CASE(tk_in); + CASE(tk_end); + + case tokentype::tk_invalid: + case tokentype::n_tokentype: + return "?tokentype"; + } + +#undef CASE + + return "???"; + } /*tokentype_descr*/ + } /*namespace scm*/ +} /*namespace xo*/ + +/* end tokentype.cpp */ From 1575f8a14736c9f5e1d42b5a82cee360c4d88ce0 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Sun, 11 Jan 2026 18:42:08 -0500 Subject: [PATCH 02/33] xo-tokenizer2: use xo-arena DCircularBuffer to buffer input line --- cmake/xo_tokenizer2Config.cmake.in | 5 +- example/tokenrepl/tokenrepl.cpp | 66 ++--- include/xo/tokenizer2/TkInputState.hpp | 6 +- include/xo/tokenizer2/Tokenizer.hpp | 32 ++- include/xo/tokenizer2/TokenizerError.hpp | 2 +- include/xo/tokenizer2/scan_result.hpp | 2 +- include/xo/tokenizer2/span.hpp | 291 ----------------------- src/tokenizer2/CMakeLists.txt | 2 + src/tokenizer2/TkInputState.cpp | 3 +- src/tokenizer2/Tokenizer.cpp | 56 ++++- 10 files changed, 106 insertions(+), 359 deletions(-) delete mode 100644 include/xo/tokenizer2/span.hpp diff --git a/cmake/xo_tokenizer2Config.cmake.in b/cmake/xo_tokenizer2Config.cmake.in index b5c3cd5c..13f1dac1 100644 --- a/cmake/xo_tokenizer2Config.cmake.in +++ b/cmake/xo_tokenizer2Config.cmake.in @@ -4,9 +4,10 @@ include(CMakeFindDependencyMacro) # note: changes to find_dependency() calls here # must coordinate with xo_dependency() calls -# in CMakeLists.txt +# in src/tokenizer2/CMakeLists.txt # -#find_dependency(xo_flatstring) +find_dependency(xo_arena) +find_dependency(indentlog) include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") check_required_components("@PROJECT_NAME@") diff --git a/example/tokenrepl/tokenrepl.cpp b/example/tokenrepl/tokenrepl.cpp index f97b9cd0..0852f028 100644 --- a/example/tokenrepl/tokenrepl.cpp +++ b/example/tokenrepl/tokenrepl.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include @@ -14,7 +14,7 @@ bool replxx_getline(bool interactive, std::size_t parser_stack_size, replxx::Replxx & rx, - std::string& input) + const char ** p_input) { using namespace std; @@ -34,40 +34,23 @@ bool replxx_getline(bool interactive, if (retval) { //cerr << "got reval->true" << endl; - input = input_cstr; + *p_input = input_cstr; } else { //cerr << "got retval->false" << endl; } - rx.history_add(input); - - // we want tokenizer to see newline, it's syntax - input.push_back('\n'); + rx.history_add(input_cstr); return retval; } -#ifdef OBSOLETE -bool repl_getline(bool interactive, - std::istream & in, - std::ostream & out, - std::string & input) -{ - if (interactive) { - out << "> "; - std::flush(out); - } - - return static_cast(std::getline(in, input)); -} -#endif - int main() { using xo::scm::Tokenizer; - using xo::scm::span; using xo::scm::operator<<; + using xo::mm::CircularBufferConfig; + using xo::mm::span; using replxx::Replxx; using namespace std; @@ -82,36 +65,39 @@ main() { rx.set_max_history_size(1000); rx.history_load("repl_history.txt"); - Tokenizer tkz(xo::log_config::min_log_level <= xo::log_level::info); + Tokenizer tkz(CircularBufferConfig{.name_ = "tokenrepl-input", + .max_capacity_ = 4*1024, + .max_captured_span_ = 128}, + true /*debug_flag*/); - string input_str; + const char * input_cstr = nullptr;; size_t line_no = 1; constexpr std::size_t c_maxlines = 25; - while ( - //repl_getline(interactive, cin, cout, input_str) // once upon a time - replxx_getline(interactive, 0 /*parser_stack_size*/, rx, input_str)) + while (replxx_getline(interactive, 0 /*parser_stack_size*/, rx, &input_cstr)) { - span_type input = span_type::from_string(input_str); - //cout << "input: " << input << endl; // reminder: input may contain multiple tokens - while (!input.empty()) { - auto [tk, consumed, error] = tkz.scan(input, false /*!eof*/); + if (input_cstr && *input_cstr) { + auto [error, input] = tkz.buffer_input_line(input_cstr, false /*!eof*/); - if (tk.is_valid()) { - cout << tk << endl; - } else if (error.is_error()) { - cout << "tokenizer error: " << endl; - error.report(cout); + { + auto [tk, consumed, error] = tkz.scan(input); - break; + if (tk.is_valid()) { + cout << tk << endl; + } else if (error.is_error()) { + cout << "tokenizer error: " << endl; + error.report(cout); + + break; + } + + input = input.after_prefix(consumed); } - - input = input.after_prefix(consumed); } /* here: input.empty() or error encountered */ diff --git a/include/xo/tokenizer2/TkInputState.hpp b/include/xo/tokenizer2/TkInputState.hpp index 531585a1..ea315a0a 100644 --- a/include/xo/tokenizer2/TkInputState.hpp +++ b/include/xo/tokenizer2/TkInputState.hpp @@ -63,7 +63,7 @@ namespace xo { using CharT = char; /** type representing a contiguous span of tokenizer input characters **/ - using span_type = span; + using span_type = xo::mm::span; ///@} @@ -76,7 +76,7 @@ namespace xo { /** Create instance with supplied @p current_line, @p current_pos, @p whitespace. * Introduced for unit tests, not used in tokenizer. **/ - explicit TkInputState(const span& current_line, + explicit TkInputState(const span_type & current_line, size_t current_pos, size_t whitespace) : current_line_{current_line}, current_pos_{current_pos}, @@ -191,7 +191,7 @@ namespace xo { ///@{ /** remember current input line. Used only to report errors **/ - span current_line_ = span(); + span_type current_line_ = span_type(); /** start of last token within @ref current_line_ **/ size_t tk_start_ = 0; /** input position within @ref current_line_ **/ diff --git a/include/xo/tokenizer2/Tokenizer.hpp b/include/xo/tokenizer2/Tokenizer.hpp index 99005fee..40a98cd9 100644 --- a/include/xo/tokenizer2/Tokenizer.hpp +++ b/include/xo/tokenizer2/Tokenizer.hpp @@ -9,8 +9,9 @@ #include "TkInputState.hpp" #include "span.hpp" #include "scan_result.hpp" -#include "xo/indentlog/scope.hpp" -#include "xo/indentlog/print/ppdetail_atomic.hpp" +#include +#include +#include #include namespace xo { @@ -58,15 +59,24 @@ namespace xo { using CharT = char; using token_type = Token; using error_type = TokenizerError; - using span_type = span; - using input_state_type = TkInputState; + using DCircularBuffer = xo::mm::DCircularBuffer; + using CircularBufferConfig = xo::mm::CircularBufferConfig; + using span_type = xo::mm::span; + //using input_state_type = TkInputState; using result_type = scan_result; public: /** @defgroup tokenizer-ctors tokenizer constructors **/ ///@{ - Tokenizer(bool debug_flag = false); + /** + * @p config gives configuration for circular input buffer + * @p debug_flag enables tokenizer debug output + **/ + Tokenizer(const CircularBufferConfig & config = CircularBufferConfig{.name_ = "tkz-input", + .max_capacity_ = 4*1024, + .max_captured_span_ = 128}, + bool debug_flag = false); ///@} @@ -119,6 +129,11 @@ namespace xo { **/ bool has_prefix() const { return !prefix_.empty(); } + /** buffer contents of input_cstr. + * May throw if buffer space exhausted + **/ + std::pair buffer_input_line(const char * input_cstr, bool eof_flag); + /** scan for next input token, given @p input. * Note: * - tokenizer can consume input (e.g. whitespace) @@ -130,8 +145,7 @@ namespace xo { * * @return {parsed token, consumed span} **/ - scan_result scan(const span_type & input, - bool eof_flag); + scan_result scan(const span_type & input); /** discard current line after error. Just cleans up error-reporting state **/ void discard_current_line(); @@ -142,6 +156,8 @@ namespace xo { /** @defgroup tokenizer-instance-vars tokenizer instance variables **/ ///@{ + /** Buffer input here. vm-aware. uses mmap directly **/ + DCircularBuffer input_buffer_; /** track input state (line#,pos,..) for error messages. * There's an ordering problem here: * 1. input_state_.skip_leading_whitespace() advances @@ -150,7 +166,7 @@ namespace xo { * 3. but neeed newline to end token * Also recall input_state_type needed for reporting errors. **/ - input_state_type input_state_; + TkInputState input_state_; /** Accumulate partial token here. * This will happen if input sent to @ref tokenizer::scan * ends without whitespace such that last available token's diff --git a/include/xo/tokenizer2/TokenizerError.hpp b/include/xo/tokenizer2/TokenizerError.hpp index a7fab3c2..a1cb99ee 100644 --- a/include/xo/tokenizer2/TokenizerError.hpp +++ b/include/xo/tokenizer2/TokenizerError.hpp @@ -20,7 +20,7 @@ namespace xo { class TokenizerError { public: using CharT = char; - using span_type = span; + using span_type = xo::mm::span; public: /** @defgroup tokenizer-error-ctors **/ diff --git a/include/xo/tokenizer2/scan_result.hpp b/include/xo/tokenizer2/scan_result.hpp index 971e4b93..249154f1 100644 --- a/include/xo/tokenizer2/scan_result.hpp +++ b/include/xo/tokenizer2/scan_result.hpp @@ -30,7 +30,7 @@ namespace xo { public: using CharT = char; using token_type = Token; - using span_type = span; + using span_type = xo::mm::span; using error_type = TokenizerError; using input_state_type = TkInputState; diff --git a/include/xo/tokenizer2/span.hpp b/include/xo/tokenizer2/span.hpp deleted file mode 100644 index 8cf7a4a7..00000000 --- a/include/xo/tokenizer2/span.hpp +++ /dev/null @@ -1,291 +0,0 @@ -/** @file span.hpp **/ - -#pragma once - -#include "xo/indentlog/scope.hpp" -#include "xo/indentlog/print/ppdetail_atomic.hpp" -#include -#include -#include - -namespace xo { - namespace scm { - /** @class span compression/span.hpp - * - * @brief A contiguous range of characters, without ownership. - * - * @tparam CharT type for elements referred to by this span. - **/ - template - class span { - public: - /** @defgroup span-type-traits span type traits **/ - ///@{ - - /** typealias for span size (in units of CharT) **/ - using size_type = std::uint64_t; - - ///@} - - public: - /** @defgroup span-ctors span constructors **/ - ///@{ - - /** null span **/ - span() : lo_{nullptr}, hi_{nullptr} {} - - /** Create span for the contiguous memory range [@p lo, @p hi) **/ - span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {} - - /** explicit conversion from span **/ - template - span(const span & other, - std::enable_if_t - && !std::is_same_v> * = nullptr) - : lo_{other.lo()}, hi_{other.hi()} {} - - /** copy ctor (explicit to avoid ambiguity with template ctor) **/ - span(const span & other) = default; - span & operator=(const span & other) = default; - - /** Create a null span (i.e. with null @p lo, @p hi pointers) - * A null span can be concatenated with any other span - * without triggering matching-endpoint asserts. - **/ - static span make_null() { return span(static_cast(nullptr), static_cast(nullptr)); } - - /** @brief create span for C-style string @p cstr **/ - static span from_cstr(const CharT * cstr) { - CharT * lo = cstr; - CharT * hi = cstr ? cstr + strlen(cstr) : nullptr; - - return span(lo, hi); - } - - /** @brief create span from std::string @p str **/ - static span from_string(const std::string& str) { - CharT * lo = &(*str.begin()); - CharT * hi = &(*str.end()); - - return span(lo, hi); - } - - /** @brief concatenate two contiguous spans */ - static span concat(const span & span1, const span & span2) { - if (span1.is_null()) - return span2; - if (span2.is_null()) - return span1; - - if (span1.hi() != span2.lo()) { - scope log(XO_DEBUG(true)); - - log && log(xtag("span1.hi", (void*)span1.hi()), xtag("span2.lo", (void*)span2.lo())); - } - - assert(span1.hi() == span2.lo()); - - CharT * lo = span1.lo(); - CharT * hi = span2.hi(); - - return span(lo, hi); - } - - ///@} - - /** @defgroup span-access-methods **/ - ///@{ - - CharT * lo() const { return lo_; } /* get member span::lo_ */ - CharT * hi() const { return hi_; } /* get member span::hi_ */ - - ///@} - - /** @defgroup span-general-methods **/ - ///@{ - - /** @brief strip prefix until first occurence of '\n', including the newline **/ - void discard_until_newline() { - for (const CharT * p = lo_; p < hi_; ++p) { - if (*p == '\n') { - lo_ = p + 1; - return; - } - } - - lo_ = hi_; - } - - /** Create new span over supplied type, - * with identical (possibly misaligned) endpoints. - * - * @warning - * 1. New span uses exactly the same memory addresses. - * Endpoint pointers may not be aligned. - * 2. Implementation assumes code compiled with - * @code -fno-strict-aliasing @endcode enabled. - * - * @tparam OtherT element type for new span - **/ - template - span - cast() const { return span(reinterpret_cast(lo_), - reinterpret_cast(hi_)); } - - /** @brief create span including the first @p z members of this span. **/ - span prefix(size_type z) const { return span(lo_, lo_ + z); } - - /** @brief create span representing prefix up to (but not including) @p *p - **/ - span prefix_upto(CharT * p) const { - if (p <= hi_) - return span(lo_, p); - else - return span(lo_, hi_); - } - - /** @brief create span with first @p z members of this span removed **/ - span after_prefix(size_type z) const { - if (lo_ + z > hi_) - z = hi_ - lo_; - - return span(lo_ + z, hi_); - } - - /** @brief create span with @p prefix of this span removed **/ - span after_prefix(const span & prefix) const { - if (!prefix.is_null() && (prefix.lo() != lo_)) { - throw std::runtime_error - ("after_prefix: expected prefix of this span"); - } - - return after_prefix(prefix.size()); - } - - /** Create span starting with position @p p. - * Does boundary checking; will return empty span if @p p is outside @c [lo_,hi) - **/ - span suffix_from(CharT * p) const { - if ((lo_ <= p) && (p <= hi_)) - return span(p, hi_); - else - return span(hi_, hi_); - } - - /** true iff this span is null. distinct from empty. **/ - bool is_null() const { return lo_ == nullptr && hi_ == nullptr; } - /** true iff this span is empty (comprises 0 elements). **/ - bool empty() const { return lo_ == hi_; } - /** report the number of elements (of type CharT) in this span. **/ - size_type size() const { return hi_ - lo_; } - - /** increase extent of this spans to include @p x. - * Requires @c hi() == @c x.lo() - **/ - span & operator+=(const span & x) { - if (hi_ == x.lo_) { - hi_ = x.hi_; - } else if (!x.is_null()) { - assert(false); - } - - return *this; - } - - /** print representation for this span on stream @p os **/ - void print(std::ostream & os) const { - os << ""; - } - ///@} - - private: - /** @defgroup span-instance-vars **/ - ///@{ - - /** start of span. - Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) - **/ - CharT * lo_ = nullptr; - - /** @brief end of span. - Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) - **/ - CharT * hi_ = nullptr; - - ///@} - }; /*span*/ - - /** @defgroup span-operators **/ - ///@{ - - /** compare spans for equality. - * Two spans are equal iff both endpoints match exactly. - **/ - template - inline bool - operator==(const span & lhs, const span & rhs) { - return ((lhs.lo() == rhs.lo()) - && (lhs.hi() == rhs.hi())); - } - - /** compare spans for inequality. - * Two spans are unequal if either paired endpoint differs. - **/ - template - inline bool - operator!=(const span & lhs, const span & rhs) { - return ((lhs.lo() != rhs.lo()) - || (lhs.hi() != rhs.hi())); - } - - /** print a summary of @p x on stream @p os. Intended for diagnostics **/ - template - inline std::ostream & - operator<<(std::ostream & os, - const span & x) { - x.print(os); - return os; - } - - ///@} - } /*namespace scm*/ - - namespace print { - template - class printspan_impl { - public: - printspan_impl(xo::scm::span x) : span_{x} {} - - xo::scm::span span_; - }; - - template - printspan_impl printspan(const xo::scm::span& span) { - return printspan_impl(span); - } - - template - inline std::ostream & - operator<< (std::ostream & os, - const printspan_impl & x) - { - for (const CharT * p = x.span_.lo(); p < x.span_.hi(); ++p) - os << *p; - - return os; - } - -#ifndef ppdetail_atomic - template \ - PPDETAIL_ATOMIC_BODY(printspan_impl); - - template \ - PPDETAIL_ATOMIC_BODY(xo::scm::span); -#endif - - } -} /*namespace xo*/ diff --git a/src/tokenizer2/CMakeLists.txt b/src/tokenizer2/CMakeLists.txt index 967535e2..ccf1b551 100644 --- a/src/tokenizer2/CMakeLists.txt +++ b/src/tokenizer2/CMakeLists.txt @@ -10,6 +10,8 @@ set(SELF_SRCS tokentype.cpp) xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS}) +# deps must coordinate with xo-tokenizer/cmake/xo_tokenizer2Config.cmake.in +xo_dependency(${SELF_LIB} xo_arena) xo_dependency(${SELF_LIB} indentlog) # end CMakeLists.txt diff --git a/src/tokenizer2/TkInputState.cpp b/src/tokenizer2/TkInputState.cpp index 30db1dbb..1eca02dd 100644 --- a/src/tokenizer2/TkInputState.cpp +++ b/src/tokenizer2/TkInputState.cpp @@ -84,7 +84,8 @@ namespace xo { // for example including leading whitespace. // See discussion in tokenizer scan() method - scope log(XO_DEBUG(debug_flag_)); + scope log(XO_DEBUG(debug_flag_), + xtag("input", input)); /* look ahead to {end of line, end of input}, whichever comes first */ const CharT * sol = input.lo(); diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index 00ef4eec..888a0c43 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -6,9 +6,13 @@ #include "Tokenizer.hpp" namespace xo { + using std::byte; + namespace scm { - Tokenizer::Tokenizer(bool debug_flag) - : input_state_{debug_flag} + Tokenizer::Tokenizer(const CircularBufferConfig & config, + bool debug_flag) + : input_buffer_{DCircularBuffer::map(config)}, + input_state_{debug_flag} {} void @@ -108,7 +112,7 @@ namespace xo { auto Tokenizer::assemble_token(std::size_t initial_whitespace, const span_type & token_text, - input_state_type * p_input_state) -> result_type + TkInputState * p_input_state) -> result_type { /* literal|pretty|streamlined */ log_config::style = function_style::streamlined; @@ -600,7 +604,7 @@ namespace xo { auto Tokenizer::assemble_final_token(const span_type & token_text, - input_state_type * p_input_state) -> result_type + TkInputState * p_input_state) -> result_type { return assemble_token(0 /*initial_whitespace*/, token_text, @@ -608,12 +612,43 @@ namespace xo { } auto - Tokenizer::scan(const span_type & input, - bool eof_flag) -> result_type + Tokenizer::buffer_input_line(const char * input_cstr, + bool eof_flag) -> std::pair { scope log(XO_DEBUG(input_state_.debug_flag())); - log && log(xtag("input", input)); + log && log(xtag("input", input_cstr)); + + auto buf_input_0 = input_buffer_.input_range().hi(); + + auto remainder = input_buffer_.append + (DCircularBuffer::const_span_type + ((const byte *)input_cstr, + (const byte *)input_cstr + strlen(input_cstr))); + + const char * newline_cstr = "\n"; + auto remainder2 = input_buffer_.append + (DCircularBuffer::const_span_type + ((const byte *)newline_cstr, + (const byte *)newline_cstr + strlen(newline_cstr))); + + if (!remainder.empty() || !remainder2.empty()) { + throw std::runtime_error(tostr("Tokenizer::buffer_line: line too long!", + xtag("remainder.size", remainder.size()))); + } + + auto buf_input_1 = input_buffer_.input_range().hi(); + + span_type input = span_type((const char *)buf_input_0, + (const char *)buf_input_1); + + return this->input_state_.capture_current_line(input, eof_flag); + } + + auto + Tokenizer::scan(const span_type & input) -> result_type + { + scope log(XO_DEBUG(input_state_.debug_flag())); /* - Always at beginning of token when scan() invoked * - scan will not report any portion of line as consumed until it has @@ -625,9 +660,6 @@ namespace xo { * with the same input span multiple times */ - /* automagically no-ops when the same input presented twice */ - this->input_state_.capture_current_line(input, eof_flag); - const CharT * ix = this->input_state_.skip_leading_whitespace(); if(ix == input.hi()) { @@ -789,7 +821,7 @@ namespace xo { * - punctuation */ for (; ix != input.hi(); ++ix) { - if (input_state_type::is_whitespace(*ix) + if (TkInputState::is_whitespace(*ix) || is_1char_punctuation(*ix) || is_2char_punctuation(*ix)) { @@ -829,7 +861,7 @@ namespace xo { return assemble_token(whitespace_z, span_type(tk_start, ix) /*token*/, &(this->input_state_)); - } /*scan*/ + } /*_scan_aux*/ } /*namespace scm*/ } /*namespace xo*/ From a7ed10c16a6011cc59ef0e4ce30da9bbfdfac4a0 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Sun, 11 Jan 2026 19:10:42 -0500 Subject: [PATCH 03/33] xo-tokenizer: example tokenrepl restored to wokring order Now with CBufferedInput in Tokenizer --- example/tokenrepl/tokenrepl.cpp | 15 ++++++++++++++- src/tokenizer2/Tokenizer.cpp | 14 ++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/example/tokenrepl/tokenrepl.cpp b/example/tokenrepl/tokenrepl.cpp index 0852f028..1cf02244 100644 --- a/example/tokenrepl/tokenrepl.cpp +++ b/example/tokenrepl/tokenrepl.cpp @@ -51,6 +51,8 @@ main() { using xo::scm::operator<<; using xo::mm::CircularBufferConfig; using xo::mm::span; + using xo::scope; + using xo::xtag; using replxx::Replxx; using namespace std; @@ -65,10 +67,13 @@ main() { rx.set_max_history_size(1000); rx.history_load("repl_history.txt"); + constexpr bool c_debug_flag = true; + scope log(XO_DEBUG(c_debug_flag)); + Tokenizer tkz(CircularBufferConfig{.name_ = "tokenrepl-input", .max_capacity_ = 4*1024, .max_captured_span_ = 128}, - true /*debug_flag*/); + c_debug_flag); const char * input_cstr = nullptr;; @@ -84,9 +89,17 @@ main() { if (input_cstr && *input_cstr) { auto [error, input] = tkz.buffer_input_line(input_cstr, false /*!eof*/); + if (log) { + log(xtag("msg", "buffered input line")); + log(xtag("input", input)); + } + + while (!input.empty()) { auto [tk, consumed, error] = tkz.scan(input); + log && log(xtag("consumed", consumed), xtag("tk", tk)); + if (tk.is_valid()) { cout << tk << endl; } else if (error.is_error()) { diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index 888a0c43..4fa98a97 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -622,15 +622,9 @@ namespace xo { auto buf_input_0 = input_buffer_.input_range().hi(); auto remainder = input_buffer_.append - (DCircularBuffer::const_span_type - ((const byte *)input_cstr, - (const byte *)input_cstr + strlen(input_cstr))); - - const char * newline_cstr = "\n"; + (DCircularBuffer::const_span_type::from_cstr(input_cstr)); auto remainder2 = input_buffer_.append - (DCircularBuffer::const_span_type - ((const byte *)newline_cstr, - (const byte *)newline_cstr + strlen(newline_cstr))); + (DCircularBuffer::const_span_type::from_cstr("\n")); if (!remainder.empty() || !remainder2.empty()) { throw std::runtime_error(tostr("Tokenizer::buffer_line: line too long!", @@ -639,8 +633,8 @@ namespace xo { auto buf_input_1 = input_buffer_.input_range().hi(); - span_type input = span_type((const char *)buf_input_0, - (const char *)buf_input_1); + span_type input = span_type(buf_input_0, + buf_input_1); return this->input_state_.capture_current_line(input, eof_flag); } From f25d1fb7384f60eddd416898a47f775ca4dd21bd Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Sun, 18 Jan 2026 17:59:46 -0500 Subject: [PATCH 04/33] xo-reader2 scaffold (fomo+arena version of xo-reader/) [WIP] --- include/xo/tokenizer2/tokentype.hpp | 4 ++-- src/tokenizer2/tokentype.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/xo/tokenizer2/tokentype.hpp b/include/xo/tokenizer2/tokentype.hpp index eeeb7dd0..91cb3622 100644 --- a/include/xo/tokenizer2/tokentype.hpp +++ b/include/xo/tokenizer2/tokentype.hpp @@ -164,8 +164,8 @@ namespace xo { /** keyword @c 'end' **/ tk_end, - /** counts number of entries **/ - n_tokentype + /** comes last, counts number of entries **/ + N }; /*tokentype*/ /** String representation for enum value. diff --git a/src/tokenizer2/tokentype.cpp b/src/tokenizer2/tokentype.cpp index 33d683de..40c2dbfb 100644 --- a/src/tokenizer2/tokentype.cpp +++ b/src/tokenizer2/tokentype.cpp @@ -60,7 +60,7 @@ namespace xo { CASE(tk_end); case tokentype::tk_invalid: - case tokentype::n_tokentype: + case tokentype::N: return "?tokentype"; } From e3be7ed2de57f76f05643f8061c61c8fce237bf1 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Mon, 19 Jan 2026 00:39:16 -0500 Subject: [PATCH 05/33] xo-reader: + DDefineSsm + utest --- include/xo/tokenizer2/Token.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index 0994e3b8..7ed490cc 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -132,7 +132,7 @@ namespace xo { /** token representing keyword @c type **/ static Token type() { return Token(tokentype::tk_type); } /** token representing keyword @c def **/ - static Token def() { return Token(tokentype::tk_def); } + static Token def_token() { return Token(tokentype::tk_def); } /** token representing keyword @c lambda **/ static Token lambda() { return Token(tokentype::tk_lambda); } /** token representing keyword @c if **/ From 9d4b50ede4d4b94fd78309ef58cede5b5fd8e603 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Mon, 19 Jan 2026 11:33:14 -0500 Subject: [PATCH 06/33] xo-tokenizer2: cosmetic / minor --- src/tokenizer2/Tokenizer.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index 4fa98a97..7076a95d 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -595,10 +595,13 @@ namespace xo { tk_text.clear(); } + // TOOD: report tk_text as span, + // but must pin / unpin + /* input.prefix(0): * require caller preserves current input line until it's entirely exhausted */ - return result_type(token_type(tk_type, std::move(tk_text)), + return result_type(Token(tk_type, std::move(tk_text)), p_input_state->current_line().prefix(0)); } /*assemble_token*/ From 7fadf9662e807c984b4a365d5a3ed6475a7edb11 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Tue, 20 Jan 2026 22:22:45 -0500 Subject: [PATCH 07/33] xo-reader2: DefineSsm handles colon token after lhs var example: def foo : f64 = 3.14; --- include/xo/tokenizer2/Token.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index 7ed490cc..fc448106 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -108,7 +108,7 @@ namespace xo { /** token representing comma @c "," **/ static Token comma() { return Token(tokentype::tk_comma); } /** token representing colon @c ":" **/ - static Token colon() { return Token(tokentype::tk_colon); } + static Token colon_token() { return Token(tokentype::tk_colon); } /** token representing double-colo @c "::" **/ static Token doublecolon() { return Token(tokentype::tk_doublecolon); } /** token representing semicolon @c ";" **/ From b738afac9deee9eb074695b38202bf8724f92489 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Wed, 21 Jan 2026 12:59:06 -0500 Subject: [PATCH 08/33] xo-reader2: scaffold on_singleassign_token() in PSM --- include/xo/tokenizer2/Token.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index fc448106..d47b311d 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -101,7 +101,7 @@ namespace xo { static Token rightbracket() { return Token(tokentype::tk_rightbracket); } /** token representing left brace @c "{" **/ static Token leftbrace() { return Token(tokentype::tk_leftbrace); } - /** token representing right brace @c "}' **/ + /** token representing right brace @c "}" **/ static Token rightbrace() { return Token(tokentype::tk_rightbrace); } /** token representing period @c "." **/ static Token dot() { return Token(tokentype::tk_dot); } @@ -113,8 +113,8 @@ namespace xo { static Token doublecolon() { return Token(tokentype::tk_doublecolon); } /** token representing semicolon @c ";" **/ static Token semicolon() { return Token(tokentype::tk_semicolon); } - /** token representing single-assignment @c "=" **/ - static Token singleassign() { return Token(tokentype::tk_singleassign); } + /** token representing single-assignment @c "=" (editor bait: equal_token) **/ + static Token singleassign_token() { return Token(tokentype::tk_singleassign); } /** token representing unrestricted assignment @c ":=" **/ static Token assign_token() { return Token(tokentype::tk_assign); } /** token representing indirection @c "->" **/ From 81dcd2eb714fb910f6438d82f91366014895dd02 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Thu, 22 Jan 2026 17:15:05 -0500 Subject: [PATCH 09/33] xo-reader2: + on_parsed_expression_with_semicolon + DefineSsm works --- include/xo/tokenizer2/Token.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index d47b311d..cc6e13d9 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -112,7 +112,7 @@ namespace xo { /** token representing double-colo @c "::" **/ static Token doublecolon() { return Token(tokentype::tk_doublecolon); } /** token representing semicolon @c ";" **/ - static Token semicolon() { return Token(tokentype::tk_semicolon); } + static Token semicolon_token() { return Token(tokentype::tk_semicolon); } /** token representing single-assignment @c "=" (editor bait: equal_token) **/ static Token singleassign_token() { return Token(tokentype::tk_singleassign); } /** token representing unrestricted assignment @c ":=" **/ From 5d8f4b4b92ceb5a0a94829ad988d393551d84c22 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Thu, 22 Jan 2026 21:03:40 -0500 Subject: [PATCH 10/33] xo-reader2: working on example parser repl --- include/xo/tokenizer2/TokenizerError.hpp | 2 +- include/xo/tokenizer2/scan_result.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/xo/tokenizer2/TokenizerError.hpp b/include/xo/tokenizer2/TokenizerError.hpp index a1cb99ee..b8a50988 100644 --- a/include/xo/tokenizer2/TokenizerError.hpp +++ b/include/xo/tokenizer2/TokenizerError.hpp @@ -99,7 +99,7 @@ namespace xo { size_t error_pos_ = 0; ///@} - }; /*error_token*/ + }; inline std::ostream & operator<< (std::ostream & os, diff --git a/include/xo/tokenizer2/scan_result.hpp b/include/xo/tokenizer2/scan_result.hpp index 249154f1..45718c5c 100644 --- a/include/xo/tokenizer2/scan_result.hpp +++ b/include/xo/tokenizer2/scan_result.hpp @@ -28,9 +28,9 @@ namespace xo { **/ class scan_result { public: - using CharT = char; + //using CharT = char; using token_type = Token; - using span_type = xo::mm::span; + using span_type = xo::mm::span; using error_type = TokenizerError; using input_state_type = TkInputState; From 9044e1d196418a3189d982b8c564556ea38e106c Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Fri, 23 Jan 2026 11:54:32 -0500 Subject: [PATCH 11/33] xo-reader2: + example app 'readerreplxx' --- example/tokenrepl/tokenrepl.cpp | 45 ++++++++++++------------ include/xo/tokenizer2/Tokenizer.hpp | 5 +-- include/xo/tokenizer2/TokenizerError.hpp | 16 ++++++--- src/tokenizer2/Tokenizer.cpp | 17 +++++---- 4 files changed, 46 insertions(+), 37 deletions(-) diff --git a/example/tokenrepl/tokenrepl.cpp b/example/tokenrepl/tokenrepl.cpp index 1cf02244..d8ddbd7f 100644 --- a/example/tokenrepl/tokenrepl.cpp +++ b/example/tokenrepl/tokenrepl.cpp @@ -85,32 +85,33 @@ main() { { //cout << "input: " << input << endl; + auto input_ext = Tokenizer::span_type::from_cstr(input_cstr); + // reminder: input may contain multiple tokens - if (input_cstr && *input_cstr) { - auto [error, input] = tkz.buffer_input_line(input_cstr, false /*!eof*/); + auto [error, input] = tkz.buffer_input_line(input_ext, false /*!eof*/); - if (log) { - log(xtag("msg", "buffered input line")); - log(xtag("input", input)); + if (log) { + log(xtag("msg", "buffered input line")); + log(xtag("input", input)); + } + + while (!input.empty()) + { + auto [tk, consumed, error] = tkz.scan(input); + + log && log(xtag("consumed", consumed), xtag("tk", tk)); + + if (tk.is_valid()) { + cout << tk << endl; + } else if (error.is_error()) { + cout << "tokenizer error: " << endl; + + error.report(cout); + + break; } - while (!input.empty()) - { - auto [tk, consumed, error] = tkz.scan(input); - - log && log(xtag("consumed", consumed), xtag("tk", tk)); - - if (tk.is_valid()) { - cout << tk << endl; - } else if (error.is_error()) { - cout << "tokenizer error: " << endl; - error.report(cout); - - break; - } - - input = input.after_prefix(consumed); - } + input = input.after_prefix(consumed); } /* here: input.empty() or error encountered */ diff --git a/include/xo/tokenizer2/Tokenizer.hpp b/include/xo/tokenizer2/Tokenizer.hpp index 40a98cd9..69843a5a 100644 --- a/include/xo/tokenizer2/Tokenizer.hpp +++ b/include/xo/tokenizer2/Tokenizer.hpp @@ -129,10 +129,11 @@ namespace xo { **/ bool has_prefix() const { return !prefix_.empty(); } - /** buffer contents of input_cstr. + /** copy into buffer the contents of @p input. * May throw if buffer space exhausted **/ - std::pair buffer_input_line(const char * input_cstr, bool eof_flag); + std::pair buffer_input_line(span_type input, + bool eof_flag); /** scan for next input token, given @p input. * Note: diff --git a/include/xo/tokenizer2/TokenizerError.hpp b/include/xo/tokenizer2/TokenizerError.hpp index b8a50988..bf7702b1 100644 --- a/include/xo/tokenizer2/TokenizerError.hpp +++ b/include/xo/tokenizer2/TokenizerError.hpp @@ -32,7 +32,7 @@ namespace xo { * @p tk_start current position on entry to scanner * @p error_pos error location relative to token start **/ - TokenizerError(const char * src_function, + TokenizerError(std::string_view src_function, std::string error_description, const TkInputState & input_state, size_t error_pos) @@ -46,12 +46,20 @@ namespace xo { log && log(xtag("input_state.current_pos", input_state.current_pos()), xtag("error_pos", error_pos)); } + + TokenizerError with_error(std::string_view error_src_fn, + std::string error_msg) { + return TokenizerError(error_src_fn, + std::string(error_msg), + this->input_state_, + 0 /*error_pos*/); + } ///@} /** @defgroup tokenizer-error-access-methods **/ ///@{ - const char * src_function() const { return src_function_; } + std::string_view src_function() const { return src_function_; } const std::string & error_description() const { return error_description_; } #pragma GCC diagnostic push #ifndef __APPLE__ @@ -88,8 +96,8 @@ namespace xo { ///@{ /** source location (in tokenizer) at which error identified **/ - char const * src_function_ = nullptr; - /** static error description **/ + std::string_view src_function_; + /** error description **/ std::string error_description_; /** input state associated with this error. * Sufficient to precisely locate it with context. diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index 7076a95d..2784072a 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -615,19 +615,18 @@ namespace xo { } auto - Tokenizer::buffer_input_line(const char * input_cstr, + Tokenizer::buffer_input_line(span_type input_ext, bool eof_flag) -> std::pair { scope log(XO_DEBUG(input_state_.debug_flag())); - log && log(xtag("input", input_cstr)); + log && log(xtag("input_ext", input_ext)); auto buf_input_0 = input_buffer_.input_range().hi(); - auto remainder = input_buffer_.append - (DCircularBuffer::const_span_type::from_cstr(input_cstr)); - auto remainder2 = input_buffer_.append - (DCircularBuffer::const_span_type::from_cstr("\n")); + auto remainder = input_buffer_.append(input_ext); + auto remainder2 = input_buffer_.append(span_type::from_cstr("\n")); + //(DCircularBuffer::const_span_type::from_cstr("\n")); if (!remainder.empty() || !remainder2.empty()) { throw std::runtime_error(tostr("Tokenizer::buffer_line: line too long!", @@ -636,10 +635,10 @@ namespace xo { auto buf_input_1 = input_buffer_.input_range().hi(); - span_type input = span_type(buf_input_0, - buf_input_1); + span_type input_ours = span_type(buf_input_0, + buf_input_1); - return this->input_state_.capture_current_line(input, eof_flag); + return this->input_state_.capture_current_line(input_ours, eof_flag); } auto From 7432a0bd1d75350e640d18d50c76d6c8bddc9dc8 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Fri, 23 Jan 2026 14:57:43 -0500 Subject: [PATCH 12/33] xo-reader2: readerreplxx works + streamline debugging --- include/xo/tokenizer2/Tokenizer.hpp | 8 ++--- src/tokenizer2/Tokenizer.cpp | 51 ++++++++++++----------------- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/include/xo/tokenizer2/Tokenizer.hpp b/include/xo/tokenizer2/Tokenizer.hpp index 69843a5a..3dc6da11 100644 --- a/include/xo/tokenizer2/Tokenizer.hpp +++ b/include/xo/tokenizer2/Tokenizer.hpp @@ -109,19 +109,19 @@ namespace xo { static bool is_2char_punctuation(CharT ch); /** assemble token from text @p token_text. - * @p initial_whitespace Amount of whitespace input being consumed from input. + * @p ws_span whitespace preceding token * @p token_text subset of input_line representing a single token. * @p p_input_state input state containing input_line. On exit current line cleared * if error * * retval.consumed will represent some possibly-empty prefix of @p input **/ - static scan_result assemble_token(std::size_t initial_whitespace, - const span_type & token_text, + static scan_result assemble_token( span_type ws_span, + span_type token_text, TkInputState * p_input_state); /** degenerate version of assemble_token() on reaching end-of-file **/ - static scan_result assemble_final_token(const span_type & token_text, + static scan_result assemble_final_token(span_type token_text, TkInputState * p_input_state); /** true if tokenizer contains stored prefix of diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index 2784072a..c79e10c3 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -110,8 +110,8 @@ namespace xo { } auto - Tokenizer::assemble_token(std::size_t initial_whitespace, - const span_type & token_text, + Tokenizer::assemble_token(span_type ws_span, + span_type token_text, TkInputState * p_input_state) -> result_type { /* literal|pretty|streamlined */ @@ -119,7 +119,7 @@ namespace xo { scope log(XO_DEBUG(p_input_state->debug_flag())); log && log(xtag("token_text", token_text), - xtag("initial_whitespace", initial_whitespace), + xtag("initial_whitespace", ws_span.size()), xtag("input_state", *p_input_state)); tokentype tk_type = tokentype::tk_invalid; @@ -598,18 +598,16 @@ namespace xo { // TOOD: report tk_text as span, // but must pin / unpin - /* input.prefix(0): - * require caller preserves current input line until it's entirely exhausted - */ return result_type(Token(tk_type, std::move(tk_text)), - p_input_state->current_line().prefix(0)); + span_type::concat(ws_span, + span_type(tk_start, tk_end))); } /*assemble_token*/ auto - Tokenizer::assemble_final_token(const span_type & token_text, + Tokenizer::assemble_final_token(span_type token_text, TkInputState * p_input_state) -> result_type { - return assemble_token(0 /*initial_whitespace*/, + return assemble_token(token_text.prefix(0) /*ws_span*/, token_text, p_input_state); } @@ -645,6 +643,7 @@ namespace xo { Tokenizer::scan(const span_type & input) -> result_type { scope log(XO_DEBUG(input_state_.debug_flag())); + log && log(xtag("input", input)); /* - Always at beginning of token when scan() invoked * - scan will not report any portion of line as consumed until it has @@ -659,12 +658,14 @@ namespace xo { const CharT * ix = this->input_state_.skip_leading_whitespace(); if(ix == input.hi()) { - log && log("end input -> consume current line"); + log && log("end buffered input -> consume current line"); /* entirety of current line has been tokenized * -> caller may consume it */ - return result_type::make_whitespace(this->input_state_.consume_current_line()); + this->input_state_.consume_current_line(); + + return result_type::make_whitespace(input); } /* ix: if ix < input.hi: first non-whitespace character after input_state_.current_pos_ */ @@ -697,27 +698,17 @@ namespace xo { ++ix; -#ifdef OBSOLETE // no longer a thing. either input ends in whitespace, or ends translation unit - if (ix == input.hi()) { - /* need more input to know if/when token complete */ - this->prefix_ += std::string(tk_start, input.hi()); + CharT ch2 = *ix; - log && log(xtag("captured-prefix1", this->prefix_)); - } else -#endif - { - CharT ch2 = *ix; - - if (((ch2 >= '0') && (ch2 <= '9')) - || ((ch2 >= 'A') && (ch2 <= 'Z')) - || ((ch2 >= 'a') && (ch2 <= 'z'))) + if (((ch2 >= '0') && (ch2 <= '9')) + || ((ch2 >= 'A') && (ch2 <= 'Z')) + || ((ch2 >= 'a') && (ch2 <= 'z'))) { /* treat as 1 char punctuation */ ; } else { - /* include next char */ - ++ix; - } + /* include next char */ + ++ix; } } else if (*ix == '"') { bool complete_flag = false; @@ -779,7 +770,7 @@ namespace xo { this->input_state_.advance_until(ix); - return assemble_token(whitespace_z, + return assemble_token(span_type(input.lo(), tk_start), span_type(tk_start, ix) /*token*/, &(this->input_state_)); } @@ -803,7 +794,7 @@ namespace xo { this->input_state_.advance_until(ix); /* ignore next char and complete token */ - return assemble_token(whitespace_z, + return assemble_token(span_type(input.lo(), tk_start), span_type(tk_start, ix) /*token*/, &(this->input_state_)); } @@ -854,7 +845,7 @@ namespace xo { this->input_state_.advance_until(ix); - return assemble_token(whitespace_z, + return assemble_token(span_type(input.lo(), tk_start), span_type(tk_start, ix) /*token*/, &(this->input_state_)); } /*_scan_aux*/ From f0cd32c05f3c2ac1d7e3d29eb520215ba4958b55 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Tue, 27 Jan 2026 15:50:10 -0500 Subject: [PATCH 13/33] xo-reader2: support if-then-else expressions. + detailed utest --- include/xo/tokenizer2/Token.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index cc6e13d9..38e73902 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -137,6 +137,8 @@ namespace xo { static Token lambda() { return Token(tokentype::tk_lambda); } /** token representing keyword @c if **/ static Token if_token() { return Token(tokentype::tk_if); } + /** token representing keyword @c then **/ + static Token then_token() { return Token(tokentype::tk_then); } /** token representing keyword @c else **/ static Token else_token() { return Token(tokentype::tk_else); } /** token representing keyword @c let **/ From a28b45e4f2c51d7df9b1397666e9d1d6db808d5d Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Wed, 28 Jan 2026 10:57:55 -0500 Subject: [PATCH 14/33] xo-reader2 xo-expression2: + DLambdaSsm [WIP] --- include/xo/tokenizer2/Token.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index 38e73902..f9807d05 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -134,7 +134,7 @@ namespace xo { /** token representing keyword @c def **/ static Token def_token() { return Token(tokentype::tk_def); } /** token representing keyword @c lambda **/ - static Token lambda() { return Token(tokentype::tk_lambda); } + static Token lambda_token() { return Token(tokentype::tk_lambda); } /** token representing keyword @c if **/ static Token if_token() { return Token(tokentype::tk_if); } /** token representing keyword @c then **/ From 0f4e270707dc8a2278d70211b4a4d9b996a58c0f Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Wed, 28 Jan 2026 17:40:57 -0500 Subject: [PATCH 15/33] xo-reader2: + DExpectFormalArgSsm [WIP] --- include/xo/tokenizer2/Token.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index f9807d05..55a2d57d 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -92,7 +92,7 @@ namespace xo { /** token representing right angle bracket @c ">" **/ static Token rightangle() { return Token(tokentype::tk_rightangle); } /** token representing left parenthesis @c "(" **/ - static Token leftparen() { return Token(tokentype::tk_leftparen); } + static Token leftparen_token() { return Token(tokentype::tk_leftparen); } /** Token representing right parenthesis @c ")" **/ static Token rightparen() { return Token(tokentype::tk_rightparen); } /** token representing left bracket @c "[" **/ From 83d210b96824d863690b2dffd5f5fddb215aad65 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Thu, 29 Jan 2026 13:48:24 -0500 Subject: [PATCH 16/33] xo-reader2: DExpectFormalArglistSsm parses multiple formals --- include/xo/tokenizer2/Token.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index 55a2d57d..607bc0a4 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -106,7 +106,7 @@ namespace xo { /** token representing period @c "." **/ static Token dot() { return Token(tokentype::tk_dot); } /** token representing comma @c "," **/ - static Token comma() { return Token(tokentype::tk_comma); } + static Token comma_token() { return Token(tokentype::tk_comma); } /** token representing colon @c ":" **/ static Token colon_token() { return Token(tokentype::tk_colon); } /** token representing double-colo @c "::" **/ From 900d675caac058e24e5113d78dc5e7d99c99be93 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Fri, 30 Jan 2026 10:26:35 -0500 Subject: [PATCH 17/33] xo-expression2 xo-reader2: local symtab stack in PSM --- include/xo/tokenizer2/Token.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index 607bc0a4..66427c3e 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -94,7 +94,7 @@ namespace xo { /** token representing left parenthesis @c "(" **/ static Token leftparen_token() { return Token(tokentype::tk_leftparen); } /** Token representing right parenthesis @c ")" **/ - static Token rightparen() { return Token(tokentype::tk_rightparen); } + static Token rightparen_token() { return Token(tokentype::tk_rightparen); } /** token representing left bracket @c "[" **/ static Token leftbracket() { return Token(tokentype::tk_leftbracket); } /** token representing right bracket @c "]" **/ From e3006f32666983e2040811f5512a1b93d9b35f15 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Fri, 30 Jan 2026 12:41:09 -0500 Subject: [PATCH 18/33] xo-reader2: + assemble lambda function type in DLambdaSsm --- include/xo/tokenizer2/Token.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index 66427c3e..fc5dfc9a 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -100,7 +100,7 @@ namespace xo { /** token representing right bracket @c "]" **/ static Token rightbracket() { return Token(tokentype::tk_rightbracket); } /** token representing left brace @c "{" **/ - static Token leftbrace() { return Token(tokentype::tk_leftbrace); } + static Token leftbrace_token() { return Token(tokentype::tk_leftbrace); } /** token representing right brace @c "}" **/ static Token rightbrace() { return Token(tokentype::tk_rightbrace); } /** token representing period @c "." **/ @@ -117,8 +117,8 @@ namespace xo { static Token singleassign_token() { return Token(tokentype::tk_singleassign); } /** token representing unrestricted assignment @c ":=" **/ static Token assign_token() { return Token(tokentype::tk_assign); } - /** token representing indirection @c "->" **/ - static Token yields() { return Token(tokentype::tk_yields); } + /** token representing indirection @c "->" / function return type **/ + static Token yields_token() { return Token(tokentype::tk_yields); } /** token for @c "+" **/ static Token plus_token() { return Token(tokentype::tk_plus); } From 75b9e62c842cf1777e7dab1236256dc65df11826 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Sat, 31 Jan 2026 21:33:39 -0500 Subject: [PATCH 19/33] xo-reader2: DLambdaSsm work towards producing DLambdaExpr [WIP] --- include/xo/tokenizer2/Token.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index fc5dfc9a..ab3f0cb7 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -102,7 +102,7 @@ namespace xo { /** token representing left brace @c "{" **/ static Token leftbrace_token() { return Token(tokentype::tk_leftbrace); } /** token representing right brace @c "}" **/ - static Token rightbrace() { return Token(tokentype::tk_rightbrace); } + static Token rightbrace_token() { return Token(tokentype::tk_rightbrace); } /** token representing period @c "." **/ static Token dot() { return Token(tokentype::tk_dot); } /** token representing comma @c "," **/ From 415a382442e496d2ac04fd2942aa68f84e9ee1a0 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Mon, 2 Feb 2026 21:55:34 -0500 Subject: [PATCH 20/33] xo-interpreter2: scaffold repl + alloc measurement frameowkr --- include/xo/tokenizer2/Tokenizer.hpp | 6 ++++++ src/tokenizer2/Tokenizer.cpp | 13 +++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/xo/tokenizer2/Tokenizer.hpp b/include/xo/tokenizer2/Tokenizer.hpp index 3dc6da11..83015c03 100644 --- a/include/xo/tokenizer2/Tokenizer.hpp +++ b/include/xo/tokenizer2/Tokenizer.hpp @@ -61,6 +61,7 @@ namespace xo { using error_type = TokenizerError; using DCircularBuffer = xo::mm::DCircularBuffer; using CircularBufferConfig = xo::mm::CircularBufferConfig; + using MemorySizeInfo = xo::mm::MemorySizeInfo; using span_type = xo::mm::span; //using input_state_type = TkInputState; using result_type = scan_result; @@ -90,6 +91,11 @@ namespace xo { const TkInputState & input_state() const { return input_state_; } #pragma GCC diagnostic pop + /** number of distinct memory pools owned by tokenizer **/ + std::size_t _n_store() const noexcept; + /** memory consumption for i'th memory pool **/ + MemorySizeInfo _store_info(std::size_t i) const noexcept; + ///@} /** @defgroup tokenizer-general-methods tokenizer methods **/ diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index c79e10c3..c36d85a5 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -6,6 +6,7 @@ #include "Tokenizer.hpp" namespace xo { + using xo::mm::MemorySizeInfo; using std::byte; namespace scm { @@ -21,6 +22,18 @@ namespace xo { this->input_state_.discard_current_line(); } + std::size_t + Tokenizer::_n_store() const noexcept + { + return input_buffer_._n_store(); + } + + MemorySizeInfo + Tokenizer::_store_info(std::size_t i) const noexcept + { + return input_buffer_._store_info(i); + } + bool Tokenizer::is_1char_punctuation(CharT ch) { From 3f5bd39ed8e5a38a7d44052c7be29b73a62d5de3 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Tue, 3 Feb 2026 01:05:36 -0500 Subject: [PATCH 21/33] xo-interpreter2 .. xo-arena. memory pool introspection --- include/xo/tokenizer2/Tokenizer.hpp | 8 +++----- src/tokenizer2/Tokenizer.cpp | 12 +++--------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/include/xo/tokenizer2/Tokenizer.hpp b/include/xo/tokenizer2/Tokenizer.hpp index 83015c03..0a212a8f 100644 --- a/include/xo/tokenizer2/Tokenizer.hpp +++ b/include/xo/tokenizer2/Tokenizer.hpp @@ -61,7 +61,7 @@ namespace xo { using error_type = TokenizerError; using DCircularBuffer = xo::mm::DCircularBuffer; using CircularBufferConfig = xo::mm::CircularBufferConfig; - using MemorySizeInfo = xo::mm::MemorySizeInfo; + using MemorySizeVisitor = xo::mm::MemorySizeVisitor; using span_type = xo::mm::span; //using input_state_type = TkInputState; using result_type = scan_result; @@ -91,10 +91,8 @@ namespace xo { const TkInputState & input_state() const { return input_state_; } #pragma GCC diagnostic pop - /** number of distinct memory pools owned by tokenizer **/ - std::size_t _n_store() const noexcept; - /** memory consumption for i'th memory pool **/ - MemorySizeInfo _store_info(std::size_t i) const noexcept; + /** visit tokenizer-owned memory pools; invoke visitor(info) for each one **/ + void visit_pools(const MemorySizeVisitor & visitor) const; ///@} diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index c36d85a5..f176a88f 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -22,16 +22,10 @@ namespace xo { this->input_state_.discard_current_line(); } - std::size_t - Tokenizer::_n_store() const noexcept + void + Tokenizer::visit_pools(const MemorySizeVisitor & visitor) const { - return input_buffer_._n_store(); - } - - MemorySizeInfo - Tokenizer::_store_info(std::size_t i) const noexcept - { - return input_buffer_._store_info(i); + input_buffer_.visit_pools(visitor); } bool From 0baa458c5b2770c8ff5bb49f35c0426237d04abf Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Fri, 13 Feb 2026 17:24:23 -0500 Subject: [PATCH 22/33] xo-reader2 stack: handle comparison expression (x == y) --- include/xo/tokenizer2/Token.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index ab3f0cb7..b211f967 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -129,6 +129,9 @@ namespace xo { /** token for @c "/" **/ static Token slash_token() { return Token(tokentype::tk_slash); } + /** token for @c "==" **/ + static Token cmpeq_token() { return Token(tokentype::tk_cmpeq); } + /** token representing keyword @c type **/ static Token type() { return Token(tokentype::tk_type); } /** token representing keyword @c def **/ From 6d039c03e6420168003aa48b19f459ef78b9578a Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Fri, 27 Feb 2026 19:38:53 +1100 Subject: [PATCH 23/33] xo-cmake: setup to make share target available via cmake install --- cmake/xo_tokenizer2Config.cmake.in | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/xo_tokenizer2Config.cmake.in b/cmake/xo_tokenizer2Config.cmake.in index 13f1dac1..eccd2745 100644 --- a/cmake/xo_tokenizer2Config.cmake.in +++ b/cmake/xo_tokenizer2Config.cmake.in @@ -10,4 +10,5 @@ find_dependency(xo_arena) find_dependency(indentlog) include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") +include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Share.cmake") check_required_components("@PROJECT_NAME@") From 9920812d4297b0f63abe160264ad1a5c707ca4d8 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Sun, 1 Mar 2026 13:06:57 +1100 Subject: [PATCH 24/33] xo-reader2 stack: + #q token + QuoteSsm [WIP - not functional] --- include/xo/tokenizer2/Token.hpp | 2 ++ include/xo/tokenizer2/tokentype.hpp | 3 +++ src/tokenizer2/Tokenizer.cpp | 13 +++++++++++++ src/tokenizer2/tokentype.cpp | 5 +++-- 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index b211f967..9ddd0181 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -87,6 +87,8 @@ namespace xo { static Token symbol_token(const std::string & txt) { return Token(tokentype::tk_symbol, txt); } + /** token representing quote @c "'" **/ + static Token quote() { return Token(tokentype::tk_quote); } /** token representing left angle bracket @c "<" **/ static Token leftangle() { return Token(tokentype::tk_leftangle); } /** token representing right angle bracket @c ">" **/ diff --git a/include/xo/tokenizer2/tokentype.hpp b/include/xo/tokenizer2/tokentype.hpp index 91cb3622..3f259f8d 100644 --- a/include/xo/tokenizer2/tokentype.hpp +++ b/include/xo/tokenizer2/tokentype.hpp @@ -64,6 +64,9 @@ namespace xo { /** a symbol **/ tk_symbol, + /** quote @c ' **/ + tk_quote, + /** left-hand parenthesis @c '(' **/ tk_leftparen, diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index f176a88f..8821cd65 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -32,6 +32,8 @@ namespace xo { Tokenizer::is_1char_punctuation(CharT ch) { switch(ch) { + case '\'': + return true; case '(': return true; case ')': @@ -418,6 +420,15 @@ namespace xo { break; } + case '\'': + { + log && log("quote token"); + + tk_type = tokentype::tk_quote; + ++ix; + + break; + } case 'a': case 'A': case 'b': case 'B': case 'c': case 'C': @@ -593,6 +604,8 @@ namespace xo { tk_type = tokentype::tk_in; } else if (tk_text == "end") { tk_type = tokentype::tk_end; + } else if (tk_text == "#q") { + tk_type = tokentype::tk_quote; } else { /* keep as symbol */ keep_text = true; diff --git a/src/tokenizer2/tokentype.cpp b/src/tokenizer2/tokentype.cpp index 40c2dbfb..0831940f 100644 --- a/src/tokenizer2/tokentype.cpp +++ b/src/tokenizer2/tokentype.cpp @@ -18,16 +18,17 @@ namespace xo { CASE(tk_f64); CASE(tk_string); CASE(tk_symbol); - CASE(tk_leftparen); + CASE(tk_quote); + CASE(tk_leftparen); CASE(tk_rightparen); CASE(tk_leftbracket); CASE(tk_rightbracket); CASE(tk_leftbrace); CASE(tk_rightbrace); - CASE(tk_leftangle); CASE(tk_rightangle); + CASE(tk_lessequal); CASE(tk_greatequal); CASE(tk_dot); From 650a9fa95f1851dc92720a289470ab37ec44ac8f Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Mon, 2 Mar 2026 11:05:12 +1100 Subject: [PATCH 25/33] xo-interpreter2 stack: handle operator expressions w/ qliterals --- include/xo/tokenizer2/Token.hpp | 2 +- src/tokenizer2/Tokenizer.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index 9ddd0181..9c748933 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -88,7 +88,7 @@ namespace xo { return Token(tokentype::tk_symbol, txt); } /** token representing quote @c "'" **/ - static Token quote() { return Token(tokentype::tk_quote); } + static Token quote_token() { return Token(tokentype::tk_quote); } /** token representing left angle bracket @c "<" **/ static Token leftangle() { return Token(tokentype::tk_leftangle); } /** token representing right angle bracket @c ">" **/ diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index 8821cd65..323c2d8d 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -429,6 +429,7 @@ namespace xo { break; } + case '#': case 'a': case 'A': case 'b': case 'B': case 'c': case 'C': From 74642dfcfafc540e99a1b92a710f802025ebd91d Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Tue, 3 Mar 2026 12:12:09 +1100 Subject: [PATCH 26/33] xo-interpreter2 stack: + literal array parsing --- include/xo/tokenizer2/Token.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index 9c748933..5f8be733 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -98,9 +98,9 @@ namespace xo { /** Token representing right parenthesis @c ")" **/ static Token rightparen_token() { return Token(tokentype::tk_rightparen); } /** token representing left bracket @c "[" **/ - static Token leftbracket() { return Token(tokentype::tk_leftbracket); } + static Token leftbracket_token() { return Token(tokentype::tk_leftbracket); } /** token representing right bracket @c "]" **/ - static Token rightbracket() { return Token(tokentype::tk_rightbracket); } + static Token rightbracket_token() { return Token(tokentype::tk_rightbracket); } /** token representing left brace @c "{" **/ static Token leftbrace_token() { return Token(tokentype::tk_leftbrace); } /** token representing right brace @c "}" **/ From 458fd04ca2b89c48673dd216d34ff930bd0d065b Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Wed, 11 Mar 2026 07:49:14 -0500 Subject: [PATCH 27/33] xo-reader2 stack: expand symbol table to store typedefs + typedef utest + misc qol policy choices --- include/xo/tokenizer2/Token.hpp | 6 ++++-- include/xo/tokenizer2/tokentype.hpp | 3 +++ src/tokenizer2/Tokenizer.cpp | 2 ++ src/tokenizer2/tokentype.cpp | 1 + 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index 5f8be733..c7c43287 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -111,8 +111,8 @@ namespace xo { static Token comma_token() { return Token(tokentype::tk_comma); } /** token representing colon @c ":" **/ static Token colon_token() { return Token(tokentype::tk_colon); } - /** token representing double-colo @c "::" **/ - static Token doublecolon() { return Token(tokentype::tk_doublecolon); } + /** token representing double-colon @c "::" **/ + static Token doublecolon_token() { return Token(tokentype::tk_doublecolon); } /** token representing semicolon @c ";" **/ static Token semicolon_token() { return Token(tokentype::tk_semicolon); } /** token representing single-assignment @c "=" (editor bait: equal_token) **/ @@ -138,6 +138,8 @@ namespace xo { static Token type() { return Token(tokentype::tk_type); } /** token representing keyword @c def **/ static Token def_token() { return Token(tokentype::tk_def); } + /** token representing keyword @c deftype **/ + static Token deftype_token() { return Token(tokentype::tk_deftype); } /** token representing keyword @c lambda **/ static Token lambda_token() { return Token(tokentype::tk_lambda); } /** token representing keyword @c if **/ diff --git a/include/xo/tokenizer2/tokentype.hpp b/include/xo/tokenizer2/tokentype.hpp index 3f259f8d..d0290b05 100644 --- a/include/xo/tokenizer2/tokentype.hpp +++ b/include/xo/tokenizer2/tokentype.hpp @@ -146,6 +146,9 @@ namespace xo { /** keyword @c 'def' **/ tk_def, + /** keyword @c 'deftype' **/ + tk_deftype, + /** keyword @c 'lambda' **/ tk_lambda, diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index 323c2d8d..f6ac7c2f 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -591,6 +591,8 @@ namespace xo { tk_type = tokentype::tk_type; } else if (tk_text == "def") { tk_type = tokentype::tk_def; + } else if (tk_text == "deftype") { + tk_type = tokentype::tk_deftype; } else if (tk_text == "lambda") { tk_type = tokentype::tk_lambda; } else if (tk_text == "if") { diff --git a/src/tokenizer2/tokentype.cpp b/src/tokenizer2/tokentype.cpp index 0831940f..7df59eec 100644 --- a/src/tokenizer2/tokentype.cpp +++ b/src/tokenizer2/tokentype.cpp @@ -51,6 +51,7 @@ namespace xo { CASE(tk_type); CASE(tk_def); + CASE(tk_deftype); CASE(tk_lambda); CASE(tk_if); CASE(tk_then); From caa8e31d025ad962ca34e31fc5597ea0e076af26 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Wed, 11 Mar 2026 14:13:48 -0500 Subject: [PATCH 28/33] xo-reader2: parse list types + utest --- include/xo/tokenizer2/Token.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index c7c43287..7537a2b0 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -90,9 +90,9 @@ namespace xo { /** token representing quote @c "'" **/ static Token quote_token() { return Token(tokentype::tk_quote); } /** token representing left angle bracket @c "<" **/ - static Token leftangle() { return Token(tokentype::tk_leftangle); } + static Token leftangle_token() { return Token(tokentype::tk_leftangle); } /** token representing right angle bracket @c ">" **/ - static Token rightangle() { return Token(tokentype::tk_rightangle); } + static Token rightangle_token() { return Token(tokentype::tk_rightangle); } /** token representing left parenthesis @c "(" **/ static Token leftparen_token() { return Token(tokentype::tk_leftparen); } /** Token representing right parenthesis @c ")" **/ From 6f95f38373e4c8d0bcf6b3de106d1422e56d2845 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Wed, 11 Mar 2026 16:19:40 -0500 Subject: [PATCH 29/33] xo-interpreter2: + nil + cons --- include/xo/tokenizer2/Token.hpp | 2 ++ include/xo/tokenizer2/tokentype.hpp | 3 +++ src/tokenizer2/Tokenizer.cpp | 2 ++ src/tokenizer2/tokentype.cpp | 1 + 4 files changed, 8 insertions(+) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index 7537a2b0..0968a9e9 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -134,6 +134,8 @@ namespace xo { /** token for @c "==" **/ static Token cmpeq_token() { return Token(tokentype::tk_cmpeq); } + /** token representing keyword @c nil **/ + static Token nil_token() { return Token(tokentype::tk_nil); } /** token representing keyword @c type **/ static Token type() { return Token(tokentype::tk_type); } /** token representing keyword @c def **/ diff --git a/include/xo/tokenizer2/tokentype.hpp b/include/xo/tokenizer2/tokentype.hpp index d0290b05..5f7e1937 100644 --- a/include/xo/tokenizer2/tokentype.hpp +++ b/include/xo/tokenizer2/tokentype.hpp @@ -140,6 +140,9 @@ namespace xo { /** operator @c '!=' **/ tk_cmpne, + /** keyword @c 'nil' **/ + tk_nil, + /** keyword @c 'type' **/ tk_type, diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index f6ac7c2f..2a6f8ed9 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -587,6 +587,8 @@ namespace xo { if ((tk_text == "true") || (tk_text == "false")) { tk_type = tokentype::tk_bool; keep_text = true; + } else if (tk_text == "nil") { + tk_type = tokentype::tk_nil; } else if (tk_text == "type") { tk_type = tokentype::tk_type; } else if (tk_text == "def") { diff --git a/src/tokenizer2/tokentype.cpp b/src/tokenizer2/tokentype.cpp index 7df59eec..b8a013da 100644 --- a/src/tokenizer2/tokentype.cpp +++ b/src/tokenizer2/tokentype.cpp @@ -49,6 +49,7 @@ namespace xo { CASE(tk_cmpeq); CASE(tk_cmpne); + CASE(tk_nil); CASE(tk_type); CASE(tk_def); CASE(tk_deftype); From 41f704f7ab484a94154d82ba3c4919affeda7f4e Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Thu, 12 Mar 2026 21:08:58 -0500 Subject: [PATCH 30/33] xo-reader2 stack: support op<= --- include/xo/tokenizer2/tokentype.hpp | 2 +- src/tokenizer2/Tokenizer.cpp | 2 +- src/tokenizer2/tokentype.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/xo/tokenizer2/tokentype.hpp b/include/xo/tokenizer2/tokentype.hpp index 5f7e1937..18857af9 100644 --- a/include/xo/tokenizer2/tokentype.hpp +++ b/include/xo/tokenizer2/tokentype.hpp @@ -92,7 +92,7 @@ namespace xo { tk_rightangle, /** less-equal @c '<=' **/ - tk_lessequal, + tk_cmple, /** great-equal @c '>=' **/ tk_greatequal, diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index 2a6f8ed9..76881e7f 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -486,7 +486,7 @@ namespace xo { log && log("leftangle or lessequal token"); if (*(ix + 1) == '=') { - tk_type = tokentype::tk_lessequal; + tk_type = tokentype::tk_cmple; ++ix; ++ix; } else { diff --git a/src/tokenizer2/tokentype.cpp b/src/tokenizer2/tokentype.cpp index b8a013da..e704b38d 100644 --- a/src/tokenizer2/tokentype.cpp +++ b/src/tokenizer2/tokentype.cpp @@ -29,7 +29,7 @@ namespace xo { CASE(tk_leftangle); CASE(tk_rightangle); - CASE(tk_lessequal); + CASE(tk_cmple); CASE(tk_greatequal); CASE(tk_dot); CASE(tk_comma); From 4e7b58e3c95d0726d777f5310fd4eca5f5f93b76 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Thu, 12 Mar 2026 23:41:21 -0500 Subject: [PATCH 31/33] xo-reader2: + op>= support --- include/xo/tokenizer2/tokentype.hpp | 2 +- src/tokenizer2/Tokenizer.cpp | 2 +- src/tokenizer2/tokentype.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/xo/tokenizer2/tokentype.hpp b/include/xo/tokenizer2/tokentype.hpp index 18857af9..a5538131 100644 --- a/include/xo/tokenizer2/tokentype.hpp +++ b/include/xo/tokenizer2/tokentype.hpp @@ -95,7 +95,7 @@ namespace xo { tk_cmple, /** great-equal @c '>=' **/ - tk_greatequal, + tk_cmpge, /** dot @c '.' **/ tk_dot, diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index 76881e7f..88f03755 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -500,7 +500,7 @@ namespace xo { log && log("rightangle or greatequal token"); if (*(ix + 1) == '=') { - tk_type = tokentype::tk_greatequal; + tk_type = tokentype::tk_cmpge; ++ix; ++ix; } else { diff --git a/src/tokenizer2/tokentype.cpp b/src/tokenizer2/tokentype.cpp index e704b38d..c9749a34 100644 --- a/src/tokenizer2/tokentype.cpp +++ b/src/tokenizer2/tokentype.cpp @@ -30,7 +30,7 @@ namespace xo { CASE(tk_rightangle); CASE(tk_cmple); - CASE(tk_greatequal); + CASE(tk_cmpge); CASE(tk_dot); CASE(tk_comma); CASE(tk_colon); From 1d3af64a7a09589f7b9a2be0d0ac66d737114ab0 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Tue, 24 Mar 2026 23:32:09 -0400 Subject: [PATCH 32/33] xo-tokenizer2: + op!= utest --- include/xo/tokenizer2/Token.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/xo/tokenizer2/Token.hpp b/include/xo/tokenizer2/Token.hpp index 0968a9e9..f2d8eb08 100644 --- a/include/xo/tokenizer2/Token.hpp +++ b/include/xo/tokenizer2/Token.hpp @@ -133,6 +133,8 @@ namespace xo { /** token for @c "==" **/ static Token cmpeq_token() { return Token(tokentype::tk_cmpeq); } + /** token for @c "!=" **/ + static Token cmpne_token() { return Token(tokentype::tk_cmpne); } /** token representing keyword @c nil **/ static Token nil_token() { return Token(tokentype::tk_nil); } From ff471bbc72eb51533a03d73f6c623697e1d8e7a3 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Fri, 27 Mar 2026 11:16:28 -0400 Subject: [PATCH 33/33] xo-interpreter2 stack: wrap TokenizerError as DRuntimeError Also fix _read_eval_print() to report them! --- cmake/xo_tokenizer2Config.cmake.in | 1 + include/xo/tokenizer2/TokenizerError.hpp | 6 ++++++ src/tokenizer2/CMakeLists.txt | 1 + src/tokenizer2/TokenizerError.cpp | 14 ++++++++++++++ 4 files changed, 22 insertions(+) diff --git a/cmake/xo_tokenizer2Config.cmake.in b/cmake/xo_tokenizer2Config.cmake.in index eccd2745..0c0dad0b 100644 --- a/cmake/xo_tokenizer2Config.cmake.in +++ b/cmake/xo_tokenizer2Config.cmake.in @@ -6,6 +6,7 @@ include(CMakeFindDependencyMacro) # must coordinate with xo_dependency() calls # in src/tokenizer2/CMakeLists.txt # +find_dependency(xo_stringtable2) find_dependency(xo_arena) find_dependency(indentlog) diff --git a/include/xo/tokenizer2/TokenizerError.hpp b/include/xo/tokenizer2/TokenizerError.hpp index bf7702b1..b08889bd 100644 --- a/include/xo/tokenizer2/TokenizerError.hpp +++ b/include/xo/tokenizer2/TokenizerError.hpp @@ -8,6 +8,8 @@ #include "TkInputState.hpp" #include "tokentype.hpp" #include "span.hpp" +#include +#include #include namespace xo { @@ -19,6 +21,7 @@ namespace xo { **/ class TokenizerError { public: + using AAllocator = xo::mm::AAllocator; using CharT = char; using span_type = xo::mm::span; @@ -89,6 +92,9 @@ namespace xo { /** Print human-oriented error report on @p os. **/ void report(std::ostream & os) const; + /** Similar to report, but capture as string, allocated from @p mm **/ + DString * report_to_string(obj mm) const; + ///@} private: diff --git a/src/tokenizer2/CMakeLists.txt b/src/tokenizer2/CMakeLists.txt index ccf1b551..3a748e70 100644 --- a/src/tokenizer2/CMakeLists.txt +++ b/src/tokenizer2/CMakeLists.txt @@ -11,6 +11,7 @@ set(SELF_SRCS xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS}) # deps must coordinate with xo-tokenizer/cmake/xo_tokenizer2Config.cmake.in +xo_dependency(${SELF_LIB} xo_stringtable2) xo_dependency(${SELF_LIB} xo_arena) xo_dependency(${SELF_LIB} indentlog) diff --git a/src/tokenizer2/TokenizerError.cpp b/src/tokenizer2/TokenizerError.cpp index ffe3c8b4..c80996d9 100644 --- a/src/tokenizer2/TokenizerError.cpp +++ b/src/tokenizer2/TokenizerError.cpp @@ -54,6 +54,20 @@ namespace xo { } } + DString * + TokenizerError::report_to_string(obj dest_mm) const + { + // FIXME: + // using heap here for scratch space. + // Would prefer to checkpoint + realloc. + + std::stringstream ss; + + this->report(ss); + + return DString::from_str(dest_mm, ss.str()); + } + } /*namespace scm*/ } /*namespace xo*/