From 8daf562b298084b78dd5124ecf68e261560f5ee4 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Sun, 11 Jan 2026 18:42:08 -0500 Subject: [PATCH] xo-tokenizer2: use xo-arena DCircularBuffer to buffer input line --- xo-arena/include/xo/arena/DCircularBuffer.hpp | 4 +- xo-arena/src/arena/DCircularBuffer.cpp | 2 +- .../cmake/xo_tokenizer2Config.cmake.in | 5 +- xo-tokenizer2/example/tokenrepl/tokenrepl.cpp | 66 ++-- .../include/xo/tokenizer2/TkInputState.hpp | 6 +- .../include/xo/tokenizer2/Tokenizer.hpp | 32 +- .../include/xo/tokenizer2/TokenizerError.hpp | 2 +- .../include/xo/tokenizer2/scan_result.hpp | 2 +- xo-tokenizer2/include/xo/tokenizer2/span.hpp | 291 ------------------ xo-tokenizer2/src/tokenizer2/CMakeLists.txt | 2 + xo-tokenizer2/src/tokenizer2/TkInputState.cpp | 3 +- xo-tokenizer2/src/tokenizer2/Tokenizer.cpp | 56 +++- 12 files changed, 110 insertions(+), 361 deletions(-) delete mode 100644 xo-tokenizer2/include/xo/tokenizer2/span.hpp diff --git a/xo-arena/include/xo/arena/DCircularBuffer.hpp b/xo-arena/include/xo/arena/DCircularBuffer.hpp index c7c8abf9..0cb007ad 100644 --- a/xo-arena/include/xo/arena/DCircularBuffer.hpp +++ b/xo-arena/include/xo/arena/DCircularBuffer.hpp @@ -58,8 +58,10 @@ namespace xo { size_type page_z, size_type buffer_align_z, span_type reserved_range); +#ifdef NOT_YET /** constructor */ DCircularBuffer(const CircularBufferConfig & config); +#endif /** non-copyable **/ DCircularBuffer(const DCircularBuffer & other) = delete; /** move ctor **/ @@ -107,7 +109,7 @@ namespace xo { * If buffer memory exhausted, may copy a prefix of @p r. * In that case returns the remaining suffix of @p r. **/ - span_type append(span_type r); + const_span_type append(const_span_type r); /** DMA version of @ref append_span : get mapped span A at which * buffer will receive new content. Upstream may write into diff --git a/xo-arena/src/arena/DCircularBuffer.cpp b/xo-arena/src/arena/DCircularBuffer.cpp index 70efcd2d..5ad7e021 100644 --- a/xo-arena/src/arena/DCircularBuffer.cpp +++ b/xo-arena/src/arena/DCircularBuffer.cpp @@ -146,7 +146,7 @@ namespace xo { } auto - DCircularBuffer::append(span_type src) -> span_type + DCircularBuffer::append(const_span_type src) -> const_span_type { span_type dest = get_append_span(src.size()); diff --git a/xo-tokenizer2/cmake/xo_tokenizer2Config.cmake.in b/xo-tokenizer2/cmake/xo_tokenizer2Config.cmake.in index b5c3cd5c..13f1dac1 100644 --- a/xo-tokenizer2/cmake/xo_tokenizer2Config.cmake.in +++ b/xo-tokenizer2/cmake/xo_tokenizer2Config.cmake.in @@ -4,9 +4,10 @@ include(CMakeFindDependencyMacro) # note: changes to find_dependency() calls here # must coordinate with xo_dependency() calls -# in CMakeLists.txt +# in src/tokenizer2/CMakeLists.txt # -#find_dependency(xo_flatstring) +find_dependency(xo_arena) +find_dependency(indentlog) include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") check_required_components("@PROJECT_NAME@") diff --git a/xo-tokenizer2/example/tokenrepl/tokenrepl.cpp b/xo-tokenizer2/example/tokenrepl/tokenrepl.cpp index f97b9cd0..0852f028 100644 --- a/xo-tokenizer2/example/tokenrepl/tokenrepl.cpp +++ b/xo-tokenizer2/example/tokenrepl/tokenrepl.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include @@ -14,7 +14,7 @@ bool replxx_getline(bool interactive, std::size_t parser_stack_size, replxx::Replxx & rx, - std::string& input) + const char ** p_input) { using namespace std; @@ -34,40 +34,23 @@ bool replxx_getline(bool interactive, if (retval) { //cerr << "got reval->true" << endl; - input = input_cstr; + *p_input = input_cstr; } else { //cerr << "got retval->false" << endl; } - rx.history_add(input); - - // we want tokenizer to see newline, it's syntax - input.push_back('\n'); + rx.history_add(input_cstr); return retval; } -#ifdef OBSOLETE -bool repl_getline(bool interactive, - std::istream & in, - std::ostream & out, - std::string & input) -{ - if (interactive) { - out << "> "; - std::flush(out); - } - - return static_cast(std::getline(in, input)); -} -#endif - int main() { using xo::scm::Tokenizer; - using xo::scm::span; using xo::scm::operator<<; + using xo::mm::CircularBufferConfig; + using xo::mm::span; using replxx::Replxx; using namespace std; @@ -82,36 +65,39 @@ main() { rx.set_max_history_size(1000); rx.history_load("repl_history.txt"); - Tokenizer tkz(xo::log_config::min_log_level <= xo::log_level::info); + Tokenizer tkz(CircularBufferConfig{.name_ = "tokenrepl-input", + .max_capacity_ = 4*1024, + .max_captured_span_ = 128}, + true /*debug_flag*/); - string input_str; + const char * input_cstr = nullptr;; size_t line_no = 1; constexpr std::size_t c_maxlines = 25; - while ( - //repl_getline(interactive, cin, cout, input_str) // once upon a time - replxx_getline(interactive, 0 /*parser_stack_size*/, rx, input_str)) + while (replxx_getline(interactive, 0 /*parser_stack_size*/, rx, &input_cstr)) { - span_type input = span_type::from_string(input_str); - //cout << "input: " << input << endl; // reminder: input may contain multiple tokens - while (!input.empty()) { - auto [tk, consumed, error] = tkz.scan(input, false /*!eof*/); + if (input_cstr && *input_cstr) { + auto [error, input] = tkz.buffer_input_line(input_cstr, false /*!eof*/); - if (tk.is_valid()) { - cout << tk << endl; - } else if (error.is_error()) { - cout << "tokenizer error: " << endl; - error.report(cout); + { + auto [tk, consumed, error] = tkz.scan(input); - break; + if (tk.is_valid()) { + cout << tk << endl; + } else if (error.is_error()) { + cout << "tokenizer error: " << endl; + error.report(cout); + + break; + } + + input = input.after_prefix(consumed); } - - input = input.after_prefix(consumed); } /* here: input.empty() or error encountered */ diff --git a/xo-tokenizer2/include/xo/tokenizer2/TkInputState.hpp b/xo-tokenizer2/include/xo/tokenizer2/TkInputState.hpp index 531585a1..ea315a0a 100644 --- a/xo-tokenizer2/include/xo/tokenizer2/TkInputState.hpp +++ b/xo-tokenizer2/include/xo/tokenizer2/TkInputState.hpp @@ -63,7 +63,7 @@ namespace xo { using CharT = char; /** type representing a contiguous span of tokenizer input characters **/ - using span_type = span; + using span_type = xo::mm::span; ///@} @@ -76,7 +76,7 @@ namespace xo { /** Create instance with supplied @p current_line, @p current_pos, @p whitespace. * Introduced for unit tests, not used in tokenizer. **/ - explicit TkInputState(const span& current_line, + explicit TkInputState(const span_type & current_line, size_t current_pos, size_t whitespace) : current_line_{current_line}, current_pos_{current_pos}, @@ -191,7 +191,7 @@ namespace xo { ///@{ /** remember current input line. Used only to report errors **/ - span current_line_ = span(); + span_type current_line_ = span_type(); /** start of last token within @ref current_line_ **/ size_t tk_start_ = 0; /** input position within @ref current_line_ **/ diff --git a/xo-tokenizer2/include/xo/tokenizer2/Tokenizer.hpp b/xo-tokenizer2/include/xo/tokenizer2/Tokenizer.hpp index 99005fee..40a98cd9 100644 --- a/xo-tokenizer2/include/xo/tokenizer2/Tokenizer.hpp +++ b/xo-tokenizer2/include/xo/tokenizer2/Tokenizer.hpp @@ -9,8 +9,9 @@ #include "TkInputState.hpp" #include "span.hpp" #include "scan_result.hpp" -#include "xo/indentlog/scope.hpp" -#include "xo/indentlog/print/ppdetail_atomic.hpp" +#include +#include +#include #include namespace xo { @@ -58,15 +59,24 @@ namespace xo { using CharT = char; using token_type = Token; using error_type = TokenizerError; - using span_type = span; - using input_state_type = TkInputState; + using DCircularBuffer = xo::mm::DCircularBuffer; + using CircularBufferConfig = xo::mm::CircularBufferConfig; + using span_type = xo::mm::span; + //using input_state_type = TkInputState; using result_type = scan_result; public: /** @defgroup tokenizer-ctors tokenizer constructors **/ ///@{ - Tokenizer(bool debug_flag = false); + /** + * @p config gives configuration for circular input buffer + * @p debug_flag enables tokenizer debug output + **/ + Tokenizer(const CircularBufferConfig & config = CircularBufferConfig{.name_ = "tkz-input", + .max_capacity_ = 4*1024, + .max_captured_span_ = 128}, + bool debug_flag = false); ///@} @@ -119,6 +129,11 @@ namespace xo { **/ bool has_prefix() const { return !prefix_.empty(); } + /** buffer contents of input_cstr. + * May throw if buffer space exhausted + **/ + std::pair buffer_input_line(const char * input_cstr, bool eof_flag); + /** scan for next input token, given @p input. * Note: * - tokenizer can consume input (e.g. whitespace) @@ -130,8 +145,7 @@ namespace xo { * * @return {parsed token, consumed span} **/ - scan_result scan(const span_type & input, - bool eof_flag); + scan_result scan(const span_type & input); /** discard current line after error. Just cleans up error-reporting state **/ void discard_current_line(); @@ -142,6 +156,8 @@ namespace xo { /** @defgroup tokenizer-instance-vars tokenizer instance variables **/ ///@{ + /** Buffer input here. vm-aware. uses mmap directly **/ + DCircularBuffer input_buffer_; /** track input state (line#,pos,..) for error messages. * There's an ordering problem here: * 1. input_state_.skip_leading_whitespace() advances @@ -150,7 +166,7 @@ namespace xo { * 3. but neeed newline to end token * Also recall input_state_type needed for reporting errors. **/ - input_state_type input_state_; + TkInputState input_state_; /** Accumulate partial token here. * This will happen if input sent to @ref tokenizer::scan * ends without whitespace such that last available token's diff --git a/xo-tokenizer2/include/xo/tokenizer2/TokenizerError.hpp b/xo-tokenizer2/include/xo/tokenizer2/TokenizerError.hpp index a7fab3c2..a1cb99ee 100644 --- a/xo-tokenizer2/include/xo/tokenizer2/TokenizerError.hpp +++ b/xo-tokenizer2/include/xo/tokenizer2/TokenizerError.hpp @@ -20,7 +20,7 @@ namespace xo { class TokenizerError { public: using CharT = char; - using span_type = span; + using span_type = xo::mm::span; public: /** @defgroup tokenizer-error-ctors **/ diff --git a/xo-tokenizer2/include/xo/tokenizer2/scan_result.hpp b/xo-tokenizer2/include/xo/tokenizer2/scan_result.hpp index 971e4b93..249154f1 100644 --- a/xo-tokenizer2/include/xo/tokenizer2/scan_result.hpp +++ b/xo-tokenizer2/include/xo/tokenizer2/scan_result.hpp @@ -30,7 +30,7 @@ namespace xo { public: using CharT = char; using token_type = Token; - using span_type = span; + using span_type = xo::mm::span; using error_type = TokenizerError; using input_state_type = TkInputState; diff --git a/xo-tokenizer2/include/xo/tokenizer2/span.hpp b/xo-tokenizer2/include/xo/tokenizer2/span.hpp deleted file mode 100644 index 8cf7a4a7..00000000 --- a/xo-tokenizer2/include/xo/tokenizer2/span.hpp +++ /dev/null @@ -1,291 +0,0 @@ -/** @file span.hpp **/ - -#pragma once - -#include "xo/indentlog/scope.hpp" -#include "xo/indentlog/print/ppdetail_atomic.hpp" -#include -#include -#include - -namespace xo { - namespace scm { - /** @class span compression/span.hpp - * - * @brief A contiguous range of characters, without ownership. - * - * @tparam CharT type for elements referred to by this span. - **/ - template - class span { - public: - /** @defgroup span-type-traits span type traits **/ - ///@{ - - /** typealias for span size (in units of CharT) **/ - using size_type = std::uint64_t; - - ///@} - - public: - /** @defgroup span-ctors span constructors **/ - ///@{ - - /** null span **/ - span() : lo_{nullptr}, hi_{nullptr} {} - - /** Create span for the contiguous memory range [@p lo, @p hi) **/ - span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {} - - /** explicit conversion from span **/ - template - span(const span & other, - std::enable_if_t - && !std::is_same_v> * = nullptr) - : lo_{other.lo()}, hi_{other.hi()} {} - - /** copy ctor (explicit to avoid ambiguity with template ctor) **/ - span(const span & other) = default; - span & operator=(const span & other) = default; - - /** Create a null span (i.e. with null @p lo, @p hi pointers) - * A null span can be concatenated with any other span - * without triggering matching-endpoint asserts. - **/ - static span make_null() { return span(static_cast(nullptr), static_cast(nullptr)); } - - /** @brief create span for C-style string @p cstr **/ - static span from_cstr(const CharT * cstr) { - CharT * lo = cstr; - CharT * hi = cstr ? cstr + strlen(cstr) : nullptr; - - return span(lo, hi); - } - - /** @brief create span from std::string @p str **/ - static span from_string(const std::string& str) { - CharT * lo = &(*str.begin()); - CharT * hi = &(*str.end()); - - return span(lo, hi); - } - - /** @brief concatenate two contiguous spans */ - static span concat(const span & span1, const span & span2) { - if (span1.is_null()) - return span2; - if (span2.is_null()) - return span1; - - if (span1.hi() != span2.lo()) { - scope log(XO_DEBUG(true)); - - log && log(xtag("span1.hi", (void*)span1.hi()), xtag("span2.lo", (void*)span2.lo())); - } - - assert(span1.hi() == span2.lo()); - - CharT * lo = span1.lo(); - CharT * hi = span2.hi(); - - return span(lo, hi); - } - - ///@} - - /** @defgroup span-access-methods **/ - ///@{ - - CharT * lo() const { return lo_; } /* get member span::lo_ */ - CharT * hi() const { return hi_; } /* get member span::hi_ */ - - ///@} - - /** @defgroup span-general-methods **/ - ///@{ - - /** @brief strip prefix until first occurence of '\n', including the newline **/ - void discard_until_newline() { - for (const CharT * p = lo_; p < hi_; ++p) { - if (*p == '\n') { - lo_ = p + 1; - return; - } - } - - lo_ = hi_; - } - - /** Create new span over supplied type, - * with identical (possibly misaligned) endpoints. - * - * @warning - * 1. New span uses exactly the same memory addresses. - * Endpoint pointers may not be aligned. - * 2. Implementation assumes code compiled with - * @code -fno-strict-aliasing @endcode enabled. - * - * @tparam OtherT element type for new span - **/ - template - span - cast() const { return span(reinterpret_cast(lo_), - reinterpret_cast(hi_)); } - - /** @brief create span including the first @p z members of this span. **/ - span prefix(size_type z) const { return span(lo_, lo_ + z); } - - /** @brief create span representing prefix up to (but not including) @p *p - **/ - span prefix_upto(CharT * p) const { - if (p <= hi_) - return span(lo_, p); - else - return span(lo_, hi_); - } - - /** @brief create span with first @p z members of this span removed **/ - span after_prefix(size_type z) const { - if (lo_ + z > hi_) - z = hi_ - lo_; - - return span(lo_ + z, hi_); - } - - /** @brief create span with @p prefix of this span removed **/ - span after_prefix(const span & prefix) const { - if (!prefix.is_null() && (prefix.lo() != lo_)) { - throw std::runtime_error - ("after_prefix: expected prefix of this span"); - } - - return after_prefix(prefix.size()); - } - - /** Create span starting with position @p p. - * Does boundary checking; will return empty span if @p p is outside @c [lo_,hi) - **/ - span suffix_from(CharT * p) const { - if ((lo_ <= p) && (p <= hi_)) - return span(p, hi_); - else - return span(hi_, hi_); - } - - /** true iff this span is null. distinct from empty. **/ - bool is_null() const { return lo_ == nullptr && hi_ == nullptr; } - /** true iff this span is empty (comprises 0 elements). **/ - bool empty() const { return lo_ == hi_; } - /** report the number of elements (of type CharT) in this span. **/ - size_type size() const { return hi_ - lo_; } - - /** increase extent of this spans to include @p x. - * Requires @c hi() == @c x.lo() - **/ - span & operator+=(const span & x) { - if (hi_ == x.lo_) { - hi_ = x.hi_; - } else if (!x.is_null()) { - assert(false); - } - - return *this; - } - - /** print representation for this span on stream @p os **/ - void print(std::ostream & os) const { - os << ""; - } - ///@} - - private: - /** @defgroup span-instance-vars **/ - ///@{ - - /** start of span. - Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) - **/ - CharT * lo_ = nullptr; - - /** @brief end of span. - Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) - **/ - CharT * hi_ = nullptr; - - ///@} - }; /*span*/ - - /** @defgroup span-operators **/ - ///@{ - - /** compare spans for equality. - * Two spans are equal iff both endpoints match exactly. - **/ - template - inline bool - operator==(const span & lhs, const span & rhs) { - return ((lhs.lo() == rhs.lo()) - && (lhs.hi() == rhs.hi())); - } - - /** compare spans for inequality. - * Two spans are unequal if either paired endpoint differs. - **/ - template - inline bool - operator!=(const span & lhs, const span & rhs) { - return ((lhs.lo() != rhs.lo()) - || (lhs.hi() != rhs.hi())); - } - - /** print a summary of @p x on stream @p os. Intended for diagnostics **/ - template - inline std::ostream & - operator<<(std::ostream & os, - const span & x) { - x.print(os); - return os; - } - - ///@} - } /*namespace scm*/ - - namespace print { - template - class printspan_impl { - public: - printspan_impl(xo::scm::span x) : span_{x} {} - - xo::scm::span span_; - }; - - template - printspan_impl printspan(const xo::scm::span& span) { - return printspan_impl(span); - } - - template - inline std::ostream & - operator<< (std::ostream & os, - const printspan_impl & x) - { - for (const CharT * p = x.span_.lo(); p < x.span_.hi(); ++p) - os << *p; - - return os; - } - -#ifndef ppdetail_atomic - template \ - PPDETAIL_ATOMIC_BODY(printspan_impl); - - template \ - PPDETAIL_ATOMIC_BODY(xo::scm::span); -#endif - - } -} /*namespace xo*/ diff --git a/xo-tokenizer2/src/tokenizer2/CMakeLists.txt b/xo-tokenizer2/src/tokenizer2/CMakeLists.txt index 967535e2..ccf1b551 100644 --- a/xo-tokenizer2/src/tokenizer2/CMakeLists.txt +++ b/xo-tokenizer2/src/tokenizer2/CMakeLists.txt @@ -10,6 +10,8 @@ set(SELF_SRCS tokentype.cpp) xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS}) +# deps must coordinate with xo-tokenizer/cmake/xo_tokenizer2Config.cmake.in +xo_dependency(${SELF_LIB} xo_arena) xo_dependency(${SELF_LIB} indentlog) # end CMakeLists.txt diff --git a/xo-tokenizer2/src/tokenizer2/TkInputState.cpp b/xo-tokenizer2/src/tokenizer2/TkInputState.cpp index 30db1dbb..1eca02dd 100644 --- a/xo-tokenizer2/src/tokenizer2/TkInputState.cpp +++ b/xo-tokenizer2/src/tokenizer2/TkInputState.cpp @@ -84,7 +84,8 @@ namespace xo { // for example including leading whitespace. // See discussion in tokenizer scan() method - scope log(XO_DEBUG(debug_flag_)); + scope log(XO_DEBUG(debug_flag_), + xtag("input", input)); /* look ahead to {end of line, end of input}, whichever comes first */ const CharT * sol = input.lo(); diff --git a/xo-tokenizer2/src/tokenizer2/Tokenizer.cpp b/xo-tokenizer2/src/tokenizer2/Tokenizer.cpp index 00ef4eec..888a0c43 100644 --- a/xo-tokenizer2/src/tokenizer2/Tokenizer.cpp +++ b/xo-tokenizer2/src/tokenizer2/Tokenizer.cpp @@ -6,9 +6,13 @@ #include "Tokenizer.hpp" namespace xo { + using std::byte; + namespace scm { - Tokenizer::Tokenizer(bool debug_flag) - : input_state_{debug_flag} + Tokenizer::Tokenizer(const CircularBufferConfig & config, + bool debug_flag) + : input_buffer_{DCircularBuffer::map(config)}, + input_state_{debug_flag} {} void @@ -108,7 +112,7 @@ namespace xo { auto Tokenizer::assemble_token(std::size_t initial_whitespace, const span_type & token_text, - input_state_type * p_input_state) -> result_type + TkInputState * p_input_state) -> result_type { /* literal|pretty|streamlined */ log_config::style = function_style::streamlined; @@ -600,7 +604,7 @@ namespace xo { auto Tokenizer::assemble_final_token(const span_type & token_text, - input_state_type * p_input_state) -> result_type + TkInputState * p_input_state) -> result_type { return assemble_token(0 /*initial_whitespace*/, token_text, @@ -608,12 +612,43 @@ namespace xo { } auto - Tokenizer::scan(const span_type & input, - bool eof_flag) -> result_type + Tokenizer::buffer_input_line(const char * input_cstr, + bool eof_flag) -> std::pair { scope log(XO_DEBUG(input_state_.debug_flag())); - log && log(xtag("input", input)); + log && log(xtag("input", input_cstr)); + + auto buf_input_0 = input_buffer_.input_range().hi(); + + auto remainder = input_buffer_.append + (DCircularBuffer::const_span_type + ((const byte *)input_cstr, + (const byte *)input_cstr + strlen(input_cstr))); + + const char * newline_cstr = "\n"; + auto remainder2 = input_buffer_.append + (DCircularBuffer::const_span_type + ((const byte *)newline_cstr, + (const byte *)newline_cstr + strlen(newline_cstr))); + + if (!remainder.empty() || !remainder2.empty()) { + throw std::runtime_error(tostr("Tokenizer::buffer_line: line too long!", + xtag("remainder.size", remainder.size()))); + } + + auto buf_input_1 = input_buffer_.input_range().hi(); + + span_type input = span_type((const char *)buf_input_0, + (const char *)buf_input_1); + + return this->input_state_.capture_current_line(input, eof_flag); + } + + auto + Tokenizer::scan(const span_type & input) -> result_type + { + scope log(XO_DEBUG(input_state_.debug_flag())); /* - Always at beginning of token when scan() invoked * - scan will not report any portion of line as consumed until it has @@ -625,9 +660,6 @@ namespace xo { * with the same input span multiple times */ - /* automagically no-ops when the same input presented twice */ - this->input_state_.capture_current_line(input, eof_flag); - const CharT * ix = this->input_state_.skip_leading_whitespace(); if(ix == input.hi()) { @@ -789,7 +821,7 @@ namespace xo { * - punctuation */ for (; ix != input.hi(); ++ix) { - if (input_state_type::is_whitespace(*ix) + if (TkInputState::is_whitespace(*ix) || is_1char_punctuation(*ix) || is_2char_punctuation(*ix)) { @@ -829,7 +861,7 @@ namespace xo { return assemble_token(whitespace_z, span_type(tk_start, ix) /*token*/, &(this->input_state_)); - } /*scan*/ + } /*_scan_aux*/ } /*namespace scm*/ } /*namespace xo*/