From 84c5a75b289e641e9589fc4939681199e8007453 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Sat, 22 Nov 2025 20:13:33 -0500 Subject: [PATCH] xo-tokenizer: refactor to correct accounting for line/consume/errpos --- xo-alloc/include/xo/alloc/ArenaAlloc.hpp | 86 ++++++- xo-alloc/src/alloc/ArenaAlloc.cpp | 107 ++++++-- xo-reader/src/reader/reader.cpp | 24 +- xo-tokenizer/example/tokenrepl/tokenrepl.cpp | 29 +-- .../include/xo/tokenizer/input_state.hpp | 209 ++++++++++++--- .../include/xo/tokenizer/scan_result.hpp | 29 ++- .../include/xo/tokenizer/tokenizer.hpp | 242 ++++++++---------- .../include/xo/tokenizer/tokenizer_error.hpp | 10 +- xo-tokenizer/utest/tokenizer.test.cpp | 8 +- 9 files changed, 501 insertions(+), 243 deletions(-) diff --git a/xo-alloc/include/xo/alloc/ArenaAlloc.hpp b/xo-alloc/include/xo/alloc/ArenaAlloc.hpp index da67f8f2..e0bfed2f 100644 --- a/xo-alloc/include/xo/alloc/ArenaAlloc.hpp +++ b/xo-alloc/include/xo/alloc/ArenaAlloc.hpp @@ -18,11 +18,11 @@ namespace xo { * allocation order: * -----------------------> * - * <----------------- .size() ------------------> - * <----------------- .committed() ---------------> + * <----------------- .size(), .reserved() ---------------------------> + * <----------------- .committed() -------------> * - * <-------allocated------><--------free--------> <---uncommitted----> - * XXXXXXXXXXXXXXXXXXXXXXXX______________________ .................... + * <-------allocated------><--------free--------><-----uncommitted----> + * XXXXXXXXXXXXXXXXXXXXXXXX______________________...................... * ^ ^ ^ ^ ^ * lo checkpoint free limit hi * @@ -31,12 +31,77 @@ namespace xo { * > < .before_checkpoint() * > < .after_checkpoint() * + * lifetime: + * + * 1. initial state after ctor + * + * >< committed()=0 + * <---------------------------uncommitted----------------------------> + * .................................................................... + * ^ ^ + * lo hi + * checkpoint + * free + * limit + * + * 1a. one call to ::mmap() + * 1b. vm address space [lo,hi) is reserved + * 1c. address space [lo,hi) is inaccessible. no read|write|execute permission + * + * 2. after first allocation of n bytes + * + * <--committed---> + * <--free--><--------------------uncommitted--------------------> + * > <- allocated + * XXXXXX__________..................................................... + * ^ ^ ^ ^ + * lo lo+n limit hi + * ^ free + * checkpoint + * + * 2a. committed just enough hugepages (2mb each) to accomodate n, + * i.e. expand-on-demand: + * - one call to ::mprotect() + * - .limit = .lo + (k+1) * .hugepage_z for some integer k>=0 + * - k * .page_z <= n < (k+1) * .hugepage_z + * 2b. expect immediate cost 1-5us, includes: + * - TLB flush + * invalidate TLB entries for committed range on all cores that this + * process' threads have run on since process inception. + * Also, if a kernel thread has run on one of said cores, it may + * have borrowed our TLB entries + * - page table update + * write to entry for each vm page + * - kernel overhead 100-1000 cycles (< 1us) + * 2c. expect deferred cost 1us-2us per hugepage: + * - committed pages aren't backed by physical memory until + * first touched; minor page fault on first access for each page. + * - so about 256-512us for 1MB + * 3. after .expand(z) + * + * <-------------committed------------> + * <------------free------------><----------uncomitted-----------> + * > <- allocated + * XXXXXX______________________________................................. + * ^ ^ ^ ^ + * lo lo+n limit hi + * ^ free + * checkpoint + * + * 3a. same as case 2. but without advancing .free pointer. + * + * 4. after dtor + * + * 4a. all memory returned to o/s, no longer reserved. + * - one call to ::munmap() + * * @endtext * * Design Notes: * - non-copyable, non-moveable - * - always heap-allocated * - @ref lo_ <= @ref checkpoint_ <= @ref free_ <= @ref limit_ <= @ref hi_ + * - memory for ArenaAlloc itself (not the memory it allocates), ~100 bytes + * always heap allocated. Use ArenaAlloc::make() * - memory obtained from mmap(), not heap * - memory addresses are stable. Expand storage by committing VM pages. * - @ref lo_ is aligned on VM page size (guaranteed by mmap()) @@ -55,7 +120,7 @@ namespace xo { /** Create allocator with capacity @p z, * Reserve memory addresses for @p z bytes, - * but don't commit them until needed + * (but don't commit them until needed) **/ static up make(const std::string & name, std::size_t z, @@ -127,7 +192,12 @@ namespace xo { std::string name_; /** size of a VM page (from getpagesize()) **/ - std::size_t page_z_; + std::size_t page_z_ = 0; + + /** size of a huge VM page. hardwiring this in ctor (to 2MB). + * larger pages relieve pressure on TLB, but suboptimal if use << 2MB + **/ + std::size_t hugepage_z_ = 0; /** allocator owns memory in range [@ref lo_, @ref hi_) **/ std::byte * lo_ = nullptr; @@ -139,7 +209,7 @@ namespace xo { * older (addresses below checkpoint) * and younger (addresses above checkpoint) **/ - std::byte * checkpoint_; + std::byte * checkpoint_ = nullptr; /** free pointer. memory in range [@ref free_, @ref limit_) available **/ std::byte * free_ptr_ = nullptr; /** soft limit: end of committed virtual memory **/ diff --git a/xo-alloc/src/alloc/ArenaAlloc.cpp b/xo-alloc/src/alloc/ArenaAlloc.cpp index febbcb61..0a0365e2 100644 --- a/xo-alloc/src/alloc/ArenaAlloc.cpp +++ b/xo-alloc/src/alloc/ArenaAlloc.cpp @@ -13,37 +13,101 @@ #include namespace xo { + using std::byte; + namespace gc { + namespace { + /* alignment better be a power of 2 */ + std::size_t + align_lub(std::size_t x, std::size_t align) + { + /* e.g: + * align = 4096, x%align = 100 -> dx = 3996 + * align = 4096, x%align = 0 -> dx = 0 + */ + std::size_t dx = (align - (x % align)) % align; + + return x + dx; + } + } + ArenaAlloc::ArenaAlloc(const std::string & name, - std::size_t z, bool debug_flag) + std::size_t z, + bool debug_flag) { scope log(XO_DEBUG(debug_flag), xtag("name", name)); + constexpr size_t c_hugepage_z = 2 * 1024 * 1024; + this->name_ = name; this->page_z_ = getpagesize(); + this->hugepage_z_ = c_hugepage_z; - // reserve virtual memory + // 1. need k pagetable entries where k is lub {k | k * .page_z >= z} + // 2. base will be aligned with .page_z but likely not with .hugepage_z + // 3. bad to have misalignment, because misaligned {prefix, suffix} of [base, base+z) + // will use 4k pages instead of 2mb pages + // + // strategy: + // 4. round up z to multiple of c_hugepage_z + // 5. over-request so reserved range contains an aligned subrange of size z + // 6. unmap misaligned prefix + // 7. unmap misaligned suffix. + // 8. enable huge pages for now-aligned remainder of reserved range + // + // Z. note: rejecting inferior MAP_HUGETLB|MAP_HUGE_2MB flags on ::mmap here: + // Za. requires previously-reserved memory in /proc/sys/vm/nr_hugepages + // Zb. reserved pages permenently resident in RAM, never swapped + // Zc. memory cost incurred even if no application is using said pages - void * base = mmap(nullptr, z, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + z = align_lub(z, c_hugepage_z); // 4. + + // 5. + byte * base = reinterpret_cast(::mmap(nullptr, + z + c_hugepage_z, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0)); log && log("acquired memory [lo,hi) using mmap", xtag("lo", base), xtag("z", z), - xtag("hi", reinterpret_cast(base) + z)); - - // could use this as fallback.. - //base = (new std::byte [z]); + xtag("hi", reinterpret_cast(base) + z)); if (base == MAP_FAILED) { throw std::runtime_error(tostr("ArenaAlloc: uncommitted allocation failed", xtag("size", z))); } - this->lo_ = reinterpret_cast(base); + byte * aligned_base = reinterpret_cast(align_lub(reinterpret_cast(base), + c_hugepage_z)); + + assert(reinterpret_cast(aligned_base) % c_hugepage_z == 0); + assert(aligned_base >= base); + assert(aligned_base < base + c_hugepage_z); + + if (base < aligned_base) { + size_t prefix = aligned_base - base; + + ::munmap(base, prefix); // 6. + } + + byte * aligned_hi = aligned_base + z; + byte * hi = base + z + c_hugepage_z; + + if (aligned_hi < hi) { + size_t suffix = hi - aligned_hi; + + ::munmap(aligned_hi, suffix); // 7. + } + + ::madvise(aligned_base, z, MADV_HUGEPAGE); // 8. + + this->lo_ = aligned_base; this->committed_z_ = 0; this->checkpoint_ = lo_; this->free_ptr_ = lo_; - this->limit_ = lo_ + z; + this->limit_ = lo_; this->hi_ = lo_ + z; this->debug_flag_ = debug_flag; @@ -52,7 +116,9 @@ namespace xo { xtag("size", z))); } - log && log(xtag("lo", (void*)lo_), xtag("page_z", page_z_)); + log && log(xtag("lo", (void*)lo_), + xtag("page_z", page_z_), + xtag("hugepage_z", hugepage_z_)); } ArenaAlloc::~ArenaAlloc() @@ -64,7 +130,7 @@ namespace xo { if (lo_) { log && log("unmap [lo,hi)", xtag("lo", lo_), xtag("z", hi_ - lo_), xtag("hi", hi_)); - munmap(lo_, hi_ - lo_); + ::munmap(lo_, hi_ - lo_); } // could use this as fallback if we dropped the uncommitted technique //delete [] this->lo_; @@ -86,21 +152,6 @@ namespace xo { z, debug_flag)); } - namespace { - /* alignment better be a power of 2 */ - std::size_t - align_lub(std::size_t x, std::size_t align) - { - /* e.g: - * align = 4096, x%align = 100 -> dx = 3996 - * align = 4096, x%align = 0 -> dx = 0 - */ - std::size_t dx = (align - (x % align)) % align; - - return x + dx; - } - } - bool ArenaAlloc::expand(size_t offset_z) { @@ -118,7 +169,7 @@ namespace xo { xtag("requested", offset_z), xtag("reserved", reserved()))); } - std::size_t aligned_offset_z = align_lub(offset_z, page_z_); + std::size_t aligned_offset_z = align_lub(offset_z, hugepage_z_); std::byte * commit_start = lo_ + committed_z_; std::size_t add_commit_z = aligned_offset_z - committed_z_; @@ -130,7 +181,7 @@ namespace xo { xtag("add_commit_z", add_commit_z), xtag("commit_end", commit_start + add_commit_z)); - if (mprotect(commit_start, add_commit_z, PROT_READ | PROT_WRITE) != 0) { + if (::mprotect(commit_start, add_commit_z, PROT_READ | PROT_WRITE) != 0) { throw std::runtime_error(tostr("ArenaAlloc::expand: commit failure", xtag("committed_z", committed_z_), xtag("add_commit_z", add_commit_z))); diff --git a/xo-reader/src/reader/reader.cpp b/xo-reader/src/reader/reader.cpp index 6931cd5f..2ea9b4fd 100644 --- a/xo-reader/src/reader/reader.cpp +++ b/xo-reader/src/reader/reader.cpp @@ -5,7 +5,8 @@ namespace xo { namespace scm { reader::reader(bool debug_flag) : - tokenizer_{debug_flag}, parser_{debug_flag} + tokenizer_{debug_flag}, + parser_{debug_flag} {} void @@ -29,7 +30,7 @@ namespace xo { } reader_result - reader::read_expr(const span_type & input_arg, bool eof) + reader::read_expr(const span_type & input_arg, bool eof_flag) { scope log(XO_DEBUG(this->debug_flag())); @@ -38,20 +39,25 @@ namespace xo { /* input text-span consumed by this call. * Always comprises some number (possibly 0) * of complete tokens, along with any leading - * whitespace + * whitespace. + * + * expr_span may also begin and end part way through + * distinct input lines */ span_type expr_span = input.prefix(0ul); while (!input.empty()) { - /* each loop iterations reads one token */ + /* each loop iteration reads one token */ - /* read one token from input */ - auto [tk, used_span, error1] = this->tokenizer_.scan2(input, eof); + /* read one token from input. + * tokenizer stashes one line at a time, but used_span only + * reports in used_span the portion representing the first token. + */ + auto [tk, used_span, error1] = this->tokenizer_.scan(input, eof_flag); log && log(xtag("consumed", used_span)); log && log(xtag("input.pre", input)); - input = this->tokenizer_.consume(used_span, input); expr_span += used_span; if (tk.is_valid()) { @@ -76,7 +82,7 @@ namespace xo { expr_span, parser_.stack_size(), reader_error()); } else if (parser_result.is_error()) { /* 1. parser detected error. - * 2. tokenizer_.input_state() refers to position just after offending token + * 2. tokenizer_.input_state().current_pos refers to position just after offending token * 3. error_pos here is 0 because error detected at token boundary */ reader_error error2(parser_result.error_src_function(), @@ -122,7 +128,7 @@ namespace xo { * 1. input.empty (perhaps ate some whitespace, ok) * 2. missing or incomplete token (ok unless eof) */ - if (eof) { + if (eof_flag) { if (parser_.has_incomplete_expr()) { throw std::runtime_error ("reader::read_expr" diff --git a/xo-tokenizer/example/tokenrepl/tokenrepl.cpp b/xo-tokenizer/example/tokenrepl/tokenrepl.cpp index d6eacfea..bc73de3f 100644 --- a/xo-tokenizer/example/tokenrepl/tokenrepl.cpp +++ b/xo-tokenizer/example/tokenrepl/tokenrepl.cpp @@ -29,6 +29,10 @@ main() { tokenizer_type tkz(xo::log_config::min_log_level <= xo::log_level::info); string input_str; + size_t line_no = 1; + + constexpr std::size_t c_maxlines = 25; + while (repl_getline(interactive, cin, cout, input_str)) { // we want tokenizer to see newline, it's syntax input_str.push_back('\n'); @@ -36,7 +40,7 @@ main() { // reminder: input may contain multiple tokens while (!input.empty()) { - auto [tk, consumed, error] = tkz.scan(input); + auto [tk, consumed, error] = tkz.scan(input, false /*!eof*/); if (tk.is_valid()) { cout << tk << endl; @@ -47,29 +51,16 @@ main() { break; } - input = tkz.consume(consumed, input); + input = input.after_prefix(consumed); } /* here: input.empty() or error encountered */ - /* discard stashed remainder of input line - * (for nicely-formatted errors) - */ - tkz.discard_current_line(); - } + ++line_no; - { - span_type input = span_type::from_string(input_str); - - auto [tk, consumed, error] = tkz.notify_eof(input); - - input = tkz.consume(consumed, input); - - if (tk.is_valid()) { - cout << tk << endl; - } else if (error.is_error()) { - cout << "parsing error: " << endl; - error.report(cout); + if (line_no > c_maxlines) { + cout << "always exit after " << c_maxlines << " lines of input" << endl; + break; } } } diff --git a/xo-tokenizer/include/xo/tokenizer/input_state.hpp b/xo-tokenizer/include/xo/tokenizer/input_state.hpp index 0e93512d..0cea1155 100644 --- a/xo-tokenizer/include/xo/tokenizer/input_state.hpp +++ b/xo-tokenizer/include/xo/tokenizer/input_state.hpp @@ -9,9 +9,50 @@ namespace xo { namespace scm { + /** enum to report outcome of @ref capture_current_line **/ + enum class input_error { + /** normal return, input line successfully identified and captured **/ + ok = 0, + /** incomplete input; should not have been submitted to @ref capture_current_line. + * note: submit last line of input with eof_flag=true + **/ + incomplete, + N + }; + /** @class input_state * @brief Track detailed input position for use in error messages * + * input characters fall into two categories: + * - consumed: memory can be reclaimed/recycled + * - buffered: memory will be retained unaltered until consumed + * + * remarks: + * - always in one of two states: + * - empty + * - contains exactly one line of input + * - also record current input position. + * Use this for example to identify where tokenizer rejected input. + * - .current_pos advances by one token + * + * - buffered characters always form a single contiguous range. + * - input_state does not own any storage; storage is owned elsewhere + * + * @text + * + * <------------------.current_line------------------> + * > <-- .whitespace + * cccccccccccccccccccccccccccccccc__TTTTTTTTxxxxxxxxx + * ^ ^ ^ + * .current_line.lo | .current_line.hi + * .current_pos + * + * <----prev_line----> <----current_line----> + * > <--whitespace + * ppppppppppppppppppp cccccccccccc__TTTTTTTT + * ^ + * + * @endtext **/ template class input_state { @@ -33,8 +74,11 @@ namespace xo { /** Create instance with supplied @p current_line, @p current_pos, @p whitespace. * Introduced for unit tests, not used in tokenizer. **/ - explicit input_state(const span& current_line, size_t current_pos, size_t whitespace) - : current_line_{current_line}, current_pos_{current_pos}, whitespace_{whitespace} {} + explicit input_state(const span& current_line, + size_t current_pos, + size_t whitespace) : current_line_{current_line}, + current_pos_{current_pos}, + whitespace_{whitespace} {} ///@} @@ -63,6 +107,7 @@ namespace xo { #endif const span_type & current_line() const { return current_line_; } #pragma GCC diagnostic pop + size_t tk_start() const { return tk_start_; } size_t current_pos() const { return current_pos_; } size_t whitespace() const { return whitespace_; } bool debug_flag() const { return debug_flag_; } @@ -77,27 +122,65 @@ namespace xo { **/ input_state rewind(std::size_t n) const; - /** Capture prefix of @p input up to first newline **/ - void capture_current_line(const span_type & input); + /** Capture prefix of @p input up to first newline. + * Set read position to start of line. + * + * Alters: + * .current_line + * .current_pos + * + * Return pair comprising error code and input span representing first line + * (including trailing newline) from @p input. + **/ + std::pair capture_current_line(const span_type & input, + bool eof_flag); + + /** atomically return current line while discarding it from input state + * + * Alters + * .current_line + * .current_pos + * .whitespace + **/ + span_type consume_current_line(); /** Reset input state for start of next line. * Expression parser may use this to discard remainder of input line * after a parsing error. + * + * Alters: + * .current_line + * .current_pos + * .whitespace **/ void discard_current_line(); - /** Add @p z to current position **/ - void consume(size_t z); - - /** Skip prefix of input comprising whitespace. - * Return pointer to first non-whitespace character in @p input, - * or @c input.hi if input contains only whitespace. - * - * if @p input contains any newlines, preserves suffix after last - * such newilne in @p current_line_ + /** Advance input position by @p z * + * Alters: + * .current_pos **/ - const CharT * skip_leading_whitespace(const span_type & input); + void advance(size_t z); + + /** Advance .current_pos to pos. + * Require: pos in @ref current_line_ + **/ + void advance_until(const CharT * pos); + + /** Skip prefix of input, starting at current read position, + * comprising only whitespace. + * + * Presume input position is at end of token; + * on return @ref whitespace_ counts number of whitespace characters + * skipped. + * + * Return pointer to first non-whitespace character after @ref current_pos_ + * or @ref current_line_.hi if reached end of buffered line. + * + * Alters: + * .whitespace + **/ + const CharT * skip_leading_whitespace(); ///@} @@ -107,7 +190,9 @@ namespace xo { /** remember current input line. Used only to report errors **/ span current_line_ = span(); - /** current input position within @ref current_line_ **/ + /** start of last token within @ref current_line_ **/ + size_t tk_start_ = 0; + /** input position within @ref current_line_ **/ size_t current_pos_ = 0; /** number of whitespace chars since end of preceding token, * or last newline, whichever is less @@ -149,7 +234,7 @@ namespace xo { template void - input_state::consume(size_t z) { + input_state::advance(size_t z) { scope log(XO_DEBUG(debug_flag_)); this->current_pos_ += z; @@ -157,6 +242,28 @@ namespace xo { log && log(xtag("z", z), xtag("current_pos", current_pos_)); } + template + void + input_state::advance_until(const CharT * pos) { + scope log(XO_DEBUG(debug_flag_)); + + assert(current_line_.lo() <= pos && pos < current_line_.hi()); + + this->current_pos_ = pos - current_line_.lo(); + + log && log(xtag("current_pos", current_pos_)); + } + + template + auto + input_state::consume_current_line() -> span_type { + span_type retval = current_line_; + + this->discard_current_line(); + + return retval; + } + template void input_state::discard_current_line() { @@ -166,10 +273,14 @@ namespace xo { } template - void - input_state::capture_current_line(const span_type & input) + auto + input_state::capture_current_line(const span_type & input, + bool eof_flag) -> std::pair { // see also discard_current_line() + // note: must capture entirety of first line, + // for example including leading whitespace. + // See discussion in tokenizer scan() method scope log(XO_DEBUG(debug_flag_)); @@ -177,44 +288,76 @@ namespace xo { const CharT * sol = input.lo(); const CharT * eol = sol; + if (sol == current_line_.lo()) { + log && log("short-circuit - current line already stashed"); + + /* nothing to do here */ + return std::make_pair(input_error::ok, current_line_); + } + while ((eol < input.hi()) && (*eol != '\n')) ++eol; + if (*eol == '\n') { + /* include \n at end-of-line */ + ++eol; + } else { + if (!eof_flag) { + /* caller expected to provide complete line of input. complain and ignore */ + return std::make_pair(input_error::incomplete, + input.prefix(0ul)); + } + } + this->current_line_ = span_type(sol, eol); this->current_pos_ = 0; + this->whitespace_ = 0; log && log(xtag("current_line", print::printspan(current_line_)), xtag("current_pos", current_pos_)); + + return std::make_pair(input_error::ok, + span_type(sol, eol)); } template const CharT * - input_state::skip_leading_whitespace(const span_type & input) + input_state::skip_leading_whitespace() { scope log(XO_DEBUG(debug_flag_)); - const CharT * ix = input.lo(); - - if (this->current_line().is_null()) { - this->capture_current_line(input); - } + const CharT * ix = current_line_.lo() + current_pos_; this->whitespace_ = 0; /* skip whitespace + remember beginning of most recent line */ - while (is_whitespace(*ix) && (ix != input.hi())) { - if (is_newline(*ix)) { - ++ix; + while (is_whitespace(*ix) && (ix != current_line_.hi())) { + ++ix; - this->capture_current_line(span_type(ix, input.hi())); - } else { - ++ix; - - ++(this->whitespace_); - } + ++(this->whitespace_); } + this->tk_start_ = ix - current_line_.lo(); + this->current_pos_ = ix - current_line_.lo(); + return ix; } + + template + inline std::ostream & + operator<<(std::ostream & os, + const input_state& x) + { + using xo::print::unq; + + os << ""; + + return os; + } } } diff --git a/xo-tokenizer/include/xo/tokenizer/scan_result.hpp b/xo-tokenizer/include/xo/tokenizer/scan_result.hpp index fbc29105..79846d3c 100644 --- a/xo-tokenizer/include/xo/tokenizer/scan_result.hpp +++ b/xo-tokenizer/include/xo/tokenizer/scan_result.hpp @@ -7,6 +7,7 @@ #include "token.hpp" #include "tokenizer_error.hpp" +#include "input_state.hpp" namespace xo { namespace scm { @@ -17,11 +18,11 @@ namespace xo { * Possible outcomes fall into several categories * (with T: @c token_.is_valid(), E: @cerror_.is_error()) * - * | T | E | description | - * |-------+-------+-------------------| - * | false | false | end of input | - * | true | false | parsed token in T | - * | false | true | parse error in E | + * | T | E | description | + * |-------+-------+-------------------------------------| + * | false | false | end of input, including end of line | + * | true | false | parsed token in T | + * | false | true | parse error in E | * * @endcode **/ @@ -31,6 +32,7 @@ namespace xo { using token_type = token; using span_type = span; using error_type = tokenizer_error; + using input_state_type = input_state; public: scan_result(const token_type & token, @@ -40,7 +42,8 @@ namespace xo { static scan_result make_whitespace(const span_type & prefix_input); static scan_result make_partial(const span_type & prefix_input); - static scan_result make_error(const error_type & error); + static scan_result make_error(const error_type & error, + input_state_type & input_state_ref); bool is_eof_or_ambiguous() const { return token_.is_invalid() && error_.is_not_an_error(); } bool is_token() const { return token_.is_valid(); } @@ -51,7 +54,10 @@ namespace xo { const error_type & error() const { return error_; } public: - /** successfully parsed token, whenever tk_type != tokentype::tk_invalid **/ + /** Successfully parsed token, whenever tk_type != tokentype::tk_invalid. + * Will be tokentype::tk_invalid in normal cause of events for valid input, + * when consuming whitespace + **/ token_type token_; /** input span represented by .token, on success. Otherwise not defined **/ span_type consumed_; @@ -72,9 +78,14 @@ namespace xo { } template - auto scan_result::make_error(const error_type & error) -> scan_result + auto scan_result::make_error(const error_type & error, + input_state_type & input_state_ref) -> scan_result { - return scan_result(token_type::invalid(), span_type::make_null(), error); + /* report+consume entire input line */ + + return scan_result(token_type::invalid(), + input_state_ref.consume_current_line(), + error); } } /*namespace scm*/ diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp index 0dd46877..2ea695dc 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp @@ -99,22 +99,15 @@ namespace xo { static bool is_2char_punctuation(CharT ch); /** assemble token from text @p token_text. - * @p token_text will often (but not always) represent a subset of @p input. - * (For example consider multi-line string literals) - * Also the span @p token_text may (in uncommon cases) - * have been copied to separate storage from @p input - * * @p initial_whitespace Amount of whitespace input being consumed from input. - * @p initial_token_prefix_from_input Amount of non-whitespace input being - * consumed from input. Not counting any stashed-and-already-consumed input + * @p token_text subset of input_line representing a single token. + * @p input_state input state containing input_line * * retval.consumed will represent some possibly-empty prefix of @p input **/ static result_type assemble_token(std::size_t initial_whitespace, - std::size_t initial_token_prefix_from_input, const span_type & token_text, - const span_type & input, - const input_state_type & input_state); + input_state_type & input_state); /** degenerate version of assemble_token() on reaching end-of-file **/ static result_type assemble_final_token(const span_type & token_text, @@ -136,35 +129,14 @@ namespace xo { * * @return {parsed token, consumed span} **/ - result_type scan(const span_type & input); - - /** When eof is false, same as scan(input). - * When eof is true and scan(input) does not report a token, - * return notify_eof() - **/ - result_type scan2(const span_type & input, bool eof); - - /** @retval span with @p consumed permanently removed from @p input. - * - * Purpose of this method is to update @ref current_pos_. - **/ - span_type consume(const span_type & consumed, const span_type & input); + result_type scan(const span_type & input, + bool eof_flag); /** discard current line after error. Just cleans up error-reporting state **/ void discard_current_line(); - /** notify end of input, resolving any ambiguous input stashed in .prefix - **/ - result_type notify_eof(const span_type & input); - ///@} - private: - result_type scan_completion(const span_type & whitespace, - const CharT* token_end, - const span_type & input, - const input_state_type & input_state); - private: /** @defgroup tokenizer-instance-vars tokenizer instance variables **/ ///@{ @@ -283,19 +255,16 @@ namespace xo { template auto tokenizer::assemble_token(std::size_t initial_whitespace, - std::size_t initial_token_prefix_from_input, const span_type & token_text, - const span_type & input, - const input_state_type & input_state) -> result_type + input_state_type & input_state_ref) -> result_type { /* literal|pretty|streamlined */ log_config::style = function_style::streamlined; - scope log(XO_DEBUG(input_state.debug_flag())); + scope log(XO_DEBUG(input_state_ref.debug_flag())); log && log(xtag("token_text", token_text), xtag("initial_whitespace", initial_whitespace), - xtag("initial_token_prefix_from_input", initial_token_prefix_from_input), - xtag("input", input)); + xtag("input_state", input_state_ref)); tokentype tk_type = tokentype::tk_invalid; std::string tk_text; @@ -394,17 +363,19 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "improperly placed sign indicator", - input_state, + input_state_ref, (ix - tk_start) - )); + ), + input_state_ref); } } else if (*ix == '.') { if (period_flag) { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "duplicate decimal point in numeric literal", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } period_flag = true; @@ -413,8 +384,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "duplicate exponent marker in numeric literal", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } exponent_flag = true; @@ -429,8 +401,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "unexpected character in numeric constant" /*error_description*/, - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } } @@ -532,8 +505,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "expecting key following escape character \\", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } switch(*ix) { @@ -561,8 +535,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "expecting one of n|r|\"|\\ following escape \\", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } break; default: @@ -578,8 +553,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "missing terminating '\"' to complete literal string", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } log && log(tostr("tokenizer::assemble_token", @@ -720,8 +696,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "illegal input character", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } if ((tk_type == tokentype::tk_i64) @@ -771,8 +748,11 @@ namespace xo { tk_text.clear(); } + /* input.prefix(0): + * require caller preserves current input line until it's entirely exhausted + */ return result_type(token_type(tk_type, std::move(tk_text)), - input.prefix(initial_whitespace + initial_token_prefix_from_input)); + input_state_ref.current_line().prefix(0)); } /*assemble_token*/ /* TODO: input_state_ as argument ? */ @@ -782,67 +762,44 @@ namespace xo { const input_state_type & input_state) -> result_type { return assemble_token(0 /*initial_whitespace*/, - 0 /*initial_token_prefix_from_input*/, token_text, - span_type::make_null(), input_state); } - /* TODO: prefix_, input_state_ as arguments */ template auto - tokenizer::scan_completion(const span_type & whitespace, - const CharT* token_end, - const span_type & input, - const input_state_type & input_state) -> result_type { - - auto token_span = input.after_prefix(whitespace).prefix_upto(token_end); - - if (this->prefix_.empty()) { - return assemble_token(whitespace.size(), - token_span.size() /*initial_token_prefix_from_input*/, - token_span, - input, - input_state); - } else { - /* whatever we stashed in .prefix_, should be consumed from input. - * control here implies reached end of input with either - * - input for which parsing outcome depends on existence of more input, - * and presence of eof now resolves - * - malformed input (that might represent prefix of a valid token. Say "#incl" in C) - * - * That means stashed .prefix will represent copied range of characters that - * ends at the same position as input - */ - return result_type::make_partial(input); - } - - } - -#ifdef NOT_USING - template - void - tokenizer::capture_current_line(const span_type & input) - { - this->input_state_.capture_current_line(input); - } -#endif - - template - auto - tokenizer::scan(const span_type & input) -> result_type + tokenizer::scan(const span_type & input, bool eof_flag) -> result_type { scope log(XO_DEBUG(input_state_.debug_flag())); log && log(xtag("input", input)); - const CharT * ix = this->input_state_.skip_leading_whitespace(input); + /* - Always at beginning of token when scan() invoked + * - scan will not report any portion of line as consumed until it has + * emitted all tokens in that line. + * rationale: caller is allowed to discard storage that + * scan() reports as consumed. But will be holding that line + * until all tokens have been read. + * - this means caller will typically call scan() + * with the same input span multiple times + */ + + /* automagically no-ops when the same input presented twice */ + this->input_state_.capture_current_line(input, eof_flag); + + const CharT * ix = this->input_state_.skip_leading_whitespace(); if(ix == input.hi()) { - /* no-op */ - return result_type::make_whitespace(input.prefix_upto(ix)); + log && log("end input -> consume current line"); + + /* entirety of current line has been tokenized + * -> caller may consume it + */ + return result_type::make_whitespace(this->input_state_.consume_current_line()); } + /* ix: if ix < input.hi: first non-whitespace character after input_state_.current_pos_ */ + // TODO: // 1. hoist complete_flag up here // 2. use in each branch @@ -850,9 +807,9 @@ namespace xo { /* here: *ix is not whitespace */ - auto whitespace_span = input.prefix_upto(ix); + auto whitespace_z = input_state_.whitespace(); - log && log(xtag("whitespace.size", input_state_.whitespace())); + log && log(xtag("whitespace_z", whitespace_z)); /* tk_start points to known beginning of token * (after any whitespace) @@ -871,12 +828,15 @@ namespace xo { ++ix; +#ifdef OBSOLETE // no longer a thing. either input ends in whitespace, or ends translation unit if (ix == input.hi()) { /* need more input to know if/when token complete */ this->prefix_ += std::string(tk_start, input.hi()); log && log(xtag("captured-prefix1", this->prefix_)); - } else { + } else +#endif + { CharT ch2 = *ix; if (((ch2 >= '0') && (ch2 <= '9')) @@ -909,21 +869,28 @@ namespace xo { break; } } else if ((*ix == '\n') || (*ix == '\r')) { + log && log ("string literal with naked newline or CR"); + return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "must use \\n or \\r to encode newline/cr in string literal", input_state_, - (ix - tk_start))); + (ix - tk_start)), + this->input_state_); } prev_ch = *ix; } if (!complete_flag) { - /* need more input to know if/when token complete */ - this->prefix_ += std::string(tk_start, input.hi()); + log && log("unterminated string literal"); - log && log(xtag("captured-prefix2", this->prefix_)); + return result_type::make_error + (error_type(__FUNCTION__ /*src_function*/, + "unterminated string literal", + input_state_, + (ix - tk_start)), + this->input_state_); } } else { /* ix is start of some token */ @@ -941,8 +908,13 @@ namespace xo { /* include next char and complete token */ ++ix; - return scan_completion(whitespace_span, ix /*token_end*/, input, - this->input_state_); + log && log("complete '->' token"); + + this->input_state_.advance_until(ix); + + return assemble_token(whitespace_z, + span_type(tk_start, ix) /*token*/, + input_state_); } /* here: -123, -.5e-21 for example */ @@ -959,9 +931,14 @@ namespace xo { CharT ch2 = *ix; if (ch2 != '=') { + log && log("complete '>=' token"); + + this->input_state_.advance_until(ix); + /* ignore next char and complete token */ - return scan_completion(whitespace_span, ix /*token_end*/, input, - this->input_state_); + return assemble_token(whitespace_z, + span_type(tk_start, ix) /*token*/, + this->input_state_); } /* here: >= for example */ @@ -1003,18 +980,28 @@ namespace xo { } } +#ifdef OBSOLETE if (ix == input.hi()) { /* need more input to know if/when token complete */ this->prefix_ += std::string(tk_start, input.hi()); log && log(xtag("captured-prefix5", this->prefix_)); } +#endif } - return scan_completion(whitespace_span, ix /*token_end*/, input, - this->input_state_); + log && log("assemble token z", xtag("token_z", ix - tk_start)); + + assert(tk_start < ix); + + this->input_state_.advance_until(ix); + + return assemble_token(whitespace_z, + span_type(tk_start, ix) /*token*/, + this->input_state_); } /*scan*/ +#ifdef OBSOLETE template auto tokenizer::scan2(const span_type & input, bool eof) -> result_type { @@ -1039,15 +1026,19 @@ namespace xo { span_type::concat(sr.consumed(), sr2.consumed()), sr2.error()); } +#endif +#ifdef OBSOLETE template auto - tokenizer::consume(const span_type & consumed, const span_type & input) -> span_type + tokenizer::consume(const span_type & consumed, + const span_type & input) -> span_type { this->input_state_.consume(consumed.size()); return input.after_prefix(consumed); } +#endif template void @@ -1056,6 +1047,7 @@ namespace xo { this->input_state_.discard_current_line(); } +#ifdef OBSOLETE template auto tokenizer::notify_eof(const span_type & input) -> result_type { @@ -1063,20 +1055,12 @@ namespace xo { log && log(xtag("prefix_", prefix_), xtag("prefix_.size", prefix_.size()), xtag("input", input)); - if (this->prefix_.empty()) { - /* almost meretricious to include input here, - * when called from scan2() it can only be whitespace - */ - return result_type::make_whitespace(input); - } else { - auto retval = assemble_final_token(span_type::from_string(prefix_), - this->input_state_); - - this->prefix_.clear(); - - return retval; - } + /* almost meretricious to include input here, + * when called from scan2() it can only be whitespace + */ + return result_type::make_whitespace(input); } /*notify_eof*/ +#endif } /*namespace scm*/ } /*namespace xo*/ diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp index ebcf2a0f..6a673e53 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp @@ -121,22 +121,22 @@ namespace xo { if (!error_description_.empty()) { const char * prefix = "input: "; - /* input_state.current_pos: position of first character following preceding token. - * input_state.whitespace: whitespace between current_pos and start of failing token + /* input_state.tk_start: position of first character in token + * input_state.current_pos: position of first character following preceding token. * error_pos: position (relative to start) at which failure detected */ - const size_t tk_start = input_state_.current_pos() + input_state_.whitespace(); + const size_t tk_start = input_state_.tk_start(); const size_t tk_indent = (strlen(prefix) + tk_start); const size_t error_pos = 1 + tk_start + error_pos_; - os << "char: " << error_pos << endl; + os << "token col: " << tk_start << ", error col: " << error_pos << "\n"; os << prefix; for (const char *p = input_state_.current_line().lo(), *e = input_state_.current_line().hi(); p < e; ++p) { os << *p; } - os << endl; + //os << endl; os << std::setw(tk_indent) << " "; for (size_t i = 0; i < error_pos_; ++i) { diff --git a/xo-tokenizer/utest/tokenizer.test.cpp b/xo-tokenizer/utest/tokenizer.test.cpp index ec7d394f..604b9d25 100644 --- a/xo-tokenizer/utest/tokenizer.test.cpp +++ b/xo-tokenizer/utest/tokenizer.test.cpp @@ -232,7 +232,7 @@ namespace xo { in_span(testcase.input_.c_str(), testcase.input_.c_str() + testcase.input_.size()); - auto sr = tkz.scan2(in_span, true /*eof*/); + auto sr = tkz.scan(in_span, true /*eof*/); REHEARSE(rh, sr.get_token().tk_type() == testcase.expected_tk_.tk_type()); if (sr.get_token().tk_type() == tokentype::tk_i64) @@ -408,7 +408,7 @@ namespace xo { { log && log(xtag("i_tk", i_tk)); - auto sr = tkz.scan2(in_span, in_span.empty()); + auto sr = tkz.scan(in_span, in_span.empty()); const auto & tk = sr.get_token(); if (tk.is_valid()) { @@ -454,6 +454,8 @@ namespace xo { make_testcase(const char * input, const char * src_function, const char * error_descr, size_t tk_start, size_t whitespace, size_t error_pos) { + size_t line_no = 1; + testcase_error retval; retval.input_ = input; retval.expect_error_ = tkz_error_type(src_function, error_descr, @@ -548,7 +550,7 @@ namespace xo { auto in_span = tokenizer::span_type::from_string(testcase.input_); - auto sr = tkz.scan2(in_span, true /*eof*/); + auto sr = tkz.scan(in_span, true /*eof*/); REHEARSE(rh, sr.is_error());