From 7432a0bd1d75350e640d18d50c76d6c8bddc9dc8 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Fri, 23 Jan 2026 14:57:43 -0500 Subject: [PATCH] xo-reader2: readerreplxx works + streamline debugging --- include/xo/tokenizer2/Tokenizer.hpp | 8 ++--- src/tokenizer2/Tokenizer.cpp | 51 ++++++++++++----------------- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/include/xo/tokenizer2/Tokenizer.hpp b/include/xo/tokenizer2/Tokenizer.hpp index 69843a5a..3dc6da11 100644 --- a/include/xo/tokenizer2/Tokenizer.hpp +++ b/include/xo/tokenizer2/Tokenizer.hpp @@ -109,19 +109,19 @@ namespace xo { static bool is_2char_punctuation(CharT ch); /** assemble token from text @p token_text. - * @p initial_whitespace Amount of whitespace input being consumed from input. + * @p ws_span whitespace preceding token * @p token_text subset of input_line representing a single token. * @p p_input_state input state containing input_line. On exit current line cleared * if error * * retval.consumed will represent some possibly-empty prefix of @p input **/ - static scan_result assemble_token(std::size_t initial_whitespace, - const span_type & token_text, + static scan_result assemble_token( span_type ws_span, + span_type token_text, TkInputState * p_input_state); /** degenerate version of assemble_token() on reaching end-of-file **/ - static scan_result assemble_final_token(const span_type & token_text, + static scan_result assemble_final_token(span_type token_text, TkInputState * p_input_state); /** true if tokenizer contains stored prefix of diff --git a/src/tokenizer2/Tokenizer.cpp b/src/tokenizer2/Tokenizer.cpp index 2784072a..c79e10c3 100644 --- a/src/tokenizer2/Tokenizer.cpp +++ b/src/tokenizer2/Tokenizer.cpp @@ -110,8 +110,8 @@ namespace xo { } auto - Tokenizer::assemble_token(std::size_t initial_whitespace, - const span_type & token_text, + Tokenizer::assemble_token(span_type ws_span, + span_type token_text, TkInputState * p_input_state) -> result_type { /* literal|pretty|streamlined */ @@ -119,7 +119,7 @@ namespace xo { scope log(XO_DEBUG(p_input_state->debug_flag())); log && log(xtag("token_text", token_text), - xtag("initial_whitespace", initial_whitespace), + xtag("initial_whitespace", ws_span.size()), xtag("input_state", *p_input_state)); tokentype tk_type = tokentype::tk_invalid; @@ -598,18 +598,16 @@ namespace xo { // TOOD: report tk_text as span, // but must pin / unpin - /* input.prefix(0): - * require caller preserves current input line until it's entirely exhausted - */ return result_type(Token(tk_type, std::move(tk_text)), - p_input_state->current_line().prefix(0)); + span_type::concat(ws_span, + span_type(tk_start, tk_end))); } /*assemble_token*/ auto - Tokenizer::assemble_final_token(const span_type & token_text, + Tokenizer::assemble_final_token(span_type token_text, TkInputState * p_input_state) -> result_type { - return assemble_token(0 /*initial_whitespace*/, + return assemble_token(token_text.prefix(0) /*ws_span*/, token_text, p_input_state); } @@ -645,6 +643,7 @@ namespace xo { Tokenizer::scan(const span_type & input) -> result_type { scope log(XO_DEBUG(input_state_.debug_flag())); + log && log(xtag("input", input)); /* - Always at beginning of token when scan() invoked * - scan will not report any portion of line as consumed until it has @@ -659,12 +658,14 @@ namespace xo { const CharT * ix = this->input_state_.skip_leading_whitespace(); if(ix == input.hi()) { - log && log("end input -> consume current line"); + log && log("end buffered input -> consume current line"); /* entirety of current line has been tokenized * -> caller may consume it */ - return result_type::make_whitespace(this->input_state_.consume_current_line()); + this->input_state_.consume_current_line(); + + return result_type::make_whitespace(input); } /* ix: if ix < input.hi: first non-whitespace character after input_state_.current_pos_ */ @@ -697,27 +698,17 @@ namespace xo { ++ix; -#ifdef OBSOLETE // no longer a thing. either input ends in whitespace, or ends translation unit - if (ix == input.hi()) { - /* need more input to know if/when token complete */ - this->prefix_ += std::string(tk_start, input.hi()); + CharT ch2 = *ix; - log && log(xtag("captured-prefix1", this->prefix_)); - } else -#endif - { - CharT ch2 = *ix; - - if (((ch2 >= '0') && (ch2 <= '9')) - || ((ch2 >= 'A') && (ch2 <= 'Z')) - || ((ch2 >= 'a') && (ch2 <= 'z'))) + if (((ch2 >= '0') && (ch2 <= '9')) + || ((ch2 >= 'A') && (ch2 <= 'Z')) + || ((ch2 >= 'a') && (ch2 <= 'z'))) { /* treat as 1 char punctuation */ ; } else { - /* include next char */ - ++ix; - } + /* include next char */ + ++ix; } } else if (*ix == '"') { bool complete_flag = false; @@ -779,7 +770,7 @@ namespace xo { this->input_state_.advance_until(ix); - return assemble_token(whitespace_z, + return assemble_token(span_type(input.lo(), tk_start), span_type(tk_start, ix) /*token*/, &(this->input_state_)); } @@ -803,7 +794,7 @@ namespace xo { this->input_state_.advance_until(ix); /* ignore next char and complete token */ - return assemble_token(whitespace_z, + return assemble_token(span_type(input.lo(), tk_start), span_type(tk_start, ix) /*token*/, &(this->input_state_)); } @@ -854,7 +845,7 @@ namespace xo { this->input_state_.advance_until(ix); - return assemble_token(whitespace_z, + return assemble_token(span_type(input.lo(), tk_start), span_type(tk_start, ix) /*token*/, &(this->input_state_)); } /*_scan_aux*/