From 093f8a4b7ce178e7ab54f3883a7f01b78a8f7087 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Tue, 24 Jun 2025 23:50:21 -0500 Subject: [PATCH] tokenizer: + input_state helper --- .../include/xo/tokenizer/error_token.hpp | 0 .../include/xo/tokenizer/input_state.hpp | 82 +++++++++++++ xo-tokenizer/include/xo/tokenizer/span.hpp | 16 ++- .../include/xo/tokenizer/tokenizer.hpp | 110 +++++++++--------- .../include/xo/tokenizer/tokenizer_error.hpp | 47 ++++---- xo-tokenizer/utest/tokenizer.test.cpp | 8 +- 6 files changed, 182 insertions(+), 81 deletions(-) delete mode 100644 xo-tokenizer/include/xo/tokenizer/error_token.hpp create mode 100644 xo-tokenizer/include/xo/tokenizer/input_state.hpp diff --git a/xo-tokenizer/include/xo/tokenizer/error_token.hpp b/xo-tokenizer/include/xo/tokenizer/error_token.hpp deleted file mode 100644 index e69de29b..00000000 diff --git a/xo-tokenizer/include/xo/tokenizer/input_state.hpp b/xo-tokenizer/include/xo/tokenizer/input_state.hpp new file mode 100644 index 00000000..12c570d5 --- /dev/null +++ b/xo-tokenizer/include/xo/tokenizer/input_state.hpp @@ -0,0 +1,82 @@ +/* @file input_state.hpp + * + * author: Roland Conybeare, Jun 2025 + */ + +#pragma once + +#include "span.hpp" + +namespace xo { + namespace scm { + /** @class input_state + * @brief Track detailed input position for use in error messages + * + **/ + template + class input_state { + public: + using span_type = span; + + public: + input_state() = default; + explicit input_state(const span& x, size_t cpos, size_t ws) + : current_line_{x}, current_pos_{cpos}, whitespace_{ws} {} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wchanges-meaning" + const span_type & current_line() const { return current_line_; } +#pragma GCC diagnostic pop + size_t current_pos() const { return current_pos_; } + size_t whitespace() const { return whitespace_; } + + void capture_current_line(const span_type & input); + void discard_current_line(); + + void consume(size_t z) { current_pos_ += z; } + + void reset_whitespace() { whitespace_ = 0; } + void increment_whitespace() { ++whitespace_; } + + private: + /** remember current input line. Used only to report errors **/ + span current_line_ = span(); + /** current input position within @ref current_line_ **/ + size_t current_pos_ = 0; + /** whitespace since end of preceding token, + * or last newline, whichever is less + **/ + size_t whitespace_ = 0; + + bool debug_flag_ = false; + }; + + template + void + input_state::discard_current_line() { + this->current_line_ = span_type::make_null(); + this->current_pos_ = 0; + } + + template + void + input_state::capture_current_line(const span_type & input) + { + // see also discard_current_line() + + scope log(XO_DEBUG(debug_flag_)); + + /* look ahead to {end of line, end of input}, whichever comes first */ + const CharT * sol = input.lo(); + const CharT * eol = sol; + + while ((eol < input.hi()) && (*eol != '\n')) + ++eol; + + this->current_line_ = span_type(sol, eol); +// this->current_pos_ = 0; + + log && log(xtag("current_line", print::printspan(current_line_))); + } + } +} diff --git a/xo-tokenizer/include/xo/tokenizer/span.hpp b/xo-tokenizer/include/xo/tokenizer/span.hpp index 5381a440..2b9166c6 100644 --- a/xo-tokenizer/include/xo/tokenizer/span.hpp +++ b/xo-tokenizer/include/xo/tokenizer/span.hpp @@ -30,14 +30,28 @@ namespace xo { /** @defgroup span-ctors span constructors **/ ///@{ + /** null span **/ + span() : lo_{nullptr}, hi_{nullptr} {} + /** Create span for the contiguous memory range [@p lo, @p hi) **/ span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {} + /** explicit conversion from span **/ + template + span(const span & other, + std::enable_if_t + && !std::is_same_v> * = nullptr) + : lo_{other.lo()}, hi_{other.hi()} {} + + /** copy ctor (explicit to avoid ambiguity with template ctor) **/ + span(const span & other) = default; + span & operator=(const span & other) = default; + /** Create a null span (i.e. with null @p lo, @p hi pointers) * A null span can be concatenated with any other span * without triggering matching-endpoint asserts. **/ - static span make_null() { return span(nullptr, nullptr); } + static span make_null() { return span(static_cast(nullptr), static_cast(nullptr)); } /** @brief create span for C-style string @p cstr **/ static span from_cstr(const CharT * cstr) { diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp index f0ebb4be..7dc0e750 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp @@ -6,6 +6,7 @@ #pragma once #include "token.hpp" +#include "input_state.hpp" #include "span.hpp" #include "scan_result.hpp" #include "xo/indentlog/scope.hpp" @@ -53,6 +54,7 @@ namespace xo { using token_type = token; using error_type = tokenizer_error; using span_type = span; + using input_state_type = input_state; using result_type = scan_result; public: @@ -150,10 +152,8 @@ namespace xo { private: /** true to log tokenizer activity to stdout **/ bool debug_flag_ = false; - /** remember current input line. Used only to report errors **/ - span_type current_line_ = span_type::make_null(); - /** current input position within @ref current_line_ **/ - size_t current_pos_ = 0; + /** track input state (line#,pos,..) for error messages **/ + input_state_type input_state_; /** Accumulate partial token here. * This will happen if input sent to @ref tokenizer::scan * ends without a determinate token boundary. @@ -369,9 +369,10 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "improperly placed sign indicator", - current_line_, - current_pos_, - initial_whitespace, + input_state_, + //current_line_, + //current_pos_, + //initial_whitespace, (ix - tk_start))); } } else if (*ix == '.') { @@ -379,9 +380,10 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "duplicate decimal point in numeric literal", - current_line_, - current_pos_, - initial_whitespace, + input_state_, + //current_line_, + //current_pos_, + //initial_whitespace, (ix - tk_start))); } @@ -391,9 +393,10 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "duplicate exponent marker in numeric literal", - current_line_, - current_pos_, - initial_whitespace, + input_state_, + //current_line_, + //current_pos_, + //initial_whitespace, (ix - tk_start))); } @@ -409,9 +412,10 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "unexpected character in numeric constant" /*error_description*/, - current_line_, - current_pos_, - initial_whitespace, + input_state_, + //current_line_, + //current_pos_, + //initial_whitespace, (ix - tk_start))); } } @@ -490,9 +494,10 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "expecting key following escape character \\", - current_line_, - current_pos_, - initial_whitespace, + input_state_, + //current_line_, + //current_pos_, + //initial_whitespace, (ix - tk_start))); } @@ -521,9 +526,10 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "expecting one of n|r|\"|\\ following escape \\", - current_line_, - current_pos_, - initial_whitespace, + input_state_, + //current_line_, + //current_pos_, + //initial_whitespace, (ix - tk_start))); } break; @@ -540,9 +546,10 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "missing terminating '\"' to complete literal string", - current_line_, - current_pos_, - initial_whitespace, + input_state_, + //current_line_, + //current_pos_, + //initial_whitespace, (ix - tk_start))); } @@ -668,9 +675,10 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "illegal input character", - current_line_, - current_pos_, - initial_whitespace, + input_state_, + //current_line_, + //current_pos_, + //initial_whitespace, (ix - tk_start))); } @@ -760,21 +768,7 @@ namespace xo { void tokenizer::capture_current_line(const span_type & input) { - // see discard_current_line() - - scope log(XO_DEBUG(debug_flag_)); - - /* look ahead to {end of line, end of input}, whichever comes first */ - const CharT * sol = input.lo(); - const CharT * eol = sol; - - while ((eol < input.hi()) && (*eol != '\n')) - ++eol; - - this->current_line_ = span_type(sol, eol); - this->current_pos_ = 0; - - log && log(xtag("current_line", print::printspan(current_line_))); + this->input_state_.capture_current_line(input); } template @@ -787,22 +781,23 @@ namespace xo { const CharT * ix = input.lo(); - if (this->current_line_.is_null()) { + if (this->input_state_.current_line().is_null()) { this->capture_current_line(input); } + this->input_state_.reset_whitespace(); + /* skip whitespace + remember beginning of most recent line */ while (is_whitespace(*ix) && (ix != input.hi())) { if (is_newline(*ix)) { ++ix; this->capture_current_line(span_type(ix, input.hi())); + this->input_state_.reset_whitespace(); } else { ++ix; -#ifdef OBSOLETE - ++(this->current_pos_); -#endif + this->input_state_.increment_whitespace(); } } @@ -818,9 +813,9 @@ namespace xo { /* here: *ix is not whitespace */ - auto whitespace = input.prefix_upto(ix); + auto whitespace_span = input.prefix_upto(ix); - log && log(xtag("whitespace.size", whitespace.size())); + log && log(xtag("whitespace.size", input_state_.whitespace())); /* tk_start points to known beginning of token * (after any whitespace) @@ -880,8 +875,10 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "must use \\n or \\r to encode newline/cr in string literal", - current_line_, current_pos_, - whitespace.size(), + input_state_, + //current_line_, + //current_pos_, + //whitespace.size(), (ix - tk_start))); } @@ -910,7 +907,7 @@ namespace xo { /* include next char and complete token */ ++ix; - return scan_completion(whitespace, ix /*token_end*/, input); + return scan_completion(whitespace_span, ix /*token_end*/, input); } /* here: -123, -.5e-21 for example */ @@ -928,7 +925,7 @@ namespace xo { if (ch2 != '=') { /* ignore next char and complete token */ - return scan_completion(whitespace, ix /*token_end*/, input); + return scan_completion(whitespace_span, ix /*token_end*/, input); } /* here: >= for example */ @@ -978,7 +975,7 @@ namespace xo { } } - return scan_completion(whitespace, ix /*token_end*/, input); + return scan_completion(whitespace_span, ix /*token_end*/, input); } /*scan*/ template @@ -1010,7 +1007,7 @@ namespace xo { auto tokenizer::consume(const span_type & consumed, const span_type & input) -> span_type { - this->current_pos_ += consumed.size(); + this->input_state_.consume(consumed.size()); return input.after_prefix(consumed); } @@ -1021,8 +1018,7 @@ namespace xo { { // see capture_current_line() - this->current_line_ = span_type::make_null(); - this->current_pos_ = 0; + this->input_state_.discard_current_line(); } template diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp index c12fad72..5b83e27f 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp @@ -5,6 +5,7 @@ #pragma once +#include "input_state.hpp" #include "tokentype.hpp" #include "span.hpp" #include @@ -19,6 +20,7 @@ namespace xo { template class tokenizer_error { public: + using input_state_type = input_state; using span_type = span; public: @@ -29,20 +31,20 @@ namespace xo { tokenizer_error() = default; /** Constructor to capture parsing error context * @p tk_start current position on entry to scanner - * @p whitespace number of chars initial whitespace * @p error_pos error location relative to token start **/ tokenizer_error(const char * src_function, const char * error_description, - span_type input_line, - size_t tk_start, - size_t whitespace, + const input_state_type & input_state, + //span_type input_line, + //size_t tk_start, + //size_t whitespace, size_t error_pos) : src_function_{src_function}, error_description_{error_description}, - input_line_{input_line}, - tk_entry_{tk_start}, - whitespace_{whitespace}, + input_state_{input_state}, + //tk_entry_{tk_start}, + //whitespace_{whitespace}, error_pos_{error_pos} {} ///@} @@ -51,9 +53,13 @@ namespace xo { const char * src_function() const { return src_function_; } const char * error_description() const { return error_description_; } - const span_type& input_line() const { return input_line_; } - size_t tk_start() const { return tk_entry_; } - size_t whitespace() const { return whitespace_; } +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wchanges-meaning" + const input_state_type & input_state() const { return input_state_; } +#pragma GCC diagnostic pop + //const span_type& input_line() const { return input_line_; } + size_t tk_start() const { return input_state_.current_pos(); } + size_t whitespace() const { return input_state_.whitespace(); } size_t error_pos() const { return error_pos_; } ///@} @@ -84,14 +90,12 @@ namespace xo { char const * src_function_ = nullptr; /** static error description **/ char const * error_description_ = nullptr; - /** complete current input line (to the extent captured) - * that contains error + /** input state associated with this error. + * Sufficient to precisely locate it with context. **/ - span_type input_line_ = span_type::make_null(); + input_state_type input_state_; /** position (relative to line_.lo) of token start where error encountered **/ size_t tk_entry_ = 0; - /** number of characters of initial whitespace skipped before token start **/ - size_t whitespace_ = 0; /** position (relative to @ref tk_entry_) of error **/ size_t error_pos_ = 0; @@ -104,8 +108,8 @@ namespace xo { os << ""; @@ -118,15 +122,18 @@ namespace xo { if (error_description_) { const char * prefix = "input: "; - const size_t tk_indent = strlen(prefix) + tk_entry_ + whitespace_; + const size_t tk_indent = strlen(prefix) + tk_entry_ + input_state_.whitespace(); //const size_t msg_length = strlen(error_description_); - const size_t error_pos = 1 + tk_entry_ + whitespace_ + error_pos_; + const size_t error_pos = 1 + tk_entry_ + input_state_.whitespace() + error_pos_; os << "char: " << error_pos << endl; os << prefix; - for (const char *p = input_line_.lo(), *e = input_line_.hi(); p < e; ++p) + for (const char *p = input_state_.current_line().lo(), + *e = input_state_.current_line().hi(); p < e; ++p) + { os << *p; + } os << endl; os << std::setw(tk_indent) << " "; diff --git a/xo-tokenizer/utest/tokenizer.test.cpp b/xo-tokenizer/utest/tokenizer.test.cpp index 4c8d360b..dbcc74f0 100644 --- a/xo-tokenizer/utest/tokenizer.test.cpp +++ b/xo-tokenizer/utest/tokenizer.test.cpp @@ -442,6 +442,7 @@ namespace xo { namespace { using tkz_error_type = xo::scm::tokenizer_error; + using input_state_type = xo::scm::input_state; using span_type = xo::scm::span; struct testcase_error { @@ -456,8 +457,9 @@ namespace xo { testcase_error retval; retval.input_ = input; retval.expect_error_ = tkz_error_type(src_function, error_descr, - span_type::from_string(retval.input_), - tk_start, whitespace, error_pos); + input_state_type(span_type::from_string(retval.input_), + tk_start, whitespace), + error_pos); return retval; } @@ -481,7 +483,7 @@ namespace xo { "assemble_token", "duplicate decimal point in numeric literal", 0, 0, 2), - // 0123456 + // o 0123456 // ------v make_testcase("1.23e4e", "assemble_token",