tokenizer: + input_state helper

This commit is contained in:
Roland Conybeare 2025-06-24 23:50:21 -05:00
commit 093f8a4b7c
6 changed files with 182 additions and 81 deletions

View file

@ -0,0 +1,82 @@
/* @file input_state.hpp
*
* author: Roland Conybeare, Jun 2025
*/
#pragma once
#include "span.hpp"
namespace xo {
namespace scm {
/** @class input_state
* @brief Track detailed input position for use in error messages
*
**/
template <typename CharT>
class input_state {
public:
using span_type = span<const CharT>;
public:
input_state() = default;
explicit input_state(const span<const CharT>& x, size_t cpos, size_t ws)
: current_line_{x}, current_pos_{cpos}, whitespace_{ws} {}
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wchanges-meaning"
const span_type & current_line() const { return current_line_; }
#pragma GCC diagnostic pop
size_t current_pos() const { return current_pos_; }
size_t whitespace() const { return whitespace_; }
void capture_current_line(const span_type & input);
void discard_current_line();
void consume(size_t z) { current_pos_ += z; }
void reset_whitespace() { whitespace_ = 0; }
void increment_whitespace() { ++whitespace_; }
private:
/** remember current input line. Used only to report errors **/
span<const CharT> current_line_ = span<const CharT>();
/** current input position within @ref current_line_ **/
size_t current_pos_ = 0;
/** whitespace since end of preceding token,
* or last newline, whichever is less
**/
size_t whitespace_ = 0;
bool debug_flag_ = false;
};
template <typename CharT>
void
input_state<CharT>::discard_current_line() {
this->current_line_ = span_type::make_null();
this->current_pos_ = 0;
}
template <typename CharT>
void
input_state<CharT>::capture_current_line(const span_type & input)
{
// see also discard_current_line()
scope log(XO_DEBUG(debug_flag_));
/* look ahead to {end of line, end of input}, whichever comes first */
const CharT * sol = input.lo();
const CharT * eol = sol;
while ((eol < input.hi()) && (*eol != '\n'))
++eol;
this->current_line_ = span_type(sol, eol);
// this->current_pos_ = 0;
log && log(xtag("current_line", print::printspan(current_line_)));
}
}
}

View file

@ -30,14 +30,28 @@ namespace xo {
/** @defgroup span-ctors span constructors **/
///@{
/** null span **/
span() : lo_{nullptr}, hi_{nullptr} {}
/** Create span for the contiguous memory range [@p lo, @p hi) **/
span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {}
/** explicit conversion from span<U> **/
template<typename CharU>
span(const span<CharU> & other,
std::enable_if_t<std::is_convertible_v<CharU*, CharT*>
&& !std::is_same_v<CharU, CharT>> * = nullptr)
: lo_{other.lo()}, hi_{other.hi()} {}
/** copy ctor (explicit to avoid ambiguity with template ctor) **/
span(const span & other) = default;
span & operator=(const span & other) = default;
/** Create a null span (i.e. with null @p lo, @p hi pointers)
* A null span can be concatenated with any other span
* without triggering matching-endpoint asserts.
**/
static span make_null() { return span(nullptr, nullptr); }
static span make_null() { return span(static_cast<CharT*>(nullptr), static_cast<CharT*>(nullptr)); }
/** @brief create span for C-style string @p cstr **/
static span from_cstr(const CharT * cstr) {

View file

@ -6,6 +6,7 @@
#pragma once
#include "token.hpp"
#include "input_state.hpp"
#include "span.hpp"
#include "scan_result.hpp"
#include "xo/indentlog/scope.hpp"
@ -53,6 +54,7 @@ namespace xo {
using token_type = token<CharT>;
using error_type = tokenizer_error<CharT>;
using span_type = span<const CharT>;
using input_state_type = input_state<CharT>;
using result_type = scan_result<CharT>;
public:
@ -150,10 +152,8 @@ namespace xo {
private:
/** true to log tokenizer activity to stdout **/
bool debug_flag_ = false;
/** remember current input line. Used only to report errors **/
span_type current_line_ = span_type::make_null();
/** current input position within @ref current_line_ **/
size_t current_pos_ = 0;
/** track input state (line#,pos,..) for error messages **/
input_state_type input_state_;
/** Accumulate partial token here.
* This will happen if input sent to @ref tokenizer::scan
* ends without a determinate token boundary.
@ -369,9 +369,10 @@ namespace xo {
return result_type::make_error
(error_type(__FUNCTION__ /*src_function*/,
"improperly placed sign indicator",
current_line_,
current_pos_,
initial_whitespace,
input_state_,
//current_line_,
//current_pos_,
//initial_whitespace,
(ix - tk_start)));
}
} else if (*ix == '.') {
@ -379,9 +380,10 @@ namespace xo {
return result_type::make_error
(error_type(__FUNCTION__ /*src_function*/,
"duplicate decimal point in numeric literal",
current_line_,
current_pos_,
initial_whitespace,
input_state_,
//current_line_,
//current_pos_,
//initial_whitespace,
(ix - tk_start)));
}
@ -391,9 +393,10 @@ namespace xo {
return result_type::make_error
(error_type(__FUNCTION__ /*src_function*/,
"duplicate exponent marker in numeric literal",
current_line_,
current_pos_,
initial_whitespace,
input_state_,
//current_line_,
//current_pos_,
//initial_whitespace,
(ix - tk_start)));
}
@ -409,9 +412,10 @@ namespace xo {
return result_type::make_error
(error_type(__FUNCTION__ /*src_function*/,
"unexpected character in numeric constant" /*error_description*/,
current_line_,
current_pos_,
initial_whitespace,
input_state_,
//current_line_,
//current_pos_,
//initial_whitespace,
(ix - tk_start)));
}
}
@ -490,9 +494,10 @@ namespace xo {
return result_type::make_error
(error_type(__FUNCTION__ /*src_function*/,
"expecting key following escape character \\",
current_line_,
current_pos_,
initial_whitespace,
input_state_,
//current_line_,
//current_pos_,
//initial_whitespace,
(ix - tk_start)));
}
@ -521,9 +526,10 @@ namespace xo {
return result_type::make_error
(error_type(__FUNCTION__ /*src_function*/,
"expecting one of n|r|\"|\\ following escape \\",
current_line_,
current_pos_,
initial_whitespace,
input_state_,
//current_line_,
//current_pos_,
//initial_whitespace,
(ix - tk_start)));
}
break;
@ -540,9 +546,10 @@ namespace xo {
return result_type::make_error
(error_type(__FUNCTION__ /*src_function*/,
"missing terminating '\"' to complete literal string",
current_line_,
current_pos_,
initial_whitespace,
input_state_,
//current_line_,
//current_pos_,
//initial_whitespace,
(ix - tk_start)));
}
@ -668,9 +675,10 @@ namespace xo {
return result_type::make_error
(error_type(__FUNCTION__ /*src_function*/,
"illegal input character",
current_line_,
current_pos_,
initial_whitespace,
input_state_,
//current_line_,
//current_pos_,
//initial_whitespace,
(ix - tk_start)));
}
@ -760,21 +768,7 @@ namespace xo {
void
tokenizer<CharT>::capture_current_line(const span_type & input)
{
// see discard_current_line()
scope log(XO_DEBUG(debug_flag_));
/* look ahead to {end of line, end of input}, whichever comes first */
const CharT * sol = input.lo();
const CharT * eol = sol;
while ((eol < input.hi()) && (*eol != '\n'))
++eol;
this->current_line_ = span_type(sol, eol);
this->current_pos_ = 0;
log && log(xtag("current_line", print::printspan(current_line_)));
this->input_state_.capture_current_line(input);
}
template <typename CharT>
@ -787,22 +781,23 @@ namespace xo {
const CharT * ix = input.lo();
if (this->current_line_.is_null()) {
if (this->input_state_.current_line().is_null()) {
this->capture_current_line(input);
}
this->input_state_.reset_whitespace();
/* skip whitespace + remember beginning of most recent line */
while (is_whitespace(*ix) && (ix != input.hi())) {
if (is_newline(*ix)) {
++ix;
this->capture_current_line(span_type(ix, input.hi()));
this->input_state_.reset_whitespace();
} else {
++ix;
#ifdef OBSOLETE
++(this->current_pos_);
#endif
this->input_state_.increment_whitespace();
}
}
@ -818,9 +813,9 @@ namespace xo {
/* here: *ix is not whitespace */
auto whitespace = input.prefix_upto(ix);
auto whitespace_span = input.prefix_upto(ix);
log && log(xtag("whitespace.size", whitespace.size()));
log && log(xtag("whitespace.size", input_state_.whitespace()));
/* tk_start points to known beginning of token
* (after any whitespace)
@ -880,8 +875,10 @@ namespace xo {
return result_type::make_error
(error_type(__FUNCTION__ /*src_function*/,
"must use \\n or \\r to encode newline/cr in string literal",
current_line_, current_pos_,
whitespace.size(),
input_state_,
//current_line_,
//current_pos_,
//whitespace.size(),
(ix - tk_start)));
}
@ -910,7 +907,7 @@ namespace xo {
/* include next char and complete token */
++ix;
return scan_completion(whitespace, ix /*token_end*/, input);
return scan_completion(whitespace_span, ix /*token_end*/, input);
}
/* here: -123, -.5e-21 for example */
@ -928,7 +925,7 @@ namespace xo {
if (ch2 != '=') {
/* ignore next char and complete token */
return scan_completion(whitespace, ix /*token_end*/, input);
return scan_completion(whitespace_span, ix /*token_end*/, input);
}
/* here: >= for example */
@ -978,7 +975,7 @@ namespace xo {
}
}
return scan_completion(whitespace, ix /*token_end*/, input);
return scan_completion(whitespace_span, ix /*token_end*/, input);
} /*scan*/
template <typename CharT>
@ -1010,7 +1007,7 @@ namespace xo {
auto
tokenizer<CharT>::consume(const span_type & consumed, const span_type & input) -> span_type
{
this->current_pos_ += consumed.size();
this->input_state_.consume(consumed.size());
return input.after_prefix(consumed);
}
@ -1021,8 +1018,7 @@ namespace xo {
{
// see capture_current_line()
this->current_line_ = span_type::make_null();
this->current_pos_ = 0;
this->input_state_.discard_current_line();
}
template <typename CharT>

View file

@ -5,6 +5,7 @@
#pragma once
#include "input_state.hpp"
#include "tokentype.hpp"
#include "span.hpp"
#include <iomanip>
@ -19,6 +20,7 @@ namespace xo {
template <typename CharT>
class tokenizer_error {
public:
using input_state_type = input_state<CharT>;
using span_type = span<const CharT>;
public:
@ -29,20 +31,20 @@ namespace xo {
tokenizer_error() = default;
/** Constructor to capture parsing error context
* @p tk_start current position on entry to scanner
* @p whitespace number of chars initial whitespace
* @p error_pos error location relative to token start
**/
tokenizer_error(const char * src_function,
const char * error_description,
span_type input_line,
size_t tk_start,
size_t whitespace,
const input_state_type & input_state,
//span_type input_line,
//size_t tk_start,
//size_t whitespace,
size_t error_pos)
: src_function_{src_function},
error_description_{error_description},
input_line_{input_line},
tk_entry_{tk_start},
whitespace_{whitespace},
input_state_{input_state},
//tk_entry_{tk_start},
//whitespace_{whitespace},
error_pos_{error_pos} {}
///@}
@ -51,9 +53,13 @@ namespace xo {
const char * src_function() const { return src_function_; }
const char * error_description() const { return error_description_; }
const span_type& input_line() const { return input_line_; }
size_t tk_start() const { return tk_entry_; }
size_t whitespace() const { return whitespace_; }
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wchanges-meaning"
const input_state_type & input_state() const { return input_state_; }
#pragma GCC diagnostic pop
//const span_type& input_line() const { return input_line_; }
size_t tk_start() const { return input_state_.current_pos(); }
size_t whitespace() const { return input_state_.whitespace(); }
size_t error_pos() const { return error_pos_; }
///@}
@ -84,14 +90,12 @@ namespace xo {
char const * src_function_ = nullptr;
/** static error description **/
char const * error_description_ = nullptr;
/** complete current input line (to the extent captured)
* that contains error
/** input state associated with this error.
* Sufficient to precisely locate it with context.
**/
span_type input_line_ = span_type::make_null();
input_state_type input_state_;
/** position (relative to line_.lo) of token start where error encountered **/
size_t tk_entry_ = 0;
/** number of characters of initial whitespace skipped before token start **/
size_t whitespace_ = 0;
/** position (relative to @ref tk_entry_) of error **/
size_t error_pos_ = 0;
@ -104,8 +108,8 @@ namespace xo {
os << "<tokenizer-error"
<< xtag("src-function", src_function_)
<< xtag("message", error_description_)
<< xtag("input", input_line_)
<< xtag("whitespace", whitespace_)
<< xtag("input", input_state_.current_line())
<< xtag("whitespace", input_state_.whitespace())
<< xtag("tk-start", tk_entry_)
<< xtag("error-pos", error_pos_)
<< ">";
@ -118,15 +122,18 @@ namespace xo {
if (error_description_) {
const char * prefix = "input: ";
const size_t tk_indent = strlen(prefix) + tk_entry_ + whitespace_;
const size_t tk_indent = strlen(prefix) + tk_entry_ + input_state_.whitespace();
//const size_t msg_length = strlen(error_description_);
const size_t error_pos = 1 + tk_entry_ + whitespace_ + error_pos_;
const size_t error_pos = 1 + tk_entry_ + input_state_.whitespace() + error_pos_;
os << "char: " << error_pos << endl;
os << prefix;
for (const char *p = input_line_.lo(), *e = input_line_.hi(); p < e; ++p)
for (const char *p = input_state_.current_line().lo(),
*e = input_state_.current_line().hi(); p < e; ++p)
{
os << *p;
}
os << endl;
os << std::setw(tk_indent) << " ";

View file

@ -442,6 +442,7 @@ namespace xo {
namespace {
using tkz_error_type = xo::scm::tokenizer_error<char>;
using input_state_type = xo::scm::input_state<char>;
using span_type = xo::scm::span<const char>;
struct testcase_error {
@ -456,8 +457,9 @@ namespace xo {
testcase_error retval;
retval.input_ = input;
retval.expect_error_ = tkz_error_type(src_function, error_descr,
span_type::from_string(retval.input_),
tk_start, whitespace, error_pos);
input_state_type(span_type::from_string(retval.input_),
tk_start, whitespace),
error_pos);
return retval;
}
@ -481,7 +483,7 @@ namespace xo {
"assemble_token",
"duplicate decimal point in numeric literal",
0, 0, 2),
// 0123456
// o 0123456
// ------v
make_testcase("1.23e4e",
"assemble_token",