tokenizer: + input_state helper
This commit is contained in:
parent
f162b48461
commit
093f8a4b7c
6 changed files with 182 additions and 81 deletions
82
xo-tokenizer/include/xo/tokenizer/input_state.hpp
Normal file
82
xo-tokenizer/include/xo/tokenizer/input_state.hpp
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
/* @file input_state.hpp
|
||||
*
|
||||
* author: Roland Conybeare, Jun 2025
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "span.hpp"
|
||||
|
||||
namespace xo {
|
||||
namespace scm {
|
||||
/** @class input_state
|
||||
* @brief Track detailed input position for use in error messages
|
||||
*
|
||||
**/
|
||||
template <typename CharT>
|
||||
class input_state {
|
||||
public:
|
||||
using span_type = span<const CharT>;
|
||||
|
||||
public:
|
||||
input_state() = default;
|
||||
explicit input_state(const span<const CharT>& x, size_t cpos, size_t ws)
|
||||
: current_line_{x}, current_pos_{cpos}, whitespace_{ws} {}
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wchanges-meaning"
|
||||
const span_type & current_line() const { return current_line_; }
|
||||
#pragma GCC diagnostic pop
|
||||
size_t current_pos() const { return current_pos_; }
|
||||
size_t whitespace() const { return whitespace_; }
|
||||
|
||||
void capture_current_line(const span_type & input);
|
||||
void discard_current_line();
|
||||
|
||||
void consume(size_t z) { current_pos_ += z; }
|
||||
|
||||
void reset_whitespace() { whitespace_ = 0; }
|
||||
void increment_whitespace() { ++whitespace_; }
|
||||
|
||||
private:
|
||||
/** remember current input line. Used only to report errors **/
|
||||
span<const CharT> current_line_ = span<const CharT>();
|
||||
/** current input position within @ref current_line_ **/
|
||||
size_t current_pos_ = 0;
|
||||
/** whitespace since end of preceding token,
|
||||
* or last newline, whichever is less
|
||||
**/
|
||||
size_t whitespace_ = 0;
|
||||
|
||||
bool debug_flag_ = false;
|
||||
};
|
||||
|
||||
template <typename CharT>
|
||||
void
|
||||
input_state<CharT>::discard_current_line() {
|
||||
this->current_line_ = span_type::make_null();
|
||||
this->current_pos_ = 0;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void
|
||||
input_state<CharT>::capture_current_line(const span_type & input)
|
||||
{
|
||||
// see also discard_current_line()
|
||||
|
||||
scope log(XO_DEBUG(debug_flag_));
|
||||
|
||||
/* look ahead to {end of line, end of input}, whichever comes first */
|
||||
const CharT * sol = input.lo();
|
||||
const CharT * eol = sol;
|
||||
|
||||
while ((eol < input.hi()) && (*eol != '\n'))
|
||||
++eol;
|
||||
|
||||
this->current_line_ = span_type(sol, eol);
|
||||
// this->current_pos_ = 0;
|
||||
|
||||
log && log(xtag("current_line", print::printspan(current_line_)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -30,14 +30,28 @@ namespace xo {
|
|||
/** @defgroup span-ctors span constructors **/
|
||||
///@{
|
||||
|
||||
/** null span **/
|
||||
span() : lo_{nullptr}, hi_{nullptr} {}
|
||||
|
||||
/** Create span for the contiguous memory range [@p lo, @p hi) **/
|
||||
span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {}
|
||||
|
||||
/** explicit conversion from span<U> **/
|
||||
template<typename CharU>
|
||||
span(const span<CharU> & other,
|
||||
std::enable_if_t<std::is_convertible_v<CharU*, CharT*>
|
||||
&& !std::is_same_v<CharU, CharT>> * = nullptr)
|
||||
: lo_{other.lo()}, hi_{other.hi()} {}
|
||||
|
||||
/** copy ctor (explicit to avoid ambiguity with template ctor) **/
|
||||
span(const span & other) = default;
|
||||
span & operator=(const span & other) = default;
|
||||
|
||||
/** Create a null span (i.e. with null @p lo, @p hi pointers)
|
||||
* A null span can be concatenated with any other span
|
||||
* without triggering matching-endpoint asserts.
|
||||
**/
|
||||
static span make_null() { return span(nullptr, nullptr); }
|
||||
static span make_null() { return span(static_cast<CharT*>(nullptr), static_cast<CharT*>(nullptr)); }
|
||||
|
||||
/** @brief create span for C-style string @p cstr **/
|
||||
static span from_cstr(const CharT * cstr) {
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "token.hpp"
|
||||
#include "input_state.hpp"
|
||||
#include "span.hpp"
|
||||
#include "scan_result.hpp"
|
||||
#include "xo/indentlog/scope.hpp"
|
||||
|
|
@ -53,6 +54,7 @@ namespace xo {
|
|||
using token_type = token<CharT>;
|
||||
using error_type = tokenizer_error<CharT>;
|
||||
using span_type = span<const CharT>;
|
||||
using input_state_type = input_state<CharT>;
|
||||
using result_type = scan_result<CharT>;
|
||||
|
||||
public:
|
||||
|
|
@ -150,10 +152,8 @@ namespace xo {
|
|||
private:
|
||||
/** true to log tokenizer activity to stdout **/
|
||||
bool debug_flag_ = false;
|
||||
/** remember current input line. Used only to report errors **/
|
||||
span_type current_line_ = span_type::make_null();
|
||||
/** current input position within @ref current_line_ **/
|
||||
size_t current_pos_ = 0;
|
||||
/** track input state (line#,pos,..) for error messages **/
|
||||
input_state_type input_state_;
|
||||
/** Accumulate partial token here.
|
||||
* This will happen if input sent to @ref tokenizer::scan
|
||||
* ends without a determinate token boundary.
|
||||
|
|
@ -369,9 +369,10 @@ namespace xo {
|
|||
return result_type::make_error
|
||||
(error_type(__FUNCTION__ /*src_function*/,
|
||||
"improperly placed sign indicator",
|
||||
current_line_,
|
||||
current_pos_,
|
||||
initial_whitespace,
|
||||
input_state_,
|
||||
//current_line_,
|
||||
//current_pos_,
|
||||
//initial_whitespace,
|
||||
(ix - tk_start)));
|
||||
}
|
||||
} else if (*ix == '.') {
|
||||
|
|
@ -379,9 +380,10 @@ namespace xo {
|
|||
return result_type::make_error
|
||||
(error_type(__FUNCTION__ /*src_function*/,
|
||||
"duplicate decimal point in numeric literal",
|
||||
current_line_,
|
||||
current_pos_,
|
||||
initial_whitespace,
|
||||
input_state_,
|
||||
//current_line_,
|
||||
//current_pos_,
|
||||
//initial_whitespace,
|
||||
(ix - tk_start)));
|
||||
}
|
||||
|
||||
|
|
@ -391,9 +393,10 @@ namespace xo {
|
|||
return result_type::make_error
|
||||
(error_type(__FUNCTION__ /*src_function*/,
|
||||
"duplicate exponent marker in numeric literal",
|
||||
current_line_,
|
||||
current_pos_,
|
||||
initial_whitespace,
|
||||
input_state_,
|
||||
//current_line_,
|
||||
//current_pos_,
|
||||
//initial_whitespace,
|
||||
(ix - tk_start)));
|
||||
}
|
||||
|
||||
|
|
@ -409,9 +412,10 @@ namespace xo {
|
|||
return result_type::make_error
|
||||
(error_type(__FUNCTION__ /*src_function*/,
|
||||
"unexpected character in numeric constant" /*error_description*/,
|
||||
current_line_,
|
||||
current_pos_,
|
||||
initial_whitespace,
|
||||
input_state_,
|
||||
//current_line_,
|
||||
//current_pos_,
|
||||
//initial_whitespace,
|
||||
(ix - tk_start)));
|
||||
}
|
||||
}
|
||||
|
|
@ -490,9 +494,10 @@ namespace xo {
|
|||
return result_type::make_error
|
||||
(error_type(__FUNCTION__ /*src_function*/,
|
||||
"expecting key following escape character \\",
|
||||
current_line_,
|
||||
current_pos_,
|
||||
initial_whitespace,
|
||||
input_state_,
|
||||
//current_line_,
|
||||
//current_pos_,
|
||||
//initial_whitespace,
|
||||
(ix - tk_start)));
|
||||
}
|
||||
|
||||
|
|
@ -521,9 +526,10 @@ namespace xo {
|
|||
return result_type::make_error
|
||||
(error_type(__FUNCTION__ /*src_function*/,
|
||||
"expecting one of n|r|\"|\\ following escape \\",
|
||||
current_line_,
|
||||
current_pos_,
|
||||
initial_whitespace,
|
||||
input_state_,
|
||||
//current_line_,
|
||||
//current_pos_,
|
||||
//initial_whitespace,
|
||||
(ix - tk_start)));
|
||||
}
|
||||
break;
|
||||
|
|
@ -540,9 +546,10 @@ namespace xo {
|
|||
return result_type::make_error
|
||||
(error_type(__FUNCTION__ /*src_function*/,
|
||||
"missing terminating '\"' to complete literal string",
|
||||
current_line_,
|
||||
current_pos_,
|
||||
initial_whitespace,
|
||||
input_state_,
|
||||
//current_line_,
|
||||
//current_pos_,
|
||||
//initial_whitespace,
|
||||
(ix - tk_start)));
|
||||
}
|
||||
|
||||
|
|
@ -668,9 +675,10 @@ namespace xo {
|
|||
return result_type::make_error
|
||||
(error_type(__FUNCTION__ /*src_function*/,
|
||||
"illegal input character",
|
||||
current_line_,
|
||||
current_pos_,
|
||||
initial_whitespace,
|
||||
input_state_,
|
||||
//current_line_,
|
||||
//current_pos_,
|
||||
//initial_whitespace,
|
||||
(ix - tk_start)));
|
||||
}
|
||||
|
||||
|
|
@ -760,21 +768,7 @@ namespace xo {
|
|||
void
|
||||
tokenizer<CharT>::capture_current_line(const span_type & input)
|
||||
{
|
||||
// see discard_current_line()
|
||||
|
||||
scope log(XO_DEBUG(debug_flag_));
|
||||
|
||||
/* look ahead to {end of line, end of input}, whichever comes first */
|
||||
const CharT * sol = input.lo();
|
||||
const CharT * eol = sol;
|
||||
|
||||
while ((eol < input.hi()) && (*eol != '\n'))
|
||||
++eol;
|
||||
|
||||
this->current_line_ = span_type(sol, eol);
|
||||
this->current_pos_ = 0;
|
||||
|
||||
log && log(xtag("current_line", print::printspan(current_line_)));
|
||||
this->input_state_.capture_current_line(input);
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
|
|
@ -787,22 +781,23 @@ namespace xo {
|
|||
|
||||
const CharT * ix = input.lo();
|
||||
|
||||
if (this->current_line_.is_null()) {
|
||||
if (this->input_state_.current_line().is_null()) {
|
||||
this->capture_current_line(input);
|
||||
}
|
||||
|
||||
this->input_state_.reset_whitespace();
|
||||
|
||||
/* skip whitespace + remember beginning of most recent line */
|
||||
while (is_whitespace(*ix) && (ix != input.hi())) {
|
||||
if (is_newline(*ix)) {
|
||||
++ix;
|
||||
|
||||
this->capture_current_line(span_type(ix, input.hi()));
|
||||
this->input_state_.reset_whitespace();
|
||||
} else {
|
||||
++ix;
|
||||
|
||||
#ifdef OBSOLETE
|
||||
++(this->current_pos_);
|
||||
#endif
|
||||
this->input_state_.increment_whitespace();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -818,9 +813,9 @@ namespace xo {
|
|||
|
||||
/* here: *ix is not whitespace */
|
||||
|
||||
auto whitespace = input.prefix_upto(ix);
|
||||
auto whitespace_span = input.prefix_upto(ix);
|
||||
|
||||
log && log(xtag("whitespace.size", whitespace.size()));
|
||||
log && log(xtag("whitespace.size", input_state_.whitespace()));
|
||||
|
||||
/* tk_start points to known beginning of token
|
||||
* (after any whitespace)
|
||||
|
|
@ -880,8 +875,10 @@ namespace xo {
|
|||
return result_type::make_error
|
||||
(error_type(__FUNCTION__ /*src_function*/,
|
||||
"must use \\n or \\r to encode newline/cr in string literal",
|
||||
current_line_, current_pos_,
|
||||
whitespace.size(),
|
||||
input_state_,
|
||||
//current_line_,
|
||||
//current_pos_,
|
||||
//whitespace.size(),
|
||||
(ix - tk_start)));
|
||||
}
|
||||
|
||||
|
|
@ -910,7 +907,7 @@ namespace xo {
|
|||
/* include next char and complete token */
|
||||
++ix;
|
||||
|
||||
return scan_completion(whitespace, ix /*token_end*/, input);
|
||||
return scan_completion(whitespace_span, ix /*token_end*/, input);
|
||||
}
|
||||
|
||||
/* here: -123, -.5e-21 for example */
|
||||
|
|
@ -928,7 +925,7 @@ namespace xo {
|
|||
|
||||
if (ch2 != '=') {
|
||||
/* ignore next char and complete token */
|
||||
return scan_completion(whitespace, ix /*token_end*/, input);
|
||||
return scan_completion(whitespace_span, ix /*token_end*/, input);
|
||||
}
|
||||
|
||||
/* here: >= for example */
|
||||
|
|
@ -978,7 +975,7 @@ namespace xo {
|
|||
}
|
||||
}
|
||||
|
||||
return scan_completion(whitespace, ix /*token_end*/, input);
|
||||
return scan_completion(whitespace_span, ix /*token_end*/, input);
|
||||
} /*scan*/
|
||||
|
||||
template <typename CharT>
|
||||
|
|
@ -1010,7 +1007,7 @@ namespace xo {
|
|||
auto
|
||||
tokenizer<CharT>::consume(const span_type & consumed, const span_type & input) -> span_type
|
||||
{
|
||||
this->current_pos_ += consumed.size();
|
||||
this->input_state_.consume(consumed.size());
|
||||
|
||||
return input.after_prefix(consumed);
|
||||
}
|
||||
|
|
@ -1021,8 +1018,7 @@ namespace xo {
|
|||
{
|
||||
// see capture_current_line()
|
||||
|
||||
this->current_line_ = span_type::make_null();
|
||||
this->current_pos_ = 0;
|
||||
this->input_state_.discard_current_line();
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "input_state.hpp"
|
||||
#include "tokentype.hpp"
|
||||
#include "span.hpp"
|
||||
#include <iomanip>
|
||||
|
|
@ -19,6 +20,7 @@ namespace xo {
|
|||
template <typename CharT>
|
||||
class tokenizer_error {
|
||||
public:
|
||||
using input_state_type = input_state<CharT>;
|
||||
using span_type = span<const CharT>;
|
||||
|
||||
public:
|
||||
|
|
@ -29,20 +31,20 @@ namespace xo {
|
|||
tokenizer_error() = default;
|
||||
/** Constructor to capture parsing error context
|
||||
* @p tk_start current position on entry to scanner
|
||||
* @p whitespace number of chars initial whitespace
|
||||
* @p error_pos error location relative to token start
|
||||
**/
|
||||
tokenizer_error(const char * src_function,
|
||||
const char * error_description,
|
||||
span_type input_line,
|
||||
size_t tk_start,
|
||||
size_t whitespace,
|
||||
const input_state_type & input_state,
|
||||
//span_type input_line,
|
||||
//size_t tk_start,
|
||||
//size_t whitespace,
|
||||
size_t error_pos)
|
||||
: src_function_{src_function},
|
||||
error_description_{error_description},
|
||||
input_line_{input_line},
|
||||
tk_entry_{tk_start},
|
||||
whitespace_{whitespace},
|
||||
input_state_{input_state},
|
||||
//tk_entry_{tk_start},
|
||||
//whitespace_{whitespace},
|
||||
error_pos_{error_pos} {}
|
||||
///@}
|
||||
|
||||
|
|
@ -51,9 +53,13 @@ namespace xo {
|
|||
|
||||
const char * src_function() const { return src_function_; }
|
||||
const char * error_description() const { return error_description_; }
|
||||
const span_type& input_line() const { return input_line_; }
|
||||
size_t tk_start() const { return tk_entry_; }
|
||||
size_t whitespace() const { return whitespace_; }
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wchanges-meaning"
|
||||
const input_state_type & input_state() const { return input_state_; }
|
||||
#pragma GCC diagnostic pop
|
||||
//const span_type& input_line() const { return input_line_; }
|
||||
size_t tk_start() const { return input_state_.current_pos(); }
|
||||
size_t whitespace() const { return input_state_.whitespace(); }
|
||||
size_t error_pos() const { return error_pos_; }
|
||||
|
||||
///@}
|
||||
|
|
@ -84,14 +90,12 @@ namespace xo {
|
|||
char const * src_function_ = nullptr;
|
||||
/** static error description **/
|
||||
char const * error_description_ = nullptr;
|
||||
/** complete current input line (to the extent captured)
|
||||
* that contains error
|
||||
/** input state associated with this error.
|
||||
* Sufficient to precisely locate it with context.
|
||||
**/
|
||||
span_type input_line_ = span_type::make_null();
|
||||
input_state_type input_state_;
|
||||
/** position (relative to line_.lo) of token start where error encountered **/
|
||||
size_t tk_entry_ = 0;
|
||||
/** number of characters of initial whitespace skipped before token start **/
|
||||
size_t whitespace_ = 0;
|
||||
/** position (relative to @ref tk_entry_) of error **/
|
||||
size_t error_pos_ = 0;
|
||||
|
||||
|
|
@ -104,8 +108,8 @@ namespace xo {
|
|||
os << "<tokenizer-error"
|
||||
<< xtag("src-function", src_function_)
|
||||
<< xtag("message", error_description_)
|
||||
<< xtag("input", input_line_)
|
||||
<< xtag("whitespace", whitespace_)
|
||||
<< xtag("input", input_state_.current_line())
|
||||
<< xtag("whitespace", input_state_.whitespace())
|
||||
<< xtag("tk-start", tk_entry_)
|
||||
<< xtag("error-pos", error_pos_)
|
||||
<< ">";
|
||||
|
|
@ -118,15 +122,18 @@ namespace xo {
|
|||
|
||||
if (error_description_) {
|
||||
const char * prefix = "input: ";
|
||||
const size_t tk_indent = strlen(prefix) + tk_entry_ + whitespace_;
|
||||
const size_t tk_indent = strlen(prefix) + tk_entry_ + input_state_.whitespace();
|
||||
//const size_t msg_length = strlen(error_description_);
|
||||
|
||||
const size_t error_pos = 1 + tk_entry_ + whitespace_ + error_pos_;
|
||||
const size_t error_pos = 1 + tk_entry_ + input_state_.whitespace() + error_pos_;
|
||||
|
||||
os << "char: " << error_pos << endl;
|
||||
os << prefix;
|
||||
for (const char *p = input_line_.lo(), *e = input_line_.hi(); p < e; ++p)
|
||||
for (const char *p = input_state_.current_line().lo(),
|
||||
*e = input_state_.current_line().hi(); p < e; ++p)
|
||||
{
|
||||
os << *p;
|
||||
}
|
||||
os << endl;
|
||||
os << std::setw(tk_indent) << " ";
|
||||
|
||||
|
|
|
|||
|
|
@ -442,6 +442,7 @@ namespace xo {
|
|||
|
||||
namespace {
|
||||
using tkz_error_type = xo::scm::tokenizer_error<char>;
|
||||
using input_state_type = xo::scm::input_state<char>;
|
||||
using span_type = xo::scm::span<const char>;
|
||||
|
||||
struct testcase_error {
|
||||
|
|
@ -456,8 +457,9 @@ namespace xo {
|
|||
testcase_error retval;
|
||||
retval.input_ = input;
|
||||
retval.expect_error_ = tkz_error_type(src_function, error_descr,
|
||||
span_type::from_string(retval.input_),
|
||||
tk_start, whitespace, error_pos);
|
||||
input_state_type(span_type::from_string(retval.input_),
|
||||
tk_start, whitespace),
|
||||
error_pos);
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
|
@ -481,7 +483,7 @@ namespace xo {
|
|||
"assemble_token",
|
||||
"duplicate decimal point in numeric literal",
|
||||
0, 0, 2),
|
||||
// 0123456
|
||||
// o 0123456
|
||||
// ------v
|
||||
make_testcase("1.23e4e",
|
||||
"assemble_token",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue