xo-tokenizer: + doc for tokenizer + other doc-related improvements

This commit is contained in:
Roland Conybeare 2025-06-25 23:50:30 -05:00
commit 573afb6030
9 changed files with 143 additions and 30 deletions

View file

@ -38,7 +38,7 @@ namespace xo {
///@}
/** @defgroup input-state static methods **/
/** @defgroup input-state-static-methods input_state static methods **/
///@{
/** recognize the newline character '\n' **/
@ -80,7 +80,7 @@ namespace xo {
void discard_current_line();
/** Add @p z to current position **/
void consume(size_t z) { current_pos_ += z; }
void consume(size_t z);
/** Skip prefix of input comprising whitespace.
* Return pointer to first non-whitespace character in @p input,
@ -91,7 +91,7 @@ namespace xo {
///@}
private:
/** @defgroup input-state-instance-vars **/
/** @defgroup input-state-instance-vars input_state instance variables **/
///@{
/** remember current input line. Used only to report errors **/
@ -128,6 +128,16 @@ namespace xo {
return false;
}
template <typename CharT>
void
input_state<CharT>::consume(size_t z) {
scope log(XO_DEBUG(debug_flag_));
this->current_pos_ += z;
log && log(xtag("z", z), xtag("current_pos", current_pos_));
}
template <typename CharT>
void
input_state<CharT>::discard_current_line() {

View file

@ -58,8 +58,16 @@ namespace xo {
using result_type = scan_result<CharT>;
public:
/** @defgroup tokenizer-ctors tokenizer constructors **/
///@{
tokenizer(bool debug_flag = false);
///@}
/** @defgroup tokenizer-general-methods tokenizer methods **/
///@{
/** identifies punctuation chars.
* These are chars that are not permitted to appear within
* a symbol token. Instead they force completion of
@ -130,19 +138,26 @@ namespace xo {
**/
result_type notify_eof(const span_type & input);
///@}
private:
result_type scan_completion(const span_type & whitespace,
const CharT* token_end,
const span_type & input);
private:
/** @defgroup tokenizer-instance-vars tokenizer instance variables **/
///@{
/** track input state (line#,pos,..) for error messages **/
input_state_type input_state_;
/** Accumulate partial token here.
* This will happen if input sent to @ref tokenizer::scan
* ends without a determinate token boundary.
* ends without whitespace such that last available token's extent is not determined
**/
std::string prefix_;
///@}
}; /*tokenizer*/
template <typename CharT>
@ -338,7 +353,8 @@ namespace xo {
//current_line_,
//current_pos_,
//initial_whitespace,
(ix - tk_start)));
(ix - tk_start)
));
}
} else if (*ix == '.') {
if (period_flag) {
@ -378,9 +394,6 @@ namespace xo {
(error_type(__FUNCTION__ /*src_function*/,
"unexpected character in numeric constant" /*error_description*/,
input_state_,
//current_line_,
//current_pos_,
//initial_whitespace,
(ix - tk_start)));
}
}

View file

@ -36,16 +36,17 @@ namespace xo {
tokenizer_error(const char * src_function,
const char * error_description,
const input_state_type & input_state,
//span_type input_line,
//size_t tk_start,
//size_t whitespace,
size_t error_pos)
: src_function_{src_function},
error_description_{error_description},
input_state_{input_state},
//tk_entry_{tk_start},
//whitespace_{whitespace},
error_pos_{error_pos} {}
error_pos_{error_pos}
{
scope log(XO_DEBUG(input_state.debug_flag()));
log && log(xtag("input_state.current_pos", input_state.current_pos()),
xtag("error_pos", error_pos));
}
///@}
/** @defgroup tokenizer-error-access-methods **/
@ -57,7 +58,6 @@ namespace xo {
#pragma GCC diagnostic ignored "-Wchanges-meaning"
const input_state_type & input_state() const { return input_state_; }
#pragma GCC diagnostic pop
//const span_type& input_line() const { return input_line_; }
size_t tk_start() const { return input_state_.current_pos(); }
size_t whitespace() const { return input_state_.whitespace(); }
size_t error_pos() const { return error_pos_; }
@ -94,8 +94,6 @@ namespace xo {
* Sufficient to precisely locate it with context.
**/
input_state_type input_state_;
/** position (relative to line_.lo) of token start where error encountered **/
size_t tk_entry_ = 0;
/** position (relative to @ref tk_entry_) of error **/
size_t error_pos_ = 0;
@ -110,7 +108,6 @@ namespace xo {
<< xtag("message", error_description_)
<< xtag("input", input_state_.current_line())
<< xtag("whitespace", input_state_.whitespace())
<< xtag("tk-start", tk_entry_)
<< xtag("error-pos", error_pos_)
<< ">";
}
@ -122,10 +119,13 @@ namespace xo {
if (error_description_) {
const char * prefix = "input: ";
const size_t tk_indent = strlen(prefix) + tk_entry_ + input_state_.whitespace();
//const size_t msg_length = strlen(error_description_);
const size_t error_pos = 1 + tk_entry_ + input_state_.whitespace() + error_pos_;
/* input_state.current_pos: position of first character following preceding token.
* input_state.whitespace: whitespace between current_pos and start of failing token
* error_pos: position (relative to start) at which failure detected
*/
const size_t tk_start = input_state_.current_pos() + input_state_.whitespace();
const size_t tk_indent = (strlen(prefix) + tk_start);
const size_t error_pos = 1 + tk_start + error_pos_;
os << "char: " << error_pos << endl;
os << prefix;