diff --git a/example/tokenrepl/tokenrepl.cpp b/example/tokenrepl/tokenrepl.cpp index d6eacfea..bc73de3f 100644 --- a/example/tokenrepl/tokenrepl.cpp +++ b/example/tokenrepl/tokenrepl.cpp @@ -29,6 +29,10 @@ main() { tokenizer_type tkz(xo::log_config::min_log_level <= xo::log_level::info); string input_str; + size_t line_no = 1; + + constexpr std::size_t c_maxlines = 25; + while (repl_getline(interactive, cin, cout, input_str)) { // we want tokenizer to see newline, it's syntax input_str.push_back('\n'); @@ -36,7 +40,7 @@ main() { // reminder: input may contain multiple tokens while (!input.empty()) { - auto [tk, consumed, error] = tkz.scan(input); + auto [tk, consumed, error] = tkz.scan(input, false /*!eof*/); if (tk.is_valid()) { cout << tk << endl; @@ -47,29 +51,16 @@ main() { break; } - input = tkz.consume(consumed, input); + input = input.after_prefix(consumed); } /* here: input.empty() or error encountered */ - /* discard stashed remainder of input line - * (for nicely-formatted errors) - */ - tkz.discard_current_line(); - } + ++line_no; - { - span_type input = span_type::from_string(input_str); - - auto [tk, consumed, error] = tkz.notify_eof(input); - - input = tkz.consume(consumed, input); - - if (tk.is_valid()) { - cout << tk << endl; - } else if (error.is_error()) { - cout << "parsing error: " << endl; - error.report(cout); + if (line_no > c_maxlines) { + cout << "always exit after " << c_maxlines << " lines of input" << endl; + break; } } } diff --git a/include/xo/tokenizer/input_state.hpp b/include/xo/tokenizer/input_state.hpp index 0e93512d..0cea1155 100644 --- a/include/xo/tokenizer/input_state.hpp +++ b/include/xo/tokenizer/input_state.hpp @@ -9,9 +9,50 @@ namespace xo { namespace scm { + /** enum to report outcome of @ref capture_current_line **/ + enum class input_error { + /** normal return, input line successfully identified and captured **/ + ok = 0, + /** incomplete input; should not have been submitted to @ref capture_current_line. + * note: submit last line of input with eof_flag=true + **/ + incomplete, + N + }; + /** @class input_state * @brief Track detailed input position for use in error messages * + * input characters fall into two categories: + * - consumed: memory can be reclaimed/recycled + * - buffered: memory will be retained unaltered until consumed + * + * remarks: + * - always in one of two states: + * - empty + * - contains exactly one line of input + * - also record current input position. + * Use this for example to identify where tokenizer rejected input. + * - .current_pos advances by one token + * + * - buffered characters always form a single contiguous range. + * - input_state does not own any storage; storage is owned elsewhere + * + * @text + * + * <------------------.current_line------------------> + * > <-- .whitespace + * cccccccccccccccccccccccccccccccc__TTTTTTTTxxxxxxxxx + * ^ ^ ^ + * .current_line.lo | .current_line.hi + * .current_pos + * + * <----prev_line----> <----current_line----> + * > <--whitespace + * ppppppppppppppppppp cccccccccccc__TTTTTTTT + * ^ + * + * @endtext **/ template class input_state { @@ -33,8 +74,11 @@ namespace xo { /** Create instance with supplied @p current_line, @p current_pos, @p whitespace. * Introduced for unit tests, not used in tokenizer. **/ - explicit input_state(const span& current_line, size_t current_pos, size_t whitespace) - : current_line_{current_line}, current_pos_{current_pos}, whitespace_{whitespace} {} + explicit input_state(const span& current_line, + size_t current_pos, + size_t whitespace) : current_line_{current_line}, + current_pos_{current_pos}, + whitespace_{whitespace} {} ///@} @@ -63,6 +107,7 @@ namespace xo { #endif const span_type & current_line() const { return current_line_; } #pragma GCC diagnostic pop + size_t tk_start() const { return tk_start_; } size_t current_pos() const { return current_pos_; } size_t whitespace() const { return whitespace_; } bool debug_flag() const { return debug_flag_; } @@ -77,27 +122,65 @@ namespace xo { **/ input_state rewind(std::size_t n) const; - /** Capture prefix of @p input up to first newline **/ - void capture_current_line(const span_type & input); + /** Capture prefix of @p input up to first newline. + * Set read position to start of line. + * + * Alters: + * .current_line + * .current_pos + * + * Return pair comprising error code and input span representing first line + * (including trailing newline) from @p input. + **/ + std::pair capture_current_line(const span_type & input, + bool eof_flag); + + /** atomically return current line while discarding it from input state + * + * Alters + * .current_line + * .current_pos + * .whitespace + **/ + span_type consume_current_line(); /** Reset input state for start of next line. * Expression parser may use this to discard remainder of input line * after a parsing error. + * + * Alters: + * .current_line + * .current_pos + * .whitespace **/ void discard_current_line(); - /** Add @p z to current position **/ - void consume(size_t z); - - /** Skip prefix of input comprising whitespace. - * Return pointer to first non-whitespace character in @p input, - * or @c input.hi if input contains only whitespace. - * - * if @p input contains any newlines, preserves suffix after last - * such newilne in @p current_line_ + /** Advance input position by @p z * + * Alters: + * .current_pos **/ - const CharT * skip_leading_whitespace(const span_type & input); + void advance(size_t z); + + /** Advance .current_pos to pos. + * Require: pos in @ref current_line_ + **/ + void advance_until(const CharT * pos); + + /** Skip prefix of input, starting at current read position, + * comprising only whitespace. + * + * Presume input position is at end of token; + * on return @ref whitespace_ counts number of whitespace characters + * skipped. + * + * Return pointer to first non-whitespace character after @ref current_pos_ + * or @ref current_line_.hi if reached end of buffered line. + * + * Alters: + * .whitespace + **/ + const CharT * skip_leading_whitespace(); ///@} @@ -107,7 +190,9 @@ namespace xo { /** remember current input line. Used only to report errors **/ span current_line_ = span(); - /** current input position within @ref current_line_ **/ + /** start of last token within @ref current_line_ **/ + size_t tk_start_ = 0; + /** input position within @ref current_line_ **/ size_t current_pos_ = 0; /** number of whitespace chars since end of preceding token, * or last newline, whichever is less @@ -149,7 +234,7 @@ namespace xo { template void - input_state::consume(size_t z) { + input_state::advance(size_t z) { scope log(XO_DEBUG(debug_flag_)); this->current_pos_ += z; @@ -157,6 +242,28 @@ namespace xo { log && log(xtag("z", z), xtag("current_pos", current_pos_)); } + template + void + input_state::advance_until(const CharT * pos) { + scope log(XO_DEBUG(debug_flag_)); + + assert(current_line_.lo() <= pos && pos < current_line_.hi()); + + this->current_pos_ = pos - current_line_.lo(); + + log && log(xtag("current_pos", current_pos_)); + } + + template + auto + input_state::consume_current_line() -> span_type { + span_type retval = current_line_; + + this->discard_current_line(); + + return retval; + } + template void input_state::discard_current_line() { @@ -166,10 +273,14 @@ namespace xo { } template - void - input_state::capture_current_line(const span_type & input) + auto + input_state::capture_current_line(const span_type & input, + bool eof_flag) -> std::pair { // see also discard_current_line() + // note: must capture entirety of first line, + // for example including leading whitespace. + // See discussion in tokenizer scan() method scope log(XO_DEBUG(debug_flag_)); @@ -177,44 +288,76 @@ namespace xo { const CharT * sol = input.lo(); const CharT * eol = sol; + if (sol == current_line_.lo()) { + log && log("short-circuit - current line already stashed"); + + /* nothing to do here */ + return std::make_pair(input_error::ok, current_line_); + } + while ((eol < input.hi()) && (*eol != '\n')) ++eol; + if (*eol == '\n') { + /* include \n at end-of-line */ + ++eol; + } else { + if (!eof_flag) { + /* caller expected to provide complete line of input. complain and ignore */ + return std::make_pair(input_error::incomplete, + input.prefix(0ul)); + } + } + this->current_line_ = span_type(sol, eol); this->current_pos_ = 0; + this->whitespace_ = 0; log && log(xtag("current_line", print::printspan(current_line_)), xtag("current_pos", current_pos_)); + + return std::make_pair(input_error::ok, + span_type(sol, eol)); } template const CharT * - input_state::skip_leading_whitespace(const span_type & input) + input_state::skip_leading_whitespace() { scope log(XO_DEBUG(debug_flag_)); - const CharT * ix = input.lo(); - - if (this->current_line().is_null()) { - this->capture_current_line(input); - } + const CharT * ix = current_line_.lo() + current_pos_; this->whitespace_ = 0; /* skip whitespace + remember beginning of most recent line */ - while (is_whitespace(*ix) && (ix != input.hi())) { - if (is_newline(*ix)) { - ++ix; + while (is_whitespace(*ix) && (ix != current_line_.hi())) { + ++ix; - this->capture_current_line(span_type(ix, input.hi())); - } else { - ++ix; - - ++(this->whitespace_); - } + ++(this->whitespace_); } + this->tk_start_ = ix - current_line_.lo(); + this->current_pos_ = ix - current_line_.lo(); + return ix; } + + template + inline std::ostream & + operator<<(std::ostream & os, + const input_state& x) + { + using xo::print::unq; + + os << ""; + + return os; + } } } diff --git a/include/xo/tokenizer/scan_result.hpp b/include/xo/tokenizer/scan_result.hpp index fbc29105..79846d3c 100644 --- a/include/xo/tokenizer/scan_result.hpp +++ b/include/xo/tokenizer/scan_result.hpp @@ -7,6 +7,7 @@ #include "token.hpp" #include "tokenizer_error.hpp" +#include "input_state.hpp" namespace xo { namespace scm { @@ -17,11 +18,11 @@ namespace xo { * Possible outcomes fall into several categories * (with T: @c token_.is_valid(), E: @cerror_.is_error()) * - * | T | E | description | - * |-------+-------+-------------------| - * | false | false | end of input | - * | true | false | parsed token in T | - * | false | true | parse error in E | + * | T | E | description | + * |-------+-------+-------------------------------------| + * | false | false | end of input, including end of line | + * | true | false | parsed token in T | + * | false | true | parse error in E | * * @endcode **/ @@ -31,6 +32,7 @@ namespace xo { using token_type = token; using span_type = span; using error_type = tokenizer_error; + using input_state_type = input_state; public: scan_result(const token_type & token, @@ -40,7 +42,8 @@ namespace xo { static scan_result make_whitespace(const span_type & prefix_input); static scan_result make_partial(const span_type & prefix_input); - static scan_result make_error(const error_type & error); + static scan_result make_error(const error_type & error, + input_state_type & input_state_ref); bool is_eof_or_ambiguous() const { return token_.is_invalid() && error_.is_not_an_error(); } bool is_token() const { return token_.is_valid(); } @@ -51,7 +54,10 @@ namespace xo { const error_type & error() const { return error_; } public: - /** successfully parsed token, whenever tk_type != tokentype::tk_invalid **/ + /** Successfully parsed token, whenever tk_type != tokentype::tk_invalid. + * Will be tokentype::tk_invalid in normal cause of events for valid input, + * when consuming whitespace + **/ token_type token_; /** input span represented by .token, on success. Otherwise not defined **/ span_type consumed_; @@ -72,9 +78,14 @@ namespace xo { } template - auto scan_result::make_error(const error_type & error) -> scan_result + auto scan_result::make_error(const error_type & error, + input_state_type & input_state_ref) -> scan_result { - return scan_result(token_type::invalid(), span_type::make_null(), error); + /* report+consume entire input line */ + + return scan_result(token_type::invalid(), + input_state_ref.consume_current_line(), + error); } } /*namespace scm*/ diff --git a/include/xo/tokenizer/tokenizer.hpp b/include/xo/tokenizer/tokenizer.hpp index 0dd46877..2ea695dc 100644 --- a/include/xo/tokenizer/tokenizer.hpp +++ b/include/xo/tokenizer/tokenizer.hpp @@ -99,22 +99,15 @@ namespace xo { static bool is_2char_punctuation(CharT ch); /** assemble token from text @p token_text. - * @p token_text will often (but not always) represent a subset of @p input. - * (For example consider multi-line string literals) - * Also the span @p token_text may (in uncommon cases) - * have been copied to separate storage from @p input - * * @p initial_whitespace Amount of whitespace input being consumed from input. - * @p initial_token_prefix_from_input Amount of non-whitespace input being - * consumed from input. Not counting any stashed-and-already-consumed input + * @p token_text subset of input_line representing a single token. + * @p input_state input state containing input_line * * retval.consumed will represent some possibly-empty prefix of @p input **/ static result_type assemble_token(std::size_t initial_whitespace, - std::size_t initial_token_prefix_from_input, const span_type & token_text, - const span_type & input, - const input_state_type & input_state); + input_state_type & input_state); /** degenerate version of assemble_token() on reaching end-of-file **/ static result_type assemble_final_token(const span_type & token_text, @@ -136,35 +129,14 @@ namespace xo { * * @return {parsed token, consumed span} **/ - result_type scan(const span_type & input); - - /** When eof is false, same as scan(input). - * When eof is true and scan(input) does not report a token, - * return notify_eof() - **/ - result_type scan2(const span_type & input, bool eof); - - /** @retval span with @p consumed permanently removed from @p input. - * - * Purpose of this method is to update @ref current_pos_. - **/ - span_type consume(const span_type & consumed, const span_type & input); + result_type scan(const span_type & input, + bool eof_flag); /** discard current line after error. Just cleans up error-reporting state **/ void discard_current_line(); - /** notify end of input, resolving any ambiguous input stashed in .prefix - **/ - result_type notify_eof(const span_type & input); - ///@} - private: - result_type scan_completion(const span_type & whitespace, - const CharT* token_end, - const span_type & input, - const input_state_type & input_state); - private: /** @defgroup tokenizer-instance-vars tokenizer instance variables **/ ///@{ @@ -283,19 +255,16 @@ namespace xo { template auto tokenizer::assemble_token(std::size_t initial_whitespace, - std::size_t initial_token_prefix_from_input, const span_type & token_text, - const span_type & input, - const input_state_type & input_state) -> result_type + input_state_type & input_state_ref) -> result_type { /* literal|pretty|streamlined */ log_config::style = function_style::streamlined; - scope log(XO_DEBUG(input_state.debug_flag())); + scope log(XO_DEBUG(input_state_ref.debug_flag())); log && log(xtag("token_text", token_text), xtag("initial_whitespace", initial_whitespace), - xtag("initial_token_prefix_from_input", initial_token_prefix_from_input), - xtag("input", input)); + xtag("input_state", input_state_ref)); tokentype tk_type = tokentype::tk_invalid; std::string tk_text; @@ -394,17 +363,19 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "improperly placed sign indicator", - input_state, + input_state_ref, (ix - tk_start) - )); + ), + input_state_ref); } } else if (*ix == '.') { if (period_flag) { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "duplicate decimal point in numeric literal", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } period_flag = true; @@ -413,8 +384,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "duplicate exponent marker in numeric literal", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } exponent_flag = true; @@ -429,8 +401,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "unexpected character in numeric constant" /*error_description*/, - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } } @@ -532,8 +505,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "expecting key following escape character \\", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } switch(*ix) { @@ -561,8 +535,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "expecting one of n|r|\"|\\ following escape \\", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } break; default: @@ -578,8 +553,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "missing terminating '\"' to complete literal string", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } log && log(tostr("tokenizer::assemble_token", @@ -720,8 +696,9 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "illegal input character", - input_state, - (ix - tk_start))); + input_state_ref, + (ix - tk_start)), + input_state_ref); } if ((tk_type == tokentype::tk_i64) @@ -771,8 +748,11 @@ namespace xo { tk_text.clear(); } + /* input.prefix(0): + * require caller preserves current input line until it's entirely exhausted + */ return result_type(token_type(tk_type, std::move(tk_text)), - input.prefix(initial_whitespace + initial_token_prefix_from_input)); + input_state_ref.current_line().prefix(0)); } /*assemble_token*/ /* TODO: input_state_ as argument ? */ @@ -782,67 +762,44 @@ namespace xo { const input_state_type & input_state) -> result_type { return assemble_token(0 /*initial_whitespace*/, - 0 /*initial_token_prefix_from_input*/, token_text, - span_type::make_null(), input_state); } - /* TODO: prefix_, input_state_ as arguments */ template auto - tokenizer::scan_completion(const span_type & whitespace, - const CharT* token_end, - const span_type & input, - const input_state_type & input_state) -> result_type { - - auto token_span = input.after_prefix(whitespace).prefix_upto(token_end); - - if (this->prefix_.empty()) { - return assemble_token(whitespace.size(), - token_span.size() /*initial_token_prefix_from_input*/, - token_span, - input, - input_state); - } else { - /* whatever we stashed in .prefix_, should be consumed from input. - * control here implies reached end of input with either - * - input for which parsing outcome depends on existence of more input, - * and presence of eof now resolves - * - malformed input (that might represent prefix of a valid token. Say "#incl" in C) - * - * That means stashed .prefix will represent copied range of characters that - * ends at the same position as input - */ - return result_type::make_partial(input); - } - - } - -#ifdef NOT_USING - template - void - tokenizer::capture_current_line(const span_type & input) - { - this->input_state_.capture_current_line(input); - } -#endif - - template - auto - tokenizer::scan(const span_type & input) -> result_type + tokenizer::scan(const span_type & input, bool eof_flag) -> result_type { scope log(XO_DEBUG(input_state_.debug_flag())); log && log(xtag("input", input)); - const CharT * ix = this->input_state_.skip_leading_whitespace(input); + /* - Always at beginning of token when scan() invoked + * - scan will not report any portion of line as consumed until it has + * emitted all tokens in that line. + * rationale: caller is allowed to discard storage that + * scan() reports as consumed. But will be holding that line + * until all tokens have been read. + * - this means caller will typically call scan() + * with the same input span multiple times + */ + + /* automagically no-ops when the same input presented twice */ + this->input_state_.capture_current_line(input, eof_flag); + + const CharT * ix = this->input_state_.skip_leading_whitespace(); if(ix == input.hi()) { - /* no-op */ - return result_type::make_whitespace(input.prefix_upto(ix)); + log && log("end input -> consume current line"); + + /* entirety of current line has been tokenized + * -> caller may consume it + */ + return result_type::make_whitespace(this->input_state_.consume_current_line()); } + /* ix: if ix < input.hi: first non-whitespace character after input_state_.current_pos_ */ + // TODO: // 1. hoist complete_flag up here // 2. use in each branch @@ -850,9 +807,9 @@ namespace xo { /* here: *ix is not whitespace */ - auto whitespace_span = input.prefix_upto(ix); + auto whitespace_z = input_state_.whitespace(); - log && log(xtag("whitespace.size", input_state_.whitespace())); + log && log(xtag("whitespace_z", whitespace_z)); /* tk_start points to known beginning of token * (after any whitespace) @@ -871,12 +828,15 @@ namespace xo { ++ix; +#ifdef OBSOLETE // no longer a thing. either input ends in whitespace, or ends translation unit if (ix == input.hi()) { /* need more input to know if/when token complete */ this->prefix_ += std::string(tk_start, input.hi()); log && log(xtag("captured-prefix1", this->prefix_)); - } else { + } else +#endif + { CharT ch2 = *ix; if (((ch2 >= '0') && (ch2 <= '9')) @@ -909,21 +869,28 @@ namespace xo { break; } } else if ((*ix == '\n') || (*ix == '\r')) { + log && log ("string literal with naked newline or CR"); + return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "must use \\n or \\r to encode newline/cr in string literal", input_state_, - (ix - tk_start))); + (ix - tk_start)), + this->input_state_); } prev_ch = *ix; } if (!complete_flag) { - /* need more input to know if/when token complete */ - this->prefix_ += std::string(tk_start, input.hi()); + log && log("unterminated string literal"); - log && log(xtag("captured-prefix2", this->prefix_)); + return result_type::make_error + (error_type(__FUNCTION__ /*src_function*/, + "unterminated string literal", + input_state_, + (ix - tk_start)), + this->input_state_); } } else { /* ix is start of some token */ @@ -941,8 +908,13 @@ namespace xo { /* include next char and complete token */ ++ix; - return scan_completion(whitespace_span, ix /*token_end*/, input, - this->input_state_); + log && log("complete '->' token"); + + this->input_state_.advance_until(ix); + + return assemble_token(whitespace_z, + span_type(tk_start, ix) /*token*/, + input_state_); } /* here: -123, -.5e-21 for example */ @@ -959,9 +931,14 @@ namespace xo { CharT ch2 = *ix; if (ch2 != '=') { + log && log("complete '>=' token"); + + this->input_state_.advance_until(ix); + /* ignore next char and complete token */ - return scan_completion(whitespace_span, ix /*token_end*/, input, - this->input_state_); + return assemble_token(whitespace_z, + span_type(tk_start, ix) /*token*/, + this->input_state_); } /* here: >= for example */ @@ -1003,18 +980,28 @@ namespace xo { } } +#ifdef OBSOLETE if (ix == input.hi()) { /* need more input to know if/when token complete */ this->prefix_ += std::string(tk_start, input.hi()); log && log(xtag("captured-prefix5", this->prefix_)); } +#endif } - return scan_completion(whitespace_span, ix /*token_end*/, input, - this->input_state_); + log && log("assemble token z", xtag("token_z", ix - tk_start)); + + assert(tk_start < ix); + + this->input_state_.advance_until(ix); + + return assemble_token(whitespace_z, + span_type(tk_start, ix) /*token*/, + this->input_state_); } /*scan*/ +#ifdef OBSOLETE template auto tokenizer::scan2(const span_type & input, bool eof) -> result_type { @@ -1039,15 +1026,19 @@ namespace xo { span_type::concat(sr.consumed(), sr2.consumed()), sr2.error()); } +#endif +#ifdef OBSOLETE template auto - tokenizer::consume(const span_type & consumed, const span_type & input) -> span_type + tokenizer::consume(const span_type & consumed, + const span_type & input) -> span_type { this->input_state_.consume(consumed.size()); return input.after_prefix(consumed); } +#endif template void @@ -1056,6 +1047,7 @@ namespace xo { this->input_state_.discard_current_line(); } +#ifdef OBSOLETE template auto tokenizer::notify_eof(const span_type & input) -> result_type { @@ -1063,20 +1055,12 @@ namespace xo { log && log(xtag("prefix_", prefix_), xtag("prefix_.size", prefix_.size()), xtag("input", input)); - if (this->prefix_.empty()) { - /* almost meretricious to include input here, - * when called from scan2() it can only be whitespace - */ - return result_type::make_whitespace(input); - } else { - auto retval = assemble_final_token(span_type::from_string(prefix_), - this->input_state_); - - this->prefix_.clear(); - - return retval; - } + /* almost meretricious to include input here, + * when called from scan2() it can only be whitespace + */ + return result_type::make_whitespace(input); } /*notify_eof*/ +#endif } /*namespace scm*/ } /*namespace xo*/ diff --git a/include/xo/tokenizer/tokenizer_error.hpp b/include/xo/tokenizer/tokenizer_error.hpp index ebcf2a0f..6a673e53 100644 --- a/include/xo/tokenizer/tokenizer_error.hpp +++ b/include/xo/tokenizer/tokenizer_error.hpp @@ -121,22 +121,22 @@ namespace xo { if (!error_description_.empty()) { const char * prefix = "input: "; - /* input_state.current_pos: position of first character following preceding token. - * input_state.whitespace: whitespace between current_pos and start of failing token + /* input_state.tk_start: position of first character in token + * input_state.current_pos: position of first character following preceding token. * error_pos: position (relative to start) at which failure detected */ - const size_t tk_start = input_state_.current_pos() + input_state_.whitespace(); + const size_t tk_start = input_state_.tk_start(); const size_t tk_indent = (strlen(prefix) + tk_start); const size_t error_pos = 1 + tk_start + error_pos_; - os << "char: " << error_pos << endl; + os << "token col: " << tk_start << ", error col: " << error_pos << "\n"; os << prefix; for (const char *p = input_state_.current_line().lo(), *e = input_state_.current_line().hi(); p < e; ++p) { os << *p; } - os << endl; + //os << endl; os << std::setw(tk_indent) << " "; for (size_t i = 0; i < error_pos_; ++i) { diff --git a/utest/tokenizer.test.cpp b/utest/tokenizer.test.cpp index ec7d394f..604b9d25 100644 --- a/utest/tokenizer.test.cpp +++ b/utest/tokenizer.test.cpp @@ -232,7 +232,7 @@ namespace xo { in_span(testcase.input_.c_str(), testcase.input_.c_str() + testcase.input_.size()); - auto sr = tkz.scan2(in_span, true /*eof*/); + auto sr = tkz.scan(in_span, true /*eof*/); REHEARSE(rh, sr.get_token().tk_type() == testcase.expected_tk_.tk_type()); if (sr.get_token().tk_type() == tokentype::tk_i64) @@ -408,7 +408,7 @@ namespace xo { { log && log(xtag("i_tk", i_tk)); - auto sr = tkz.scan2(in_span, in_span.empty()); + auto sr = tkz.scan(in_span, in_span.empty()); const auto & tk = sr.get_token(); if (tk.is_valid()) { @@ -454,6 +454,8 @@ namespace xo { make_testcase(const char * input, const char * src_function, const char * error_descr, size_t tk_start, size_t whitespace, size_t error_pos) { + size_t line_no = 1; + testcase_error retval; retval.input_ = input; retval.expect_error_ = tkz_error_type(src_function, error_descr, @@ -548,7 +550,7 @@ namespace xo { auto in_span = tokenizer::span_type::from_string(testcase.input_); - auto sr = tkz.scan2(in_span, true /*eof*/); + auto sr = tkz.scan(in_span, true /*eof*/); REHEARSE(rh, sr.is_error());