diff --git a/xo-tokenizer/example/tokenrepl/tokenrepl.cpp b/xo-tokenizer/example/tokenrepl/tokenrepl.cpp index 77606a0b..b1a2fc55 100644 --- a/xo-tokenizer/example/tokenrepl/tokenrepl.cpp +++ b/xo-tokenizer/example/tokenrepl/tokenrepl.cpp @@ -51,6 +51,8 @@ main() { //input = input.after_prefix(consumed.size()); } + /* here: input.empty() or error encountered */ + /* discard stashed remainder of input line * (for nicely-formatted errors) */ diff --git a/xo-tokenizer/include/xo/tokenizer/input_state.hpp b/xo-tokenizer/include/xo/tokenizer/input_state.hpp index 12c570d5..bd321382 100644 --- a/xo-tokenizer/include/xo/tokenizer/input_state.hpp +++ b/xo-tokenizer/include/xo/tokenizer/input_state.hpp @@ -20,8 +20,23 @@ namespace xo { public: input_state() = default; - explicit input_state(const span& x, size_t cpos, size_t ws) - : current_line_{x}, current_pos_{cpos}, whitespace_{ws} {} + explicit input_state(bool debug_flag) : debug_flag_{debug_flag} {} + /** Create instance with supplied @p current_line, @p current_pos, @p whitespace. + * Introduced for unit tests, not used in tokenizer. + **/ + explicit input_state(const span& current_line, size_t current_pos, size_t whitespace) + : current_line_{current_line}, current_pos_{current_pos}, whitespace_{whitespace} {} + + /** recognize the newline character '\n' **/ + static bool is_newline(CharT ch); + /** identifies whitespace chars. + * These are chars that do not belong to any token. + * They are not permitted to appear within + * a symbol or string token. + * Appearance of a whitespace char forces completioon of + * preceding token. + **/ + static bool is_whitespace(CharT ch); #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wchanges-meaning" @@ -29,14 +44,20 @@ namespace xo { #pragma GCC diagnostic pop size_t current_pos() const { return current_pos_; } size_t whitespace() const { return whitespace_; } + bool debug_flag() const { return debug_flag_; } + /** capture prefix of @p input up to first newline **/ void capture_current_line(const span_type & input); + + /** reset input state for start of next line **/ void discard_current_line(); void consume(size_t z) { current_pos_ += z; } - void reset_whitespace() { whitespace_ = 0; } - void increment_whitespace() { ++whitespace_; } + const CharT * skip_leading_whitespace(const span_type & input); + + private: + //void reset_whitespace() { whitespace_ = 0; } private: /** remember current input line. Used only to report errors **/ @@ -48,14 +69,35 @@ namespace xo { **/ size_t whitespace_ = 0; + /** true to log input activity */ bool debug_flag_ = false; }; + template + bool + input_state::is_newline(CharT ch) { + return (ch == '\n'); + } + + template + bool + input_state::is_whitespace(CharT ch) { + switch(ch) { + case ' ': return true; + case '\t': return true; + case '\n': return true; + case '\r': return true; + } + + return false; + } + template void input_state::discard_current_line() { this->current_line_ = span_type::make_null(); this->current_pos_ = 0; + this->whitespace_ = 0; } template @@ -74,9 +116,36 @@ namespace xo { ++eol; this->current_line_ = span_type(sol, eol); -// this->current_pos_ = 0; log && log(xtag("current_line", print::printspan(current_line_))); } + + template + const CharT * + input_state::skip_leading_whitespace(const span_type & input) + { + const CharT * ix = input.lo(); + + if (this->current_line().is_null()) { + this->capture_current_line(input); + } + + this->whitespace_ = 0; + + /* skip whitespace + remember beginning of most recent line */ + while (is_whitespace(*ix) && (ix != input.hi())) { + if (is_newline(*ix)) { + ++ix; + + this->capture_current_line(span_type(ix, input.hi())); + } else { + ++ix; + + ++(this->whitespace_); + } + } + + return ix; + } } } diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp index 7dc0e750..1eda4f10 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp @@ -60,18 +60,6 @@ namespace xo { public: tokenizer(bool debug_flag = false); - /** recognize the newline character '\n' **/ - bool is_newline(CharT ch) const; - - /** identifies whitespace chars. - * These are chars that do not belong to any token. - * They are not permitted to appear within - * a symbol or string token. - * Appearance of a whitespace char forces completioon of - * preceding token. - **/ - bool is_whitespace(CharT ch) const; - /** identifies punctuation chars. * These are chars that are not permitted to appear within * a symbol token. Instead they force completion of @@ -143,15 +131,11 @@ namespace xo { result_type notify_eof(const span_type & input); private: - void capture_current_line(const span_type & input); - result_type scan_completion(const span_type & whitespace, const CharT* token_end, const span_type & input); private: - /** true to log tokenizer activity to stdout **/ - bool debug_flag_ = false; /** track input state (line#,pos,..) for error messages **/ input_state_type input_state_; /** Accumulate partial token here. @@ -163,28 +147,9 @@ namespace xo { template tokenizer::tokenizer(bool debug_flag) - : debug_flag_{debug_flag} + : input_state_{debug_flag} {} - template - bool - tokenizer::is_newline(CharT ch) const { - return (ch == '\n'); - } - - template - bool - tokenizer::is_whitespace(CharT ch) const { - switch(ch) { - case ' ': return true; - case '\t': return true; - case '\n': return true; - case '\r': return true; - } - - return false; - } - template bool tokenizer::is_1char_punctuation(CharT ch) const { @@ -266,7 +231,7 @@ namespace xo { /* literal|pretty|streamlined */ log_config::style = function_style::streamlined; - scope log(XO_DEBUG(debug_flag_)); + scope log(XO_DEBUG(input_state_.debug_flag())); log && log(xtag("token_text", token_text), xtag("initial_whitespace", initial_whitespace), xtag("initial_token_prefix_from_input", initial_token_prefix_from_input), @@ -764,42 +729,24 @@ namespace xo { } +#ifdef NOT_USING template void tokenizer::capture_current_line(const span_type & input) { this->input_state_.capture_current_line(input); } +#endif template auto tokenizer::scan(const span_type & input) -> result_type { - scope log(XO_DEBUG(debug_flag_)); + scope log(XO_DEBUG(input_state_.debug_flag())); log && log(xtag("input", input)); - const CharT * ix = input.lo(); - - if (this->input_state_.current_line().is_null()) { - this->capture_current_line(input); - } - - this->input_state_.reset_whitespace(); - - /* skip whitespace + remember beginning of most recent line */ - while (is_whitespace(*ix) && (ix != input.hi())) { - if (is_newline(*ix)) { - ++ix; - - this->capture_current_line(span_type(ix, input.hi())); - this->input_state_.reset_whitespace(); - } else { - ++ix; - - this->input_state_.increment_whitespace(); - } - } + const CharT * ix = this->input_state_.skip_leading_whitespace(input); if(ix == input.hi()) { /* no-op */ @@ -937,7 +884,7 @@ namespace xo { * - punctuation */ for (; ix != input.hi(); ++ix) { - if (is_whitespace(*ix) + if (input_state_type::is_whitespace(*ix) || is_1char_punctuation(*ix) || is_2char_punctuation(*ix)) { @@ -981,7 +928,7 @@ namespace xo { template auto tokenizer::scan2(const span_type & input, bool eof) -> result_type { - scope log(XO_DEBUG(debug_flag_)); + scope log(XO_DEBUG(input_state_.debug_flag())); auto sr = this->scan(input); @@ -1016,15 +963,13 @@ namespace xo { void tokenizer::discard_current_line() { - // see capture_current_line() - this->input_state_.discard_current_line(); } template auto tokenizer::notify_eof(const span_type & input) -> result_type { - scope log(XO_DEBUG(debug_flag_)); + scope log(XO_DEBUG(input_state_.debug_flag())); log && log(xtag("prefix_", prefix_), xtag("prefix_.size", prefix_.size()), xtag("input", input));