/* @file input_state.hpp * * author: Roland Conybeare, Jun 2025 */ #pragma once #include "span.hpp" namespace xo { namespace scm { /** enum to report outcome of @ref capture_current_line **/ enum class input_error { /** normal return, input line successfully identified and captured **/ ok = 0, /** incomplete input; should not have been submitted to @ref capture_current_line. * note: submit last line of input with eof_flag=true **/ incomplete, N }; /** @class input_state * @brief Track detailed input position for use in error messages * * input characters fall into two categories: * - consumed: memory can be reclaimed/recycled * - buffered: memory will be retained unaltered until consumed * * remarks: * - always in one of two states: * - empty * - contains exactly one line of input * - also record current input position. * Use this for example to identify where tokenizer rejected input. * - .current_pos advances by one token * * - buffered characters always form a single contiguous range. * - input_state does not own any storage; storage is owned elsewhere * * @text * * <------------------.current_line------------------> * > <-- .whitespace * cccccccccccccccccccccccccccccccc__TTTTTTTTxxxxxxxxx * ^ ^ ^ * .current_line.lo | .current_line.hi * .current_pos * * <----prev_line----> <----current_line----> * > <--whitespace * ppppppppppppppppppp cccccccccccc__TTTTTTTT * ^ * * @endtext **/ template class input_state { public: /** @defgroup input-state-type-traits input-state type straits **/ ///@{ /** type representing a contiguous span of tokenizer input characters **/ using span_type = span; ///@} public: /** @defgroup input-state-ctors input_state constructors **/ ///@{ input_state() = default; explicit input_state(bool debug_flag) : debug_flag_{debug_flag} {} /** Create instance with supplied @p current_line, @p current_pos, @p whitespace. * Introduced for unit tests, not used in tokenizer. **/ explicit input_state(const span& current_line, size_t current_pos, size_t whitespace) : current_line_{current_line}, current_pos_{current_pos}, whitespace_{whitespace} {} ///@} /** @defgroup input-state-static-methods input_state static methods **/ ///@{ /** recognize the newline character '\n' **/ static bool is_newline(CharT ch); /** identifies whitespace chars. * These are chars that do not belong to any token. * They are not permitted to appear within * a symbol or string token. * Appearance of a whitespace char forces completioon of * preceding token. **/ static bool is_whitespace(CharT ch); ///@} /** @defgroup input-state-access-methods **/ ///@{ #pragma GCC diagnostic push #ifndef __APPLE__ #pragma GCC diagnostic ignored "-Wchanges-meaning" #endif const span_type & current_line() const { return current_line_; } #pragma GCC diagnostic pop size_t tk_start() const { return tk_start_; } size_t current_pos() const { return current_pos_; } size_t whitespace() const { return whitespace_; } bool debug_flag() const { return debug_flag_; } ///@} /** @defgroup input-state-general-methods **/ ///@{ /** Input state less @p n chars. * Use to recover input state before a complete but error-triggering token **/ input_state rewind(std::size_t n) const; /** Capture prefix of @p input up to first newline. * Set read position to start of line. * * Alters: * .current_line * .current_pos * * Return pair comprising error code and input span representing first line * (including trailing newline) from @p input. **/ std::pair capture_current_line(const span_type & input, bool eof_flag); /** atomically return current line while discarding it from input state * * Alters * .current_line * .current_pos * .whitespace **/ span_type consume_current_line(); /** Reset input state for start of next line. * Expression parser may use this to discard remainder of input line * after a parsing error. * * Alters: * .current_line * .current_pos * .whitespace **/ void discard_current_line(); /** Advance input position by @p z * * Alters: * .current_pos **/ void advance(size_t z); /** Advance .current_pos to pos. * Require: pos in @ref current_line_ **/ void advance_until(const CharT * pos); /** Skip prefix of input, starting at current read position, * comprising only whitespace. * * Presume input position is at end of token; * on return @ref whitespace_ counts number of whitespace characters * skipped. * * Return pointer to first non-whitespace character after @ref current_pos_ * or @ref current_line_.hi if reached end of buffered line. * * Alters: * .whitespace **/ const CharT * skip_leading_whitespace(); ///@} private: /** @defgroup input-state-instance-vars input_state instance variables **/ ///@{ /** remember current input line. Used only to report errors **/ span current_line_ = span(); /** start of last token within @ref current_line_ **/ size_t tk_start_ = 0; /** input position within @ref current_line_ **/ size_t current_pos_ = 0; /** number of whitespace chars since end of preceding token, * or last newline, whichever is less **/ size_t whitespace_ = 0; /** true to log input activity */ bool debug_flag_ = false; ///@} }; /*input_state*/ template bool input_state::is_newline(CharT ch) { return (ch == '\n'); } template bool input_state::is_whitespace(CharT ch) { switch(ch) { case ' ': return true; case '\t': return true; case '\n': return true; case '\r': return true; } return false; } template input_state input_state::rewind(std::size_t n) const { return input_state(this->current_line_, (n <= current_pos_) ? current_pos_ - n : 0, 0 /*whitespace*/); } template void input_state::advance(size_t z) { scope log(XO_DEBUG(debug_flag_)); this->current_pos_ += z; log && log(xtag("z", z), xtag("current_pos", current_pos_)); } template void input_state::advance_until(const CharT * pos) { scope log(XO_DEBUG(debug_flag_)); assert(current_line_.lo() <= pos && pos <= current_line_.hi()); this->current_pos_ = pos - current_line_.lo(); log && log(xtag("current_pos", current_pos_)); } template auto input_state::consume_current_line() -> span_type { span_type retval = current_line_; this->discard_current_line(); return retval; } template void input_state::discard_current_line() { this->current_line_ = span_type::make_null(); this->current_pos_ = 0; this->whitespace_ = 0; } template auto input_state::capture_current_line(const span_type & input, bool eof_flag) -> std::pair { // see also discard_current_line() // note: must capture entirety of first line, // for example including leading whitespace. // See discussion in tokenizer scan() method scope log(XO_DEBUG(debug_flag_)); /* look ahead to {end of line, end of input}, whichever comes first */ const CharT * sol = input.lo(); const CharT * eol = sol; if (sol == current_line_.lo()) { log && log("short-circuit - current line already stashed"); /* nothing to do here */ return std::make_pair(input_error::ok, current_line_); } while ((eol < input.hi()) && (*eol != '\n')) ++eol; if (*eol == '\n') { /* include \n at end-of-line */ ++eol; } else { if (!eof_flag) { /* caller expected to provide complete line of input. complain and ignore */ return std::make_pair(input_error::incomplete, input.prefix(0ul)); } } this->current_line_ = span_type(sol, eol); this->current_pos_ = 0; this->whitespace_ = 0; log && log(xtag("current_line", print::printspan(current_line_)), xtag("current_pos", current_pos_)); return std::make_pair(input_error::ok, span_type(sol, eol)); } template const CharT * input_state::skip_leading_whitespace() { scope log(XO_DEBUG(debug_flag_)); const CharT * ix = current_line_.lo() + current_pos_; this->whitespace_ = 0; /* skip whitespace + remember beginning of most recent line */ while (is_whitespace(*ix) && (ix != current_line_.hi())) { ++ix; ++(this->whitespace_); } this->tk_start_ = ix - current_line_.lo(); this->current_pos_ = ix - current_line_.lo(); return ix; } template inline std::ostream & operator<<(std::ostream & os, const input_state& x) { using xo::print::unq; os << ""; return os; } } }