xo-tokenizer: streamline input_state/tokenizer interaction + docs
This commit is contained in:
parent
093f8a4b7c
commit
21aa8978cf
3 changed files with 85 additions and 69 deletions
|
|
@ -51,6 +51,8 @@ main() {
|
|||
//input = input.after_prefix(consumed.size());
|
||||
}
|
||||
|
||||
/* here: input.empty() or error encountered */
|
||||
|
||||
/* discard stashed remainder of input line
|
||||
* (for nicely-formatted errors)
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -20,8 +20,23 @@ namespace xo {
|
|||
|
||||
public:
|
||||
input_state() = default;
|
||||
explicit input_state(const span<const CharT>& x, size_t cpos, size_t ws)
|
||||
: current_line_{x}, current_pos_{cpos}, whitespace_{ws} {}
|
||||
explicit input_state(bool debug_flag) : debug_flag_{debug_flag} {}
|
||||
/** Create instance with supplied @p current_line, @p current_pos, @p whitespace.
|
||||
* Introduced for unit tests, not used in tokenizer.
|
||||
**/
|
||||
explicit input_state(const span<const CharT>& current_line, size_t current_pos, size_t whitespace)
|
||||
: current_line_{current_line}, current_pos_{current_pos}, whitespace_{whitespace} {}
|
||||
|
||||
/** recognize the newline character '\n' **/
|
||||
static bool is_newline(CharT ch);
|
||||
/** identifies whitespace chars.
|
||||
* These are chars that do not belong to any token.
|
||||
* They are not permitted to appear within
|
||||
* a symbol or string token.
|
||||
* Appearance of a whitespace char forces completioon of
|
||||
* preceding token.
|
||||
**/
|
||||
static bool is_whitespace(CharT ch);
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wchanges-meaning"
|
||||
|
|
@ -29,14 +44,20 @@ namespace xo {
|
|||
#pragma GCC diagnostic pop
|
||||
size_t current_pos() const { return current_pos_; }
|
||||
size_t whitespace() const { return whitespace_; }
|
||||
bool debug_flag() const { return debug_flag_; }
|
||||
|
||||
/** capture prefix of @p input up to first newline **/
|
||||
void capture_current_line(const span_type & input);
|
||||
|
||||
/** reset input state for start of next line **/
|
||||
void discard_current_line();
|
||||
|
||||
void consume(size_t z) { current_pos_ += z; }
|
||||
|
||||
void reset_whitespace() { whitespace_ = 0; }
|
||||
void increment_whitespace() { ++whitespace_; }
|
||||
const CharT * skip_leading_whitespace(const span_type & input);
|
||||
|
||||
private:
|
||||
//void reset_whitespace() { whitespace_ = 0; }
|
||||
|
||||
private:
|
||||
/** remember current input line. Used only to report errors **/
|
||||
|
|
@ -48,14 +69,35 @@ namespace xo {
|
|||
**/
|
||||
size_t whitespace_ = 0;
|
||||
|
||||
/** true to log input activity */
|
||||
bool debug_flag_ = false;
|
||||
};
|
||||
|
||||
template <typename CharT>
|
||||
bool
|
||||
input_state<CharT>::is_newline(CharT ch) {
|
||||
return (ch == '\n');
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
bool
|
||||
input_state<CharT>::is_whitespace(CharT ch) {
|
||||
switch(ch) {
|
||||
case ' ': return true;
|
||||
case '\t': return true;
|
||||
case '\n': return true;
|
||||
case '\r': return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
void
|
||||
input_state<CharT>::discard_current_line() {
|
||||
this->current_line_ = span_type::make_null();
|
||||
this->current_pos_ = 0;
|
||||
this->whitespace_ = 0;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
|
|
@ -74,9 +116,36 @@ namespace xo {
|
|||
++eol;
|
||||
|
||||
this->current_line_ = span_type(sol, eol);
|
||||
// this->current_pos_ = 0;
|
||||
|
||||
log && log(xtag("current_line", print::printspan(current_line_)));
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
const CharT *
|
||||
input_state<CharT>::skip_leading_whitespace(const span_type & input)
|
||||
{
|
||||
const CharT * ix = input.lo();
|
||||
|
||||
if (this->current_line().is_null()) {
|
||||
this->capture_current_line(input);
|
||||
}
|
||||
|
||||
this->whitespace_ = 0;
|
||||
|
||||
/* skip whitespace + remember beginning of most recent line */
|
||||
while (is_whitespace(*ix) && (ix != input.hi())) {
|
||||
if (is_newline(*ix)) {
|
||||
++ix;
|
||||
|
||||
this->capture_current_line(span_type(ix, input.hi()));
|
||||
} else {
|
||||
++ix;
|
||||
|
||||
++(this->whitespace_);
|
||||
}
|
||||
}
|
||||
|
||||
return ix;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -60,18 +60,6 @@ namespace xo {
|
|||
public:
|
||||
tokenizer(bool debug_flag = false);
|
||||
|
||||
/** recognize the newline character '\n' **/
|
||||
bool is_newline(CharT ch) const;
|
||||
|
||||
/** identifies whitespace chars.
|
||||
* These are chars that do not belong to any token.
|
||||
* They are not permitted to appear within
|
||||
* a symbol or string token.
|
||||
* Appearance of a whitespace char forces completioon of
|
||||
* preceding token.
|
||||
**/
|
||||
bool is_whitespace(CharT ch) const;
|
||||
|
||||
/** identifies punctuation chars.
|
||||
* These are chars that are not permitted to appear within
|
||||
* a symbol token. Instead they force completion of
|
||||
|
|
@ -143,15 +131,11 @@ namespace xo {
|
|||
result_type notify_eof(const span_type & input);
|
||||
|
||||
private:
|
||||
void capture_current_line(const span_type & input);
|
||||
|
||||
result_type scan_completion(const span_type & whitespace,
|
||||
const CharT* token_end,
|
||||
const span_type & input);
|
||||
|
||||
private:
|
||||
/** true to log tokenizer activity to stdout **/
|
||||
bool debug_flag_ = false;
|
||||
/** track input state (line#,pos,..) for error messages **/
|
||||
input_state_type input_state_;
|
||||
/** Accumulate partial token here.
|
||||
|
|
@ -163,28 +147,9 @@ namespace xo {
|
|||
|
||||
template <typename CharT>
|
||||
tokenizer<CharT>::tokenizer(bool debug_flag)
|
||||
: debug_flag_{debug_flag}
|
||||
: input_state_{debug_flag}
|
||||
{}
|
||||
|
||||
template <typename CharT>
|
||||
bool
|
||||
tokenizer<CharT>::is_newline(CharT ch) const {
|
||||
return (ch == '\n');
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
bool
|
||||
tokenizer<CharT>::is_whitespace(CharT ch) const {
|
||||
switch(ch) {
|
||||
case ' ': return true;
|
||||
case '\t': return true;
|
||||
case '\n': return true;
|
||||
case '\r': return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
bool
|
||||
tokenizer<CharT>::is_1char_punctuation(CharT ch) const {
|
||||
|
|
@ -266,7 +231,7 @@ namespace xo {
|
|||
/* literal|pretty|streamlined */
|
||||
log_config::style = function_style::streamlined;
|
||||
|
||||
scope log(XO_DEBUG(debug_flag_));
|
||||
scope log(XO_DEBUG(input_state_.debug_flag()));
|
||||
log && log(xtag("token_text", token_text),
|
||||
xtag("initial_whitespace", initial_whitespace),
|
||||
xtag("initial_token_prefix_from_input", initial_token_prefix_from_input),
|
||||
|
|
@ -764,42 +729,24 @@ namespace xo {
|
|||
|
||||
}
|
||||
|
||||
#ifdef NOT_USING
|
||||
template <typename CharT>
|
||||
void
|
||||
tokenizer<CharT>::capture_current_line(const span_type & input)
|
||||
{
|
||||
this->input_state_.capture_current_line(input);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::scan(const span_type & input) -> result_type
|
||||
{
|
||||
scope log(XO_DEBUG(debug_flag_));
|
||||
scope log(XO_DEBUG(input_state_.debug_flag()));
|
||||
|
||||
log && log(xtag("input", input));
|
||||
|
||||
const CharT * ix = input.lo();
|
||||
|
||||
if (this->input_state_.current_line().is_null()) {
|
||||
this->capture_current_line(input);
|
||||
}
|
||||
|
||||
this->input_state_.reset_whitespace();
|
||||
|
||||
/* skip whitespace + remember beginning of most recent line */
|
||||
while (is_whitespace(*ix) && (ix != input.hi())) {
|
||||
if (is_newline(*ix)) {
|
||||
++ix;
|
||||
|
||||
this->capture_current_line(span_type(ix, input.hi()));
|
||||
this->input_state_.reset_whitespace();
|
||||
} else {
|
||||
++ix;
|
||||
|
||||
this->input_state_.increment_whitespace();
|
||||
}
|
||||
}
|
||||
const CharT * ix = this->input_state_.skip_leading_whitespace(input);
|
||||
|
||||
if(ix == input.hi()) {
|
||||
/* no-op */
|
||||
|
|
@ -937,7 +884,7 @@ namespace xo {
|
|||
* - punctuation
|
||||
*/
|
||||
for (; ix != input.hi(); ++ix) {
|
||||
if (is_whitespace(*ix)
|
||||
if (input_state_type::is_whitespace(*ix)
|
||||
|| is_1char_punctuation(*ix)
|
||||
|| is_2char_punctuation(*ix))
|
||||
{
|
||||
|
|
@ -981,7 +928,7 @@ namespace xo {
|
|||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::scan2(const span_type & input, bool eof) -> result_type {
|
||||
scope log(XO_DEBUG(debug_flag_));
|
||||
scope log(XO_DEBUG(input_state_.debug_flag()));
|
||||
|
||||
auto sr = this->scan(input);
|
||||
|
||||
|
|
@ -1016,15 +963,13 @@ namespace xo {
|
|||
void
|
||||
tokenizer<CharT>::discard_current_line()
|
||||
{
|
||||
// see capture_current_line()
|
||||
|
||||
this->input_state_.discard_current_line();
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::notify_eof(const span_type & input) -> result_type {
|
||||
scope log(XO_DEBUG(debug_flag_));
|
||||
scope log(XO_DEBUG(input_state_.debug_flag()));
|
||||
|
||||
log && log(xtag("prefix_", prefix_), xtag("prefix_.size", prefix_.size()), xtag("input", input));
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue