From 27ef5701acb6b8196f22db7e3d71f9a82020a6c5 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Sun, 22 Jun 2025 16:16:23 -0500 Subject: [PATCH] xo-tokenizer: bugfix: yields token works + 2phase utest --- xo-reader/src/reader/reader.cpp | 6 +- xo-tokenizer/CMakeLists.txt | 8 +- xo-tokenizer/include/xo/tokenizer/span.hpp | 35 ++ xo-tokenizer/include/xo/tokenizer/token.hpp | 4 +- .../include/xo/tokenizer/tokenizer.hpp | 462 +++++++++++++----- .../include/xo/tokenizer/tokentype.hpp | 6 +- xo-tokenizer/utest/token.test.cpp | 106 ++-- xo-tokenizer/utest/tokenizer.test.cpp | 405 ++++++++++----- 8 files changed, 721 insertions(+), 311 deletions(-) diff --git a/xo-reader/src/reader/reader.cpp b/xo-reader/src/reader/reader.cpp index d08b9552..eb012541 100644 --- a/xo-reader/src/reader/reader.cpp +++ b/xo-reader/src/reader/reader.cpp @@ -32,10 +32,10 @@ namespace xo { while (!input.empty()) { /* read one token from input */ auto sr = this->tokenizer_.scan2(input, eof); - const auto & tk = sr.first; - const span_type & used_span = sr.second; + const auto & tk = sr.get_token(); + const span_type & used_span = sr.consumed(); - log && log(xtag("used_span", used_span)); + log && log(xtag("consumed", used_span)); log && log(xtag("input.pre", input)); input = input.after_prefix(used_span); diff --git a/xo-tokenizer/CMakeLists.txt b/xo-tokenizer/CMakeLists.txt index 147e16c7..74d93c9e 100644 --- a/xo-tokenizer/CMakeLists.txt +++ b/xo-tokenizer/CMakeLists.txt @@ -19,9 +19,11 @@ add_definitions(${PROJECT_CXX_FLAGS}) # ---------------------------------------------------------------- add_subdirectory(src/tokenizer) +add_subdirectory(example) add_subdirectory(utest) +xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets) # ---------------------------------------------------------------- -# provide find_package() support - -xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets) +# docs targets depend on all the other library/utest targets +# +add_subdirectory(docs) diff --git a/xo-tokenizer/include/xo/tokenizer/span.hpp b/xo-tokenizer/include/xo/tokenizer/span.hpp index 22695ec5..6c9c5262 100644 --- a/xo-tokenizer/include/xo/tokenizer/span.hpp +++ b/xo-tokenizer/include/xo/tokenizer/span.hpp @@ -2,6 +2,7 @@ #pragma once +#include "xo/indentlog/scope.hpp" #include #include #include @@ -24,6 +25,9 @@ namespace xo { /** @brief create span for the contiguous memory range [@p lo, @p hi) **/ span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {} + /** @brief create a null span (i.e. with null @p lo, @p hi pointers) **/ + static span make_null() { return span(nullptr, nullptr); } + /** @brief create span for C-style string @p cstr **/ static span from_cstr(const CharT * cstr) { CharT * lo = cstr; @@ -32,6 +36,35 @@ namespace xo { return span(lo, hi); } + /** @brief create span from std::string @p str **/ + static span from_string(const std::string& str) { + CharT * lo = &(*str.begin()); + CharT * hi = &(*str.end()); + + return span(lo, hi); + } + + /** @brief concatenate two contiguous spans */ + static span concat(const span & span1, const span & span2) { + if (span1.is_null()) + return span2; + if (span2.is_null()) + return span1; + + if (span1.hi() != span2.lo()) { + scope log(XO_DEBUG(true)); + + log && log(xtag("span1.hi", (void*)span1.hi()), xtag("span2.lo", (void*)span2.lo())); + } + + assert(span1.hi() == span2.lo()); + + CharT * lo = span1.lo(); + CharT * hi = span2.hi(); + + return span(lo, hi); + } + ///@{ /** @name getters **/ @@ -96,6 +129,8 @@ namespace xo { return span(hi_, hi_); } + /** @brief true iff this span is null. distinct from empty. **/ + bool is_null() const { return lo_ == nullptr && hi_ == nullptr; } /** @brief true iff this span is empty (comprises 0 elements). **/ bool empty() const { return lo_ == hi_; } /** @brief report the number of elements (of type CharT) in this span. **/ diff --git a/xo-tokenizer/include/xo/tokenizer/token.hpp b/xo-tokenizer/include/xo/tokenizer/token.hpp index 988b4976..c9132183 100644 --- a/xo-tokenizer/include/xo/tokenizer/token.hpp +++ b/xo-tokenizer/include/xo/tokenizer/token.hpp @@ -80,7 +80,10 @@ namespace xo { static token assign_token() { return token(tokentype::tk_assign); } static token yields() { return token(tokentype::tk_yields); } + static token plus_token() { return token(tokentype::tk_plus); } + static token minus_token() { return token(tokentype::tk_minus); } static token star_token() { return token(tokentype::tk_star); } + static token slash_token() { return token(tokentype::tk_slash); } static token type() { return token(tokentype::tk_type); } static token def() { return token(tokentype::tk_def); } @@ -355,5 +358,4 @@ namespace xo { } /*Namespace scm*/ } /*namespace xo*/ - /* end token.hpp */ diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp index b9782b10..e36a54af 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp @@ -7,6 +7,7 @@ #include "token.hpp" #include "span.hpp" +#include "scan_result.hpp" #include "xo/indentlog/scope.hpp" #include @@ -21,7 +22,7 @@ namespace xo { * tokenizer_type tkz; * span_type input = ...; * - * while !input.empty() { + * while (!input.empty()) { * auto res = tkz.scan(input); * const auto & tk = res.first; * @@ -39,22 +40,27 @@ namespace xo { * // expect !tkz.has_prefix() * * @endcode + * + * See tokentype.hpp for token types **/ template class tokenizer { public: using token_type = token; using span_type = span; - using scan_result = std::pair; + using result_type = scan_result; public: - tokenizer() = default; + tokenizer(bool debug_flag = false); + + /** recognize the newline character '\n' **/ + bool is_newline(CharT ch) const; /** identifies whitespace chars. * These are chars that do not belong to any token. * They are not permitted to appear within * a symbol or string token. - * Appearance of a whitespace char forces completion of + * Appearance of a whitespace char forces completioon of * preceding token. **/ bool is_whitespace(CharT ch) const; @@ -77,28 +83,59 @@ namespace xo { **/ bool has_prefix() const { return !prefix_.empty(); } - /** assemble token from text @p token_text + /** assemble token from text @p token_text. + * @p token_text will often but not always represent a subset of @p input. + * (For example consider multi-line string literals) + * Also the span @p token_text may (in uncommon cases) + * have been copied to separate storage from @p input + * + * @p initial_whitespace Amount of whitespace input being consumed from input. + * @p initial_token_prefix_from_input Amount of non-whitespace input being + * consumed from input. Not counting any stashed-and-already-consumed input + * + * retval.consumed will represent some possibly-empty prefix of @p input **/ - token_type assemble_token(const span_type & token_text) const; + result_type assemble_token(std::size_t initial_whitespace, + std::size_t initial_token_prefix_from_input, + const span_type & token_text, + const span_type & input) const; + + /** degenerate version of assemble_token() on reaching end-of-file **/ + result_type assemble_final_token(const span_type & token_text) const; /** scan for next input token, given @p input. - * Note tokenizer can consume input (e.g. whitespace) - * without completing a token + * Note: + * - tokenizer can consume input (e.g. whitespace) + * without completing a token + * - input will remember the extent of the last line of input + * for which parsing has begun, but not completed. + * It's required that at least that portion of the input span + * remain valid across scan(), scan2() calls * * @return {parsed token, consumed span} **/ - scan_result scan(const span_type & input); + result_type scan(const span_type & input); /** When eof is false, same as scan(input). * When eof is true and scan(input) does not report a token, * return notify_eof() **/ - scan_result scan2(const span_type & input, bool eof); + result_type scan2(const span_type & input, bool eof); - /** notify end of input, resolve any stored input **/ - token_type notify_eof(); + /** notify end of input, resolving any ambiguous input stashed in .prefix + **/ + result_type notify_eof(const span_type & input); private: + result_type scan_completion(const span_type & whitespace, + const CharT* token_end, + const span_type & input); + + private: + /** true to log tokenizer activity to stdout **/ + bool debug_flag_ = false; + /** remember start of current line here **/ + span_type current_line_ = span_type::make_null(); /** Accumulate partial token here. * This will happen if input sent to @ref tokenizer::scan * ends without a determinate token boundary. @@ -106,6 +143,17 @@ namespace xo { std::string prefix_; }; /*tokenizer*/ + template + tokenizer::tokenizer(bool debug_flag) + : debug_flag_{debug_flag} + {} + + template + bool + tokenizer::is_newline(CharT ch) const { + return (ch == '\n'); + } + template bool tokenizer::is_whitespace(CharT ch) const { @@ -126,7 +174,10 @@ namespace xo { case '<': return true; case '>': - return true; + /* can't be punctuation + * - appears in tk_yields token: -> + */ + return false; case '(': return true; case ')': @@ -149,7 +200,10 @@ namespace xo { case '=': return true; case '-': - /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e-9 */ + /* can't be punctuation + * - can appear inside f64 token: e.g. 1.23e-9. + * - begins tk_yields token: -> + */ return false; case '+': /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */ @@ -171,6 +225,10 @@ namespace xo { template bool tokenizer::is_2char_punctuation(CharT ch) const { + /* can't put '-' here, because of the way it appears in numeric literals + * characters here may not appear in symbol names + */ + switch(ch) { case ':': /* can begin := */ @@ -182,15 +240,19 @@ namespace xo { template auto - tokenizer::assemble_token(const span_type & token_text) const -> token_type + tokenizer::assemble_token(std::size_t initial_whitespace, + std::size_t initial_token_prefix_from_input, + const span_type & token_text, + const span_type & input) const -> result_type { - constexpr bool c_debug_flag = true; - /* literal|pretty|streamlined */ log_config::style = function_style::streamlined; - scope log(XO_DEBUG(c_debug_flag)); - log && log(xtag("token_text", token_text)); + scope log(XO_DEBUG(debug_flag_)); + log && log(xtag("token_text", token_text), + xtag("initial_whitespace", initial_whitespace), + xtag("initial_token_prefix_from_input", initial_token_prefix_from_input), + xtag("input", input)); tokentype tk_type = tokentype::tk_invalid; std::string tk_text; @@ -265,79 +327,89 @@ namespace xo { /* true if at least one digit encountered */ bool number_flag = false; - /* token will be one of: {i64, f64, dot}: */ - for(; ix != token_text.hi(); ++ix) { - if((*ix == '-') || (*ix == '+')) { - /* sign allowed: - * 1. before period and before first digit - * 2. after exponent - */ - if (!period_flag && !number_flag && !sign_flag) { - sign_flag = true; - } else if (exponent_flag && !exponent_digit_flag) { - exponent_sign_flag = true; - } else { - throw std::runtime_error - (tostr("tokenizer::assemble_token", - ": improperly placed sign indicator", - xtag("pos", ix - tk_start), - xtag("char", *ix))); - } - } else if(*ix == '.') { - if (period_flag) { - throw (std::runtime_error - (tostr("tokenizer::assemble_token", - ": duplicate decimal point", - xtag("pos", ix - tk_start), - xtag("char", *ix)))); - } + log && log(xtag("*ix", *ix), + xtag("tk.length", token_text.size())); + if (log && (ix + 1 < tk_end)) + log(xtag("*(ix+1)", *(ix + 1))); - period_flag = true; - } else if((*ix == 'e') || (*ix == 'E')) { - if (exponent_flag) { - throw (std::runtime_error - (tostr("tokenizer::assemble_token", - ": duplicate exponent marker", - xtag("pos", ix - tk_start), - xtag("char", *ix)))); - } - - exponent_flag = true; - } else if(isdigit(*ix)) { - if (exponent_flag) { - /* need digit before exponent to recognize as number */ - exponent_digit_flag = true; - } else { - number_flag = true; - } - } else { - /* invalid input */ - throw (std::runtime_error - (tostr("tokenizer::assemble_token", - ": unexpected character in numeric constant", - xtag("pos", ix - tk_start), - xtag("char", *ix)))); - } - } - - if (number_flag) { - if (period_flag || exponent_flag) { - tk_type = tokentype::tk_f64; - } else { - tk_type = tokentype::tk_i64; - } - } else if (period_flag && !exponent_flag) { - tk_type = tokentype::tk_dot; + if ((*ix == '-') && (ix + 2 == token_text.hi()) && (*(ix + 1) == '>')) { + /* composing exactly '->' */ + tk_type = tokentype::tk_yields; } else { - /* not a valid token */ - } + /* token (if valid) will be one of: {tk_i64, tk_f64, tk_dot}: */ + for (; ix != token_text.hi(); ++ix) { + if ((*ix == '-') || (*ix == '+')) { + /* sign allowed: + * 1. before period and before first digit + * 2. after exponent + */ + if (!period_flag && !number_flag && !sign_flag) { + sign_flag = true; + } else if (exponent_flag && !exponent_digit_flag) { + exponent_sign_flag = true; + } else { + throw std::runtime_error + (tostr("tokenizer::assemble_token", + ": improperly placed sign indicator", + xtag("pos", ix - tk_start), + xtag("char", *ix))); + } + } else if (*ix == '.') { + if (period_flag) { + throw (std::runtime_error + (tostr("tokenizer::assemble_token", + ": duplicate decimal point", + xtag("pos", ix - tk_start), + xtag("char", *ix)))); + } - log && log(xtag("sign_flag", sign_flag)); - log && log(xtag("period_flag", period_flag), - xtag("exponent_flag", exponent_flag), - xtag("exponent_sign_flag", exponent_sign_flag), - xtag("number_flag", number_flag)); - log && log(xtag("tk_type", tk_type)); + period_flag = true; + } else if ((*ix == 'e') || (*ix == 'E')) { + if (exponent_flag) { + throw (std::runtime_error + (tostr("tokenizer::assemble_token", + ": duplicate exponent marker", + xtag("pos", ix - tk_start), + xtag("char", *ix)))); + } + + exponent_flag = true; + } else if (isdigit(*ix)) { + if (exponent_flag) { + /* need digit before exponent to recognize as number */ + exponent_digit_flag = true; + } else { + number_flag = true; + } + } else { + /* invalid input */ + throw (std::runtime_error + (tostr("tokenizer::assemble_token", + ": unexpected character in numeric constant", + xtag("pos", ix - tk_start), + xtag("char", *ix)))); + } + } + + if (number_flag) { + if (period_flag || exponent_flag) { + tk_type = tokentype::tk_f64; + } else { + tk_type = tokentype::tk_i64; + } + } else if (period_flag && !exponent_flag) { + tk_type = tokentype::tk_dot; + } else { + /* not a valid token */ + } + + log && log(xtag("sign_flag", sign_flag)); + log && log(xtag("period_flag", period_flag), + xtag("exponent_flag", exponent_flag), + xtag("exponent_sign_flag", exponent_sign_flag), + xtag("number_flag", number_flag)); + log && log(xtag("tk_type", tk_type)); + } break; } @@ -569,7 +641,9 @@ namespace xo { || (tk_type == tokentype::tk_f64) || (tk_type == tokentype::tk_symbol)) { - /* re-parse in token::i64_value() / token::f64_value() */ + /* note: capturing token text here; + * for numeric literals will re-parse in token::i64_value() / token::f64_value() + */ tk_text = std::string(tk_start, tk_end); } else if (tk_type == tokentype::tk_string) { ; /* nothing to do here -- desired tk_text already constructed */ @@ -603,40 +677,96 @@ namespace xo { tk_text.clear(); } - return token_type(tk_type, std::move(tk_text)); + return result_type(token_type(tk_type, std::move(tk_text)), + input.prefix(initial_whitespace + initial_token_prefix_from_input)); } /*assemble_token*/ template auto - tokenizer::scan(const span_type & input) -> scan_result + tokenizer::assemble_final_token(const span_type & token_text) const -> result_type { - constexpr bool c_debug_flag = true; - scope log(XO_DEBUG(c_debug_flag)); + return assemble_token(0 /*initial_whitespace*/, + 0 /*initial_token_prefix_from_input*/, + token_text, + span_type::make_null()); + } + + template + auto + tokenizer::scan_completion(const span_type & whitespace, + const CharT* token_end, + const span_type & input) -> result_type { + + auto token_span = input.after_prefix(whitespace).prefix_upto(token_end); + + if (this->prefix_.empty()) { + return assemble_token(whitespace.size(), + token_span.size() /*initial_token_prefix_from_input*/, + token_span, + input); + } else { + /* whatever we stashed in .prefix_, should be consumed from input. + * control here implies reached end of input with either + * - input for which parsing outcome depends on existence of more input, + * and presence of eof now resolves + * - malformed input (that might represent prefix of a valid token. Say "#incl" in C) + * + * That means stashed .prefix will represent copied range of characters that + * ends at the same position as input + */ + return result_type::make_partial(input); + } + + } + + template + auto + tokenizer::scan(const span_type & input) -> result_type + { + scope log(XO_DEBUG(debug_flag_)); log && log(xtag("input", input)); const CharT * ix = input.lo(); - /* skip whitespace */ - while (is_whitespace(*ix) && (ix != input.hi())) - ++ix; + /* skip whitespace + remember beginning of most recent line */ + while (is_whitespace(*ix) && (ix != input.hi())) { + + if (is_newline(*ix)) { + ++ix; + /* look ahead to {end of line, end of input}, whichever comes first */ + const CharT * sol = ix; + const CharT * eol = ix; + + while ((eol < input.hi()) && (*eol != '\n')) + ++eol; + + this->current_line_ = span_type(sol, eol); + } else { + ++ix; + } + } if(ix == input.hi()) { /* no-op */ - return { - token_type::invalid(), - input.prefix_upto(ix) - }; + return result_type::make_whitespace(input.prefix_upto(ix)); } + // TODO: + // 1. hoist complete_flag up here + // 2. use in each branch + // 3. common check for prefix-capturing after if-cascade below done + /* here: *ix is not whitespace */ auto whitespace = input.prefix_upto(ix); log && log(xtag("whitespace.size", whitespace.size())); - /* tk_start points to beginning of token + /* tk_start points to known beginning of token * (after any whitespace) + * + * goal is to leave ix pointing to 1 char past the end of the token */ const CharT * tk_start = ix; @@ -654,7 +784,7 @@ namespace xo { /* need more input to know if/when token complete */ this->prefix_ += std::string(tk_start, input.hi()); - log && log(xtag("captured-prefix", this->prefix_)); + log && log(xtag("captured-prefix1", this->prefix_)); } else { CharT ch2 = *ix; @@ -701,9 +831,49 @@ namespace xo { /* need more input to know if/when token complete */ this->prefix_ += std::string(tk_start, input.hi()); - log && log(xtag("captured-prefix", this->prefix_)); + log && log(xtag("captured-prefix2", this->prefix_)); } } else { + /* ix is start of some token */ + + if (*ix == '-') { + /* this section load-bearing for input '->' scanning from beginning of token */ + ++ix; + + if (ix == input.hi()) { + /* need more input to know if/when token complete -- see captured-prefix5 below */ + } else { + CharT ch2 = *ix; + + if (ch2 == '>') { + /* include next char and complete token */ + ++ix; + + return scan_completion(whitespace, ix /*token_end*/, input); + } + + /* here: -123, -.5e-21 for example */ + } + } else if (*ix == '>') { + /* this section load-bearing for input '>=' scanning from beginning of token. + * Need this because '>' necessarily excluded from is_1char_punctuation() + */ + ++ix; + + if (ix == input.hi()) { + /* need more input to know if/when token complete -- see captured-prefix5 below */ + } else { + CharT ch2 = *ix; + + if (ch2 != '=') { + /* ignore next char and complete token */ + return scan_completion(whitespace, ix /*token_end*/, input); + } + + /* here: >= for example */ + } + } + /* scan until: * - whitespace * - punctuation @@ -715,59 +885,85 @@ namespace xo { { break; } + + /* this section load-bearing for input '>' after beginning of a token, e.g. p> */ + if ((ix > tk_start) && (*ix == '>')) + break; + + /* this section load-bearing for input '->' at the end of another token, e.g. p->q */ + if (*ix == '-') { + if (ix + 1 == input.hi()) { + /* need more input to know if/when token complete + * + * apple-banana parses as: {tk_symbol: apple-banana} + * apple-> parses as: {tk_symbol: apple} {tk_yields} + * apple- illegal (may not end symbol with '-') + */ + break; + } + + if (*(ix + 1) == '>') { + /* treat '->' as punctuation; complete preceding token */ + break; + } + } } if (ix == input.hi()) { /* need more input to know if/when token complete */ this->prefix_ += std::string(tk_start, input.hi()); - log && log(xtag("captured-prefix", this->prefix_)); + log && log(xtag("captured-prefix5", this->prefix_)); } } - auto token_span = input.after_prefix(whitespace).prefix_upto(ix); - - token tk - = (this->prefix_.empty() - ? assemble_token(token_span) - : token_type(tokentype::tk_invalid)); - - return scan_result - { tk, input.prefix(whitespace.size() + token_span.size()) }; + return scan_completion(whitespace, ix /*token_end*/, input); } /*scan*/ template auto - tokenizer::scan2(const span_type & input, bool eof) -> scan_result { + tokenizer::scan2(const span_type & input, bool eof) -> result_type { + scope log(XO_DEBUG(debug_flag_)); + auto sr = this->scan(input); - if (!sr.first.is_valid() && eof) { - sr.first = this->notify_eof(); - /* always consume remainder of input here. - * ambiguous prefix can represent at most one token - */ - sr.second = input; - } + if (sr.is_token() || sr.is_error() || !eof) + return sr; - return sr; + /* control here only if input contains no unambiguous tokens. + * This implies it contains _at most one_ final token. + */ + + span_type input2 = input.after_prefix(sr.consumed()); + + /* need to include src.consumed() in retval */ + + auto sr2 = this->notify_eof(input2); + + return result_type(sr2.get_token(), + span_type::concat(sr.consumed(), sr2.consumed()), + sr2.error()); } template auto - tokenizer::notify_eof() -> token_type { - constexpr bool c_debug_flag = true; + tokenizer::notify_eof(const span_type & input) -> result_type { + scope log(XO_DEBUG(debug_flag_)); - scope log(XO_DEBUG(c_debug_flag)); + log && log(xtag("prefix_", prefix_), xtag("prefix_.size", prefix_.size()), xtag("input", input)); - token tk - = (this->prefix_.empty() - ? token_type(tokentype::tk_invalid) - : assemble_token(span_type(&prefix_[0], - &prefix_[prefix_.size()]))); + if (this->prefix_.empty()) { + /* almost meretricious to include input here, + * when called from scan2() it can only be whitespace + */ + return result_type::make_whitespace(input); + } else { + auto retval = assemble_final_token(span_type::from_string(prefix_)); - this->prefix_.clear(); + this->prefix_.clear(); - return tk; + return retval; + } } /*notify_eof*/ } /*namespace scm*/ } /*namespace xo*/ diff --git a/xo-tokenizer/include/xo/tokenizer/tokentype.hpp b/xo-tokenizer/include/xo/tokenizer/tokentype.hpp index 6da013d9..6a3ef8a6 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokentype.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokentype.hpp @@ -15,7 +15,7 @@ namespace xo { * * Schematica code examples: * - * type point :: { xcoord : f64, ycoord: f64 }; + * type point :: { xcoord : f64, ycoord : f64 }; * type matrix :: array; // 2-d array * * decl hypot(x : f64, y : f64) -> f64; @@ -39,7 +39,7 @@ namespace xo { * }; * * def matrixproduct(x : matrix, y : matrix) { - * [i,j : x.row(i) * y.col(j)]; + * [i, j : x.row(i) * y.col(j)]; * }; **/ enum class tokentype { @@ -120,7 +120,7 @@ namespace xo { /** operator '/' **/ tk_slash, - /** keyworkd 'type' **/ + /** keyword 'type' **/ tk_type, /** keyword 'def' **/ diff --git a/xo-tokenizer/utest/token.test.cpp b/xo-tokenizer/utest/token.test.cpp index 160420b0..80ee6e4f 100644 --- a/xo-tokenizer/utest/token.test.cpp +++ b/xo-tokenizer/utest/token.test.cpp @@ -12,70 +12,76 @@ namespace xo { using xo::scm::tokentype; namespace ut { - struct testcase_i64 { - std::string text_; - bool expect_throw_; - std::int64_t expected_; - }; + // also see tokenizer.test.cpp for syntax - std::vector s_testcase_v = { - {"", true, 0}, - {"0", false, 0}, - {"-", true, 0}, - {"+", true, 0}, - {"-0", false, 0}, - {"+0", false, 0}, - {"1", false, 1}, - {"-1", false, -1}, - {"9", false, 9}, - {"-9", false, -9}, - {"12", false, 12}, - {"+12", false, 12}, - {"-12", false, -12}, - {"99", false, 99}, - {"-99", false, -99}, - {"123x", true, 0}, - }; + namespace test2 { + struct testcase_i64 { + std::string text_; + bool expect_throw_; + std::int64_t expected_; + }; - TEST_CASE("parse-i64", "[token]") { - for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) { - INFO(xtag("i_tc", i_tc)); + std::vector s_testcase_v = { + {"", true, 0}, + {"0", false, 0}, + {"-", true, 0}, + {"+", true, 0}, + {"-0", false, 0}, + {"+0", false, 0}, + {"1", false, 1}, + {"-1", false, -1}, + {"9", false, 9}, + {"-9", false, -9}, + {"12", false, 12}, + {"+12", false, 12}, + {"-12", false, -12}, + {"99", false, 99}, + {"-99", false, -99}, + {"123x", true, 0}, + }; - auto const & testcase = s_testcase_v[i_tc]; + TEST_CASE("parse-i64", "[token]") { + for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) { + INFO(xtag("i_tc", i_tc)); - token tk(tokentype::tk_i64, - testcase.text_); + auto const & testcase = s_testcase_v[i_tc]; - REQUIRE(tk.tk_type() == tokentype::tk_i64); + token tk(tokentype::tk_i64, + testcase.text_); + + REQUIRE(tk.tk_type() == tokentype::tk_i64); + + bool throw_flag = false; + try { + std::int64_t x = tk.i64_value(); + + REQUIRE(x == testcase.expected_); + } catch (std::exception & ex) { + throw_flag = true; + } + + REQUIRE(throw_flag == testcase.expect_throw_); + } + } + } + + namespace test3 { + TEST_CASE("error-i64", "[token]") { + token tk(tokentype::tk_i64, "+"); bool throw_flag = false; - try { - std::int64_t x = tk.i64_value(); - REQUIRE(x == testcase.expected_); - } catch (std::exception & ex) { + try { + tk.i64_value(); + } catch(std::exception & ex) { throw_flag = true; } - REQUIRE(throw_flag == testcase.expect_throw_); + REQUIRE(throw_flag); } } - TEST_CASE("error-i64", "[token]") { - token tk(tokentype::tk_i64, "+"); - - bool throw_flag = false; - - try { - tk.i64_value(); - } catch(std::exception & ex) { - throw_flag = true; - } - - REQUIRE(throw_flag); - } - - namespace { + namespace test4 { struct testcase_f64 { std::string text_; bool expect_throw_; diff --git a/xo-tokenizer/utest/tokenizer.test.cpp b/xo-tokenizer/utest/tokenizer.test.cpp index 44600e7f..ed2cb515 100644 --- a/xo-tokenizer/utest/tokenizer.test.cpp +++ b/xo-tokenizer/utest/tokenizer.test.cpp @@ -12,6 +12,79 @@ namespace xo { using xo::scm::span; namespace ut { + /** Two-pass test harness. + * + * First pass - verify test assertions. + * Second pass only if first pass failed. + * On second pass, enable verbose logging + **/ + struct rehearser { + /* expect at most one iterator to exist per TestRehearser instance **/ + struct iterator { + iterator(rehearser* parent, std::uint32_t attention) : parent_{parent}, attention_{attention} {} + + iterator& operator++(); + std::uint32_t operator*() { return attention_; } + + bool operator==(const iterator& ix2) const { + return (parent_ == ix2.parent_) && (attention_ == ix2.attention_); + } + + rehearser* parent_ = nullptr; + std::uint32_t attention_ = 0; + + }; + + bool is_second_pass() const { return attention_ == 1; } + bool enable_debug() const { return is_second_pass(); } + + iterator begin() { return iterator(this, 0); } + iterator end() { return iterator(this, 2); } + + public: + /** pass number: 0 or 1 **/ + std::uint32_t attention_ = 0; + /** @brief set to true when test starts; false if first pass fails **/ + bool ok_flag_ = true; + }; + + auto rehearser::iterator::operator++() -> iterator& + { + ++attention_; + + if (parent_->ok_flag_ && attention_ == 1) { + /* skip 2nd pass */ + ++attention_; + } + + return *this; + } + + /* use this instead of REQUIRE(expr) in context of a test_rehearser */ +# define REHEARSE(rehearser, expr) \ + if (rehearser.is_second_pass()) { \ + REQUIRE((expr)); \ + } else { \ + REQUIRE(true); \ + rehearser.ok_flag_ &= (expr); \ + } + + /* note: trivial REQUIRE() call in else branch bc we still want + * catch2 to count assertions when verification succeeds + */ +# define REQUIRE_ORCAPTURE(ok_flag, catch_flag, expr) \ + if (catch_flag) { \ + REQUIRE((expr)); \ + } else { \ + REQUIRE(true); \ + ok_flag &= (expr); \ + } + +# define REQUIRE_ORFAIL(ok_flag, catch_flag, expr) \ + REQUIRE_ORCAPTURE(ok_flag, catch_flag, expr); \ + if (!ok_flag) \ + return ok_flag + namespace { struct testcase_tkz { std::string input_; @@ -22,66 +95,73 @@ namespace xo { std::vector s_testcase_v = { - {"<", false, token::leftangle(), true}, - {">", false, token::rightangle(), true}, + /* + * + * expect_throw consume_all + * v v + */ + {"<", false, token::leftangle(), true}, + /* possible prefix of >= */ + {">", false, token::rightangle(), true}, + {"> ", false, token::rightangle(), false}, - {"(", false, token::leftparen(), true}, - {")", false, token::rightparen(), true}, + {"(", false, token::leftparen(), true}, + {")", false, token::rightparen(), true}, - {"[", false, token::leftbracket(), true}, - {"]", false, token::rightbracket(), true}, + {"[", false, token::leftbracket(), true}, + {"]", false, token::rightbracket(), true}, - {"{", false, token::leftbrace(), true}, - {" {", false, token::leftbrace(), true}, + {"{", false, token::leftbrace(), true}, + {" {", false, token::leftbrace(), true}, - {"\t{", false, token::leftbrace(), true}, - {"\n{", false, token::leftbrace(), true}, - {"}", false, token::rightbrace(), true}, + {"\t{", false, token::leftbrace(), true}, + {"\n{", false, token::leftbrace(), true}, + {"}", false, token::rightbrace(), true}, - {"0", false, token::i64_token("0"), true}, - {"1", false, token::i64_token("1"), true}, - {"12", false, token::i64_token("12"), true}, - {"123", false, token::i64_token("123"), true}, + {"0", false, token::i64_token("0"), true}, + {"1", false, token::i64_token("1"), true}, + {"12", false, token::i64_token("12"), true}, + {"123", false, token::i64_token("123"), true}, {"1234", false, token::i64_token("1234"), true}, - {"0 ", false, token::i64_token("0"), false}, - {"1 ", false, token::i64_token("1"), false}, - {"12 ", false, token::i64_token("12"), false}, - {"123 ", false, token::i64_token("123"), false}, + {"0 ", false, token::i64_token("0"), false}, + {"1 ", false, token::i64_token("1"), false}, + {"12 ", false, token::i64_token("12"), false}, + {"123 ", false, token::i64_token("123"), false}, {"1234 ", false, token::i64_token("1234"), false}, - {"1<", false, token::i64_token("1"), false}, - {"1>", false, token::i64_token("1"), false}, - {"1(", false, token::i64_token("1"), false}, - {"1)", false, token::i64_token("1"), false}, - {"1[", false, token::i64_token("1"), false}, - {"1]", false, token::i64_token("1"), false}, - {"1{", false, token::i64_token("1"), false}, - {"1}", false, token::i64_token("1"), false}, - {"1;", false, token::i64_token("1"), false}, - {"1:", false, token::i64_token("1"), false}, - {"1,", false, token::i64_token("1"), false}, + {"1<", false, token::i64_token("1"), false}, + {"1>", false, token::i64_token("1"), false}, + {"1(", false, token::i64_token("1"), false}, + {"1)", false, token::i64_token("1"), false}, + {"1[", false, token::i64_token("1"), false}, + {"1]", false, token::i64_token("1"), false}, + {"1{", false, token::i64_token("1"), false}, + {"1}", false, token::i64_token("1"), false}, + {"1;", false, token::i64_token("1"), false}, + {"1:", false, token::i64_token("1"), false}, + {"1,", false, token::i64_token("1"), false}, - {".1", false, token::f64_token(".1"), true}, - {".12", false, token::f64_token(".12"), true}, - {".123", false, token::f64_token(".123"), true}, + {".1", false, token::f64_token(".1"), true}, + {".12", false, token::f64_token(".12"), true}, + {".123", false, token::f64_token(".123"), true}, - {"+.1", false, token::f64_token("+.1"), true}, - {"+.12", false, token::f64_token("+.12"), true}, + {"+.1", false, token::f64_token("+.1"), true}, + {"+.12", false, token::f64_token("+.12"), true}, {"+.123", false, token::f64_token("+.123"), true}, - {"-.1", false, token::f64_token("-.1"), true}, - {"-.12", false, token::f64_token("-.12"), true}, + {"-.1", false, token::f64_token("-.1"), true}, + {"-.12", false, token::f64_token("-.12"), true}, {"-.123", false, token::f64_token("-.123"), true}, - {"1.", false, token::f64_token("1."), true}, - {"1.2", false, token::f64_token("1.2"), true}, - {"1.23", false, token::f64_token("1.23"), true}, + {"1.", false, token::f64_token("1."), true}, + {"1.2", false, token::f64_token("1.2"), true}, + {"1.23", false, token::f64_token("1.23"), true}, - {"1e0", false, token::f64_token("1e0"), true}, - {"1e-1", false, token::f64_token("1e-1"), true}, - {"1e1", false, token::f64_token("1e1"), true}, - {"1e+1", false, token::f64_token("1e+1"), true}, + {"1e0", false, token::f64_token("1e0"), true}, + {"1e-1", false, token::f64_token("1e-1"), true}, + {"1e1", false, token::f64_token("1e1"), true}, + {"1e+1", false, token::f64_token("1e+1"), true}, {"\"hello\"", false, token::string_token("hello"), true}, /* tokenizer sees this input: @@ -99,10 +179,20 @@ namespace xo { {"\"tab to the right [\\t], to the right [\\t]\"", false, token::string_token("tab to the right [\t], to the right [\t]"), true}, + {".", false, token::dot(), true}, {":", false, token::colon(), true}, + {",", false, token::comma(), true}, + {"=", false, token::singleassign(), true}, {":=", false, token::assign_token(), true}, + {"->", false, token::yields(), true}, + + {"+", false, token::plus_token(), true}, + {"-", false, token::minus_token(), true}, + {"*", false, token::star_token(), true}, + {"/", false, token::slash_token(), true}, {"symbol", false, token::symbol_token("symbol"), true}, + {"another-symbol", false, token::symbol_token("another-symbol"), true}, {"type", false, token::type(), true}, {"def", false, token::def(), true}, @@ -112,58 +202,59 @@ namespace xo { {"in", false, token::in(), true}, {"end", false, token::end(), true}, - {"*", false, token::star_token(), true}, }; } TEST_CASE("tokenizer", "[tokenizer]") { for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) { + const testcase_tkz & testcase = s_testcase_v[i_tc]; - INFO(xtag("input", testcase.input_)); - INFO(xtag("i_tc", i_tc)); + rehearser rh; - using tokenizer - = xo::scm::tokenizer; + for (auto _ : rh) { + scope log(XO_DEBUG2(rh.enable_debug(), "tokenizer")); - tokenizer tkz; - tokenizer::span_type - in_span(testcase.input_.c_str(), - testcase.input_.c_str() + testcase.input_.size()); + log && log(xtag("i_tc", i_tc), xtag("input", testcase.input_)); - auto out = tkz.scan(in_span); + using tokenizer + = xo::scm::tokenizer; - auto tk = out.first; + tokenizer tkz(rh.enable_debug()); + tokenizer::span_type + in_span(testcase.input_.c_str(), + testcase.input_.c_str() + testcase.input_.size()); - if (tk.is_invalid()) - tk = tkz.notify_eof(); + auto sr = tkz.scan2(in_span, true /*eof*/); - REQUIRE(tk.tk_type() == testcase.expected_tk_.tk_type()); - if (tk.tk_type() == tokentype::tk_i64) - { - REQUIRE(!tk.text().empty()); - REQUIRE(tk.i64_value() == testcase.expected_tk_.i64_value()); - } else if (tk.tk_type() == tokentype::tk_f64) - { - REQUIRE(!tk.text().empty()); - REQUIRE(tk.f64_value() == testcase.expected_tk_.f64_value()); - } else if(tk.tk_type() == tokentype::tk_string) - { - /* tk.text() can be empty, consider input "" */ - REQUIRE(tk.text() == testcase.expected_tk_.text()); - } else if(tk.tk_type() == tokentype::tk_symbol) - { - REQUIRE(!tk.text().empty()); - REQUIRE(tk.text() == testcase.expected_tk_.text()); - } else { - REQUIRE(tk.text().empty()); + REHEARSE(rh, sr.get_token().tk_type() == testcase.expected_tk_.tk_type()); + if (sr.get_token().tk_type() == tokentype::tk_i64) + { + REHEARSE(rh, !sr.get_token().text().empty()); + REHEARSE(rh, sr.get_token().i64_value() == testcase.expected_tk_.i64_value()); + } else if (sr.get_token().tk_type() == tokentype::tk_f64) + { + REHEARSE(rh, !sr.get_token().text().empty()); + REHEARSE(rh, sr.get_token().f64_value() == testcase.expected_tk_.f64_value()); + } else if(sr.get_token().tk_type() == tokentype::tk_string) + { + /* sr.get_token().text() can be empty, consider input "" */ + REHEARSE(rh, sr.get_token().text() == testcase.expected_tk_.text()); + } else if(sr.get_token().tk_type() == tokentype::tk_symbol) + { + REHEARSE(rh, !sr.get_token().text().empty()); + REHEARSE(rh, sr.get_token().text() == testcase.expected_tk_.text()); + } else { + REHEARSE(rh, sr.get_token().text().empty()); + } + + /* must consume all input for tests we're doing here */ + if (testcase.consume_all_) { + REHEARSE(rh, sr.consumed() == in_span); + } else { + REHEARSE(rh, sr.consumed() != in_span); + } } - - /* must consume all input for tests we're doing here */ - if (testcase.consume_all_) - REQUIRE(out.second == in_span); - else - REQUIRE(out.second != in_span); } } @@ -208,56 +299,134 @@ namespace xo { token::symbol_token("y"), token::semicolon(), token::rightbrace() - }} + }}, + {"a.b", + false, + {token::symbol_token("a"), + token::dot(), + token::symbol_token("b") + }}, + {"a,b", + false, + {token::symbol_token("a"), + token::comma(), + token::symbol_token("b") + }}, + {"a:b", + false, + {token::symbol_token("a"), + token::colon(), + token::symbol_token("b") + }}, + {"a;b", + false, + {token::symbol_token("a"), + token::semicolon(), + token::symbol_token("b") + }}, + {"a:=b", + false, + {token::symbol_token("a"), + token::assign_token(), + token::symbol_token("b") + }}, + {"a=b", + false, + {token::symbol_token("a"), + token::singleassign(), + token::symbol_token("b") + }}, + {"p->q", + false, + {token::symbol_token("p"), + token::yields(), + token::symbol_token("q") + }}, + {"a + b", + false, + {token::symbol_token("a"), + token::plus_token(), + token::symbol_token("b") + }}, + {"a - b", + false, + {token::symbol_token("a"), + token::minus_token(), + token::symbol_token("b") + }}, + {"a-b", + false, + {token::symbol_token("a-b"), + }}, + {"(apple)", + false, + {token::leftparen(), + token::symbol_token("apple"), + token::rightparen() + }}, + {"", + false, + {token::leftangle(), + token::symbol_token("apple"), + token::rightangle() + }}, }; } TEST_CASE("tokenizer2", "[tokenizer]") { + /* this time testing token sequences */ + + using tokenizer = xo::scm::tokenizer; + for (std::size_t i_tc = 0, n_tc = s_testcase2_v.size(); i_tc < n_tc; ++i_tc) { const testcase2_tkz & testcase = s_testcase2_v[i_tc]; - INFO(xtag("input", testcase.input_)); - INFO(xtag("i_tc", i_tc)); + rehearser rh; - using tokenizer - = xo::scm::tokenizer; + for (auto _ : rh) { + scope log(XO_DEBUG2(rh.enable_debug(), "tokenizer2")); - tokenizer tkz; - tokenizer::span_type - in_span(testcase.input_.c_str(), - testcase.input_.c_str() + testcase.input_.size()); + log && log(xtag("i_tc", i_tc), xtag("input", testcase.input_)); - for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size(); - i_tk < n_tk; ++i_tk) - { - INFO(xtag("i_tk", i_tk)); + tokenizer tkz(rh.enable_debug()); - auto res = tkz.scan2(in_span, in_span.empty()); - const auto & tk = res.first; + tokenizer::span_type + in_span(testcase.input_.c_str(), + testcase.input_.c_str() + testcase.input_.size()); - if (tk.is_valid()) - REQUIRE(tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type()); - if (tk.tk_type() == tokentype::tk_i64) + for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size(); + i_tk < n_tk; ++i_tk) { - REQUIRE(!tk.text().empty()); - REQUIRE(tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value()); - } else if (tk.tk_type() == tokentype::tk_f64) - { - REQUIRE(!tk.text().empty()); - REQUIRE(tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value()); - } else if(tk.tk_type() == tokentype::tk_string) - { - /* tk.text() can be empty, consider input "" */ - REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text()); - } else if(tk.tk_type() == tokentype::tk_symbol) - { - REQUIRE(!tk.text().empty()); - REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text()); - } else { - REQUIRE(tk.text().empty()); + log && log(xtag("i_tk", i_tk)); + + auto sr = tkz.scan2(in_span, in_span.empty()); + const auto & tk = sr.get_token(); + + if (tk.is_valid()) { + REHEARSE(rh, tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type()); + } + if (tk.tk_type() == tokentype::tk_i64) + { + REHEARSE(rh, !tk.text().empty()); + REHEARSE(rh, tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value()); + } else if (tk.tk_type() == tokentype::tk_f64) + { + REHEARSE(rh, !tk.text().empty()); + REHEARSE(rh, tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value()); + } else if(tk.tk_type() == tokentype::tk_string) + { + /* tk.text() can be empty, consider input "" */ + REHEARSE(rh, tk.text() == testcase.expected_tk_v_[i_tk].text()); + } else if(tk.tk_type() == tokentype::tk_symbol) + { + REHEARSE(rh, !tk.text().empty()); + REHEARSE(rh, tk.text() == testcase.expected_tk_v_[i_tk].text()); + } else { + REHEARSE(rh, tk.text().empty()); + } + + in_span = in_span.after_prefix(sr.consumed()); } - - in_span = in_span.after_prefix(res.second); } } } /*TEST_CASE(tokenizer2)*/