xo-tokenizer: bugfix: yields token works + 2phase utest

This commit is contained in:
Roland Conybeare 2025-06-22 16:16:23 -05:00
commit 27ef5701ac
8 changed files with 722 additions and 312 deletions

View file

@ -19,9 +19,11 @@ add_definitions(${PROJECT_CXX_FLAGS})
# ----------------------------------------------------------------
add_subdirectory(src/tokenizer)
add_subdirectory(example)
add_subdirectory(utest)
xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets)
# ----------------------------------------------------------------
# provide find_package() support
xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets)
# docs targets depend on all the other library/utest targets
#
add_subdirectory(docs)

View file

@ -2,6 +2,7 @@
#pragma once
#include "xo/indentlog/scope.hpp"
#include <ostream>
#include <cstdint>
#include <cassert>
@ -24,6 +25,9 @@ namespace xo {
/** @brief create span for the contiguous memory range [@p lo, @p hi) **/
span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {}
/** @brief create a null span (i.e. with null @p lo, @p hi pointers) **/
static span make_null() { return span(nullptr, nullptr); }
/** @brief create span for C-style string @p cstr **/
static span from_cstr(const CharT * cstr) {
CharT * lo = cstr;
@ -32,6 +36,35 @@ namespace xo {
return span(lo, hi);
}
/** @brief create span from std::string @p str **/
static span from_string(const std::string& str) {
CharT * lo = &(*str.begin());
CharT * hi = &(*str.end());
return span(lo, hi);
}
/** @brief concatenate two contiguous spans */
static span concat(const span & span1, const span & span2) {
if (span1.is_null())
return span2;
if (span2.is_null())
return span1;
if (span1.hi() != span2.lo()) {
scope log(XO_DEBUG(true));
log && log(xtag("span1.hi", (void*)span1.hi()), xtag("span2.lo", (void*)span2.lo()));
}
assert(span1.hi() == span2.lo());
CharT * lo = span1.lo();
CharT * hi = span2.hi();
return span(lo, hi);
}
///@{
/** @name getters **/
@ -96,6 +129,8 @@ namespace xo {
return span(hi_, hi_);
}
/** @brief true iff this span is null. distinct from empty. **/
bool is_null() const { return lo_ == nullptr && hi_ == nullptr; }
/** @brief true iff this span is empty (comprises 0 elements). **/
bool empty() const { return lo_ == hi_; }
/** @brief report the number of elements (of type CharT) in this span. **/

View file

@ -80,7 +80,10 @@ namespace xo {
static token assign_token() { return token(tokentype::tk_assign); }
static token yields() { return token(tokentype::tk_yields); }
static token plus_token() { return token(tokentype::tk_plus); }
static token minus_token() { return token(tokentype::tk_minus); }
static token star_token() { return token(tokentype::tk_star); }
static token slash_token() { return token(tokentype::tk_slash); }
static token type() { return token(tokentype::tk_type); }
static token def() { return token(tokentype::tk_def); }
@ -355,5 +358,4 @@ namespace xo {
} /*Namespace scm*/
} /*namespace xo*/
/* end token.hpp */

View file

@ -7,6 +7,7 @@
#include "token.hpp"
#include "span.hpp"
#include "scan_result.hpp"
#include "xo/indentlog/scope.hpp"
#include <cassert>
@ -21,7 +22,7 @@ namespace xo {
* tokenizer_type tkz;
* span_type input = ...;
*
* while !input.empty() {
* while (!input.empty()) {
* auto res = tkz.scan(input);
* const auto & tk = res.first;
*
@ -39,22 +40,27 @@ namespace xo {
* // expect !tkz.has_prefix()
*
* @endcode
*
* See tokentype.hpp for token types
**/
template <typename CharT>
class tokenizer {
public:
using token_type = token<CharT>;
using span_type = span<const CharT>;
using scan_result = std::pair<token_type, span_type>;
using result_type = scan_result<CharT>;
public:
tokenizer() = default;
tokenizer(bool debug_flag = false);
/** recognize the newline character '\n' **/
bool is_newline(CharT ch) const;
/** identifies whitespace chars.
* These are chars that do not belong to any token.
* They are not permitted to appear within
* a symbol or string token.
* Appearance of a whitespace char forces completion of
* Appearance of a whitespace char forces completioon of
* preceding token.
**/
bool is_whitespace(CharT ch) const;
@ -77,28 +83,59 @@ namespace xo {
**/
bool has_prefix() const { return !prefix_.empty(); }
/** assemble token from text @p token_text
/** assemble token from text @p token_text.
* @p token_text will often but not always represent a subset of @p input.
* (For example consider multi-line string literals)
* Also the span @p token_text may (in uncommon cases)
* have been copied to separate storage from @p input
*
* @p initial_whitespace Amount of whitespace input being consumed from input.
* @p initial_token_prefix_from_input Amount of non-whitespace input being
* consumed from input. Not counting any stashed-and-already-consumed input
*
* retval.consumed will represent some possibly-empty prefix of @p input
**/
token_type assemble_token(const span_type & token_text) const;
result_type assemble_token(std::size_t initial_whitespace,
std::size_t initial_token_prefix_from_input,
const span_type & token_text,
const span_type & input) const;
/** degenerate version of assemble_token() on reaching end-of-file **/
result_type assemble_final_token(const span_type & token_text) const;
/** scan for next input token, given @p input.
* Note tokenizer can consume input (e.g. whitespace)
* without completing a token
* Note:
* - tokenizer can consume input (e.g. whitespace)
* without completing a token
* - input will remember the extent of the last line of input
* for which parsing has begun, but not completed.
* It's required that at least that portion of the input span
* remain valid across scan(), scan2() calls
*
* @return {parsed token, consumed span}
**/
scan_result scan(const span_type & input);
result_type scan(const span_type & input);
/** When eof is false, same as scan(input).
* When eof is true and scan(input) does not report a token,
* return notify_eof()
**/
scan_result scan2(const span_type & input, bool eof);
result_type scan2(const span_type & input, bool eof);
/** notify end of input, resolve any stored input **/
token_type notify_eof();
/** notify end of input, resolving any ambiguous input stashed in .prefix
**/
result_type notify_eof(const span_type & input);
private:
result_type scan_completion(const span_type & whitespace,
const CharT* token_end,
const span_type & input);
private:
/** true to log tokenizer activity to stdout **/
bool debug_flag_ = false;
/** remember start of current line here **/
span_type current_line_ = span_type::make_null();
/** Accumulate partial token here.
* This will happen if input sent to @ref tokenizer::scan
* ends without a determinate token boundary.
@ -106,6 +143,17 @@ namespace xo {
std::string prefix_;
}; /*tokenizer*/
template <typename CharT>
tokenizer<CharT>::tokenizer(bool debug_flag)
: debug_flag_{debug_flag}
{}
template <typename CharT>
bool
tokenizer<CharT>::is_newline(CharT ch) const {
return (ch == '\n');
}
template <typename CharT>
bool
tokenizer<CharT>::is_whitespace(CharT ch) const {
@ -126,7 +174,10 @@ namespace xo {
case '<':
return true;
case '>':
return true;
/* can't be punctuation
* - appears in tk_yields token: ->
*/
return false;
case '(':
return true;
case ')':
@ -149,7 +200,10 @@ namespace xo {
case '=':
return true;
case '-':
/* can't be punctuation -- can appear inside f64 token: e.g. 1.23e-9 */
/* can't be punctuation
* - can appear inside f64 token: e.g. 1.23e-9.
* - begins tk_yields token: ->
*/
return false;
case '+':
/* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */
@ -171,6 +225,10 @@ namespace xo {
template <typename CharT>
bool
tokenizer<CharT>::is_2char_punctuation(CharT ch) const {
/* can't put '-' here, because of the way it appears in numeric literals
* characters here may not appear in symbol names
*/
switch(ch) {
case ':':
/* can begin := */
@ -182,15 +240,19 @@ namespace xo {
template <typename CharT>
auto
tokenizer<CharT>::assemble_token(const span_type & token_text) const -> token_type
tokenizer<CharT>::assemble_token(std::size_t initial_whitespace,
std::size_t initial_token_prefix_from_input,
const span_type & token_text,
const span_type & input) const -> result_type
{
constexpr bool c_debug_flag = true;
/* literal|pretty|streamlined */
log_config::style = function_style::streamlined;
scope log(XO_DEBUG(c_debug_flag));
log && log(xtag("token_text", token_text));
scope log(XO_DEBUG(debug_flag_));
log && log(xtag("token_text", token_text),
xtag("initial_whitespace", initial_whitespace),
xtag("initial_token_prefix_from_input", initial_token_prefix_from_input),
xtag("input", input));
tokentype tk_type = tokentype::tk_invalid;
std::string tk_text;
@ -265,79 +327,89 @@ namespace xo {
/* true if at least one digit encountered */
bool number_flag = false;
/* token will be one of: {i64, f64, dot}: */
for(; ix != token_text.hi(); ++ix) {
if((*ix == '-') || (*ix == '+')) {
/* sign allowed:
* 1. before period and before first digit
* 2. after exponent
*/
if (!period_flag && !number_flag && !sign_flag) {
sign_flag = true;
} else if (exponent_flag && !exponent_digit_flag) {
exponent_sign_flag = true;
} else {
throw std::runtime_error
(tostr("tokenizer::assemble_token",
": improperly placed sign indicator",
xtag("pos", ix - tk_start),
xtag("char", *ix)));
}
} else if(*ix == '.') {
if (period_flag) {
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": duplicate decimal point",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
log && log(xtag("*ix", *ix),
xtag("tk.length", token_text.size()));
if (log && (ix + 1 < tk_end))
log(xtag("*(ix+1)", *(ix + 1)));
period_flag = true;
} else if((*ix == 'e') || (*ix == 'E')) {
if (exponent_flag) {
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": duplicate exponent marker",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
exponent_flag = true;
} else if(isdigit(*ix)) {
if (exponent_flag) {
/* need digit before exponent to recognize as number */
exponent_digit_flag = true;
} else {
number_flag = true;
}
} else {
/* invalid input */
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": unexpected character in numeric constant",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
}
if (number_flag) {
if (period_flag || exponent_flag) {
tk_type = tokentype::tk_f64;
} else {
tk_type = tokentype::tk_i64;
}
} else if (period_flag && !exponent_flag) {
tk_type = tokentype::tk_dot;
if ((*ix == '-') && (ix + 2 == token_text.hi()) && (*(ix + 1) == '>')) {
/* composing exactly '->' */
tk_type = tokentype::tk_yields;
} else {
/* not a valid token */
}
/* token (if valid) will be one of: {tk_i64, tk_f64, tk_dot}: */
for (; ix != token_text.hi(); ++ix) {
if ((*ix == '-') || (*ix == '+')) {
/* sign allowed:
* 1. before period and before first digit
* 2. after exponent
*/
if (!period_flag && !number_flag && !sign_flag) {
sign_flag = true;
} else if (exponent_flag && !exponent_digit_flag) {
exponent_sign_flag = true;
} else {
throw std::runtime_error
(tostr("tokenizer::assemble_token",
": improperly placed sign indicator",
xtag("pos", ix - tk_start),
xtag("char", *ix)));
}
} else if (*ix == '.') {
if (period_flag) {
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": duplicate decimal point",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
log && log(xtag("sign_flag", sign_flag));
log && log(xtag("period_flag", period_flag),
xtag("exponent_flag", exponent_flag),
xtag("exponent_sign_flag", exponent_sign_flag),
xtag("number_flag", number_flag));
log && log(xtag("tk_type", tk_type));
period_flag = true;
} else if ((*ix == 'e') || (*ix == 'E')) {
if (exponent_flag) {
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": duplicate exponent marker",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
exponent_flag = true;
} else if (isdigit(*ix)) {
if (exponent_flag) {
/* need digit before exponent to recognize as number */
exponent_digit_flag = true;
} else {
number_flag = true;
}
} else {
/* invalid input */
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": unexpected character in numeric constant",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
}
if (number_flag) {
if (period_flag || exponent_flag) {
tk_type = tokentype::tk_f64;
} else {
tk_type = tokentype::tk_i64;
}
} else if (period_flag && !exponent_flag) {
tk_type = tokentype::tk_dot;
} else {
/* not a valid token */
}
log && log(xtag("sign_flag", sign_flag));
log && log(xtag("period_flag", period_flag),
xtag("exponent_flag", exponent_flag),
xtag("exponent_sign_flag", exponent_sign_flag),
xtag("number_flag", number_flag));
log && log(xtag("tk_type", tk_type));
}
break;
}
@ -569,7 +641,9 @@ namespace xo {
|| (tk_type == tokentype::tk_f64)
|| (tk_type == tokentype::tk_symbol))
{
/* re-parse in token::i64_value() / token::f64_value() */
/* note: capturing token text here;
* for numeric literals will re-parse in token::i64_value() / token::f64_value()
*/
tk_text = std::string(tk_start, tk_end);
} else if (tk_type == tokentype::tk_string) {
; /* nothing to do here -- desired tk_text already constructed */
@ -603,40 +677,96 @@ namespace xo {
tk_text.clear();
}
return token_type(tk_type, std::move(tk_text));
return result_type(token_type(tk_type, std::move(tk_text)),
input.prefix(initial_whitespace + initial_token_prefix_from_input));
} /*assemble_token*/
template <typename CharT>
auto
tokenizer<CharT>::scan(const span_type & input) -> scan_result
tokenizer<CharT>::assemble_final_token(const span_type & token_text) const -> result_type
{
constexpr bool c_debug_flag = true;
scope log(XO_DEBUG(c_debug_flag));
return assemble_token(0 /*initial_whitespace*/,
0 /*initial_token_prefix_from_input*/,
token_text,
span_type::make_null());
}
template <typename CharT>
auto
tokenizer<CharT>::scan_completion(const span_type & whitespace,
const CharT* token_end,
const span_type & input) -> result_type {
auto token_span = input.after_prefix(whitespace).prefix_upto(token_end);
if (this->prefix_.empty()) {
return assemble_token(whitespace.size(),
token_span.size() /*initial_token_prefix_from_input*/,
token_span,
input);
} else {
/* whatever we stashed in .prefix_, should be consumed from input.
* control here implies reached end of input with either
* - input for which parsing outcome depends on existence of more input,
* and presence of eof now resolves
* - malformed input (that might represent prefix of a valid token. Say "#incl" in C)
*
* That means stashed .prefix will represent copied range of characters that
* ends at the same position as input
*/
return result_type::make_partial(input);
}
}
template <typename CharT>
auto
tokenizer<CharT>::scan(const span_type & input) -> result_type
{
scope log(XO_DEBUG(debug_flag_));
log && log(xtag("input", input));
const CharT * ix = input.lo();
/* skip whitespace */
while (is_whitespace(*ix) && (ix != input.hi()))
++ix;
/* skip whitespace + remember beginning of most recent line */
while (is_whitespace(*ix) && (ix != input.hi())) {
if (is_newline(*ix)) {
++ix;
/* look ahead to {end of line, end of input}, whichever comes first */
const CharT * sol = ix;
const CharT * eol = ix;
while ((eol < input.hi()) && (*eol != '\n'))
++eol;
this->current_line_ = span_type(sol, eol);
} else {
++ix;
}
}
if(ix == input.hi()) {
/* no-op */
return {
token_type::invalid(),
input.prefix_upto(ix)
};
return result_type::make_whitespace(input.prefix_upto(ix));
}
// TODO:
// 1. hoist complete_flag up here
// 2. use in each branch
// 3. common check for prefix-capturing after if-cascade below done
/* here: *ix is not whitespace */
auto whitespace = input.prefix_upto(ix);
log && log(xtag("whitespace.size", whitespace.size()));
/* tk_start points to beginning of token
/* tk_start points to known beginning of token
* (after any whitespace)
*
* goal is to leave ix pointing to 1 char past the end of the token
*/
const CharT * tk_start = ix;
@ -654,7 +784,7 @@ namespace xo {
/* need more input to know if/when token complete */
this->prefix_ += std::string(tk_start, input.hi());
log && log(xtag("captured-prefix", this->prefix_));
log && log(xtag("captured-prefix1", this->prefix_));
} else {
CharT ch2 = *ix;
@ -701,9 +831,49 @@ namespace xo {
/* need more input to know if/when token complete */
this->prefix_ += std::string(tk_start, input.hi());
log && log(xtag("captured-prefix", this->prefix_));
log && log(xtag("captured-prefix2", this->prefix_));
}
} else {
/* ix is start of some token */
if (*ix == '-') {
/* this section load-bearing for input '->' scanning from beginning of token */
++ix;
if (ix == input.hi()) {
/* need more input to know if/when token complete -- see captured-prefix5 below */
} else {
CharT ch2 = *ix;
if (ch2 == '>') {
/* include next char and complete token */
++ix;
return scan_completion(whitespace, ix /*token_end*/, input);
}
/* here: -123, -.5e-21 for example */
}
} else if (*ix == '>') {
/* this section load-bearing for input '>=' scanning from beginning of token.
* Need this because '>' necessarily excluded from is_1char_punctuation()
*/
++ix;
if (ix == input.hi()) {
/* need more input to know if/when token complete -- see captured-prefix5 below */
} else {
CharT ch2 = *ix;
if (ch2 != '=') {
/* ignore next char and complete token */
return scan_completion(whitespace, ix /*token_end*/, input);
}
/* here: >= for example */
}
}
/* scan until:
* - whitespace
* - punctuation
@ -715,59 +885,85 @@ namespace xo {
{
break;
}
/* this section load-bearing for input '>' after beginning of a token, e.g. p> */
if ((ix > tk_start) && (*ix == '>'))
break;
/* this section load-bearing for input '->' at the end of another token, e.g. p->q */
if (*ix == '-') {
if (ix + 1 == input.hi()) {
/* need more input to know if/when token complete
*
* apple-banana parses as: {tk_symbol: apple-banana}
* apple-> parses as: {tk_symbol: apple} {tk_yields}
* apple- illegal (may not end symbol with '-')
*/
break;
}
if (*(ix + 1) == '>') {
/* treat '->' as punctuation; complete preceding token */
break;
}
}
}
if (ix == input.hi()) {
/* need more input to know if/when token complete */
this->prefix_ += std::string(tk_start, input.hi());
log && log(xtag("captured-prefix", this->prefix_));
log && log(xtag("captured-prefix5", this->prefix_));
}
}
auto token_span = input.after_prefix(whitespace).prefix_upto(ix);
token tk
= (this->prefix_.empty()
? assemble_token(token_span)
: token_type(tokentype::tk_invalid));
return scan_result
{ tk, input.prefix(whitespace.size() + token_span.size()) };
return scan_completion(whitespace, ix /*token_end*/, input);
} /*scan*/
template <typename CharT>
auto
tokenizer<CharT>::scan2(const span_type & input, bool eof) -> scan_result {
tokenizer<CharT>::scan2(const span_type & input, bool eof) -> result_type {
scope log(XO_DEBUG(debug_flag_));
auto sr = this->scan(input);
if (!sr.first.is_valid() && eof) {
sr.first = this->notify_eof();
/* always consume remainder of input here.
* ambiguous prefix can represent at most one token
*/
sr.second = input;
}
if (sr.is_token() || sr.is_error() || !eof)
return sr;
return sr;
/* control here only if input contains no unambiguous tokens.
* This implies it contains _at most one_ final token.
*/
span_type input2 = input.after_prefix(sr.consumed());
/* need to include src.consumed() in retval */
auto sr2 = this->notify_eof(input2);
return result_type(sr2.get_token(),
span_type::concat(sr.consumed(), sr2.consumed()),
sr2.error());
}
template <typename CharT>
auto
tokenizer<CharT>::notify_eof() -> token_type {
constexpr bool c_debug_flag = true;
tokenizer<CharT>::notify_eof(const span_type & input) -> result_type {
scope log(XO_DEBUG(debug_flag_));
scope log(XO_DEBUG(c_debug_flag));
log && log(xtag("prefix_", prefix_), xtag("prefix_.size", prefix_.size()), xtag("input", input));
token tk
= (this->prefix_.empty()
? token_type(tokentype::tk_invalid)
: assemble_token(span_type(&prefix_[0],
&prefix_[prefix_.size()])));
if (this->prefix_.empty()) {
/* almost meretricious to include input here,
* when called from scan2() it can only be whitespace
*/
return result_type::make_whitespace(input);
} else {
auto retval = assemble_final_token(span_type::from_string(prefix_));
this->prefix_.clear();
this->prefix_.clear();
return tk;
return retval;
}
} /*notify_eof*/
} /*namespace scm*/
} /*namespace xo*/

View file

@ -15,7 +15,7 @@ namespace xo {
*
* Schematica code examples:
*
* type point :: { xcoord : f64, ycoord: f64 };
* type point :: { xcoord : f64, ycoord : f64 };
* type matrix :: array<double, 2>; // 2-d array
*
* decl hypot(x : f64, y : f64) -> f64;
@ -39,7 +39,7 @@ namespace xo {
* };
*
* def matrixproduct(x : matrix, y : matrix) {
* [i,j : x.row(i) * y.col(j)];
* [i, j : x.row(i) * y.col(j)];
* };
**/
enum class tokentype {
@ -120,7 +120,7 @@ namespace xo {
/** operator '/' **/
tk_slash,
/** keyworkd 'type' **/
/** keyword 'type' **/
tk_type,
/** keyword 'def' **/

View file

@ -12,70 +12,76 @@ namespace xo {
using xo::scm::tokentype;
namespace ut {
struct testcase_i64 {
std::string text_;
bool expect_throw_;
std::int64_t expected_;
};
// also see tokenizer.test.cpp for syntax
std::vector<testcase_i64> s_testcase_v = {
{"", true, 0},
{"0", false, 0},
{"-", true, 0},
{"+", true, 0},
{"-0", false, 0},
{"+0", false, 0},
{"1", false, 1},
{"-1", false, -1},
{"9", false, 9},
{"-9", false, -9},
{"12", false, 12},
{"+12", false, 12},
{"-12", false, -12},
{"99", false, 99},
{"-99", false, -99},
{"123x", true, 0},
};
namespace test2 {
struct testcase_i64 {
std::string text_;
bool expect_throw_;
std::int64_t expected_;
};
TEST_CASE("parse-i64", "[token]") {
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
INFO(xtag("i_tc", i_tc));
std::vector<testcase_i64> s_testcase_v = {
{"", true, 0},
{"0", false, 0},
{"-", true, 0},
{"+", true, 0},
{"-0", false, 0},
{"+0", false, 0},
{"1", false, 1},
{"-1", false, -1},
{"9", false, 9},
{"-9", false, -9},
{"12", false, 12},
{"+12", false, 12},
{"-12", false, -12},
{"99", false, 99},
{"-99", false, -99},
{"123x", true, 0},
};
auto const & testcase = s_testcase_v[i_tc];
TEST_CASE("parse-i64", "[token]") {
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
INFO(xtag("i_tc", i_tc));
token tk(tokentype::tk_i64,
testcase.text_);
auto const & testcase = s_testcase_v[i_tc];
REQUIRE(tk.tk_type() == tokentype::tk_i64);
token tk(tokentype::tk_i64,
testcase.text_);
REQUIRE(tk.tk_type() == tokentype::tk_i64);
bool throw_flag = false;
try {
std::int64_t x = tk.i64_value();
REQUIRE(x == testcase.expected_);
} catch (std::exception & ex) {
throw_flag = true;
}
REQUIRE(throw_flag == testcase.expect_throw_);
}
}
}
namespace test3 {
TEST_CASE("error-i64", "[token]") {
token tk(tokentype::tk_i64, "+");
bool throw_flag = false;
try {
std::int64_t x = tk.i64_value();
REQUIRE(x == testcase.expected_);
} catch (std::exception & ex) {
try {
tk.i64_value();
} catch(std::exception & ex) {
throw_flag = true;
}
REQUIRE(throw_flag == testcase.expect_throw_);
REQUIRE(throw_flag);
}
}
TEST_CASE("error-i64", "[token]") {
token tk(tokentype::tk_i64, "+");
bool throw_flag = false;
try {
tk.i64_value();
} catch(std::exception & ex) {
throw_flag = true;
}
REQUIRE(throw_flag);
}
namespace {
namespace test4 {
struct testcase_f64 {
std::string text_;
bool expect_throw_;

View file

@ -12,6 +12,79 @@ namespace xo {
using xo::scm::span;
namespace ut {
/** Two-pass test harness.
*
* First pass - verify test assertions.
* Second pass only if first pass failed.
* On second pass, enable verbose logging
**/
struct rehearser {
/* expect at most one iterator to exist per TestRehearser instance **/
struct iterator {
iterator(rehearser* parent, std::uint32_t attention) : parent_{parent}, attention_{attention} {}
iterator& operator++();
std::uint32_t operator*() { return attention_; }
bool operator==(const iterator& ix2) const {
return (parent_ == ix2.parent_) && (attention_ == ix2.attention_);
}
rehearser* parent_ = nullptr;
std::uint32_t attention_ = 0;
};
bool is_second_pass() const { return attention_ == 1; }
bool enable_debug() const { return is_second_pass(); }
iterator begin() { return iterator(this, 0); }
iterator end() { return iterator(this, 2); }
public:
/** pass number: 0 or 1 **/
std::uint32_t attention_ = 0;
/** @brief set to true when test starts; false if first pass fails **/
bool ok_flag_ = true;
};
auto rehearser::iterator::operator++() -> iterator&
{
++attention_;
if (parent_->ok_flag_ && attention_ == 1) {
/* skip 2nd pass */
++attention_;
}
return *this;
}
/* use this instead of REQUIRE(expr) in context of a test_rehearser */
# define REHEARSE(rehearser, expr) \
if (rehearser.is_second_pass()) { \
REQUIRE((expr)); \
} else { \
REQUIRE(true); \
rehearser.ok_flag_ &= (expr); \
}
/* note: trivial REQUIRE() call in else branch bc we still want
* catch2 to count assertions when verification succeeds
*/
# define REQUIRE_ORCAPTURE(ok_flag, catch_flag, expr) \
if (catch_flag) { \
REQUIRE((expr)); \
} else { \
REQUIRE(true); \
ok_flag &= (expr); \
}
# define REQUIRE_ORFAIL(ok_flag, catch_flag, expr) \
REQUIRE_ORCAPTURE(ok_flag, catch_flag, expr); \
if (!ok_flag) \
return ok_flag
namespace {
struct testcase_tkz {
std::string input_;
@ -22,66 +95,73 @@ namespace xo {
std::vector<testcase_tkz>
s_testcase_v = {
{"<", false, token::leftangle(), true},
{">", false, token::rightangle(), true},
/*
*
* expect_throw consume_all
* v v
*/
{"<", false, token::leftangle(), true},
/* possible prefix of >= */
{">", false, token::rightangle(), true},
{"> ", false, token::rightangle(), false},
{"(", false, token::leftparen(), true},
{")", false, token::rightparen(), true},
{"(", false, token::leftparen(), true},
{")", false, token::rightparen(), true},
{"[", false, token::leftbracket(), true},
{"]", false, token::rightbracket(), true},
{"[", false, token::leftbracket(), true},
{"]", false, token::rightbracket(), true},
{"{", false, token::leftbrace(), true},
{" {", false, token::leftbrace(), true},
{"{", false, token::leftbrace(), true},
{" {", false, token::leftbrace(), true},
{"\t{", false, token::leftbrace(), true},
{"\n{", false, token::leftbrace(), true},
{"}", false, token::rightbrace(), true},
{"\t{", false, token::leftbrace(), true},
{"\n{", false, token::leftbrace(), true},
{"}", false, token::rightbrace(), true},
{"0", false, token::i64_token("0"), true},
{"1", false, token::i64_token("1"), true},
{"12", false, token::i64_token("12"), true},
{"123", false, token::i64_token("123"), true},
{"0", false, token::i64_token("0"), true},
{"1", false, token::i64_token("1"), true},
{"12", false, token::i64_token("12"), true},
{"123", false, token::i64_token("123"), true},
{"1234", false, token::i64_token("1234"), true},
{"0 ", false, token::i64_token("0"), false},
{"1 ", false, token::i64_token("1"), false},
{"12 ", false, token::i64_token("12"), false},
{"123 ", false, token::i64_token("123"), false},
{"0 ", false, token::i64_token("0"), false},
{"1 ", false, token::i64_token("1"), false},
{"12 ", false, token::i64_token("12"), false},
{"123 ", false, token::i64_token("123"), false},
{"1234 ", false, token::i64_token("1234"), false},
{"1<", false, token::i64_token("1"), false},
{"1>", false, token::i64_token("1"), false},
{"1(", false, token::i64_token("1"), false},
{"1)", false, token::i64_token("1"), false},
{"1[", false, token::i64_token("1"), false},
{"1]", false, token::i64_token("1"), false},
{"1{", false, token::i64_token("1"), false},
{"1}", false, token::i64_token("1"), false},
{"1;", false, token::i64_token("1"), false},
{"1:", false, token::i64_token("1"), false},
{"1,", false, token::i64_token("1"), false},
{"1<", false, token::i64_token("1"), false},
{"1>", false, token::i64_token("1"), false},
{"1(", false, token::i64_token("1"), false},
{"1)", false, token::i64_token("1"), false},
{"1[", false, token::i64_token("1"), false},
{"1]", false, token::i64_token("1"), false},
{"1{", false, token::i64_token("1"), false},
{"1}", false, token::i64_token("1"), false},
{"1;", false, token::i64_token("1"), false},
{"1:", false, token::i64_token("1"), false},
{"1,", false, token::i64_token("1"), false},
{".1", false, token::f64_token(".1"), true},
{".12", false, token::f64_token(".12"), true},
{".123", false, token::f64_token(".123"), true},
{".1", false, token::f64_token(".1"), true},
{".12", false, token::f64_token(".12"), true},
{".123", false, token::f64_token(".123"), true},
{"+.1", false, token::f64_token("+.1"), true},
{"+.12", false, token::f64_token("+.12"), true},
{"+.1", false, token::f64_token("+.1"), true},
{"+.12", false, token::f64_token("+.12"), true},
{"+.123", false, token::f64_token("+.123"), true},
{"-.1", false, token::f64_token("-.1"), true},
{"-.12", false, token::f64_token("-.12"), true},
{"-.1", false, token::f64_token("-.1"), true},
{"-.12", false, token::f64_token("-.12"), true},
{"-.123", false, token::f64_token("-.123"), true},
{"1.", false, token::f64_token("1."), true},
{"1.2", false, token::f64_token("1.2"), true},
{"1.23", false, token::f64_token("1.23"), true},
{"1.", false, token::f64_token("1."), true},
{"1.2", false, token::f64_token("1.2"), true},
{"1.23", false, token::f64_token("1.23"), true},
{"1e0", false, token::f64_token("1e0"), true},
{"1e-1", false, token::f64_token("1e-1"), true},
{"1e1", false, token::f64_token("1e1"), true},
{"1e+1", false, token::f64_token("1e+1"), true},
{"1e0", false, token::f64_token("1e0"), true},
{"1e-1", false, token::f64_token("1e-1"), true},
{"1e1", false, token::f64_token("1e1"), true},
{"1e+1", false, token::f64_token("1e+1"), true},
{"\"hello\"", false, token::string_token("hello"), true},
/* tokenizer sees this input:
@ -99,10 +179,20 @@ namespace xo {
{"\"tab to the right [\\t], to the right [\\t]\"", false,
token::string_token("tab to the right [\t], to the right [\t]"), true},
{".", false, token::dot(), true},
{":", false, token::colon(), true},
{",", false, token::comma(), true},
{"=", false, token::singleassign(), true},
{":=", false, token::assign_token(), true},
{"->", false, token::yields(), true},
{"+", false, token::plus_token(), true},
{"-", false, token::minus_token(), true},
{"*", false, token::star_token(), true},
{"/", false, token::slash_token(), true},
{"symbol", false, token::symbol_token("symbol"), true},
{"another-symbol", false, token::symbol_token("another-symbol"), true},
{"type", false, token::type(), true},
{"def", false, token::def(), true},
@ -112,58 +202,59 @@ namespace xo {
{"in", false, token::in(), true},
{"end", false, token::end(), true},
{"*", false, token::star_token(), true},
};
}
TEST_CASE("tokenizer", "[tokenizer]") {
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
const testcase_tkz & testcase = s_testcase_v[i_tc];
INFO(xtag("input", testcase.input_));
INFO(xtag("i_tc", i_tc));
rehearser rh;
using tokenizer
= xo::scm::tokenizer<char>;
for (auto _ : rh) {
scope log(XO_DEBUG2(rh.enable_debug(), "tokenizer"));
tokenizer tkz;
tokenizer::span_type
in_span(testcase.input_.c_str(),
testcase.input_.c_str() + testcase.input_.size());
log && log(xtag("i_tc", i_tc), xtag("input", testcase.input_));
auto out = tkz.scan(in_span);
using tokenizer
= xo::scm::tokenizer<char>;
auto tk = out.first;
tokenizer tkz(rh.enable_debug());
tokenizer::span_type
in_span(testcase.input_.c_str(),
testcase.input_.c_str() + testcase.input_.size());
if (tk.is_invalid())
tk = tkz.notify_eof();
auto sr = tkz.scan2(in_span, true /*eof*/);
REQUIRE(tk.tk_type() == testcase.expected_tk_.tk_type());
if (tk.tk_type() == tokentype::tk_i64)
{
REQUIRE(!tk.text().empty());
REQUIRE(tk.i64_value() == testcase.expected_tk_.i64_value());
} else if (tk.tk_type() == tokentype::tk_f64)
{
REQUIRE(!tk.text().empty());
REQUIRE(tk.f64_value() == testcase.expected_tk_.f64_value());
} else if(tk.tk_type() == tokentype::tk_string)
{
/* tk.text() can be empty, consider input "" */
REQUIRE(tk.text() == testcase.expected_tk_.text());
} else if(tk.tk_type() == tokentype::tk_symbol)
{
REQUIRE(!tk.text().empty());
REQUIRE(tk.text() == testcase.expected_tk_.text());
} else {
REQUIRE(tk.text().empty());
REHEARSE(rh, sr.get_token().tk_type() == testcase.expected_tk_.tk_type());
if (sr.get_token().tk_type() == tokentype::tk_i64)
{
REHEARSE(rh, !sr.get_token().text().empty());
REHEARSE(rh, sr.get_token().i64_value() == testcase.expected_tk_.i64_value());
} else if (sr.get_token().tk_type() == tokentype::tk_f64)
{
REHEARSE(rh, !sr.get_token().text().empty());
REHEARSE(rh, sr.get_token().f64_value() == testcase.expected_tk_.f64_value());
} else if(sr.get_token().tk_type() == tokentype::tk_string)
{
/* sr.get_token().text() can be empty, consider input "" */
REHEARSE(rh, sr.get_token().text() == testcase.expected_tk_.text());
} else if(sr.get_token().tk_type() == tokentype::tk_symbol)
{
REHEARSE(rh, !sr.get_token().text().empty());
REHEARSE(rh, sr.get_token().text() == testcase.expected_tk_.text());
} else {
REHEARSE(rh, sr.get_token().text().empty());
}
/* must consume all input for tests we're doing here */
if (testcase.consume_all_) {
REHEARSE(rh, sr.consumed() == in_span);
} else {
REHEARSE(rh, sr.consumed() != in_span);
}
}
/* must consume all input for tests we're doing here */
if (testcase.consume_all_)
REQUIRE(out.second == in_span);
else
REQUIRE(out.second != in_span);
}
}
@ -208,56 +299,134 @@ namespace xo {
token::symbol_token("y"),
token::semicolon(),
token::rightbrace()
}}
}},
{"a.b",
false,
{token::symbol_token("a"),
token::dot(),
token::symbol_token("b")
}},
{"a,b",
false,
{token::symbol_token("a"),
token::comma(),
token::symbol_token("b")
}},
{"a:b",
false,
{token::symbol_token("a"),
token::colon(),
token::symbol_token("b")
}},
{"a;b",
false,
{token::symbol_token("a"),
token::semicolon(),
token::symbol_token("b")
}},
{"a:=b",
false,
{token::symbol_token("a"),
token::assign_token(),
token::symbol_token("b")
}},
{"a=b",
false,
{token::symbol_token("a"),
token::singleassign(),
token::symbol_token("b")
}},
{"p->q",
false,
{token::symbol_token("p"),
token::yields(),
token::symbol_token("q")
}},
{"a + b",
false,
{token::symbol_token("a"),
token::plus_token(),
token::symbol_token("b")
}},
{"a - b",
false,
{token::symbol_token("a"),
token::minus_token(),
token::symbol_token("b")
}},
{"a-b",
false,
{token::symbol_token("a-b"),
}},
{"(apple)",
false,
{token::leftparen(),
token::symbol_token("apple"),
token::rightparen()
}},
{"<apple>",
false,
{token::leftangle(),
token::symbol_token("apple"),
token::rightangle()
}},
};
}
TEST_CASE("tokenizer2", "[tokenizer]") {
/* this time testing token sequences */
using tokenizer = xo::scm::tokenizer<char>;
for (std::size_t i_tc = 0, n_tc = s_testcase2_v.size(); i_tc < n_tc; ++i_tc) {
const testcase2_tkz & testcase = s_testcase2_v[i_tc];
INFO(xtag("input", testcase.input_));
INFO(xtag("i_tc", i_tc));
rehearser rh;
using tokenizer
= xo::scm::tokenizer<char>;
for (auto _ : rh) {
scope log(XO_DEBUG2(rh.enable_debug(), "tokenizer2"));
tokenizer tkz;
tokenizer::span_type
in_span(testcase.input_.c_str(),
testcase.input_.c_str() + testcase.input_.size());
log && log(xtag("i_tc", i_tc), xtag("input", testcase.input_));
for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size();
i_tk < n_tk; ++i_tk)
{
INFO(xtag("i_tk", i_tk));
tokenizer tkz(rh.enable_debug());
auto res = tkz.scan2(in_span, in_span.empty());
const auto & tk = res.first;
tokenizer::span_type
in_span(testcase.input_.c_str(),
testcase.input_.c_str() + testcase.input_.size());
if (tk.is_valid())
REQUIRE(tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type());
if (tk.tk_type() == tokentype::tk_i64)
for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size();
i_tk < n_tk; ++i_tk)
{
REQUIRE(!tk.text().empty());
REQUIRE(tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value());
} else if (tk.tk_type() == tokentype::tk_f64)
{
REQUIRE(!tk.text().empty());
REQUIRE(tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value());
} else if(tk.tk_type() == tokentype::tk_string)
{
/* tk.text() can be empty, consider input "" */
REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text());
} else if(tk.tk_type() == tokentype::tk_symbol)
{
REQUIRE(!tk.text().empty());
REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text());
} else {
REQUIRE(tk.text().empty());
log && log(xtag("i_tk", i_tk));
auto sr = tkz.scan2(in_span, in_span.empty());
const auto & tk = sr.get_token();
if (tk.is_valid()) {
REHEARSE(rh, tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type());
}
if (tk.tk_type() == tokentype::tk_i64)
{
REHEARSE(rh, !tk.text().empty());
REHEARSE(rh, tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value());
} else if (tk.tk_type() == tokentype::tk_f64)
{
REHEARSE(rh, !tk.text().empty());
REHEARSE(rh, tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value());
} else if(tk.tk_type() == tokentype::tk_string)
{
/* tk.text() can be empty, consider input "" */
REHEARSE(rh, tk.text() == testcase.expected_tk_v_[i_tk].text());
} else if(tk.tk_type() == tokentype::tk_symbol)
{
REHEARSE(rh, !tk.text().empty());
REHEARSE(rh, tk.text() == testcase.expected_tk_v_[i_tk].text());
} else {
REHEARSE(rh, tk.text().empty());
}
in_span = in_span.after_prefix(sr.consumed());
}
in_span = in_span.after_prefix(res.second);
}
}
} /*TEST_CASE(tokenizer2)*/