xo-tokenizer: bugfix: yields token works + 2phase utest
This commit is contained in:
parent
8435734b45
commit
27ef5701ac
8 changed files with 722 additions and 312 deletions
|
|
@ -32,10 +32,10 @@ namespace xo {
|
|||
while (!input.empty()) {
|
||||
/* read one token from input */
|
||||
auto sr = this->tokenizer_.scan2(input, eof);
|
||||
const auto & tk = sr.first;
|
||||
const span_type & used_span = sr.second;
|
||||
const auto & tk = sr.get_token();
|
||||
const span_type & used_span = sr.consumed();
|
||||
|
||||
log && log(xtag("used_span", used_span));
|
||||
log && log(xtag("consumed", used_span));
|
||||
log && log(xtag("input.pre", input));
|
||||
|
||||
input = input.after_prefix(used_span);
|
||||
|
|
|
|||
|
|
@ -19,9 +19,11 @@ add_definitions(${PROJECT_CXX_FLAGS})
|
|||
# ----------------------------------------------------------------
|
||||
|
||||
add_subdirectory(src/tokenizer)
|
||||
add_subdirectory(example)
|
||||
add_subdirectory(utest)
|
||||
xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets)
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# provide find_package() support
|
||||
|
||||
xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets)
|
||||
# docs targets depend on all the other library/utest targets
|
||||
#
|
||||
add_subdirectory(docs)
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "xo/indentlog/scope.hpp"
|
||||
#include <ostream>
|
||||
#include <cstdint>
|
||||
#include <cassert>
|
||||
|
|
@ -24,6 +25,9 @@ namespace xo {
|
|||
/** @brief create span for the contiguous memory range [@p lo, @p hi) **/
|
||||
span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {}
|
||||
|
||||
/** @brief create a null span (i.e. with null @p lo, @p hi pointers) **/
|
||||
static span make_null() { return span(nullptr, nullptr); }
|
||||
|
||||
/** @brief create span for C-style string @p cstr **/
|
||||
static span from_cstr(const CharT * cstr) {
|
||||
CharT * lo = cstr;
|
||||
|
|
@ -32,6 +36,35 @@ namespace xo {
|
|||
return span(lo, hi);
|
||||
}
|
||||
|
||||
/** @brief create span from std::string @p str **/
|
||||
static span from_string(const std::string& str) {
|
||||
CharT * lo = &(*str.begin());
|
||||
CharT * hi = &(*str.end());
|
||||
|
||||
return span(lo, hi);
|
||||
}
|
||||
|
||||
/** @brief concatenate two contiguous spans */
|
||||
static span concat(const span & span1, const span & span2) {
|
||||
if (span1.is_null())
|
||||
return span2;
|
||||
if (span2.is_null())
|
||||
return span1;
|
||||
|
||||
if (span1.hi() != span2.lo()) {
|
||||
scope log(XO_DEBUG(true));
|
||||
|
||||
log && log(xtag("span1.hi", (void*)span1.hi()), xtag("span2.lo", (void*)span2.lo()));
|
||||
}
|
||||
|
||||
assert(span1.hi() == span2.lo());
|
||||
|
||||
CharT * lo = span1.lo();
|
||||
CharT * hi = span2.hi();
|
||||
|
||||
return span(lo, hi);
|
||||
}
|
||||
|
||||
///@{
|
||||
|
||||
/** @name getters **/
|
||||
|
|
@ -96,6 +129,8 @@ namespace xo {
|
|||
return span(hi_, hi_);
|
||||
}
|
||||
|
||||
/** @brief true iff this span is null. distinct from empty. **/
|
||||
bool is_null() const { return lo_ == nullptr && hi_ == nullptr; }
|
||||
/** @brief true iff this span is empty (comprises 0 elements). **/
|
||||
bool empty() const { return lo_ == hi_; }
|
||||
/** @brief report the number of elements (of type CharT) in this span. **/
|
||||
|
|
|
|||
|
|
@ -80,7 +80,10 @@ namespace xo {
|
|||
static token assign_token() { return token(tokentype::tk_assign); }
|
||||
static token yields() { return token(tokentype::tk_yields); }
|
||||
|
||||
static token plus_token() { return token(tokentype::tk_plus); }
|
||||
static token minus_token() { return token(tokentype::tk_minus); }
|
||||
static token star_token() { return token(tokentype::tk_star); }
|
||||
static token slash_token() { return token(tokentype::tk_slash); }
|
||||
|
||||
static token type() { return token(tokentype::tk_type); }
|
||||
static token def() { return token(tokentype::tk_def); }
|
||||
|
|
@ -355,5 +358,4 @@ namespace xo {
|
|||
} /*Namespace scm*/
|
||||
} /*namespace xo*/
|
||||
|
||||
|
||||
/* end token.hpp */
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include "token.hpp"
|
||||
#include "span.hpp"
|
||||
#include "scan_result.hpp"
|
||||
#include "xo/indentlog/scope.hpp"
|
||||
#include <cassert>
|
||||
|
||||
|
|
@ -21,7 +22,7 @@ namespace xo {
|
|||
* tokenizer_type tkz;
|
||||
* span_type input = ...;
|
||||
*
|
||||
* while !input.empty() {
|
||||
* while (!input.empty()) {
|
||||
* auto res = tkz.scan(input);
|
||||
* const auto & tk = res.first;
|
||||
*
|
||||
|
|
@ -39,22 +40,27 @@ namespace xo {
|
|||
* // expect !tkz.has_prefix()
|
||||
*
|
||||
* @endcode
|
||||
*
|
||||
* See tokentype.hpp for token types
|
||||
**/
|
||||
template <typename CharT>
|
||||
class tokenizer {
|
||||
public:
|
||||
using token_type = token<CharT>;
|
||||
using span_type = span<const CharT>;
|
||||
using scan_result = std::pair<token_type, span_type>;
|
||||
using result_type = scan_result<CharT>;
|
||||
|
||||
public:
|
||||
tokenizer() = default;
|
||||
tokenizer(bool debug_flag = false);
|
||||
|
||||
/** recognize the newline character '\n' **/
|
||||
bool is_newline(CharT ch) const;
|
||||
|
||||
/** identifies whitespace chars.
|
||||
* These are chars that do not belong to any token.
|
||||
* They are not permitted to appear within
|
||||
* a symbol or string token.
|
||||
* Appearance of a whitespace char forces completion of
|
||||
* Appearance of a whitespace char forces completioon of
|
||||
* preceding token.
|
||||
**/
|
||||
bool is_whitespace(CharT ch) const;
|
||||
|
|
@ -77,28 +83,59 @@ namespace xo {
|
|||
**/
|
||||
bool has_prefix() const { return !prefix_.empty(); }
|
||||
|
||||
/** assemble token from text @p token_text
|
||||
/** assemble token from text @p token_text.
|
||||
* @p token_text will often but not always represent a subset of @p input.
|
||||
* (For example consider multi-line string literals)
|
||||
* Also the span @p token_text may (in uncommon cases)
|
||||
* have been copied to separate storage from @p input
|
||||
*
|
||||
* @p initial_whitespace Amount of whitespace input being consumed from input.
|
||||
* @p initial_token_prefix_from_input Amount of non-whitespace input being
|
||||
* consumed from input. Not counting any stashed-and-already-consumed input
|
||||
*
|
||||
* retval.consumed will represent some possibly-empty prefix of @p input
|
||||
**/
|
||||
token_type assemble_token(const span_type & token_text) const;
|
||||
result_type assemble_token(std::size_t initial_whitespace,
|
||||
std::size_t initial_token_prefix_from_input,
|
||||
const span_type & token_text,
|
||||
const span_type & input) const;
|
||||
|
||||
/** degenerate version of assemble_token() on reaching end-of-file **/
|
||||
result_type assemble_final_token(const span_type & token_text) const;
|
||||
|
||||
/** scan for next input token, given @p input.
|
||||
* Note tokenizer can consume input (e.g. whitespace)
|
||||
* without completing a token
|
||||
* Note:
|
||||
* - tokenizer can consume input (e.g. whitespace)
|
||||
* without completing a token
|
||||
* - input will remember the extent of the last line of input
|
||||
* for which parsing has begun, but not completed.
|
||||
* It's required that at least that portion of the input span
|
||||
* remain valid across scan(), scan2() calls
|
||||
*
|
||||
* @return {parsed token, consumed span}
|
||||
**/
|
||||
scan_result scan(const span_type & input);
|
||||
result_type scan(const span_type & input);
|
||||
|
||||
/** When eof is false, same as scan(input).
|
||||
* When eof is true and scan(input) does not report a token,
|
||||
* return notify_eof()
|
||||
**/
|
||||
scan_result scan2(const span_type & input, bool eof);
|
||||
result_type scan2(const span_type & input, bool eof);
|
||||
|
||||
/** notify end of input, resolve any stored input **/
|
||||
token_type notify_eof();
|
||||
/** notify end of input, resolving any ambiguous input stashed in .prefix
|
||||
**/
|
||||
result_type notify_eof(const span_type & input);
|
||||
|
||||
private:
|
||||
result_type scan_completion(const span_type & whitespace,
|
||||
const CharT* token_end,
|
||||
const span_type & input);
|
||||
|
||||
private:
|
||||
/** true to log tokenizer activity to stdout **/
|
||||
bool debug_flag_ = false;
|
||||
/** remember start of current line here **/
|
||||
span_type current_line_ = span_type::make_null();
|
||||
/** Accumulate partial token here.
|
||||
* This will happen if input sent to @ref tokenizer::scan
|
||||
* ends without a determinate token boundary.
|
||||
|
|
@ -106,6 +143,17 @@ namespace xo {
|
|||
std::string prefix_;
|
||||
}; /*tokenizer*/
|
||||
|
||||
template <typename CharT>
|
||||
tokenizer<CharT>::tokenizer(bool debug_flag)
|
||||
: debug_flag_{debug_flag}
|
||||
{}
|
||||
|
||||
template <typename CharT>
|
||||
bool
|
||||
tokenizer<CharT>::is_newline(CharT ch) const {
|
||||
return (ch == '\n');
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
bool
|
||||
tokenizer<CharT>::is_whitespace(CharT ch) const {
|
||||
|
|
@ -126,7 +174,10 @@ namespace xo {
|
|||
case '<':
|
||||
return true;
|
||||
case '>':
|
||||
return true;
|
||||
/* can't be punctuation
|
||||
* - appears in tk_yields token: ->
|
||||
*/
|
||||
return false;
|
||||
case '(':
|
||||
return true;
|
||||
case ')':
|
||||
|
|
@ -149,7 +200,10 @@ namespace xo {
|
|||
case '=':
|
||||
return true;
|
||||
case '-':
|
||||
/* can't be punctuation -- can appear inside f64 token: e.g. 1.23e-9 */
|
||||
/* can't be punctuation
|
||||
* - can appear inside f64 token: e.g. 1.23e-9.
|
||||
* - begins tk_yields token: ->
|
||||
*/
|
||||
return false;
|
||||
case '+':
|
||||
/* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */
|
||||
|
|
@ -171,6 +225,10 @@ namespace xo {
|
|||
template <typename CharT>
|
||||
bool
|
||||
tokenizer<CharT>::is_2char_punctuation(CharT ch) const {
|
||||
/* can't put '-' here, because of the way it appears in numeric literals
|
||||
* characters here may not appear in symbol names
|
||||
*/
|
||||
|
||||
switch(ch) {
|
||||
case ':':
|
||||
/* can begin := */
|
||||
|
|
@ -182,15 +240,19 @@ namespace xo {
|
|||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::assemble_token(const span_type & token_text) const -> token_type
|
||||
tokenizer<CharT>::assemble_token(std::size_t initial_whitespace,
|
||||
std::size_t initial_token_prefix_from_input,
|
||||
const span_type & token_text,
|
||||
const span_type & input) const -> result_type
|
||||
{
|
||||
constexpr bool c_debug_flag = true;
|
||||
|
||||
/* literal|pretty|streamlined */
|
||||
log_config::style = function_style::streamlined;
|
||||
|
||||
scope log(XO_DEBUG(c_debug_flag));
|
||||
log && log(xtag("token_text", token_text));
|
||||
scope log(XO_DEBUG(debug_flag_));
|
||||
log && log(xtag("token_text", token_text),
|
||||
xtag("initial_whitespace", initial_whitespace),
|
||||
xtag("initial_token_prefix_from_input", initial_token_prefix_from_input),
|
||||
xtag("input", input));
|
||||
|
||||
tokentype tk_type = tokentype::tk_invalid;
|
||||
std::string tk_text;
|
||||
|
|
@ -265,79 +327,89 @@ namespace xo {
|
|||
/* true if at least one digit encountered */
|
||||
bool number_flag = false;
|
||||
|
||||
/* token will be one of: {i64, f64, dot}: */
|
||||
for(; ix != token_text.hi(); ++ix) {
|
||||
if((*ix == '-') || (*ix == '+')) {
|
||||
/* sign allowed:
|
||||
* 1. before period and before first digit
|
||||
* 2. after exponent
|
||||
*/
|
||||
if (!period_flag && !number_flag && !sign_flag) {
|
||||
sign_flag = true;
|
||||
} else if (exponent_flag && !exponent_digit_flag) {
|
||||
exponent_sign_flag = true;
|
||||
} else {
|
||||
throw std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": improperly placed sign indicator",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix)));
|
||||
}
|
||||
} else if(*ix == '.') {
|
||||
if (period_flag) {
|
||||
throw (std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": duplicate decimal point",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix))));
|
||||
}
|
||||
log && log(xtag("*ix", *ix),
|
||||
xtag("tk.length", token_text.size()));
|
||||
if (log && (ix + 1 < tk_end))
|
||||
log(xtag("*(ix+1)", *(ix + 1)));
|
||||
|
||||
period_flag = true;
|
||||
} else if((*ix == 'e') || (*ix == 'E')) {
|
||||
if (exponent_flag) {
|
||||
throw (std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": duplicate exponent marker",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix))));
|
||||
}
|
||||
|
||||
exponent_flag = true;
|
||||
} else if(isdigit(*ix)) {
|
||||
if (exponent_flag) {
|
||||
/* need digit before exponent to recognize as number */
|
||||
exponent_digit_flag = true;
|
||||
} else {
|
||||
number_flag = true;
|
||||
}
|
||||
} else {
|
||||
/* invalid input */
|
||||
throw (std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": unexpected character in numeric constant",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix))));
|
||||
}
|
||||
}
|
||||
|
||||
if (number_flag) {
|
||||
if (period_flag || exponent_flag) {
|
||||
tk_type = tokentype::tk_f64;
|
||||
} else {
|
||||
tk_type = tokentype::tk_i64;
|
||||
}
|
||||
} else if (period_flag && !exponent_flag) {
|
||||
tk_type = tokentype::tk_dot;
|
||||
if ((*ix == '-') && (ix + 2 == token_text.hi()) && (*(ix + 1) == '>')) {
|
||||
/* composing exactly '->' */
|
||||
tk_type = tokentype::tk_yields;
|
||||
} else {
|
||||
/* not a valid token */
|
||||
}
|
||||
/* token (if valid) will be one of: {tk_i64, tk_f64, tk_dot}: */
|
||||
for (; ix != token_text.hi(); ++ix) {
|
||||
if ((*ix == '-') || (*ix == '+')) {
|
||||
/* sign allowed:
|
||||
* 1. before period and before first digit
|
||||
* 2. after exponent
|
||||
*/
|
||||
if (!period_flag && !number_flag && !sign_flag) {
|
||||
sign_flag = true;
|
||||
} else if (exponent_flag && !exponent_digit_flag) {
|
||||
exponent_sign_flag = true;
|
||||
} else {
|
||||
throw std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": improperly placed sign indicator",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix)));
|
||||
}
|
||||
} else if (*ix == '.') {
|
||||
if (period_flag) {
|
||||
throw (std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": duplicate decimal point",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix))));
|
||||
}
|
||||
|
||||
log && log(xtag("sign_flag", sign_flag));
|
||||
log && log(xtag("period_flag", period_flag),
|
||||
xtag("exponent_flag", exponent_flag),
|
||||
xtag("exponent_sign_flag", exponent_sign_flag),
|
||||
xtag("number_flag", number_flag));
|
||||
log && log(xtag("tk_type", tk_type));
|
||||
period_flag = true;
|
||||
} else if ((*ix == 'e') || (*ix == 'E')) {
|
||||
if (exponent_flag) {
|
||||
throw (std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": duplicate exponent marker",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix))));
|
||||
}
|
||||
|
||||
exponent_flag = true;
|
||||
} else if (isdigit(*ix)) {
|
||||
if (exponent_flag) {
|
||||
/* need digit before exponent to recognize as number */
|
||||
exponent_digit_flag = true;
|
||||
} else {
|
||||
number_flag = true;
|
||||
}
|
||||
} else {
|
||||
/* invalid input */
|
||||
throw (std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": unexpected character in numeric constant",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix))));
|
||||
}
|
||||
}
|
||||
|
||||
if (number_flag) {
|
||||
if (period_flag || exponent_flag) {
|
||||
tk_type = tokentype::tk_f64;
|
||||
} else {
|
||||
tk_type = tokentype::tk_i64;
|
||||
}
|
||||
} else if (period_flag && !exponent_flag) {
|
||||
tk_type = tokentype::tk_dot;
|
||||
} else {
|
||||
/* not a valid token */
|
||||
}
|
||||
|
||||
log && log(xtag("sign_flag", sign_flag));
|
||||
log && log(xtag("period_flag", period_flag),
|
||||
xtag("exponent_flag", exponent_flag),
|
||||
xtag("exponent_sign_flag", exponent_sign_flag),
|
||||
xtag("number_flag", number_flag));
|
||||
log && log(xtag("tk_type", tk_type));
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
|
@ -569,7 +641,9 @@ namespace xo {
|
|||
|| (tk_type == tokentype::tk_f64)
|
||||
|| (tk_type == tokentype::tk_symbol))
|
||||
{
|
||||
/* re-parse in token::i64_value() / token::f64_value() */
|
||||
/* note: capturing token text here;
|
||||
* for numeric literals will re-parse in token::i64_value() / token::f64_value()
|
||||
*/
|
||||
tk_text = std::string(tk_start, tk_end);
|
||||
} else if (tk_type == tokentype::tk_string) {
|
||||
; /* nothing to do here -- desired tk_text already constructed */
|
||||
|
|
@ -603,40 +677,96 @@ namespace xo {
|
|||
tk_text.clear();
|
||||
}
|
||||
|
||||
return token_type(tk_type, std::move(tk_text));
|
||||
return result_type(token_type(tk_type, std::move(tk_text)),
|
||||
input.prefix(initial_whitespace + initial_token_prefix_from_input));
|
||||
} /*assemble_token*/
|
||||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::scan(const span_type & input) -> scan_result
|
||||
tokenizer<CharT>::assemble_final_token(const span_type & token_text) const -> result_type
|
||||
{
|
||||
constexpr bool c_debug_flag = true;
|
||||
scope log(XO_DEBUG(c_debug_flag));
|
||||
return assemble_token(0 /*initial_whitespace*/,
|
||||
0 /*initial_token_prefix_from_input*/,
|
||||
token_text,
|
||||
span_type::make_null());
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::scan_completion(const span_type & whitespace,
|
||||
const CharT* token_end,
|
||||
const span_type & input) -> result_type {
|
||||
|
||||
auto token_span = input.after_prefix(whitespace).prefix_upto(token_end);
|
||||
|
||||
if (this->prefix_.empty()) {
|
||||
return assemble_token(whitespace.size(),
|
||||
token_span.size() /*initial_token_prefix_from_input*/,
|
||||
token_span,
|
||||
input);
|
||||
} else {
|
||||
/* whatever we stashed in .prefix_, should be consumed from input.
|
||||
* control here implies reached end of input with either
|
||||
* - input for which parsing outcome depends on existence of more input,
|
||||
* and presence of eof now resolves
|
||||
* - malformed input (that might represent prefix of a valid token. Say "#incl" in C)
|
||||
*
|
||||
* That means stashed .prefix will represent copied range of characters that
|
||||
* ends at the same position as input
|
||||
*/
|
||||
return result_type::make_partial(input);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::scan(const span_type & input) -> result_type
|
||||
{
|
||||
scope log(XO_DEBUG(debug_flag_));
|
||||
|
||||
log && log(xtag("input", input));
|
||||
|
||||
const CharT * ix = input.lo();
|
||||
|
||||
/* skip whitespace */
|
||||
while (is_whitespace(*ix) && (ix != input.hi()))
|
||||
++ix;
|
||||
/* skip whitespace + remember beginning of most recent line */
|
||||
while (is_whitespace(*ix) && (ix != input.hi())) {
|
||||
|
||||
if (is_newline(*ix)) {
|
||||
++ix;
|
||||
/* look ahead to {end of line, end of input}, whichever comes first */
|
||||
const CharT * sol = ix;
|
||||
const CharT * eol = ix;
|
||||
|
||||
while ((eol < input.hi()) && (*eol != '\n'))
|
||||
++eol;
|
||||
|
||||
this->current_line_ = span_type(sol, eol);
|
||||
} else {
|
||||
++ix;
|
||||
}
|
||||
}
|
||||
|
||||
if(ix == input.hi()) {
|
||||
/* no-op */
|
||||
return {
|
||||
token_type::invalid(),
|
||||
input.prefix_upto(ix)
|
||||
};
|
||||
return result_type::make_whitespace(input.prefix_upto(ix));
|
||||
}
|
||||
|
||||
// TODO:
|
||||
// 1. hoist complete_flag up here
|
||||
// 2. use in each branch
|
||||
// 3. common check for prefix-capturing after if-cascade below done
|
||||
|
||||
/* here: *ix is not whitespace */
|
||||
|
||||
auto whitespace = input.prefix_upto(ix);
|
||||
|
||||
log && log(xtag("whitespace.size", whitespace.size()));
|
||||
|
||||
/* tk_start points to beginning of token
|
||||
/* tk_start points to known beginning of token
|
||||
* (after any whitespace)
|
||||
*
|
||||
* goal is to leave ix pointing to 1 char past the end of the token
|
||||
*/
|
||||
const CharT * tk_start = ix;
|
||||
|
||||
|
|
@ -654,7 +784,7 @@ namespace xo {
|
|||
/* need more input to know if/when token complete */
|
||||
this->prefix_ += std::string(tk_start, input.hi());
|
||||
|
||||
log && log(xtag("captured-prefix", this->prefix_));
|
||||
log && log(xtag("captured-prefix1", this->prefix_));
|
||||
} else {
|
||||
CharT ch2 = *ix;
|
||||
|
||||
|
|
@ -701,9 +831,49 @@ namespace xo {
|
|||
/* need more input to know if/when token complete */
|
||||
this->prefix_ += std::string(tk_start, input.hi());
|
||||
|
||||
log && log(xtag("captured-prefix", this->prefix_));
|
||||
log && log(xtag("captured-prefix2", this->prefix_));
|
||||
}
|
||||
} else {
|
||||
/* ix is start of some token */
|
||||
|
||||
if (*ix == '-') {
|
||||
/* this section load-bearing for input '->' scanning from beginning of token */
|
||||
++ix;
|
||||
|
||||
if (ix == input.hi()) {
|
||||
/* need more input to know if/when token complete -- see captured-prefix5 below */
|
||||
} else {
|
||||
CharT ch2 = *ix;
|
||||
|
||||
if (ch2 == '>') {
|
||||
/* include next char and complete token */
|
||||
++ix;
|
||||
|
||||
return scan_completion(whitespace, ix /*token_end*/, input);
|
||||
}
|
||||
|
||||
/* here: -123, -.5e-21 for example */
|
||||
}
|
||||
} else if (*ix == '>') {
|
||||
/* this section load-bearing for input '>=' scanning from beginning of token.
|
||||
* Need this because '>' necessarily excluded from is_1char_punctuation()
|
||||
*/
|
||||
++ix;
|
||||
|
||||
if (ix == input.hi()) {
|
||||
/* need more input to know if/when token complete -- see captured-prefix5 below */
|
||||
} else {
|
||||
CharT ch2 = *ix;
|
||||
|
||||
if (ch2 != '=') {
|
||||
/* ignore next char and complete token */
|
||||
return scan_completion(whitespace, ix /*token_end*/, input);
|
||||
}
|
||||
|
||||
/* here: >= for example */
|
||||
}
|
||||
}
|
||||
|
||||
/* scan until:
|
||||
* - whitespace
|
||||
* - punctuation
|
||||
|
|
@ -715,59 +885,85 @@ namespace xo {
|
|||
{
|
||||
break;
|
||||
}
|
||||
|
||||
/* this section load-bearing for input '>' after beginning of a token, e.g. p> */
|
||||
if ((ix > tk_start) && (*ix == '>'))
|
||||
break;
|
||||
|
||||
/* this section load-bearing for input '->' at the end of another token, e.g. p->q */
|
||||
if (*ix == '-') {
|
||||
if (ix + 1 == input.hi()) {
|
||||
/* need more input to know if/when token complete
|
||||
*
|
||||
* apple-banana parses as: {tk_symbol: apple-banana}
|
||||
* apple-> parses as: {tk_symbol: apple} {tk_yields}
|
||||
* apple- illegal (may not end symbol with '-')
|
||||
*/
|
||||
break;
|
||||
}
|
||||
|
||||
if (*(ix + 1) == '>') {
|
||||
/* treat '->' as punctuation; complete preceding token */
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ix == input.hi()) {
|
||||
/* need more input to know if/when token complete */
|
||||
this->prefix_ += std::string(tk_start, input.hi());
|
||||
|
||||
log && log(xtag("captured-prefix", this->prefix_));
|
||||
log && log(xtag("captured-prefix5", this->prefix_));
|
||||
}
|
||||
}
|
||||
|
||||
auto token_span = input.after_prefix(whitespace).prefix_upto(ix);
|
||||
|
||||
token tk
|
||||
= (this->prefix_.empty()
|
||||
? assemble_token(token_span)
|
||||
: token_type(tokentype::tk_invalid));
|
||||
|
||||
return scan_result
|
||||
{ tk, input.prefix(whitespace.size() + token_span.size()) };
|
||||
return scan_completion(whitespace, ix /*token_end*/, input);
|
||||
} /*scan*/
|
||||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::scan2(const span_type & input, bool eof) -> scan_result {
|
||||
tokenizer<CharT>::scan2(const span_type & input, bool eof) -> result_type {
|
||||
scope log(XO_DEBUG(debug_flag_));
|
||||
|
||||
auto sr = this->scan(input);
|
||||
|
||||
if (!sr.first.is_valid() && eof) {
|
||||
sr.first = this->notify_eof();
|
||||
/* always consume remainder of input here.
|
||||
* ambiguous prefix can represent at most one token
|
||||
*/
|
||||
sr.second = input;
|
||||
}
|
||||
if (sr.is_token() || sr.is_error() || !eof)
|
||||
return sr;
|
||||
|
||||
return sr;
|
||||
/* control here only if input contains no unambiguous tokens.
|
||||
* This implies it contains _at most one_ final token.
|
||||
*/
|
||||
|
||||
span_type input2 = input.after_prefix(sr.consumed());
|
||||
|
||||
/* need to include src.consumed() in retval */
|
||||
|
||||
auto sr2 = this->notify_eof(input2);
|
||||
|
||||
return result_type(sr2.get_token(),
|
||||
span_type::concat(sr.consumed(), sr2.consumed()),
|
||||
sr2.error());
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::notify_eof() -> token_type {
|
||||
constexpr bool c_debug_flag = true;
|
||||
tokenizer<CharT>::notify_eof(const span_type & input) -> result_type {
|
||||
scope log(XO_DEBUG(debug_flag_));
|
||||
|
||||
scope log(XO_DEBUG(c_debug_flag));
|
||||
log && log(xtag("prefix_", prefix_), xtag("prefix_.size", prefix_.size()), xtag("input", input));
|
||||
|
||||
token tk
|
||||
= (this->prefix_.empty()
|
||||
? token_type(tokentype::tk_invalid)
|
||||
: assemble_token(span_type(&prefix_[0],
|
||||
&prefix_[prefix_.size()])));
|
||||
if (this->prefix_.empty()) {
|
||||
/* almost meretricious to include input here,
|
||||
* when called from scan2() it can only be whitespace
|
||||
*/
|
||||
return result_type::make_whitespace(input);
|
||||
} else {
|
||||
auto retval = assemble_final_token(span_type::from_string(prefix_));
|
||||
|
||||
this->prefix_.clear();
|
||||
this->prefix_.clear();
|
||||
|
||||
return tk;
|
||||
return retval;
|
||||
}
|
||||
} /*notify_eof*/
|
||||
} /*namespace scm*/
|
||||
} /*namespace xo*/
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ namespace xo {
|
|||
*
|
||||
* Schematica code examples:
|
||||
*
|
||||
* type point :: { xcoord : f64, ycoord: f64 };
|
||||
* type point :: { xcoord : f64, ycoord : f64 };
|
||||
* type matrix :: array<double, 2>; // 2-d array
|
||||
*
|
||||
* decl hypot(x : f64, y : f64) -> f64;
|
||||
|
|
@ -39,7 +39,7 @@ namespace xo {
|
|||
* };
|
||||
*
|
||||
* def matrixproduct(x : matrix, y : matrix) {
|
||||
* [i,j : x.row(i) * y.col(j)];
|
||||
* [i, j : x.row(i) * y.col(j)];
|
||||
* };
|
||||
**/
|
||||
enum class tokentype {
|
||||
|
|
@ -120,7 +120,7 @@ namespace xo {
|
|||
/** operator '/' **/
|
||||
tk_slash,
|
||||
|
||||
/** keyworkd 'type' **/
|
||||
/** keyword 'type' **/
|
||||
tk_type,
|
||||
|
||||
/** keyword 'def' **/
|
||||
|
|
|
|||
|
|
@ -12,70 +12,76 @@ namespace xo {
|
|||
using xo::scm::tokentype;
|
||||
|
||||
namespace ut {
|
||||
struct testcase_i64 {
|
||||
std::string text_;
|
||||
bool expect_throw_;
|
||||
std::int64_t expected_;
|
||||
};
|
||||
// also see tokenizer.test.cpp for syntax
|
||||
|
||||
std::vector<testcase_i64> s_testcase_v = {
|
||||
{"", true, 0},
|
||||
{"0", false, 0},
|
||||
{"-", true, 0},
|
||||
{"+", true, 0},
|
||||
{"-0", false, 0},
|
||||
{"+0", false, 0},
|
||||
{"1", false, 1},
|
||||
{"-1", false, -1},
|
||||
{"9", false, 9},
|
||||
{"-9", false, -9},
|
||||
{"12", false, 12},
|
||||
{"+12", false, 12},
|
||||
{"-12", false, -12},
|
||||
{"99", false, 99},
|
||||
{"-99", false, -99},
|
||||
{"123x", true, 0},
|
||||
};
|
||||
namespace test2 {
|
||||
struct testcase_i64 {
|
||||
std::string text_;
|
||||
bool expect_throw_;
|
||||
std::int64_t expected_;
|
||||
};
|
||||
|
||||
TEST_CASE("parse-i64", "[token]") {
|
||||
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
|
||||
INFO(xtag("i_tc", i_tc));
|
||||
std::vector<testcase_i64> s_testcase_v = {
|
||||
{"", true, 0},
|
||||
{"0", false, 0},
|
||||
{"-", true, 0},
|
||||
{"+", true, 0},
|
||||
{"-0", false, 0},
|
||||
{"+0", false, 0},
|
||||
{"1", false, 1},
|
||||
{"-1", false, -1},
|
||||
{"9", false, 9},
|
||||
{"-9", false, -9},
|
||||
{"12", false, 12},
|
||||
{"+12", false, 12},
|
||||
{"-12", false, -12},
|
||||
{"99", false, 99},
|
||||
{"-99", false, -99},
|
||||
{"123x", true, 0},
|
||||
};
|
||||
|
||||
auto const & testcase = s_testcase_v[i_tc];
|
||||
TEST_CASE("parse-i64", "[token]") {
|
||||
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
|
||||
INFO(xtag("i_tc", i_tc));
|
||||
|
||||
token tk(tokentype::tk_i64,
|
||||
testcase.text_);
|
||||
auto const & testcase = s_testcase_v[i_tc];
|
||||
|
||||
REQUIRE(tk.tk_type() == tokentype::tk_i64);
|
||||
token tk(tokentype::tk_i64,
|
||||
testcase.text_);
|
||||
|
||||
REQUIRE(tk.tk_type() == tokentype::tk_i64);
|
||||
|
||||
bool throw_flag = false;
|
||||
try {
|
||||
std::int64_t x = tk.i64_value();
|
||||
|
||||
REQUIRE(x == testcase.expected_);
|
||||
} catch (std::exception & ex) {
|
||||
throw_flag = true;
|
||||
}
|
||||
|
||||
REQUIRE(throw_flag == testcase.expect_throw_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace test3 {
|
||||
TEST_CASE("error-i64", "[token]") {
|
||||
token tk(tokentype::tk_i64, "+");
|
||||
|
||||
bool throw_flag = false;
|
||||
try {
|
||||
std::int64_t x = tk.i64_value();
|
||||
|
||||
REQUIRE(x == testcase.expected_);
|
||||
} catch (std::exception & ex) {
|
||||
try {
|
||||
tk.i64_value();
|
||||
} catch(std::exception & ex) {
|
||||
throw_flag = true;
|
||||
}
|
||||
|
||||
REQUIRE(throw_flag == testcase.expect_throw_);
|
||||
REQUIRE(throw_flag);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("error-i64", "[token]") {
|
||||
token tk(tokentype::tk_i64, "+");
|
||||
|
||||
bool throw_flag = false;
|
||||
|
||||
try {
|
||||
tk.i64_value();
|
||||
} catch(std::exception & ex) {
|
||||
throw_flag = true;
|
||||
}
|
||||
|
||||
REQUIRE(throw_flag);
|
||||
}
|
||||
|
||||
namespace {
|
||||
namespace test4 {
|
||||
struct testcase_f64 {
|
||||
std::string text_;
|
||||
bool expect_throw_;
|
||||
|
|
|
|||
|
|
@ -12,6 +12,79 @@ namespace xo {
|
|||
using xo::scm::span;
|
||||
|
||||
namespace ut {
|
||||
/** Two-pass test harness.
|
||||
*
|
||||
* First pass - verify test assertions.
|
||||
* Second pass only if first pass failed.
|
||||
* On second pass, enable verbose logging
|
||||
**/
|
||||
struct rehearser {
|
||||
/* expect at most one iterator to exist per TestRehearser instance **/
|
||||
struct iterator {
|
||||
iterator(rehearser* parent, std::uint32_t attention) : parent_{parent}, attention_{attention} {}
|
||||
|
||||
iterator& operator++();
|
||||
std::uint32_t operator*() { return attention_; }
|
||||
|
||||
bool operator==(const iterator& ix2) const {
|
||||
return (parent_ == ix2.parent_) && (attention_ == ix2.attention_);
|
||||
}
|
||||
|
||||
rehearser* parent_ = nullptr;
|
||||
std::uint32_t attention_ = 0;
|
||||
|
||||
};
|
||||
|
||||
bool is_second_pass() const { return attention_ == 1; }
|
||||
bool enable_debug() const { return is_second_pass(); }
|
||||
|
||||
iterator begin() { return iterator(this, 0); }
|
||||
iterator end() { return iterator(this, 2); }
|
||||
|
||||
public:
|
||||
/** pass number: 0 or 1 **/
|
||||
std::uint32_t attention_ = 0;
|
||||
/** @brief set to true when test starts; false if first pass fails **/
|
||||
bool ok_flag_ = true;
|
||||
};
|
||||
|
||||
auto rehearser::iterator::operator++() -> iterator&
|
||||
{
|
||||
++attention_;
|
||||
|
||||
if (parent_->ok_flag_ && attention_ == 1) {
|
||||
/* skip 2nd pass */
|
||||
++attention_;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
/* use this instead of REQUIRE(expr) in context of a test_rehearser */
|
||||
# define REHEARSE(rehearser, expr) \
|
||||
if (rehearser.is_second_pass()) { \
|
||||
REQUIRE((expr)); \
|
||||
} else { \
|
||||
REQUIRE(true); \
|
||||
rehearser.ok_flag_ &= (expr); \
|
||||
}
|
||||
|
||||
/* note: trivial REQUIRE() call in else branch bc we still want
|
||||
* catch2 to count assertions when verification succeeds
|
||||
*/
|
||||
# define REQUIRE_ORCAPTURE(ok_flag, catch_flag, expr) \
|
||||
if (catch_flag) { \
|
||||
REQUIRE((expr)); \
|
||||
} else { \
|
||||
REQUIRE(true); \
|
||||
ok_flag &= (expr); \
|
||||
}
|
||||
|
||||
# define REQUIRE_ORFAIL(ok_flag, catch_flag, expr) \
|
||||
REQUIRE_ORCAPTURE(ok_flag, catch_flag, expr); \
|
||||
if (!ok_flag) \
|
||||
return ok_flag
|
||||
|
||||
namespace {
|
||||
struct testcase_tkz {
|
||||
std::string input_;
|
||||
|
|
@ -22,66 +95,73 @@ namespace xo {
|
|||
|
||||
std::vector<testcase_tkz>
|
||||
s_testcase_v = {
|
||||
{"<", false, token::leftangle(), true},
|
||||
{">", false, token::rightangle(), true},
|
||||
/*
|
||||
*
|
||||
* expect_throw consume_all
|
||||
* v v
|
||||
*/
|
||||
{"<", false, token::leftangle(), true},
|
||||
/* possible prefix of >= */
|
||||
{">", false, token::rightangle(), true},
|
||||
{"> ", false, token::rightangle(), false},
|
||||
|
||||
{"(", false, token::leftparen(), true},
|
||||
{")", false, token::rightparen(), true},
|
||||
{"(", false, token::leftparen(), true},
|
||||
{")", false, token::rightparen(), true},
|
||||
|
||||
{"[", false, token::leftbracket(), true},
|
||||
{"]", false, token::rightbracket(), true},
|
||||
{"[", false, token::leftbracket(), true},
|
||||
{"]", false, token::rightbracket(), true},
|
||||
|
||||
{"{", false, token::leftbrace(), true},
|
||||
{" {", false, token::leftbrace(), true},
|
||||
{"{", false, token::leftbrace(), true},
|
||||
{" {", false, token::leftbrace(), true},
|
||||
|
||||
{"\t{", false, token::leftbrace(), true},
|
||||
{"\n{", false, token::leftbrace(), true},
|
||||
{"}", false, token::rightbrace(), true},
|
||||
{"\t{", false, token::leftbrace(), true},
|
||||
{"\n{", false, token::leftbrace(), true},
|
||||
{"}", false, token::rightbrace(), true},
|
||||
|
||||
{"0", false, token::i64_token("0"), true},
|
||||
{"1", false, token::i64_token("1"), true},
|
||||
{"12", false, token::i64_token("12"), true},
|
||||
{"123", false, token::i64_token("123"), true},
|
||||
{"0", false, token::i64_token("0"), true},
|
||||
{"1", false, token::i64_token("1"), true},
|
||||
{"12", false, token::i64_token("12"), true},
|
||||
{"123", false, token::i64_token("123"), true},
|
||||
{"1234", false, token::i64_token("1234"), true},
|
||||
|
||||
{"0 ", false, token::i64_token("0"), false},
|
||||
{"1 ", false, token::i64_token("1"), false},
|
||||
{"12 ", false, token::i64_token("12"), false},
|
||||
{"123 ", false, token::i64_token("123"), false},
|
||||
{"0 ", false, token::i64_token("0"), false},
|
||||
{"1 ", false, token::i64_token("1"), false},
|
||||
{"12 ", false, token::i64_token("12"), false},
|
||||
{"123 ", false, token::i64_token("123"), false},
|
||||
{"1234 ", false, token::i64_token("1234"), false},
|
||||
|
||||
{"1<", false, token::i64_token("1"), false},
|
||||
{"1>", false, token::i64_token("1"), false},
|
||||
{"1(", false, token::i64_token("1"), false},
|
||||
{"1)", false, token::i64_token("1"), false},
|
||||
{"1[", false, token::i64_token("1"), false},
|
||||
{"1]", false, token::i64_token("1"), false},
|
||||
{"1{", false, token::i64_token("1"), false},
|
||||
{"1}", false, token::i64_token("1"), false},
|
||||
{"1;", false, token::i64_token("1"), false},
|
||||
{"1:", false, token::i64_token("1"), false},
|
||||
{"1,", false, token::i64_token("1"), false},
|
||||
{"1<", false, token::i64_token("1"), false},
|
||||
{"1>", false, token::i64_token("1"), false},
|
||||
{"1(", false, token::i64_token("1"), false},
|
||||
{"1)", false, token::i64_token("1"), false},
|
||||
{"1[", false, token::i64_token("1"), false},
|
||||
{"1]", false, token::i64_token("1"), false},
|
||||
{"1{", false, token::i64_token("1"), false},
|
||||
{"1}", false, token::i64_token("1"), false},
|
||||
{"1;", false, token::i64_token("1"), false},
|
||||
{"1:", false, token::i64_token("1"), false},
|
||||
{"1,", false, token::i64_token("1"), false},
|
||||
|
||||
{".1", false, token::f64_token(".1"), true},
|
||||
{".12", false, token::f64_token(".12"), true},
|
||||
{".123", false, token::f64_token(".123"), true},
|
||||
{".1", false, token::f64_token(".1"), true},
|
||||
{".12", false, token::f64_token(".12"), true},
|
||||
{".123", false, token::f64_token(".123"), true},
|
||||
|
||||
{"+.1", false, token::f64_token("+.1"), true},
|
||||
{"+.12", false, token::f64_token("+.12"), true},
|
||||
{"+.1", false, token::f64_token("+.1"), true},
|
||||
{"+.12", false, token::f64_token("+.12"), true},
|
||||
{"+.123", false, token::f64_token("+.123"), true},
|
||||
|
||||
{"-.1", false, token::f64_token("-.1"), true},
|
||||
{"-.12", false, token::f64_token("-.12"), true},
|
||||
{"-.1", false, token::f64_token("-.1"), true},
|
||||
{"-.12", false, token::f64_token("-.12"), true},
|
||||
{"-.123", false, token::f64_token("-.123"), true},
|
||||
|
||||
{"1.", false, token::f64_token("1."), true},
|
||||
{"1.2", false, token::f64_token("1.2"), true},
|
||||
{"1.23", false, token::f64_token("1.23"), true},
|
||||
{"1.", false, token::f64_token("1."), true},
|
||||
{"1.2", false, token::f64_token("1.2"), true},
|
||||
{"1.23", false, token::f64_token("1.23"), true},
|
||||
|
||||
{"1e0", false, token::f64_token("1e0"), true},
|
||||
{"1e-1", false, token::f64_token("1e-1"), true},
|
||||
{"1e1", false, token::f64_token("1e1"), true},
|
||||
{"1e+1", false, token::f64_token("1e+1"), true},
|
||||
{"1e0", false, token::f64_token("1e0"), true},
|
||||
{"1e-1", false, token::f64_token("1e-1"), true},
|
||||
{"1e1", false, token::f64_token("1e1"), true},
|
||||
{"1e+1", false, token::f64_token("1e+1"), true},
|
||||
|
||||
{"\"hello\"", false, token::string_token("hello"), true},
|
||||
/* tokenizer sees this input:
|
||||
|
|
@ -99,10 +179,20 @@ namespace xo {
|
|||
{"\"tab to the right [\\t], to the right [\\t]\"", false,
|
||||
token::string_token("tab to the right [\t], to the right [\t]"), true},
|
||||
|
||||
{".", false, token::dot(), true},
|
||||
{":", false, token::colon(), true},
|
||||
{",", false, token::comma(), true},
|
||||
{"=", false, token::singleassign(), true},
|
||||
{":=", false, token::assign_token(), true},
|
||||
{"->", false, token::yields(), true},
|
||||
|
||||
{"+", false, token::plus_token(), true},
|
||||
{"-", false, token::minus_token(), true},
|
||||
{"*", false, token::star_token(), true},
|
||||
{"/", false, token::slash_token(), true},
|
||||
|
||||
{"symbol", false, token::symbol_token("symbol"), true},
|
||||
{"another-symbol", false, token::symbol_token("another-symbol"), true},
|
||||
|
||||
{"type", false, token::type(), true},
|
||||
{"def", false, token::def(), true},
|
||||
|
|
@ -112,58 +202,59 @@ namespace xo {
|
|||
{"in", false, token::in(), true},
|
||||
{"end", false, token::end(), true},
|
||||
|
||||
{"*", false, token::star_token(), true},
|
||||
};
|
||||
}
|
||||
|
||||
TEST_CASE("tokenizer", "[tokenizer]") {
|
||||
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
|
||||
|
||||
const testcase_tkz & testcase = s_testcase_v[i_tc];
|
||||
|
||||
INFO(xtag("input", testcase.input_));
|
||||
INFO(xtag("i_tc", i_tc));
|
||||
rehearser rh;
|
||||
|
||||
using tokenizer
|
||||
= xo::scm::tokenizer<char>;
|
||||
for (auto _ : rh) {
|
||||
scope log(XO_DEBUG2(rh.enable_debug(), "tokenizer"));
|
||||
|
||||
tokenizer tkz;
|
||||
tokenizer::span_type
|
||||
in_span(testcase.input_.c_str(),
|
||||
testcase.input_.c_str() + testcase.input_.size());
|
||||
log && log(xtag("i_tc", i_tc), xtag("input", testcase.input_));
|
||||
|
||||
auto out = tkz.scan(in_span);
|
||||
using tokenizer
|
||||
= xo::scm::tokenizer<char>;
|
||||
|
||||
auto tk = out.first;
|
||||
tokenizer tkz(rh.enable_debug());
|
||||
tokenizer::span_type
|
||||
in_span(testcase.input_.c_str(),
|
||||
testcase.input_.c_str() + testcase.input_.size());
|
||||
|
||||
if (tk.is_invalid())
|
||||
tk = tkz.notify_eof();
|
||||
auto sr = tkz.scan2(in_span, true /*eof*/);
|
||||
|
||||
REQUIRE(tk.tk_type() == testcase.expected_tk_.tk_type());
|
||||
if (tk.tk_type() == tokentype::tk_i64)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.i64_value() == testcase.expected_tk_.i64_value());
|
||||
} else if (tk.tk_type() == tokentype::tk_f64)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.f64_value() == testcase.expected_tk_.f64_value());
|
||||
} else if(tk.tk_type() == tokentype::tk_string)
|
||||
{
|
||||
/* tk.text() can be empty, consider input "" */
|
||||
REQUIRE(tk.text() == testcase.expected_tk_.text());
|
||||
} else if(tk.tk_type() == tokentype::tk_symbol)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.text() == testcase.expected_tk_.text());
|
||||
} else {
|
||||
REQUIRE(tk.text().empty());
|
||||
REHEARSE(rh, sr.get_token().tk_type() == testcase.expected_tk_.tk_type());
|
||||
if (sr.get_token().tk_type() == tokentype::tk_i64)
|
||||
{
|
||||
REHEARSE(rh, !sr.get_token().text().empty());
|
||||
REHEARSE(rh, sr.get_token().i64_value() == testcase.expected_tk_.i64_value());
|
||||
} else if (sr.get_token().tk_type() == tokentype::tk_f64)
|
||||
{
|
||||
REHEARSE(rh, !sr.get_token().text().empty());
|
||||
REHEARSE(rh, sr.get_token().f64_value() == testcase.expected_tk_.f64_value());
|
||||
} else if(sr.get_token().tk_type() == tokentype::tk_string)
|
||||
{
|
||||
/* sr.get_token().text() can be empty, consider input "" */
|
||||
REHEARSE(rh, sr.get_token().text() == testcase.expected_tk_.text());
|
||||
} else if(sr.get_token().tk_type() == tokentype::tk_symbol)
|
||||
{
|
||||
REHEARSE(rh, !sr.get_token().text().empty());
|
||||
REHEARSE(rh, sr.get_token().text() == testcase.expected_tk_.text());
|
||||
} else {
|
||||
REHEARSE(rh, sr.get_token().text().empty());
|
||||
}
|
||||
|
||||
/* must consume all input for tests we're doing here */
|
||||
if (testcase.consume_all_) {
|
||||
REHEARSE(rh, sr.consumed() == in_span);
|
||||
} else {
|
||||
REHEARSE(rh, sr.consumed() != in_span);
|
||||
}
|
||||
}
|
||||
|
||||
/* must consume all input for tests we're doing here */
|
||||
if (testcase.consume_all_)
|
||||
REQUIRE(out.second == in_span);
|
||||
else
|
||||
REQUIRE(out.second != in_span);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -208,56 +299,134 @@ namespace xo {
|
|||
token::symbol_token("y"),
|
||||
token::semicolon(),
|
||||
token::rightbrace()
|
||||
}}
|
||||
}},
|
||||
{"a.b",
|
||||
false,
|
||||
{token::symbol_token("a"),
|
||||
token::dot(),
|
||||
token::symbol_token("b")
|
||||
}},
|
||||
{"a,b",
|
||||
false,
|
||||
{token::symbol_token("a"),
|
||||
token::comma(),
|
||||
token::symbol_token("b")
|
||||
}},
|
||||
{"a:b",
|
||||
false,
|
||||
{token::symbol_token("a"),
|
||||
token::colon(),
|
||||
token::symbol_token("b")
|
||||
}},
|
||||
{"a;b",
|
||||
false,
|
||||
{token::symbol_token("a"),
|
||||
token::semicolon(),
|
||||
token::symbol_token("b")
|
||||
}},
|
||||
{"a:=b",
|
||||
false,
|
||||
{token::symbol_token("a"),
|
||||
token::assign_token(),
|
||||
token::symbol_token("b")
|
||||
}},
|
||||
{"a=b",
|
||||
false,
|
||||
{token::symbol_token("a"),
|
||||
token::singleassign(),
|
||||
token::symbol_token("b")
|
||||
}},
|
||||
{"p->q",
|
||||
false,
|
||||
{token::symbol_token("p"),
|
||||
token::yields(),
|
||||
token::symbol_token("q")
|
||||
}},
|
||||
{"a + b",
|
||||
false,
|
||||
{token::symbol_token("a"),
|
||||
token::plus_token(),
|
||||
token::symbol_token("b")
|
||||
}},
|
||||
{"a - b",
|
||||
false,
|
||||
{token::symbol_token("a"),
|
||||
token::minus_token(),
|
||||
token::symbol_token("b")
|
||||
}},
|
||||
{"a-b",
|
||||
false,
|
||||
{token::symbol_token("a-b"),
|
||||
}},
|
||||
{"(apple)",
|
||||
false,
|
||||
{token::leftparen(),
|
||||
token::symbol_token("apple"),
|
||||
token::rightparen()
|
||||
}},
|
||||
{"<apple>",
|
||||
false,
|
||||
{token::leftangle(),
|
||||
token::symbol_token("apple"),
|
||||
token::rightangle()
|
||||
}},
|
||||
};
|
||||
}
|
||||
|
||||
TEST_CASE("tokenizer2", "[tokenizer]") {
|
||||
/* this time testing token sequences */
|
||||
|
||||
using tokenizer = xo::scm::tokenizer<char>;
|
||||
|
||||
for (std::size_t i_tc = 0, n_tc = s_testcase2_v.size(); i_tc < n_tc; ++i_tc) {
|
||||
const testcase2_tkz & testcase = s_testcase2_v[i_tc];
|
||||
|
||||
INFO(xtag("input", testcase.input_));
|
||||
INFO(xtag("i_tc", i_tc));
|
||||
rehearser rh;
|
||||
|
||||
using tokenizer
|
||||
= xo::scm::tokenizer<char>;
|
||||
for (auto _ : rh) {
|
||||
scope log(XO_DEBUG2(rh.enable_debug(), "tokenizer2"));
|
||||
|
||||
tokenizer tkz;
|
||||
tokenizer::span_type
|
||||
in_span(testcase.input_.c_str(),
|
||||
testcase.input_.c_str() + testcase.input_.size());
|
||||
log && log(xtag("i_tc", i_tc), xtag("input", testcase.input_));
|
||||
|
||||
for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size();
|
||||
i_tk < n_tk; ++i_tk)
|
||||
{
|
||||
INFO(xtag("i_tk", i_tk));
|
||||
tokenizer tkz(rh.enable_debug());
|
||||
|
||||
auto res = tkz.scan2(in_span, in_span.empty());
|
||||
const auto & tk = res.first;
|
||||
tokenizer::span_type
|
||||
in_span(testcase.input_.c_str(),
|
||||
testcase.input_.c_str() + testcase.input_.size());
|
||||
|
||||
if (tk.is_valid())
|
||||
REQUIRE(tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type());
|
||||
if (tk.tk_type() == tokentype::tk_i64)
|
||||
for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size();
|
||||
i_tk < n_tk; ++i_tk)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value());
|
||||
} else if (tk.tk_type() == tokentype::tk_f64)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value());
|
||||
} else if(tk.tk_type() == tokentype::tk_string)
|
||||
{
|
||||
/* tk.text() can be empty, consider input "" */
|
||||
REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text());
|
||||
} else if(tk.tk_type() == tokentype::tk_symbol)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text());
|
||||
} else {
|
||||
REQUIRE(tk.text().empty());
|
||||
log && log(xtag("i_tk", i_tk));
|
||||
|
||||
auto sr = tkz.scan2(in_span, in_span.empty());
|
||||
const auto & tk = sr.get_token();
|
||||
|
||||
if (tk.is_valid()) {
|
||||
REHEARSE(rh, tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type());
|
||||
}
|
||||
if (tk.tk_type() == tokentype::tk_i64)
|
||||
{
|
||||
REHEARSE(rh, !tk.text().empty());
|
||||
REHEARSE(rh, tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value());
|
||||
} else if (tk.tk_type() == tokentype::tk_f64)
|
||||
{
|
||||
REHEARSE(rh, !tk.text().empty());
|
||||
REHEARSE(rh, tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value());
|
||||
} else if(tk.tk_type() == tokentype::tk_string)
|
||||
{
|
||||
/* tk.text() can be empty, consider input "" */
|
||||
REHEARSE(rh, tk.text() == testcase.expected_tk_v_[i_tk].text());
|
||||
} else if(tk.tk_type() == tokentype::tk_symbol)
|
||||
{
|
||||
REHEARSE(rh, !tk.text().empty());
|
||||
REHEARSE(rh, tk.text() == testcase.expected_tk_v_[i_tk].text());
|
||||
} else {
|
||||
REHEARSE(rh, tk.text().empty());
|
||||
}
|
||||
|
||||
in_span = in_span.after_prefix(sr.consumed());
|
||||
}
|
||||
|
||||
in_span = in_span.after_prefix(res.second);
|
||||
}
|
||||
}
|
||||
} /*TEST_CASE(tokenizer2)*/
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue