xo-tokenizer: = and := tokens
This commit is contained in:
parent
fa335ee523
commit
bff6b7ce9b
3 changed files with 144 additions and 12 deletions
|
|
@ -77,7 +77,7 @@ namespace xo {
|
|||
static token doublecolon() { return token(tokentype::tk_doublecolon); }
|
||||
static token semicolon() { return token(tokentype::tk_semicolon); }
|
||||
static token singleassign() { return token(tokentype::tk_singleassign); }
|
||||
static token assign() { return token(tokentype::tk_assign); }
|
||||
static token assign_token() { return token(tokentype::tk_assign); }
|
||||
static token yields() { return token(tokentype::tk_yields); }
|
||||
|
||||
static token type() { return token(tokentype::tk_type); }
|
||||
|
|
|
|||
|
|
@ -61,10 +61,16 @@ namespace xo {
|
|||
|
||||
/** identifies punctuation chars.
|
||||
* These are chars that are not permitted to appear within
|
||||
* a string/symbol token. Instead they force completion of
|
||||
* a symbol token. Instead they force completion of
|
||||
* a preceding token, and start a new token with themselves
|
||||
**/
|
||||
bool is_punctuation(CharT ch) const;
|
||||
bool is_1char_punctuation(CharT ch) const;
|
||||
|
||||
/** more-relazed version of is_1char_punctuation.
|
||||
* Chars that are not permitted to appear within a symbol token,
|
||||
* but may form token combined with next character
|
||||
**/
|
||||
bool is_2char_punctuation(CharT ch) const;
|
||||
|
||||
/** true if tokenizer contains stored prefix of
|
||||
* possibly-incomplete token
|
||||
|
|
@ -115,7 +121,7 @@ namespace xo {
|
|||
|
||||
template <typename CharT>
|
||||
bool
|
||||
tokenizer<CharT>::is_punctuation(CharT ch) const {
|
||||
tokenizer<CharT>::is_1char_punctuation(CharT ch) const {
|
||||
switch(ch) {
|
||||
case '<':
|
||||
return true;
|
||||
|
|
@ -138,23 +144,36 @@ namespace xo {
|
|||
case ';':
|
||||
return true;
|
||||
case ':':
|
||||
return true;
|
||||
/* can't be 1char punctuation -- can begin assignment token */
|
||||
return false;
|
||||
case '=':
|
||||
return true;
|
||||
case '-':
|
||||
/* can't be punctuation -- can appear inside f64 token */
|
||||
/* can't be punctuation -- can appear inside f64 token: e.g. 1.23e-9 */
|
||||
return false;
|
||||
case '+':
|
||||
/* can't be punctuation -- can appear inside f64 token */
|
||||
/* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */
|
||||
return false;
|
||||
case '.':
|
||||
/* can't be punctuation -- can appear inside f64 token */
|
||||
/* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
bool
|
||||
tokenizer<CharT>::is_2char_punctuation(CharT ch) const {
|
||||
switch(ch) {
|
||||
case ':':
|
||||
/* can begin := */
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::assemble_token(const span_type & token_text) const -> token_type
|
||||
|
|
@ -483,9 +502,19 @@ namespace xo {
|
|||
++ix;
|
||||
break;
|
||||
case ':':
|
||||
tk_type = tokentype::tk_colon;
|
||||
++ix;
|
||||
{
|
||||
log && log("colon or assignment token");
|
||||
|
||||
if (*(ix + 1) == '=') {
|
||||
tk_type = tokentype::tk_assign;
|
||||
++ix;
|
||||
++ix;
|
||||
} else {
|
||||
tk_type = tokentype::tk_colon;
|
||||
++ix;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case '=':
|
||||
tk_type = tokentype::tk_singleassign;
|
||||
++ix;
|
||||
|
|
@ -575,9 +604,33 @@ namespace xo {
|
|||
*/
|
||||
const CharT * tk_start = ix;
|
||||
|
||||
if (is_punctuation(*ix)) {
|
||||
if (is_1char_punctuation(*ix)) {
|
||||
/* 1-character token */
|
||||
++ix;
|
||||
} else if (is_2char_punctuation(*ix)) {
|
||||
CharT ch1 = *ix;
|
||||
|
||||
++ix;
|
||||
|
||||
if (ix == input.hi()) {
|
||||
/* need more input to know if/when token complete */
|
||||
this->prefix_ += std::string(tk_start, input.hi());
|
||||
|
||||
log && log(xtag("captured-prefix", this->prefix_));
|
||||
} else {
|
||||
CharT ch2 = *ix;
|
||||
|
||||
if (((ch2 >= '0') && (ch2 <= '9'))
|
||||
|| ((ch2 >= 'A') && (ch2 <= 'Z'))
|
||||
|| ((ch2 >= 'a') && (ch2 <= 'z')))
|
||||
{
|
||||
/* treat as 1 char punctuation */
|
||||
;
|
||||
} else {
|
||||
/* include next char */
|
||||
++ix;
|
||||
}
|
||||
}
|
||||
} else if (*ix == '"') {
|
||||
bool complete_flag = false;
|
||||
|
||||
|
|
@ -618,8 +671,12 @@ namespace xo {
|
|||
* - punctuation
|
||||
*/
|
||||
for (; ix != input.hi(); ++ix) {
|
||||
if (is_whitespace(*ix) || is_punctuation(*ix))
|
||||
if (is_whitespace(*ix)
|
||||
|| is_1char_punctuation(*ix)
|
||||
|| is_2char_punctuation(*ix))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ix == input.hi()) {
|
||||
|
|
|
|||
|
|
@ -99,6 +99,9 @@ namespace xo {
|
|||
{"\"tab to the right [\\t], to the right [\\t]\"", false,
|
||||
token::string_token("tab to the right [\t], to the right [\t]"), true},
|
||||
|
||||
{":", false, token::colon(), true},
|
||||
{":=", false, token::assign_token(), true},
|
||||
|
||||
{"symbol", false, token::symbol_token("symbol"), true},
|
||||
|
||||
{"type", false, token::type(), true},
|
||||
|
|
@ -162,6 +165,78 @@ namespace xo {
|
|||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct testcase2_tkz {
|
||||
std::string input_;
|
||||
bool expect_throw_;
|
||||
std::vector<token> expected_tk_v_;
|
||||
};
|
||||
|
||||
std::vector<testcase2_tkz>
|
||||
s_testcase2_v = {
|
||||
{"def foo : f64 = 3.141;",
|
||||
false,
|
||||
{token::def(),
|
||||
token::symbol_token("foo"),
|
||||
token::colon(),
|
||||
token::symbol_token("f64"),
|
||||
token::singleassign(),
|
||||
token::f64_token("3.141"),
|
||||
token::semicolon()
|
||||
}}
|
||||
};
|
||||
}
|
||||
|
||||
TEST_CASE("tokenizer2", "[tokenizer]") {
|
||||
for (std::size_t i_tc = 0, n_tc = s_testcase2_v.size(); i_tc < n_tc; ++i_tc) {
|
||||
const testcase2_tkz & testcase = s_testcase2_v[i_tc];
|
||||
|
||||
INFO(xtag("input", testcase.input_));
|
||||
INFO(xtag("i_tc", i_tc));
|
||||
|
||||
using tokenizer
|
||||
= xo::scm::tokenizer<char>;
|
||||
|
||||
tokenizer tkz;
|
||||
tokenizer::span_type
|
||||
in_span(testcase.input_.c_str(),
|
||||
testcase.input_.c_str() + testcase.input_.size());
|
||||
|
||||
for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size();
|
||||
i_tk < n_tk; ++i_tk)
|
||||
{
|
||||
INFO(xtag("i_tk", i_tk));
|
||||
|
||||
auto res = tkz.scan2(in_span, in_span.empty());
|
||||
const auto & tk = res.first;
|
||||
|
||||
if (tk.is_valid())
|
||||
REQUIRE(tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type());
|
||||
if (tk.tk_type() == tokentype::tk_i64)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value());
|
||||
} else if (tk.tk_type() == tokentype::tk_f64)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value());
|
||||
} else if(tk.tk_type() == tokentype::tk_string)
|
||||
{
|
||||
/* tk.text() can be empty, consider input "" */
|
||||
REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text());
|
||||
} else if(tk.tk_type() == tokentype::tk_symbol)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text());
|
||||
} else {
|
||||
REQUIRE(tk.text().empty());
|
||||
}
|
||||
|
||||
in_span = in_span.after_prefix(res.second);
|
||||
}
|
||||
}
|
||||
} /*TEST_CASE(tokenizer2)*/
|
||||
|
||||
} /*namespace ut*/
|
||||
} /*namespace xo*/
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue