From bff6b7ce9bf752e12069d9f59cd359c466bf194a Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Sat, 24 Aug 2024 12:30:22 -0400 Subject: [PATCH] xo-tokenizer: = and := tokens --- include/xo/tokenizer/token.hpp | 2 +- include/xo/tokenizer/tokenizer.hpp | 79 +++++++++++++++++++++++++----- utest/tokenizer.test.cpp | 75 ++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 12 deletions(-) diff --git a/include/xo/tokenizer/token.hpp b/include/xo/tokenizer/token.hpp index 84b1a1b8..9944cb3d 100644 --- a/include/xo/tokenizer/token.hpp +++ b/include/xo/tokenizer/token.hpp @@ -77,7 +77,7 @@ namespace xo { static token doublecolon() { return token(tokentype::tk_doublecolon); } static token semicolon() { return token(tokentype::tk_semicolon); } static token singleassign() { return token(tokentype::tk_singleassign); } - static token assign() { return token(tokentype::tk_assign); } + static token assign_token() { return token(tokentype::tk_assign); } static token yields() { return token(tokentype::tk_yields); } static token type() { return token(tokentype::tk_type); } diff --git a/include/xo/tokenizer/tokenizer.hpp b/include/xo/tokenizer/tokenizer.hpp index 11ee5aca..bb67eb13 100644 --- a/include/xo/tokenizer/tokenizer.hpp +++ b/include/xo/tokenizer/tokenizer.hpp @@ -61,10 +61,16 @@ namespace xo { /** identifies punctuation chars. * These are chars that are not permitted to appear within - * a string/symbol token. Instead they force completion of + * a symbol token. Instead they force completion of * a preceding token, and start a new token with themselves **/ - bool is_punctuation(CharT ch) const; + bool is_1char_punctuation(CharT ch) const; + + /** more-relazed version of is_1char_punctuation. + * Chars that are not permitted to appear within a symbol token, + * but may form token combined with next character + **/ + bool is_2char_punctuation(CharT ch) const; /** true if tokenizer contains stored prefix of * possibly-incomplete token @@ -115,7 +121,7 @@ namespace xo { template bool - tokenizer::is_punctuation(CharT ch) const { + tokenizer::is_1char_punctuation(CharT ch) const { switch(ch) { case '<': return true; @@ -138,23 +144,36 @@ namespace xo { case ';': return true; case ':': - return true; + /* can't be 1char punctuation -- can begin assignment token */ + return false; case '=': return true; case '-': - /* can't be punctuation -- can appear inside f64 token */ + /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e-9 */ return false; case '+': - /* can't be punctuation -- can appear inside f64 token */ + /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */ return false; case '.': - /* can't be punctuation -- can appear inside f64 token */ + /* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */ return false; } return false; } + template + bool + tokenizer::is_2char_punctuation(CharT ch) const { + switch(ch) { + case ':': + /* can begin := */ + return true; + } + + return false; + } + template auto tokenizer::assemble_token(const span_type & token_text) const -> token_type @@ -483,9 +502,19 @@ namespace xo { ++ix; break; case ':': - tk_type = tokentype::tk_colon; - ++ix; + { + log && log("colon or assignment token"); + + if (*(ix + 1) == '=') { + tk_type = tokentype::tk_assign; + ++ix; + ++ix; + } else { + tk_type = tokentype::tk_colon; + ++ix; + } break; + } case '=': tk_type = tokentype::tk_singleassign; ++ix; @@ -575,9 +604,33 @@ namespace xo { */ const CharT * tk_start = ix; - if (is_punctuation(*ix)) { + if (is_1char_punctuation(*ix)) { /* 1-character token */ ++ix; + } else if (is_2char_punctuation(*ix)) { + CharT ch1 = *ix; + + ++ix; + + if (ix == input.hi()) { + /* need more input to know if/when token complete */ + this->prefix_ += std::string(tk_start, input.hi()); + + log && log(xtag("captured-prefix", this->prefix_)); + } else { + CharT ch2 = *ix; + + if (((ch2 >= '0') && (ch2 <= '9')) + || ((ch2 >= 'A') && (ch2 <= 'Z')) + || ((ch2 >= 'a') && (ch2 <= 'z'))) + { + /* treat as 1 char punctuation */ + ; + } else { + /* include next char */ + ++ix; + } + } } else if (*ix == '"') { bool complete_flag = false; @@ -618,8 +671,12 @@ namespace xo { * - punctuation */ for (; ix != input.hi(); ++ix) { - if (is_whitespace(*ix) || is_punctuation(*ix)) + if (is_whitespace(*ix) + || is_1char_punctuation(*ix) + || is_2char_punctuation(*ix)) + { break; + } } if (ix == input.hi()) { diff --git a/utest/tokenizer.test.cpp b/utest/tokenizer.test.cpp index b5d8303a..8a821b96 100644 --- a/utest/tokenizer.test.cpp +++ b/utest/tokenizer.test.cpp @@ -99,6 +99,9 @@ namespace xo { {"\"tab to the right [\\t], to the right [\\t]\"", false, token::string_token("tab to the right [\t], to the right [\t]"), true}, + {":", false, token::colon(), true}, + {":=", false, token::assign_token(), true}, + {"symbol", false, token::symbol_token("symbol"), true}, {"type", false, token::type(), true}, @@ -162,6 +165,78 @@ namespace xo { } } + namespace { + struct testcase2_tkz { + std::string input_; + bool expect_throw_; + std::vector expected_tk_v_; + }; + + std::vector + s_testcase2_v = { + {"def foo : f64 = 3.141;", + false, + {token::def(), + token::symbol_token("foo"), + token::colon(), + token::symbol_token("f64"), + token::singleassign(), + token::f64_token("3.141"), + token::semicolon() + }} + }; + } + + TEST_CASE("tokenizer2", "[tokenizer]") { + for (std::size_t i_tc = 0, n_tc = s_testcase2_v.size(); i_tc < n_tc; ++i_tc) { + const testcase2_tkz & testcase = s_testcase2_v[i_tc]; + + INFO(xtag("input", testcase.input_)); + INFO(xtag("i_tc", i_tc)); + + using tokenizer + = xo::scm::tokenizer; + + tokenizer tkz; + tokenizer::span_type + in_span(testcase.input_.c_str(), + testcase.input_.c_str() + testcase.input_.size()); + + for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size(); + i_tk < n_tk; ++i_tk) + { + INFO(xtag("i_tk", i_tk)); + + auto res = tkz.scan2(in_span, in_span.empty()); + const auto & tk = res.first; + + if (tk.is_valid()) + REQUIRE(tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type()); + if (tk.tk_type() == tokentype::tk_i64) + { + REQUIRE(!tk.text().empty()); + REQUIRE(tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value()); + } else if (tk.tk_type() == tokentype::tk_f64) + { + REQUIRE(!tk.text().empty()); + REQUIRE(tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value()); + } else if(tk.tk_type() == tokentype::tk_string) + { + /* tk.text() can be empty, consider input "" */ + REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text()); + } else if(tk.tk_type() == tokentype::tk_symbol) + { + REQUIRE(!tk.text().empty()); + REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text()); + } else { + REQUIRE(tk.text().empty()); + } + + in_span = in_span.after_prefix(res.second); + } + } + } /*TEST_CASE(tokenizer2)*/ + } /*namespace ut*/ } /*namespace xo*/