From bff6b7ce9bf752e12069d9f59cd359c466bf194a Mon Sep 17 00:00:00 2001
From: Roland Conybeare <rconybeare@gmail.com>
Date: Sat, 24 Aug 2024 12:30:22 -0400
Subject: [PATCH] xo-tokenizer: = and := tokens

---
 include/xo/tokenizer/token.hpp     |  2 +-
 include/xo/tokenizer/tokenizer.hpp | 79 +++++++++++++++++++++++++-----
 utest/tokenizer.test.cpp           | 75 ++++++++++++++++++++++++++++
 3 files changed, 144 insertions(+), 12 deletions(-)
diff --git a/include/xo/tokenizer/token.hpp b/include/xo/tokenizer/token.hpp
index 84b1a1b8..9944cb3d 100644
--- a/include/xo/tokenizer/token.hpp
+++ b/include/xo/tokenizer/token.hpp
@@ -77,7 +77,7 @@ namespace xo {
             static token doublecolon() { return token(tokentype::tk_doublecolon); }
             static token semicolon() { return token(tokentype::tk_semicolon); }
             static token singleassign() { return token(tokentype::tk_singleassign); }
-            static token assign() { return token(tokentype::tk_assign); }
+            static token assign_token() { return token(tokentype::tk_assign); }
             static token yields() { return token(tokentype::tk_yields); }
 
             static token type() { return token(tokentype::tk_type); }
diff --git a/include/xo/tokenizer/tokenizer.hpp b/include/xo/tokenizer/tokenizer.hpp
index 11ee5aca..bb67eb13 100644
--- a/include/xo/tokenizer/tokenizer.hpp
+++ b/include/xo/tokenizer/tokenizer.hpp
@@ -61,10 +61,16 @@ namespace xo {
 
             /** identifies punctuation chars.
              *  These are chars that are not permitted to appear within
-             *  a string/symbol token.  Instead they force completion of
+             *  a symbol token.  Instead they force completion of
              *  a preceding token,  and start a new token with themselves
              **/
-            bool is_punctuation(CharT ch) const;
+            bool is_1char_punctuation(CharT ch) const;
+
+            /** more-relazed version of is_1char_punctuation.
+             *  Chars that are not permitted to appear within a symbol token,
+             *  but may form token combined with next character
+             **/
+            bool is_2char_punctuation(CharT ch) const;
 
             /** true if tokenizer contains stored prefix of
              *  possibly-incomplete token
@@ -115,7 +121,7 @@ namespace xo {
 
         template <typename CharT>
         bool
-        tokenizer<CharT>::is_punctuation(CharT ch) const {
+        tokenizer<CharT>::is_1char_punctuation(CharT ch) const {
             switch(ch) {
             case '<':
                 return true;
@@ -138,23 +144,36 @@ namespace xo {
             case ';':
                 return true;
             case ':':
-                return true;
+                /* can't be 1char punctuation -- can begin assignment token */
+                return false;
             case '=':
                 return true;
             case '-':
-                /* can't be punctuation -- can appear inside f64 token */
+                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e-9 */
                 return false;
             case '+':
-                /* can't be punctuation -- can appear inside f64 token */
+                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */
                 return false;
             case '.':
-                /* can't be punctuation -- can appear inside f64 token */
+                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */
                 return false;
             }
 
             return false;
         }
 
+        template <typename CharT>
+        bool
+        tokenizer<CharT>::is_2char_punctuation(CharT ch) const {
+            switch(ch) {
+            case ':':
+                /* can begin := */
+                return true;
+            }
+
+            return false;
+        }
+
         template <typename CharT>
         auto
         tokenizer<CharT>::assemble_token(const span_type & token_text) const -> token_type
@@ -483,9 +502,19 @@ namespace xo {
                 ++ix;
                 break;
             case ':':
-                tk_type = tokentype::tk_colon;
-                ++ix;
+            {
+                log && log("colon or assignment token");
+
+                if (*(ix + 1) == '=') {
+                    tk_type = tokentype::tk_assign;
+                    ++ix;
+                    ++ix;
+                } else {
+                     tk_type = tokentype::tk_colon;
+                     ++ix;
+                }
                 break;
+            }
             case '=':
                 tk_type = tokentype::tk_singleassign;
                 ++ix;
@@ -575,9 +604,33 @@ namespace xo {
              */
             const CharT * tk_start = ix;
 
-            if (is_punctuation(*ix)) {
+            if (is_1char_punctuation(*ix)) {
                 /* 1-character token */
                 ++ix;
+            } else if (is_2char_punctuation(*ix)) {
+                CharT ch1 = *ix;
+
+                ++ix;
+
+                if (ix == input.hi()) {
+                    /* need more input to know if/when token complete */
+                    this->prefix_ += std::string(tk_start, input.hi());
+
+                    log && log(xtag("captured-prefix", this->prefix_));
+                } else {
+                    CharT ch2 = *ix;
+
+                    if (((ch2 >= '0') && (ch2 <= '9'))
+                        || ((ch2 >= 'A') && (ch2 <= 'Z'))
+                        || ((ch2 >= 'a') && (ch2 <= 'z')))
+                    {
+                        /* treat as 1 char punctuation */
+                        ;
+                    } else {
+                        /* include next char */
+                        ++ix;
+                    }
+                }
             } else if (*ix == '"') {
                 bool complete_flag = false;
 
@@ -618,8 +671,12 @@ namespace xo {
                  * - punctuation
                  */
                 for (; ix != input.hi(); ++ix) {
-                    if (is_whitespace(*ix) || is_punctuation(*ix))
+                    if (is_whitespace(*ix)
+                        || is_1char_punctuation(*ix)
+                        || is_2char_punctuation(*ix))
+                    {
                         break;
+                    }
                 }
 
                 if (ix == input.hi()) {
diff --git a/utest/tokenizer.test.cpp b/utest/tokenizer.test.cpp
index b5d8303a..8a821b96 100644
--- a/utest/tokenizer.test.cpp
+++ b/utest/tokenizer.test.cpp
@@ -99,6 +99,9 @@ namespace xo {
                 {"\"tab to the right [\\t], to the right [\\t]\"", false,
                  token::string_token("tab to the right [\t], to the right [\t]"), true},
 
+                {":", false, token::colon(), true},
+                {":=", false, token::assign_token(), true},
+
                 {"symbol", false, token::symbol_token("symbol"), true},
 
                 {"type", false, token::type(), true},
@@ -162,6 +165,78 @@ namespace xo {
             }
         }
 
+        namespace {
+            struct testcase2_tkz {
+                std::string input_;
+                bool expect_throw_;
+                std::vector<token> expected_tk_v_;
+            };
+
+            std::vector<testcase2_tkz>
+            s_testcase2_v = {
+                {"def foo : f64 = 3.141;",
+                 false,
+                 {token::def(),
+                  token::symbol_token("foo"),
+                  token::colon(),
+                  token::symbol_token("f64"),
+                  token::singleassign(),
+                  token::f64_token("3.141"),
+                  token::semicolon()
+                 }}
+            };
+        }
+
+        TEST_CASE("tokenizer2", "[tokenizer]") {
+            for (std::size_t i_tc = 0, n_tc = s_testcase2_v.size(); i_tc < n_tc; ++i_tc) {
+                const testcase2_tkz & testcase = s_testcase2_v[i_tc];
+
+                INFO(xtag("input", testcase.input_));
+                INFO(xtag("i_tc", i_tc));
+
+                using tokenizer
+                    = xo::scm::tokenizer<char>;
+
+                tokenizer tkz;
+                tokenizer::span_type
+                    in_span(testcase.input_.c_str(),
+                            testcase.input_.c_str() + testcase.input_.size());
+
+                for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size();
+                     i_tk < n_tk; ++i_tk)
+                {
+                    INFO(xtag("i_tk", i_tk));
+
+                    auto res = tkz.scan2(in_span, in_span.empty());
+                    const auto & tk = res.first;
+
+                    if (tk.is_valid())
+                        REQUIRE(tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type());
+                    if (tk.tk_type() == tokentype::tk_i64)
+                    {
+                        REQUIRE(!tk.text().empty());
+                        REQUIRE(tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value());
+                    } else if (tk.tk_type() == tokentype::tk_f64)
+                    {
+                        REQUIRE(!tk.text().empty());
+                        REQUIRE(tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value());
+                    } else if(tk.tk_type() == tokentype::tk_string)
+                    {
+                        /* tk.text() can be empty, consider input "" */
+                        REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text());
+                    } else if(tk.tk_type() == tokentype::tk_symbol)
+                    {
+                        REQUIRE(!tk.text().empty());
+                        REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text());
+                    } else {
+                        REQUIRE(tk.text().empty());
+                    }
+
+                    in_span = in_span.after_prefix(res.second);
+                }
+            }
+        } /*TEST_CASE(tokenizer2)*/
+
     } /*namespace ut*/
 } /*namespace xo*/