xo-tokenizer: = and := tokens

2024-08-24 12:30:22 -04:00 · 2024-08-24 12:30:22 -04:00 · bff6b7ce9b
commit bff6b7ce9b
parent fa335ee523
3 changed files with 144 additions and 12 deletions
--- a/include/xo/tokenizer/token.hpp
+++ b/include/xo/tokenizer/token.hpp
@ -77,7 +77,7 @@ namespace xo {
            static token doublecolon() { return token(tokentype::tk_doublecolon); }
            static token semicolon() { return token(tokentype::tk_semicolon); }
            static token singleassign() { return token(tokentype::tk_singleassign); }
-            static token assign() { return token(tokentype::tk_assign); }
+            static token assign_token() { return token(tokentype::tk_assign); }
            static token yields() { return token(tokentype::tk_yields); }

            static token type() { return token(tokentype::tk_type); }
--- a/include/xo/tokenizer/tokenizer.hpp
+++ b/include/xo/tokenizer/tokenizer.hpp
@ -61,10 +61,16 @@ namespace xo {

            /** identifies punctuation chars.
             *  These are chars that are not permitted to appear within
-             *  a string/symbol token.  Instead they force completion of
+             *  a symbol token.  Instead they force completion of
             *  a preceding token,  and start a new token with themselves
             **/
-            bool is_punctuation(CharT ch) const;
+            bool is_1char_punctuation(CharT ch) const;
+
+            /** more-relazed version of is_1char_punctuation.
+             *  Chars that are not permitted to appear within a symbol token,
+             *  but may form token combined with next character
+             **/
+            bool is_2char_punctuation(CharT ch) const;

            /** true if tokenizer contains stored prefix of
             *  possibly-incomplete token
@ -115,7 +121,7 @@ namespace xo {

        template <typename CharT>
        bool
-        tokenizer<CharT>::is_punctuation(CharT ch) const {
+        tokenizer<CharT>::is_1char_punctuation(CharT ch) const {
            switch(ch) {
            case '<':
                return true;
@ -138,23 +144,36 @@ namespace xo {
            case ';':
                return true;
            case ':':
-                return true;
+                /* can't be 1char punctuation -- can begin assignment token */
+                return false;
            case '=':
                return true;
            case '-':
-                /* can't be punctuation -- can appear inside f64 token */
+                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e-9 */
                return false;
            case '+':
-                /* can't be punctuation -- can appear inside f64 token */
+                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */
                return false;
            case '.':
-                /* can't be punctuation -- can appear inside f64 token */
+                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */
                return false;
            }

            return false;
        }

+        template <typename CharT>
+        bool
+        tokenizer<CharT>::is_2char_punctuation(CharT ch) const {
+            switch(ch) {
+            case ':':
+                /* can begin := */
+                return true;
+            }
+
+            return false;
+        }
+
        template <typename CharT>
        auto
        tokenizer<CharT>::assemble_token(const span_type & token_text) const -> token_type
@ -483,9 +502,19 @@ namespace xo {
                ++ix;
                break;
            case ':':
-                tk_type = tokentype::tk_colon;
-                ++ix;
+            {
+                log && log("colon or assignment token");
+
+                if (*(ix + 1) == '=') {
+                    tk_type = tokentype::tk_assign;
+                    ++ix;
+                    ++ix;
+                } else {
+                     tk_type = tokentype::tk_colon;
+                     ++ix;
+                }
                break;
+            }
            case '=':
                tk_type = tokentype::tk_singleassign;
                ++ix;
@ -575,9 +604,33 @@ namespace xo {
             */
            const CharT * tk_start = ix;

-            if (is_punctuation(*ix)) {
+            if (is_1char_punctuation(*ix)) {
                /* 1-character token */
                ++ix;
+            } else if (is_2char_punctuation(*ix)) {
+                CharT ch1 = *ix;
+
+                ++ix;
+
+                if (ix == input.hi()) {
+                    /* need more input to know if/when token complete */
+                    this->prefix_ += std::string(tk_start, input.hi());
+
+                    log && log(xtag("captured-prefix", this->prefix_));
+                } else {
+                    CharT ch2 = *ix;
+
+                    if (((ch2 >= '0') && (ch2 <= '9'))
+                        || ((ch2 >= 'A') && (ch2 <= 'Z'))
+                        || ((ch2 >= 'a') && (ch2 <= 'z')))
+                    {
+                        /* treat as 1 char punctuation */
+                        ;
+                    } else {
+                        /* include next char */
+                        ++ix;
+                    }
+                }
            } else if (*ix == '"') {
                bool complete_flag = false;

@ -618,8 +671,12 @@ namespace xo {
                 * - punctuation
                 */
                for (; ix != input.hi(); ++ix) {
-                    if (is_whitespace(*ix) || is_punctuation(*ix))
+                    if (is_whitespace(*ix)
+                        || is_1char_punctuation(*ix)
+                        || is_2char_punctuation(*ix))
+                    {
                        break;
+                    }
                }

                if (ix == input.hi()) {