+ xo-tokenizer2 xo-reader2 xo-expression2 xo-interpreter2

2nd gen schematika interpreter using fomo
2026-01-10 12:39:09 -05:00 · 2026-01-10 12:39:09 -05:00 · f7bd3b0db3
commit f7bd3b0db3
parent 95bd3c7fae
41 changed files with 3566 additions and 9 deletions
--- a/xo-tokenizer2/src/tokenizer2/CMakeLists.txt
+++ b/xo-tokenizer2/src/tokenizer2/CMakeLists.txt
@ -0,0 +1,15 @@
+# tokenizer2/CMakeLists.txt
+
+set(SELF_LIB xo_tokenizer2)
+set(SELF_SRCS
+    Tokenizer.cpp
+    TokenizerError.cpp
+    TkInputState.cpp
+    scan_result.cpp
+    Token.cpp
+    tokentype.cpp)
+
+xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS})
+xo_dependency(${SELF_LIB} indentlog)
+
+# end CMakeLists.txt
--- a/xo-tokenizer2/src/tokenizer2/TkInputState.cpp
+++ b/xo-tokenizer2/src/tokenizer2/TkInputState.cpp
@ -0,0 +1,151 @@
+/** @file TkInputState.cpp
+ *
+ *  @author Roland Conybeare, Jun 2025
+ **/
+
+#include "TkInputState.hpp"
+
+namespace xo {
+    namespace scm {
+        using CharT = char;
+
+        bool
+        TkInputState::is_newline(CharT ch) {
+            return (ch == '\n');
+        }
+
+        bool
+        TkInputState::is_whitespace(CharT ch) {
+            switch(ch) {
+            case ' ': return true;
+            case '\t': return true;
+            case '\n': return true;
+            case '\r': return true;
+            }
+
+            return false;
+        }
+
+        TkInputState
+        TkInputState::rewind(std::size_t n) const
+        {
+            return TkInputState(this->current_line_,
+                                (n <= current_pos_) ? current_pos_ - n : 0,
+                                0 /*whitespace*/);
+        }
+
+        void
+        TkInputState::advance(size_t z)
+        {
+            scope log(XO_DEBUG(debug_flag_));
+
+            this->current_pos_ += z;
+
+            log && log(xtag("z", z), xtag("current_pos", current_pos_));
+        }
+
+        void
+        TkInputState::advance_until(const CharT * pos)
+        {
+            scope log(XO_DEBUG(debug_flag_));
+
+            assert(current_line_.lo() <= pos && pos <= current_line_.hi());
+
+            this->current_pos_ = pos - current_line_.lo();
+
+            log && log(xtag("current_pos", current_pos_));
+        }
+
+        auto
+        TkInputState::consume_current_line() -> span_type
+        {
+            span_type retval = current_line_;
+
+            this->discard_current_line();
+
+            return retval;
+        }
+
+        void
+        TkInputState::discard_current_line()
+        {
+            this->current_line_ = span_type::make_null();
+            this->current_pos_ = 0;
+            this->whitespace_ = 0;
+        }
+
+        auto
+        TkInputState::capture_current_line(const span_type & input,
+                                           bool eof_flag)
+            -> std::pair<input_error, span_type>
+        {
+            // see also discard_current_line()
+            // note: must capture entirety of first line,
+            //       for example including leading whitespace.
+            //       See discussion in tokenizer scan() method
+
+            scope log(XO_DEBUG(debug_flag_));
+
+            /* look ahead to {end of line, end of input}, whichever comes first */
+            const CharT * sol = input.lo();
+            const CharT * eol = sol;
+
+            if (sol == current_line_.lo()) {
+                log && log("short-circuit - current line already stashed");
+
+                /* nothing to do here */
+                return std::make_pair(input_error::ok, current_line_);
+            }
+
+            while ((eol < input.hi()) && (*eol != '\n'))
+                ++eol;
+
+            if (*eol == '\n') {
+                /* include \n at end-of-line */
+                ++eol;
+            } else {
+                if (!eof_flag) {
+                    /* caller expected to provide complete line of input. complain and ignore */
+                    return std::make_pair(input_error::incomplete,
+                                          input.prefix(0ul));
+                }
+            }
+
+            this->current_line_ = span_type(sol, eol);
+            this->current_pos_ = 0;
+            this->whitespace_ = 0;
+
+            log && log(xtag("current_line", print::printspan(current_line_)),
+                       xtag("current_pos", current_pos_));
+
+            return std::make_pair(input_error::ok,
+                                  span_type(sol, eol));
+        }
+
+        const CharT *
+        TkInputState::skip_leading_whitespace()
+        {
+            scope log(XO_DEBUG(debug_flag_));
+
+            const CharT * ix = current_line_.lo() + current_pos_;
+
+            this->whitespace_ = 0;
+
+            /* skip whitespace + remember beginning of most recent line */
+            while (is_whitespace(*ix) && (ix != current_line_.hi())) {
+                ++ix;
+
+                ++(this->whitespace_);
+            }
+
+            this->tk_start_ = ix - current_line_.lo();
+            this->current_pos_ = ix - current_line_.lo();
+
+            return ix;
+        }
+
+
+    } /*namespace scm*/
+} /*namespace xo*/
+
+/* end TkInputState.cpp */
--- a/xo-tokenizer2/src/tokenizer2/Token.cpp
+++ b/xo-tokenizer2/src/tokenizer2/Token.cpp
@ -0,0 +1,259 @@
+/** @file token.cpp
+ *
+ *  author: Roland Conybeare
+ **/
+
+#include "Token.hpp"
+#include "xo/indentlog/print/tag.hpp"
+
+namespace xo {
+    namespace scm {
+
+        bool
+        Token::bool_value() const
+        {
+            if (tk_type_ != tokentype::tk_bool) {
+                throw (std::runtime_error
+                       (tostr("token::bool_value",
+                              ": token with type tk found where tk_bool expected",
+                              xtag("tk", tk_type_))));
+            }
+
+            if (text_ == "true")
+                return true;
+            if (text_ == "false")
+                return false;
+
+            throw (std::runtime_error
+                   (tostr("token::bool_value",
+                          ": unexpected input string tk_bool token",
+                          xtag("text", text_))));
+
+            return false;
+        }
+
+        std::int64_t
+        Token::i64_value() const
+        {
+            if (tk_type_ != tokentype::tk_i64) {
+                throw (std::runtime_error
+                       (tostr("token::i64_value",
+                              ": token with type tk found where tk_i64 expected",
+                              xtag("tk", tk_type_))));
+            }
+
+            if (text_.empty()) {
+                throw (std::runtime_error
+                       (tostr("token::i64_value",
+                              ": unexpected empty input string for tk_i64 token")));
+            }
+
+            int sign = 1;
+            int value = 0;
+            {
+                auto ix = text_.begin();
+                auto end_ix = text_.end();
+
+                char ch = *ix;
+
+                if (ch == '+') {
+                    ++ix;
+                } else if (ch == '-') {
+                    sign = -1;
+                    ++ix;
+                }
+
+                if (ix == end_ix) {
+                    throw (std::runtime_error
+                           (tostr("token::i64_value",
+                                  ": input text found where at least one digit expected",
+                                  xtag("text", text_))));
+                }
+
+                for (; ix != end_ix; ++ix) {
+                    char ch = *ix;
+
+                    if ((ch >= '0') && (ch <= '9')) {
+                        value *= 10;
+                        value += (ch - '0');
+                    } else {
+                        throw (std::runtime_error
+                               (tostr("token::i64_value",
+                                      ": unexpected char ch in integer token",
+                                      xtag("ch", ch))));
+                    }
+                }
+            }
+
+            return sign * value;
+        } /*i64_value*/
+
+        double
+        Token::f64_value() const
+        {
+            if (tk_type_ != tokentype::tk_f64) {
+                throw (std::runtime_error
+                       (tostr("token::f64_value",
+                              ": token with type tk found where tk_f64 expected",
+                              xtag("tk", tk_type_))));
+            }
+
+            if (text_.empty()) {
+                throw (std::runtime_error
+                       (tostr("token::f64_value",
+                              ": unexpected empty input string for tk_f64 token")));
+            }
+
+            int sign = 1;
+            /* integer representing denormalized unsigned mantissa
+             * (mantissa scaled by smallest power of 10 sufficient to make
+             *  it an integer)
+             */
+            std::int64_t mantissa = 0;
+            /* counts #of digits to the right of decimal point '.' */
+            int rh_digits = 0;
+            /* sign of exponent */
+            int exp_sign = 1;
+            /* value of exponenct = integer to the right of 'e' or 'E' */
+            int exponent = 0;
+
+            /* floating-point value will represent
+             *   sign * mantissa * 10^(sign*exponent - rh_digits)
+             */
+            {
+                auto ix = text_.begin();
+                auto end_ix = text_.end();
+
+                char ch = *ix;
+
+                if (ch == '+') {
+                    ++ix;
+                } else if (ch == '-') {
+                    sign = -1;
+                    ++ix;
+                }
+
+                if (ix == end_ix) {
+                    throw (std::runtime_error
+                           (tostr("token::f64_value",
+                                  ": input text found where at least one digit expected",
+                                  xtag("text", text_))));
+                }
+
+                /* true iff decimal point '.' present in mantissa */
+                bool have_decimal_point = false;
+                /* true iff exponent prefix 'e' or 'E' present */
+                //bool have_exponent = false;
+                /* counts number of digits in mantissa
+                 * (both before and after, but not including, any decimal point
+                 */
+                int m_digits = 0;
+                /* digits to the left of decimal point */
+                int lh_digits = 0;
+
+                /* loop over mantissa digits */
+                for (; ix != end_ix; ++ix) {
+                    char ch = *ix;
+
+                    if (ch == '.') {
+                        if (have_decimal_point) {
+                            throw (std::runtime_error
+                                   (tostr("token::f64_value",
+                                          ": input text found where at most one decimal point expected",
+                                          xtag("text", text_))));
+                        }
+
+                        have_decimal_point = true;
+                        lh_digits = m_digits;
+                    } else if ((ch >= '0') && (ch <= '9')) {
+                        mantissa *= 10;
+                        mantissa += (ch - '0');
+                        ++m_digits;
+                    } else if (ch == 'e' || ch == 'E') {
+                        //have_exponent = true;
+                        break; // done with mantissa
+                    } else {
+                        throw (std::runtime_error
+                               (tostr("token::i64_value",
+                                      ": unexpected char ch in integer token",
+                                      xtag("ch", ch))));
+                    }
+                }
+
+                if (have_decimal_point)
+                    rh_digits = m_digits - lh_digits;
+
+                if (ix != end_ix) {
+                    /* continue to read exponent */
+
+                    /* skip e|E */
+                    ++ix;
+
+                    if (ix == end_ix) {
+                        throw (std::runtime_error
+                               (tostr("token::f64_value",
+                                      ": on input text, expect at least one digit following exponent marker e|E",
+                                      xtag("text", text_))));
+                    }
+
+                    char ch = *ix;
+
+                    if (ch == '+') {
+                        ++ix; /*skip*/
+                    } else if (ch == '-') {
+                        exp_sign = -1;
+                        ++ix;
+                    }
+
+                    for (; ix != end_ix; ++ix) {
+                        char ch = *ix;
+
+                        if ((ch >= '0') && (ch <= '9')) {
+                            exponent *= 10;
+                            exponent += (ch - '0');
+                        } else {
+                            throw (std::runtime_error
+                                   (tostr("token::f64_value",
+                                          "; on input text, expect only digits following"
+                                          " (possibly signed) exponenct marker",
+                                          xtag("text", text_))));
+                        }
+                    }
+                }
+            }
+
+            /* floating-point value will represent
+             *   sign * mantissa * 10^(sign*exponent - rh_digits)
+             */
+
+            double mantissa_f64 = sign * mantissa;
+
+#ifdef OBSOLETE_DEBUG
+            std::cerr << xtag("text", text_)
+                      << xtag("rh_digits", rh_digits)
+                      << xtag("mantissa_f64", mantissa_f64)
+                      << xtag("exp_sign", exp_sign)
+                      << xtag("exponent", exponent)
+                      << std::endl;
+#endif
+
+            double retval = (mantissa_f64
+                             * detail::pow10((exp_sign * exponent)
+                                             - rh_digits));
+
+            return retval;
+        } /*f64_value*/
+
+        void
+        Token::print(std::ostream & os) const
+        {
+            os << "<token"
+               << xtag("type", tk_type_);
+            if (has_variable_text())
+                os << xtag("text", text_);
+            os << ">";
+        } /*print*/
+    } /*namespace scm*/
+} /*namespace xo*/
+
+/* end token.cpp */
--- a/xo-tokenizer2/src/tokenizer2/Tokenizer.cpp
+++ b/xo-tokenizer2/src/tokenizer2/Tokenizer.cpp
@ -0,0 +1,836 @@
+/** @file Tokenizer.cpp
+ *
+ *  @author Roland Conybeare, Jul 2024
+ **/
+
+#include "Tokenizer.hpp"
+
+namespace xo {
+    namespace scm {
+        Tokenizer::Tokenizer(bool debug_flag)
+            : input_state_{debug_flag}
+        {}
+
+        void
+        Tokenizer::discard_current_line()
+        {
+            this->input_state_.discard_current_line();
+        }
+
+        bool
+        Tokenizer::is_1char_punctuation(CharT ch)
+        {
+            switch(ch) {
+            case '(':
+                return true;
+            case ')':
+                return true;
+            case '[':
+                return true;
+            case ']':
+                return true;
+            case '{':
+                return true;
+            case '}':
+                return true;
+            case '<':
+                /* can't be 1char punctuation -- can begin lessequal token */
+                return false;
+            case '>':
+                /* can't be 1char punctuation -- can begin greatequal token,
+                 * and appears in tk_yields token
+                 */
+                return false;
+            case ',':
+                return true;
+            case ';':
+                return true;
+            case ':':
+                /* can't be 1char punctuation -- can begin assignment token */
+                return false;
+            case '=':
+                /* can't be 1char punctuation -- can begin comparison token '==' */
+                return false;
+            case '!':
+                /* can't be 1char punctuation -- can begin comparison token '!=' */
+                return false;
+            case '-':
+                /* can't be punctuation
+                 * - can appear inside f64 token: e.g. 1.23e-9.
+                 * - begins tk_yields token: ->
+                 */
+                return false;
+            case '+':
+                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */
+                return false;
+            case '*':
+                /* not punctuation -- allowed in symbol */
+                return false;
+            case '/':
+                /* not punctuation -- for symmetry with +,- */
+                return false;
+            case '.':
+                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */
+                return false;
+            }
+
+            return false;
+        }
+
+        bool
+        Tokenizer::is_2char_punctuation(CharT ch)
+        {
+            /* can't put '-' here, because of the way it appears in numeric literals
+             * characters here may not appear in symbol names
+             */
+
+            switch(ch) {
+            case '<':
+                /* can begin <= */
+                return true;
+            case '>':
+                /* can begin >= */
+                return true;
+            case ':':
+                /* can begin := */
+                return true;
+            case '=':
+                /* can begin == */
+                return true;
+            case '!':
+                /* can begin != */
+                return true;
+            }
+
+            return false;
+        }
+
+        auto
+        Tokenizer::assemble_token(std::size_t initial_whitespace,
+                                  const span_type & token_text,
+                                  input_state_type * p_input_state) -> result_type
+        {
+            /* literal|pretty|streamlined */
+            log_config::style = function_style::streamlined;
+
+            scope log(XO_DEBUG(p_input_state->debug_flag()));
+            log && log(xtag("token_text", token_text),
+                       xtag("initial_whitespace", initial_whitespace),
+                       xtag("input_state", *p_input_state));
+
+            tokentype tk_type = tokentype::tk_invalid;
+            std::string tk_text;
+
+            const CharT * tk_start = token_text.lo();
+            const CharT * tk_end = token_text.hi();
+
+            const CharT * ix = tk_start;
+
+            /* switch here applies to the first character in a token */
+            switch (*ix) {
+            case '-':
+            case '+':
+                if (token_text.size() == 1) {
+                    /* standalone '+' or '-' */
+                    if (*ix == '+')
+                        tk_type = tokentype::tk_plus;
+                    else if(*ix == '-')
+                        tk_type = tokentype::tk_minus;
+                }
+
+                /** fall through to numeric literal code below **/
+                [[fallthrough]];
+            case '.':
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                /* examples of valid floating-point numbers:
+                 *   .0
+                 *   1e0
+                 *   1e
+                 *   0.
+                 *   +1e0
+                 *   -1e0
+                 *   +1E+2
+                 *   -1E+2
+                 *   -0.123e-10
+                 * non-examples:
+                 *   .
+                 *   -
+                 *   +
+                 *   e0
+                 *   .e0
+                 *   -.e-0
+                 *   +.e+0
+                 *
+                 * in particular: to be recognized as a number,
+                 * must contain at least one digit
+                 */
+
+                log && log("possible number-token");
+
+                /* true if initial sign -/+ encountered */
+                bool sign_flag = false;
+                /* true if '.' encountered */
+                bool period_flag = false;
+                /* true if 'e' | 'E' encountered.
+                 */
+                bool exponent_flag = false;
+                /* true when sign '-' | '+' precedes exponenct digits */
+                bool exponent_sign_flag = false;
+                /* true when at least one digit follows exponent marker */
+                bool exponent_digit_flag = false;
+                /* true if at least one digit encountered */
+                bool number_flag = false;
+
+                log && log(xtag("*ix", *ix),
+                           xtag("tk.length", token_text.size()));
+                if (log && (ix + 1 < tk_end))
+                    log(xtag("*(ix+1)", *(ix + 1)));
+
+                if ((*ix == '-') && (ix + 2 == token_text.hi()) && (*(ix + 1) == '>')) {
+                    /* composing exactly '->' */
+                    tk_type = tokentype::tk_yields;
+                } else {
+                    /* token (if valid) will be one of: {tk_i64, tk_f64, tk_dot}: */
+                    for (; ix != token_text.hi(); ++ix) {
+                        if ((*ix == '-') || (*ix == '+')) {
+                            /* sign allowed:
+                             * 1. before period and before first digit
+                             * 2. after exponent
+                             */
+                            if (!period_flag && !number_flag && !sign_flag) {
+                                sign_flag = true;
+                            } else if (exponent_flag && !exponent_digit_flag) {
+                                exponent_sign_flag = true;
+                            } else {
+                                return result_type::make_error_consume_current_line
+                                    (__FUNCTION__ /*src_function*/,
+                                     "improperly placed sign indicator",
+                                     (ix - tk_start),
+                                     *p_input_state);
+                            }
+                        } else if (*ix == '.') {
+                            if (period_flag) {
+                                return result_type::make_error_consume_current_line
+                                    (__FUNCTION__ /*src_function*/,
+                                     "duplicate decimal point in numeric literal",
+                                     (ix - tk_start),
+                                     *p_input_state);
+                            }
+
+                            period_flag = true;
+                        } else if ((*ix == 'e') || (*ix == 'E')) {
+                            if (exponent_flag) {
+                                return result_type::make_error_consume_current_line
+                                    (__FUNCTION__ /*src_function*/,
+                                     "duplicate exponent marker in numeric literal",
+                                     (ix - tk_start),
+                                     *p_input_state);
+                            }
+
+                            exponent_flag = true;
+                        } else if (isdigit(*ix)) {
+                            if (exponent_flag) {
+                                /* need digit before exponent to recognize as number */
+                                exponent_digit_flag = true;
+                            } else {
+                                number_flag = true;
+                            }
+                        } else {
+                            return result_type::make_error_consume_current_line
+                                (__FUNCTION__ /*src_function*/,
+                                 "unexpected character in numeric constant" /*error_description*/,
+                                 (ix - tk_start),
+                                 *p_input_state);
+                        }
+                    }
+
+                    if (number_flag) {
+                        if (period_flag || exponent_flag) {
+                            tk_type = tokentype::tk_f64;
+                        } else {
+                            tk_type = tokentype::tk_i64;
+                        }
+                    } else if (period_flag && !exponent_flag) {
+                        tk_type = tokentype::tk_dot;
+                    } else {
+                        /* not a valid token */
+                    }
+
+                    log && log(xtag("sign_flag", sign_flag));
+                    log && log(xtag("period_flag", period_flag),
+                               xtag("exponent_flag", exponent_flag),
+                               xtag("exponent_sign_flag", exponent_sign_flag),
+                               xtag("number_flag", number_flag));
+                    log && log(xtag("tk_type", tk_type));
+                }
+
+                break;
+            }
+            case '*':
+                if (token_text.size() == 1) {
+                    /* standalone '*' */
+                    tk_type = tokentype::tk_star;
+                    ++ix;
+                } else {
+                    /* '*' isn't punctuation -- but may allow appearance in a longer token
+                     *
+                     * thinking that x*y is a symbol with an embedded '*' character;
+                     * in particular want to support kebab-case symbols like 'foo-config'
+                     */
+                }
+                break;
+            case '/':
+                if (token_text.size() == 1) {
+                    /* standalone '/' */
+                    tk_type = tokentype::tk_slash;
+                    ++ix;
+                }
+                break;
+            case '=':
+                log && log("singleassign or cmpeq token");
+
+                if (*(ix + 1) == '=') {
+                    tk_type = tokentype::tk_cmpeq;
+                    ++ix;
+                    ++ix;
+                } else {
+                    /* standalone '=' */
+                    tk_type = tokentype::tk_singleassign;
+                    ++ix;
+                }
+                break;
+            case '!':
+                if (*(ix + 1) == '=') {
+                    tk_type = tokentype::tk_cmpne;
+                    ++ix;
+                    ++ix;
+                } else {
+                    /* standlone '!' */
+
+                    // TODO
+                }
+                break;
+            case '"':
+            {
+                log && log("recognize string-token");
+
+                tk_type = tokentype::tk_string;
+
+                tk_text.reserve(token_text.hi() - token_text.lo());
+
+                ++ix; /*skip initial " char*/
+
+                /* true on final " */
+                bool endofstring = false;
+
+                for (; ix != token_text.hi(); ++ix) {
+                    log && log(xtag("*ix", *ix));
+
+                    switch(*ix) {
+                    case '"':
+                        endofstring = true;
+
+                        /* skip final " char, don't capture */
+                        ++ix;
+
+                        break;
+                    case '\\':
+                        /* skip escape char, don't capture */
+                        ++ix;
+
+                        if (ix == token_text.hi()) {
+                            return result_type::make_error_consume_current_line
+                                (__FUNCTION__ /*src_function*/,
+                                 "expecting key following escape character \\",
+                                 (ix - tk_start),
+                                 *p_input_state);
+                        }
+
+                        switch(*ix) {
+                        case '\\':
+                            log && log(xtag("*ix", *ix), xtag("escaped", "t"));
+                            tk_text.push_back(*ix);
+                            break;
+                        case 'n':
+                            log && log(xtag("*ix", *ix), xtag("newline", "t"));
+                            tk_text.push_back('\n');
+                            break;
+                        case 't':
+                            log && log(xtag("*ix", *ix), xtag("tab", "t"));
+                            tk_text.push_back('\t');
+                            break;
+                        case 'r':
+                            log && log(xtag("*ix", *ix), xtag("cr", "t"));
+                            tk_text.push_back('\r');
+                            break;
+                        case '"':
+                            log && log(xtag("*ix", *ix), xtag("quote", "t"));
+                            tk_text.push_back('"');
+                            break;
+                        default:
+                            return result_type::make_error_consume_current_line
+                                (__FUNCTION__ /*src_function*/,
+                                 "expecting one of n|r|\"|\\ following escape \\",
+                                 (ix - tk_start),
+                                 *p_input_state);
+                        }
+                        break;
+                    default:
+                        tk_text.push_back(*ix);
+                        break;
+                    }
+
+                    if (endofstring)
+                        break;
+                }
+
+                if (!endofstring) {
+                    return result_type::make_error_consume_current_line
+                        (__FUNCTION__ /*src_function*/,
+                         "missing terminating '\"' to complete literal string",
+                         (ix - tk_start),
+                         *p_input_state);
+                }
+
+                log && log(tostr("tokenizer::assemble_token",
+                                 xtag("tk_text", tk_text)));
+
+                break;
+            }
+            case 'a': case 'A':
+            case 'b': case 'B':
+            case 'c': case 'C':
+            case 'd': case 'D':
+            case 'e': case 'E':
+            case 'f': case 'F':
+            case 'g': case 'G':
+            case 'h': case 'H':
+            case 'i': case 'I':
+            case 'j': case 'J':
+            case 'k': case 'K':
+            case 'l': case 'L':
+            case 'm': case 'M':
+            case 'n': case 'N':
+            case 'o': case 'O':
+            case 'p': case 'P':
+            case 'q': case 'Q':
+            case 'r': case 'R':
+            case 's': case 'S':
+            case 't': case 'T':
+            case 'u': case 'U':
+            case 'v': case 'V':
+            case 'w': case 'W':
+            case 'x': case 'X':
+            case 'y': case 'Y':
+            case 'z': case 'Z':
+            {
+                /* symbol/identifier must begin with a letter?
+                 * we want to accept some other chars too.
+                 * specifically want to allow identifiers:
+                 *   this-is-the-way
+                 *   this+is+also+the+way
+                 *   how/much/is/that/doggy
+                 *   put*an*asterisk*in*that
+                 *   something%special%
+                 *
+                 * like pure lisp,  we don't allow:
+                 * - identifier beginning with digit
+                 * - period .
+                 *
+                 * unlike pure lisp,  we don't allow anywhere in a symbol:
+                 * - colon     :
+                 * - semicolon ;
+                 * - comma     ,
+                 *
+                 * also we don't allow symbols to begin with special chars
+                 */
+
+                tk_type = tokentype::tk_symbol;
+                break;
+            }
+            case '<':
+            {
+                log && log("leftangle or lessequal token");
+
+                if (*(ix + 1) == '=') {
+                    tk_type = tokentype::tk_lessequal;
+                    ++ix;
+                    ++ix;
+                } else {
+                    tk_type = tokentype::tk_leftangle;
+                    ++ix;
+                }
+                break;
+            }
+            case '>':
+            {
+                log && log("rightangle or greatequal token");
+
+                if (*(ix + 1) == '=') {
+                    tk_type = tokentype::tk_greatequal;
+                    ++ix;
+                    ++ix;
+                } else {
+                    tk_type = tokentype::tk_rightangle;
+                    ++ix;
+                }
+                break;
+            }
+            case '(':
+                tk_type = tokentype::tk_leftparen;
+                ++ix;
+                break;
+            case ')':
+                tk_type = tokentype::tk_rightparen;
+                ++ix;
+                break;
+            case '[':
+                tk_type = tokentype::tk_leftbracket;
+                ++ix;
+                break;
+            case ']':
+                tk_type = tokentype::tk_rightbracket;
+                ++ix;
+                break;
+            case '{':
+                tk_type = tokentype::tk_leftbrace;
+                ++ix;
+                break;
+            case '}':
+                tk_type = tokentype::tk_rightbrace;
+                ++ix;
+                break;
+            case ',':
+                tk_type = tokentype::tk_comma;
+                ++ix;
+                break;
+            case ';':
+                tk_type = tokentype::tk_semicolon;
+                ++ix;
+                break;
+            case ':':
+            {
+                log && log("colon or assignment token");
+
+                if (*(ix + 1) == '=') {
+                    tk_type = tokentype::tk_assign;
+                    ++ix;
+                    ++ix;
+                } else {
+                     tk_type = tokentype::tk_colon;
+                     ++ix;
+                }
+                break;
+            }
+            default:
+                break;
+            }
+
+            if (tk_type == tokentype::tk_invalid) {
+                return result_type::make_error_consume_current_line
+                    (__FUNCTION__ /*src_function*/,
+                     "illegal input character",
+                     (ix - tk_start),
+                     *p_input_state);
+            }
+
+            if ((tk_type == tokentype::tk_i64)
+                || (tk_type == tokentype::tk_f64)
+                || (tk_type == tokentype::tk_symbol))
+            {
+                /* note: capturing token text here;
+                 *       for numeric literals will re-parse in token::i64_value() / token::f64_value()
+                 */
+                tk_text = std::string(tk_start, tk_end);
+            } else if (tk_type == tokentype::tk_string) {
+                ; /* nothing to do here -- desired tk_text already constructed */
+            }
+
+            if (tk_type == tokentype::tk_symbol) {
+                /* check for keywords */
+
+                bool keep_text = false;
+
+                if ((tk_text == "true") || (tk_text == "false")) {
+                    tk_type = tokentype::tk_bool;
+                    keep_text = true;
+                } else if (tk_text == "type") {
+                    tk_type = tokentype::tk_type;
+                } else if (tk_text == "def") {
+                    tk_type = tokentype::tk_def;
+                } else if (tk_text == "lambda") {
+                    tk_type = tokentype::tk_lambda;
+                } else if (tk_text == "if") {
+                    tk_type = tokentype::tk_if;
+                } else if (tk_text == "then") {
+                    tk_type = tokentype::tk_then;
+                } else if (tk_text == "else") {
+                    tk_type = tokentype::tk_else;
+                } else if (tk_text == "let") {
+                    tk_type = tokentype::tk_let;
+                } else if (tk_text == "in") {
+                    tk_type = tokentype::tk_in;
+                } else if (tk_text == "end") {
+                    tk_type = tokentype::tk_end;
+                } else {
+                    /* keep as symbol */
+                    keep_text = true;
+                }
+
+                if (!keep_text)
+                    tk_text.clear();
+            }
+
+            /* input.prefix(0):
+             * require caller preserves current input line until it's entirely exhausted
+             */
+            return result_type(token_type(tk_type, std::move(tk_text)),
+                               p_input_state->current_line().prefix(0));
+        } /*assemble_token*/
+
+        auto
+        Tokenizer::assemble_final_token(const span_type & token_text,
+                                        input_state_type * p_input_state) -> result_type
+        {
+            return assemble_token(0 /*initial_whitespace*/,
+                                  token_text,
+                                  p_input_state);
+        }
+
+        auto
+        Tokenizer::scan(const span_type & input,
+                        bool eof_flag) -> result_type
+        {
+            scope log(XO_DEBUG(input_state_.debug_flag()));
+
+            log && log(xtag("input", input));
+
+            /* - Always at beginning of token when scan() invoked
+             * - scan will not report any portion of line as consumed until it has
+             *   emitted all tokens in that line.
+             *   rationale: caller is allowed to discard storage that
+             *   scan() reports as consumed. But will be holding that line
+             *   until all tokens have been read.
+             * - this means caller will typically call scan()
+             *   with the same input span multiple times
+             */
+
+            /* automagically no-ops when the same input presented twice */
+            this->input_state_.capture_current_line(input, eof_flag);
+
+            const CharT * ix = this->input_state_.skip_leading_whitespace();
+
+            if(ix == input.hi()) {
+                log && log("end input -> consume current line");
+
+                /* entirety of current line has been tokenized
+                 *  -> caller may consume it
+                 */
+                return result_type::make_whitespace(this->input_state_.consume_current_line());
+            }
+
+            /* ix: if ix < input.hi: first non-whitespace character after input_state_.current_pos_ */
+
+            // TODO:
+            // 1. hoist complete_flag up here
+            // 2. use in each branch
+            // 3. common check for prefix-capturing after if-cascade below done
+
+            /* here: *ix is not whitespace */
+
+            auto whitespace_z = input_state_.whitespace();
+
+            log && log(xtag("whitespace_z", whitespace_z));
+
+            /* tk_start points to known beginning of token
+             * (after any whitespace)
+             *
+             * goal is to leave ix pointing to 1 char past the end of the token
+             */
+            const CharT * tk_start = ix;
+
+            if (is_1char_punctuation(*ix)) {
+                /* 1-character token */
+                ++ix;
+            } else if (is_2char_punctuation(*ix)) {
+                CharT ch1 = *ix;
+
+                (void)ch1;
+
+                ++ix;
+
+#ifdef OBSOLETE // no longer a thing. either input ends in whitespace, or ends translation unit
+                if (ix == input.hi()) {
+                    /* need more input to know if/when token complete */
+                    this->prefix_ += std::string(tk_start, input.hi());
+
+                    log && log(xtag("captured-prefix1", this->prefix_));
+                } else
+#endif
+                    {
+                    CharT ch2 = *ix;
+
+                    if (((ch2 >= '0') && (ch2 <= '9'))
+                        || ((ch2 >= 'A') && (ch2 <= 'Z'))
+                        || ((ch2 >= 'a') && (ch2 <= 'z')))
+                    {
+                        /* treat as 1 char punctuation */
+                        ;
+                    } else {
+                        /* include next char */
+                        ++ix;
+                    }
+                }
+            } else if (*ix == '"') {
+                bool complete_flag = false;
+
+                /* 1. embedded space/tab allowed in string literal.
+                 * 2. embedded newline/cr not allowed.
+                 */
+                CharT prev_ch = '"';
+
+                ++ix;
+
+                for (; ix != input.hi(); ++ix) {
+                    /* looking for unescaped " char to end literal */
+                    if (*ix == '"') {
+                        if (prev_ch != '\\') {
+                            ++ix;  /* include terminating " for assemble_token */
+                            complete_flag = true;
+                            break;
+                        }
+                    } else if ((*ix == '\n') || (*ix == '\r')) {
+                        log && log ("string literal with naked newline or CR");
+
+                        return result_type::make_error_consume_current_line
+                            (__FUNCTION__ /*src_function*/,
+                             "must use \\n or \\r to encode newline/cr in string literal",
+                             (ix - tk_start),
+                             this->input_state_);
+                    }
+
+                    prev_ch = *ix;
+                }
+
+                if (!complete_flag) {
+                    log && log("unterminated string literal");
+
+                    return result_type::make_error_consume_current_line
+                               (__FUNCTION__ /*src_function*/,
+                                "unterminated string literal",
+                                (ix - tk_start),
+                                this->input_state_);
+                }
+            } else {
+                /* ix is start of some token */
+
+                if (*ix == '-') {
+                    /* this section load-bearing for input '->' scanning from beginning of token */
+                    ++ix;
+
+                    if (ix == input.hi()) {
+                        /* need more input to know if/when token complete -- see captured-prefix5 below */
+                    } else {
+                        CharT ch2 = *ix;
+
+                        if (ch2 == '>') {
+                            /* include next char and complete token */
+                            ++ix;
+
+                            log && log("complete '->' token");
+
+                            this->input_state_.advance_until(ix);
+
+                            return assemble_token(whitespace_z,
+                                                  span_type(tk_start, ix) /*token*/,
+                                                  &(this->input_state_));
+                        }
+
+                        /* here: -123, -.5e-21 for example */
+                    }
+                } else if (*ix == '>') {
+                    /* this section load-bearing for input '>=' scanning from beginning of token.
+                     * Need this because '>' necessarily excluded from is_1char_punctuation()
+                     */
+                    ++ix;
+
+                    if (ix == input.hi()) {
+                        /* need more input to know if/when token complete -- see captured-prefix5 below */
+                    } else {
+                        CharT ch2 = *ix;
+
+                        if (ch2 != '=') {
+                            log && log("complete '>=' token");
+
+                            this->input_state_.advance_until(ix);
+
+                            /* ignore next char and complete token */
+                            return assemble_token(whitespace_z,
+                                                  span_type(tk_start, ix) /*token*/,
+                                                  &(this->input_state_));
+                        }
+
+                        /* here: >= for example */
+                    }
+                }
+
+                /* scan until:
+                 * - whitespace
+                 * - punctuation
+                 */
+                for (; ix != input.hi(); ++ix) {
+                    if (input_state_type::is_whitespace(*ix)
+                        || is_1char_punctuation(*ix)
+                        || is_2char_punctuation(*ix))
+                    {
+                        break;
+                    }
+
+                    /* this section load-bearing for input '>' after beginning of a token, e.g. p> */
+                    if ((ix > tk_start) && (*ix == '>'))
+                        break;
+
+                    /* this section load-bearing for input '->' at the end of another token, e.g. p->q */
+                    if (*ix == '-') {
+                        if (ix + 1 == input.hi()) {
+                            /* need more input to know if/when token complete
+                             *
+                             *   apple-banana   parses as: {tk_symbol: apple-banana}
+                             *   apple->        parses as: {tk_symbol: apple} {tk_yields}
+                             *   apple-         illegal (may not end symbol with '-')
+                             */
+                            break;
+                        }
+
+                        if (*(ix + 1) == '>') {
+                            /* treat '->' as punctuation;  complete preceding token */
+                            break;
+                        }
+                    }
+                }
+            }
+
+            log && log("assemble token z", xtag("token_z", ix - tk_start));
+
+            assert(tk_start < ix);
+
+            this->input_state_.advance_until(ix);
+
+            return assemble_token(whitespace_z,
+                                  span_type(tk_start, ix) /*token*/,
+                                  &(this->input_state_));
+        } /*scan*/
+    } /*namespace scm*/
+} /*namespace xo*/
+
+/* end Tokenizer.cpp */
--- a/xo-tokenizer2/src/tokenizer2/TokenizerError.cpp
+++ b/xo-tokenizer2/src/tokenizer2/TokenizerError.cpp
@ -0,0 +1,60 @@
+/** @file TokenizerError.cpp
+ *
+ *  @author Roland Conybeare, Jun 2025
+ **/
+
+#include "TokenizerError.hpp"
+
+namespace xo {
+    namespace scm {
+
+        void
+        TokenizerError::print(std::ostream & os) const
+        {
+            os << "<tokenizer-error"
+               << xtag("src-function", src_function_)
+               << xtag("message", error_description_)
+               << xtag("input", input_state_.current_line())
+               << xtag("whitespace", input_state_.whitespace())
+               << xtag("error-pos", error_pos_)
+               << ">";
+        }
+
+        void
+        TokenizerError::report(std::ostream & os) const
+        {
+            using namespace std;
+
+            if (!error_description_.empty()) {
+                const char * prefix = "input: ";
+                /* input_state.tk_start:    position of first character in token
+                 * input_state.current_pos: position of first character following preceding token.
+                 * error_pos:               position (relative to start) at which failure detected
+                 */
+                const size_t tk_start = input_state_.tk_start();
+                const size_t tk_indent = (strlen(prefix) + tk_start);
+                const size_t error_pos = 1 + tk_start + error_pos_;
+
+                os << "token col: " << tk_start << ", error col: " << error_pos << "\n";
+                os << prefix;
+                for (const char *p = input_state_.current_line().lo(),
+                         *e = input_state_.current_line().hi(); p < e; ++p)
+                {
+                    os << *p;
+                }
+                //os << endl;
+                os << std::setw(tk_indent) << " ";
+
+                for (size_t i = 0; i < error_pos_; ++i) {
+                    os << '_';
+                }
+                os << '^' << endl;
+
+                os << error_description_ << endl;
+            }
+        }
+
+    } /*namespace scm*/
+} /*namespace xo*/
+
+/* end TokenizerError.cpp */
--- a/xo-tokenizer2/src/tokenizer2/scan_result.cpp
+++ b/xo-tokenizer2/src/tokenizer2/scan_result.cpp
@ -0,0 +1,43 @@
+/** @file scan_result.cpp
+ *
+ *  @author Roland Conybeare, 2025
+ **/
+
+#include "scan_result.hpp"
+
+namespace xo {
+    namespace scm {
+        scan_result
+        scan_result::make_whitespace(const span_type& whitespace_input)
+        {
+            return scan_result(token_type::invalid(), whitespace_input /*consumed*/);
+        }
+
+        scan_result
+        scan_result::make_partial(const span_type& prefix_input)
+        {
+            return scan_result(token_type::invalid(), prefix_input /*consumed*/);
+        }
+
+        scan_result
+        scan_result::make_error_consume_current_line(const char * error_src,
+                                                     std::string error_msg,
+                                                     size_t error_pos,
+                                                     input_state_type & input_state_ref)
+        {
+            /* report+consume entire input line */
+
+            /* copy before altered by .consume_current_line() */
+            input_state_type input_state_copy = input_state_ref;
+
+            return scan_result(token_type::invalid(),
+                               input_state_ref.consume_current_line(),
+                               error_type(error_src,
+                                          error_msg,
+                                          input_state_copy,
+                                          error_pos));
+        }
+    } /*namespace scm*/
+} /*namespace xo*/
+
+/* end scan_result.cpp */
--- a/xo-tokenizer2/src/tokenizer2/tokentype.cpp
+++ b/xo-tokenizer2/src/tokenizer2/tokentype.cpp
@ -0,0 +1,74 @@
+/* file tokentype.cpp
+ *
+ * author: Roland Conybeare
+ */
+
+#include "tokentype.hpp"
+
+namespace xo {
+    namespace scm {
+        char const *
+        tokentype_descr(tokentype tk_type)
+        {
+#define CASE(x) case tokentype::x: return STRINGIFY(x)
+
+            switch(tk_type) {
+                CASE(tk_bool);
+                CASE(tk_i64);
+                CASE(tk_f64);
+                CASE(tk_string);
+                CASE(tk_symbol);
+                CASE(tk_leftparen);
+
+                CASE(tk_rightparen);
+                CASE(tk_leftbracket);
+                CASE(tk_rightbracket);
+                CASE(tk_leftbrace);
+                CASE(tk_rightbrace);
+
+                CASE(tk_leftangle);
+                CASE(tk_rightangle);
+                CASE(tk_lessequal);
+                CASE(tk_greatequal);
+                CASE(tk_dot);
+                CASE(tk_comma);
+                CASE(tk_colon);
+
+                CASE(tk_doublecolon);
+                CASE(tk_semicolon);
+                CASE(tk_singleassign);
+                CASE(tk_assign);
+                CASE(tk_yields);
+
+                CASE(tk_plus);
+                CASE(tk_minus);
+                CASE(tk_star);
+                CASE(tk_slash);
+
+                CASE(tk_cmpeq);
+                CASE(tk_cmpne);
+
+                CASE(tk_type);
+                CASE(tk_def);
+                CASE(tk_lambda);
+                CASE(tk_if);
+                CASE(tk_then);
+                CASE(tk_else);
+                CASE(tk_let);
+
+                CASE(tk_in);
+                CASE(tk_end);
+
+            case tokentype::tk_invalid:
+            case tokentype::n_tokentype:
+                return "?tokentype";
+            }
+
+#undef CASE
+
+            return "???";
+        } /*tokentype_descr*/
+    } /*namespace scm*/
+} /*namespace xo*/
+
+/* end tokentype.cpp */