+ xo-tokenizer2 xo-reader2 xo-expression2 xo-interpreter2

2nd gen schematika interpreter using fomo
2026-01-10 12:39:09 -05:00 · 2026-01-10 12:39:09 -05:00 · b9921d4108
commit b9921d4108
parent 7abe73c903
23 changed files with 3293 additions and 1 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,37 @@
 # xo-tokenizer2/CMakeLists.txt
 cmake_minimum_required(VERSION 3.10)
 project(xo_tokenizer2 VERSION 1.0)
 enable_language(CXX)
 include(GNUInstallDirs)
 include(cmake/xo-bootstrap-macros.cmake)
 xo_cxx_toplevel_options3()
 # ----------------------------------------------------------------
 # c++ settings
 # one-time project-specific c++ flags. usually empty
 set(PROJECT_CXX_FLAGS "")
 add_definitions(${PROJECT_CXX_FLAGS})
 # ----------------------------------------------------------------
 # output targets
 add_subdirectory(src/tokenizer2)
 add_subdirectory(example)
 #add_subdirectory(utest)
 xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets)
 if (XO_ENABLE_EXAMPLES)
    install(TARGETS xo_tokenizer2_repl DESTINATION bin/xo/example/tokenizer2)
 endif()
 # ----------------------------------------------------------------
 # docs targets depends on all the other library/utest targets
 #
 #add_subdirectory(docs)
 # end CMakeLists.txt
--- a/README.md
+++ b/README.md
@ -1 +0,0 @@
 # xo-tokenizer2
--- a/cmake/xo-bootstrap-macros.cmake
+++ b/cmake/xo-bootstrap-macros.cmake
@ -0,0 +1,33 @@
 # ----------------------------------------------------------------
 # for example:
 #   $ PREFIX=/usr/local   # for example
 #   $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build
 #
 # will get
 #   CMAKE_MODULE_PATH
 # from xo-cmake-config --cmake-module-path
 #
 # and expect .cmake macros in
 #   CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake
 # ----------------------------------------------------------------
 find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED)
 if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL "prefix"))
    message(FATAL "could not find xo-cmake-config executable")
 endif()
 if (NOT XO_SUBMODULE_BUILD)
    if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix))
        # default to typical install location for xo-project-macros
        execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH)
        message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}")
    endif()
 endif()
 # needs to have been installed somewhere on CMAKE_MODULE_PATH,
 # (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX)
 #
 include(xo_macros/xo_cxx)
 xo_cxx_bootstrap_message()
--- a/cmake/xo_tokenizer2Config.cmake.in
+++ b/cmake/xo_tokenizer2Config.cmake.in
@ -0,0 +1,12 @@
@PACKAGE_INIT@
 include(CMakeFindDependencyMacro)
 # note: changes to find_dependency() calls here
 #       must coordinate with xo_dependency() calls
 #       in CMakeLists.txt
 #
 #find_dependency(xo_flatstring)
 include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
 check_required_components("@PROJECT_NAME@")
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@ -0,0 +1 @@
 add_subdirectory(tokenrepl)
--- a/example/tokenrepl/CMakeLists.txt
+++ b/example/tokenrepl/CMakeLists.txt
@ -0,0 +1,15 @@
 # xo-tokenizer2/example/tokenrepl/CMakeLists.txt
 set(SELF_EXE xo_tokenizer2_repl)
 set(SELF_SRCS tokenrepl.cpp)
 if (XO_ENABLE_EXAMPLES)
    xo_add_executable(${SELF_EXE} ${SELF_SRCS})
    xo_self_dependency(${SELF_EXE} xo_tokenizer2)
    xo_external_target_dependency(${SELF_EXE} replxx replxx::replxx)
    find_package(Threads REQUIRED)   # replxx needs this
    target_link_libraries(${SELF_EXE} PUBLIC Threads::Threads)
 endif()
 # end CMakeLists.txt
--- a/example/tokenrepl/tokenrepl.cpp
+++ b/example/tokenrepl/tokenrepl.cpp
@ -0,0 +1,128 @@
 /** @file tokenrepl.cpp **/
 #include <xo/tokenizer2/Tokenizer.hpp>
 #include <xo/tokenizer2/Token.hpp>
 #include <xo/tokenizer2/tokentype.hpp>
 #include <xo/tokenizer2/span.hpp>
 #include <xo/indentlog/log_config.hpp>
 #include <replxx.hxx>
 #include <iostream>
 #include <unistd.h> // for isatty
 // presumeably replxx assumes input is a tty
 //
 bool replxx_getline(bool interactive,
                    std::size_t parser_stack_size,
                    replxx::Replxx & rx,
                    std::string& input)
 {
    using namespace std;
    char const * prompt = "";
    if (interactive) {
        if (parser_stack_size <= 1)
            prompt = "> ";
        else
            prompt = ". ";
    }
    const char * input_cstr = rx.input(prompt);
    bool retval = (input_cstr != nullptr);
    if (retval) {
        //cerr << "got reval->true" << endl;
        input = input_cstr;
    } else {
        //cerr << "got retval->false" << endl;
    }
    rx.history_add(input);
    // we want tokenizer to see newline, it's syntax
    input.push_back('\n');
    return retval;
 }
 #ifdef OBSOLETE
 bool repl_getline(bool interactive,
                  std::istream & in,
                  std::ostream & out,
                  std::string & input)
 {
    if (interactive) {
        out << "> ";
        std::flush(out);
    }
    return static_cast<bool>(std::getline(in, input));
 }
 #endif
 int
 main() {
    using xo::scm::Tokenizer;
    using xo::scm::span;
    using xo::scm::operator<<;
    using replxx::Replxx;
    using namespace std;
    using span_type = span<const char>;
    xo::log_config::min_log_level = xo::log_level::severe;
    bool interactive = isatty(STDIN_FILENO);
    Replxx rx;
    rx.set_max_history_size(1000);
    rx.history_load("repl_history.txt");
    Tokenizer tkz(xo::log_config::min_log_level <= xo::log_level::info);
    string input_str;
    size_t line_no = 1;
    constexpr std::size_t c_maxlines = 25;
    while (
        //repl_getline(interactive, cin, cout, input_str)  // once upon a time
        replxx_getline(interactive, 0 /*parser_stack_size*/, rx, input_str))
    {
        span_type input = span_type::from_string(input_str);
        //cout << "input: " << input << endl;
        // reminder: input may contain multiple tokens
        while (!input.empty()) {
            auto [tk, consumed, error] = tkz.scan(input, false /*!eof*/);
            if (tk.is_valid()) {
                cout << tk << endl;
            } else if (error.is_error()) {
                cout << "tokenizer error: " << endl;
                error.report(cout);
                break;
            }
            input = input.after_prefix(consumed);
        }
        /* here: input.empty() or error encountered */
        ++line_no;
        if (line_no > c_maxlines) {
            cout << "always exit after " << c_maxlines << " lines of input" << endl;
            break;
        }
    }
 }
 /** end tokenrepl.cpp */
--- a/include/xo/tokenizer2/.gitkeep
+++ b/include/xo/tokenizer2/.gitkeep
--- a/include/xo/tokenizer2/TkInputState.hpp
+++ b/include/xo/tokenizer2/TkInputState.hpp
@ -0,0 +1,230 @@
 /* @file TkInputState.hpp
 *
 * author: Roland Conybeare, Jun 2025
 */
 #pragma once
 #include "span.hpp"
 namespace xo {
    namespace scm {
        /** enum to report outcome of @ref capture_current_line **/
        enum class input_error {
            /** normal return, input line successfully identified and captured **/
            ok = 0,
            /** incomplete input; should not have been submitted
             *  to @ref capture_current_line.
             *  note: submit last line of input with eof_flag=true
             **/
            incomplete,
            N
        };
        /** @class input_state
         *  @brief Track detailed input position for use in error messages
         *
         *  input characters fall into two categories:
         *  - consumed: memory can be reclaimed/recycled
         *  - buffered: memory will be retained unaltered until consumed
         *
         *  remarks:
         *  - always in one of two states:
         *    - empty
         *    - contains exactly one line of input
         *  - also record current input position.
         *    Use this for example to identify where tokenizer rejected input.
         *  - .current_pos advances by one token
         *
         *  - buffered characters always form a single contiguous range.
         *  - input_state does not own any storage; storage is owned elsewhere
         *
         *  @text
         *
         *    <------------------.current_line------------------>
         *                                   >  <-- .whitespace
         *    cccccccccccccccccccccccccccccccc__TTTTTTTTxxxxxxxxx
         *    ^                                 ^                ^
         *    .current_line.lo                  |                .current_line.hi
         *                           .current_pos
         *
         *    <----prev_line----> <----current_line---->
         *                                   >  <--whitespace
         *    ppppppppppppppppppp cccccccccccc__TTTTTTTT
         *    ^
         *
         *  @endtext
         **/
        class TkInputState {
        public:
            /** @defgroup input-state-type-traits input-state type straits **/
            ///@{
            using CharT = char;
            /** type representing a contiguous span of tokenizer input characters **/
            using span_type = span<const CharT>;
            ///@}
        public:
            /** @defgroup input-state-ctors input_state constructors **/
            ///@{
            TkInputState() = default;
            explicit TkInputState(bool debug_flag) : debug_flag_{debug_flag} {}
            /** Create instance with supplied @p current_line, @p current_pos, @p whitespace.
             *  Introduced for unit tests, not used in tokenizer.
             **/
            explicit TkInputState(const span<const CharT>& current_line,
                                  size_t current_pos,
                                  size_t whitespace) : current_line_{current_line},
                                                       current_pos_{current_pos},
                                                       whitespace_{whitespace} {}
            ///@}
            /** @defgroup input-state-static-methods input_state static methods **/
            ///@{
            /** recognize the newline character '\n' **/
            static bool is_newline(CharT ch);
            /** identifies whitespace chars.
             *  These are chars that do not belong to any token.
             *  They are not permitted to appear within
             *  a symbol or string token.
             *  Appearance of a whitespace char forces completioon of
             *  preceding token.
             **/
            static bool is_whitespace(CharT ch);
            ///@}
            /** @defgroup input-state-access-methods **/
            ///@{
 #pragma GCC diagnostic push
 #ifndef __APPLE__
 #pragma GCC diagnostic ignored "-Wchanges-meaning"
 #endif
            const span_type & current_line() const { return current_line_; }
 #pragma GCC diagnostic pop
            size_t tk_start() const { return tk_start_; }
            size_t current_pos() const { return current_pos_; }
            size_t whitespace() const { return whitespace_; }
            bool debug_flag() const { return debug_flag_; }
            ///@}
            /** @defgroup input-state-general-methods **/
            ///@{
            /** Input state less @p n chars.
             *  Use to recover input state before a complete but error-triggering token
             **/
            TkInputState rewind(std::size_t n) const;
            /** Capture prefix of @p input up to first newline.
             *  Set read position to start of line.
             *
             *  Alters:
             *    .current_line
             *    .current_pos
             *
             * Return pair comprising error code and input span representing first line
             * (including trailing newline) from @p input.
             **/
            std::pair<input_error, span_type> capture_current_line(const span_type & input,
                                                                   bool eof_flag);
            /** atomically return current line while discarding it from input state
             *
             *  Alters
             *    .current_line
             *    .current_pos
             *    .whitespace
             **/
            span_type consume_current_line();
            /** Reset input state for start of next line.
             *  Expression parser may use this to discard remainder of input line
             *  after a parsing error.
             *
             * Alters:
             *   .current_line
             *   .current_pos
             *   .whitespace
             **/
            void discard_current_line();
            /** Advance input position by @p z
             *
             *  Alters:
             *   .current_pos
             **/
            void advance(size_t z);
            /** Advance .current_pos to pos.
             *  Require: pos in @ref current_line_
             **/
            void advance_until(const CharT * pos);
            /** Skip prefix of input, starting at current read position,
             *  comprising only whitespace.
             *
             *  Presume input position is at end of token;
             *  on return @ref whitespace_ counts number of whitespace characters
             *  skipped.
             *
             *  Return pointer to first non-whitespace character after @ref current_pos_
             *  or @ref current_line_.hi if reached end of buffered line.
             *
             *  Alters:
             *    .whitespace
             **/
            const CharT * skip_leading_whitespace();
            ///@}
        private:
            /** @defgroup input-state-instance-vars input_state instance variables **/
            ///@{
            /** remember current input line.  Used only to report errors **/
            span<const CharT> current_line_ = span<const CharT>();
            /** start of last token within @ref current_line_ **/
            size_t tk_start_ = 0;
            /** input position within @ref current_line_ **/
            size_t current_pos_ = 0;
            /** number of whitespace chars since end of preceding token,
             *  or last newline, whichever is less
             **/
            size_t whitespace_ = 0;
            /** true to log input activity */
            bool debug_flag_ = false;
            ///@}
        }; /*TkInputState*/
        inline std::ostream &
        operator<<(std::ostream & os,
                   const TkInputState & x)
        {
            using xo::print::unq;
            os << "<input_state"
               << xtag("tk", x.tk_start())
               << xtag("pos", x.current_pos())
               << xtag("line",
                       unq(std::string_view(x.current_line().lo(),
                                            x.current_line().hi())))
               << xtag("whitespace", x.whitespace())
            << ">";
            return os;
        }
    } /*namespace scm*/
 } /*namespace xo*/
 /* end TkInputState.hpp */
--- a/include/xo/tokenizer2/Token.hpp
+++ b/include/xo/tokenizer2/Token.hpp
@ -0,0 +1,226 @@
 /* file Token.hpp
 *
 * author: Roland Conybeare, Jul 2024
 */
 #pragma once
 #include "tokentype.hpp"
 #include "xo/indentlog/print/tag.hpp"
 #include <stdexcept>
 #include <ostream>
 #include <string>
 #include <cstdint>
 namespace xo {
    namespace scm {
        namespace detail {
            /* compute a * b^p,  p >= 0 */
            constexpr double
            pow_aux(double a, double b, int p) {
                while (p > 0) {
                    if (p % 2 == 1) {
                        /* a * b^p = a * b^(2q + 1) = a.b * 10^(2q) */
                        a *= b;
                        p -= 1;
                    } else {
                        /* a * b^p = a * b^(2q) = a * (b^2)^q */
                        b = b * b;
                        p /= 2;
                    }
                }
                /* a * b^0 = a */
                return a;
            }
            constexpr double
            pow10(int p) {
                if (p >= 0)
                    return pow_aux(1.0, 10.0, p);
                else
                    return 1.0 / pow_aux(1.0, 10.0, -p);
            }
        }
        /** @class token
         *  @brief Represent a Schematika lexical token
         **/
        class Token {
        public:
            /** @defgroup token-ctors token constructors **/
            ///@{
            /** default ctor creates token with type @c tk_invalid **/
            Token() = default;
            /** create token with type @c tk_type and input text @c text **/
            Token(tokentype tk_type, const std::string & text = "")
                : tk_type_{tk_type}, text_{text} {}
            /** create invalid token (same as null ctor, but explicit) **/
            static Token invalid() { return Token(); }
            /** Create token representing a boolean literal from text @p txt
             *  @p txt must be @c true or @c false
             **/
            static Token bool_token(const std::string & txt) {
                return Token(tokentype::tk_bool, txt);
            }
            /** Create token representing 64-bit signed integer literal parsed from decimal @p txt.
             *  The string @p txt must be a decimal integer literal, since @ref i64_value re-parses @p txt.
             **/
            static Token i64_token(const std::string & txt) {
                return Token(tokentype::tk_i64, txt);
            }
            /** create token representing 64-bit floating-point literal parsed from decimal @p txt
             *  The string @p txt must be a decimal floating-point literal, since @ref f64_value re-parses @p txt.
             **/
            static Token f64_token(const std::string & txt) {
                return Token(tokentype::tk_f64, txt);
            }
            /** create token representing literal string parsed from @p txt **/
            static Token string_token(const std::string & txt) {
                return Token(tokentype::tk_string, txt);
            }
            /** create token representing a symbol parsed from @p txt.
             *  Note that not all strings are valid symbol names.
             **/
            static Token symbol_token(const std::string & txt) {
                return Token(tokentype::tk_symbol, txt);
            }
            /** token representing left angle bracket @c "<" **/
            static Token leftangle() { return Token(tokentype::tk_leftangle); }
            /** token representing right angle bracket @c ">" **/
            static Token rightangle() { return Token(tokentype::tk_rightangle); }
            /** token representing left parenthesis @c "(" **/
            static Token leftparen() { return Token(tokentype::tk_leftparen); }
            /** Token representing right parenthesis @c ")" **/
            static Token rightparen() { return Token(tokentype::tk_rightparen); }
            /** token representing left bracket @c "[" **/
            static Token leftbracket() { return Token(tokentype::tk_leftbracket); }
            /** token representing right bracket @c "]" **/
            static Token rightbracket() { return Token(tokentype::tk_rightbracket); }
            /** token representing left brace @c "{" **/
            static Token leftbrace() { return Token(tokentype::tk_leftbrace); }
            /** token representing right brace @c "}' **/
            static Token rightbrace() { return Token(tokentype::tk_rightbrace); }
            /** token representing period @c "." **/
            static Token dot() { return Token(tokentype::tk_dot); }
            /** token representing comma @c "," **/
            static Token comma() { return Token(tokentype::tk_comma); }
            /** token representing colon @c ":" **/
            static Token colon() { return Token(tokentype::tk_colon); }
            /** token representing double-colo @c "::" **/
            static Token doublecolon() { return Token(tokentype::tk_doublecolon); }
            /** token representing semicolon @c ";" **/
            static Token semicolon() { return Token(tokentype::tk_semicolon); }
            /** token representing single-assignment @c "=" **/
            static Token singleassign() { return Token(tokentype::tk_singleassign); }
            /** token representing unrestricted assignment @c ":=" **/
            static Token assign_token() { return Token(tokentype::tk_assign); }
            /** token representing indirection @c "->" **/
            static Token yields() { return Token(tokentype::tk_yields); }
            /** token for @c "+" **/
            static Token plus_token() { return Token(tokentype::tk_plus); }
            /** token for @c "-" **/
            static Token minus_token() { return Token(tokentype::tk_minus); }
            /** token for @c "*" **/
            static Token star_token() { return Token(tokentype::tk_star); }
            /** token for @c "/" **/
            static Token slash_token() { return Token(tokentype::tk_slash); }
            /** token representing keyword @c type **/
            static Token type() { return Token(tokentype::tk_type); }
            /** token representing keyword @c def **/
            static Token def() { return Token(tokentype::tk_def); }
            /** token representing keyword @c lambda **/
            static Token lambda() { return Token(tokentype::tk_lambda); }
            /** token representing keyword @c if **/
            static Token if_token() { return Token(tokentype::tk_if); }
            /** token representing keyword @c else **/
            static Token else_token() { return Token(tokentype::tk_else); }
            /** token representing keyword @c let **/
            static Token let() { return Token(tokentype::tk_let); }
            /** token representing keyword @c in **/
            static Token in() { return Token(tokentype::tk_in); }
            /** token representing keyword @c end **/
            static Token end() { return Token(tokentype::tk_end); }
            ///@}
            /** @defgroup token-access-methods **/
            ///@{
            tokentype tk_type() const { return tk_type_; }
            const std::string & text() const { return text_; }
            ///@}
            /** @defgroup token-general-methods **/
            ///@{
            /** true if token understood to represent valid input
             *  i.e. any token type except @c tk_invalid
             **/
            bool is_valid() const { return tk_type_ != tokentype::tk_invalid; }
            /** true for sentinel token with type tk_invalid **/
            bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; }
            /** true for tokens with variable text.  false for those with fixed textual representation **/
            bool has_variable_text() const { return (tk_type_ == tokentype::tk_i64
                                                     || tk_type_ == tokentype::tk_f64
                                                     || tk_type_ == tokentype::tk_string
                                                     || tk_type_ == tokentype::tk_symbol); }
            /** expect input matching @c true or @c false **/
            bool bool_value() const;
            /** expect input matching @c [+|-][0-9][0-9]* **/
            std::int64_t i64_value() const;
            /** expect input matching @c [+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]* **/
            double f64_value() const;
            /** print human-readable token representation on stream @p os **/
            void print(std::ostream & os) const;
            ///@}
        private:
            /** @defgroup token-instance-vars **/
            ///@{
            /** category for this token **/
            tokentype tk_type_ = tokentype::tk_invalid;
            /** characters comprising this token.
             *  only provided for certain token types:
             *
             *    tk_i64
             *    tk_f64
             *    tk_string
             *    tk_symbol
             **/
            std::string text_;
            ///@}
        };
        inline std::ostream &
        operator<< (std::ostream & os,
                    const Token & tk)
        {
            tk.print(os);
            return os;
        }
    } /*namespace scm*/
 #ifndef ppdetail_atomic
    namespace print {
        PPDETAIL_ATOMIC(xo::scm::token<char>);
    }
 #endif
 } /*namespace xo*/
 /* end Token.hpp */
--- a/include/xo/tokenizer2/Tokenizer.hpp
+++ b/include/xo/tokenizer2/Tokenizer.hpp
@ -0,0 +1,167 @@
 /* file Tokenizer.hpp
 *
 * author: Roland Conybeare, Jul 2024
 */
 #pragma once
 #include "Token.hpp"
 #include "TkInputState.hpp"
 #include "span.hpp"
 #include "scan_result.hpp"
 #include "xo/indentlog/scope.hpp"
 #include "xo/indentlog/print/ppdetail_atomic.hpp"
 #include <cassert>
 namespace xo {
    namespace scm {
        /** @class Tokenizer
         *  @brief Parse a Schematika character stream into lexical tokens
         *
         *  Use:
         *
         *  @code
         *    // see xo-tokenizer2/example/tokenrepl/tokenrepl.cpp
         *    // for exact working code
         *
         *    using tokenizer_type = tokenizer<char>;
         *    using span_type = tokenizer_type::span_type;
         *
         *    tokenizer_type tkz;
         *    span_type input = ...;
         *
         *    while (!input.empty()) {
         *        auto [tk, consumed, error] = tkz.scan(input);
         *
         *        if (tk.is_valid()) {
         *            // do something with tk
         *        } else if (error.is_error()) {
         *            error.report(cout);
         *            break;
         *        }
         *
         *        input = input.after_prefix(consumed);
         *    }
         *
         *    if endofinput {
         *        auto [tk, consumed, error] = tzk.notify_eof()
         *
         *        // do something with (final) tk if tk.is_valid()
         *    }
         *
         *  @endcode
         *
         * See tokentype.hpp for token types
         **/
        class Tokenizer {
        public:
            using CharT = char;
            using token_type = Token;
            using error_type = TokenizerError;
            using span_type = span<const CharT>;
            using input_state_type = TkInputState;
            using result_type = scan_result;
        public:
            /** @defgroup tokenizer-ctors tokenizer constructors **/
            ///@{
            Tokenizer(bool debug_flag = false);
            ///@}
            /** @defgroup tokenizer-access-methods tokenizer access methods **/
            ///@{
 #pragma GCC diagnostic push
 #ifndef __APPLE__
 #pragma GCC diagnostic ignored "-Wchanges-meaning"
 #endif
            const TkInputState & input_state() const { return input_state_; }
 #pragma GCC diagnostic pop
            ///@}
            /** @defgroup tokenizer-general-methods tokenizer methods **/
            ///@{
            /** identifies punctuation chars.
             *  These are chars that are not permitted to appear within
             *  a symbol token.  Instead they force completion of
             *  a preceding token,  and start a new token with themselves
             **/
            static bool is_1char_punctuation(CharT ch);
            /** more-relaxed version of is_1char_punctuation.
             *  Chars that are not permitted to appear within a symbol token,
             *  but may form token combined with next character
             **/
            static bool is_2char_punctuation(CharT ch);
            /** assemble token from text @p token_text.
             *  @p initial_whitespace   Amount of whitespace input being consumed from input.
             *  @p token_text subset of input_line representing a single token.
             *  @p p_input_state input state containing input_line.  On exit current line cleared
             *                   if error
             *
             *  retval.consumed will represent some possibly-empty prefix of @p input
             **/
            static scan_result assemble_token(std::size_t initial_whitespace,
                                              const span_type & token_text,
                                              TkInputState * p_input_state);
            /** degenerate version of assemble_token() on reaching end-of-file **/
            static scan_result assemble_final_token(const span_type & token_text,
                                                    TkInputState * p_input_state);
            /** true if tokenizer contains stored prefix of
             *  possibly-incomplete token
             **/
            bool has_prefix() const { return !prefix_.empty(); }
            /** scan for next input token,  given @p input.
             *  Note:
             *  - tokenizer can consume input (e.g. whitespace)
             *    without completing a token
             *  - input will remember the extent of the last line of input
             *    for which parsing has begun, but not completed.
             *    It's required that at least that portion of the input span
             *    remain valid across scan(), scan2() calls
             *
             *  @return {parsed token, consumed span}
             **/
            scan_result scan(const span_type & input,
                             bool eof_flag);
            /** discard current line after error.  Just cleans up error-reporting state **/
            void discard_current_line();
            ///@}
        private:
            /** @defgroup tokenizer-instance-vars tokenizer instance variables **/
            ///@{
            /** track input state (line#,pos,..) for error messages.
             *  There's an ordering problem here:
             *  1. input_state_.skip_leading_whitespace() advances
             *     current line automagically when it sees \n
             *  2. need to capture value of @ref input_state_ _before_ newline
             *  3. but neeed newline to end token
             *  Also recall input_state_type needed for reporting errors.
             **/
            input_state_type input_state_;
            /** Accumulate partial token here.
             *  This will happen if input sent to @ref tokenizer::scan
             *  ends without whitespace such that last available token's
             *  extent is not determined
             **/
            std::string prefix_;
            ///@}
        }; /*tokenizer*/
    } /*namespace scm*/
 } /*namespace xo*/
 /* end Tokenizer.hpp */
--- a/include/xo/tokenizer2/TokenizerError.hpp
+++ b/include/xo/tokenizer2/TokenizerError.hpp
@ -0,0 +1,114 @@
 /* file TokenizerError.hpp
 *
 * author: Roland Conybeare, Jun 2025
 */
 #pragma once
 #include "TkInputState.hpp"
 #include "tokentype.hpp"
 #include "span.hpp"
 #include <iomanip>
 namespace xo {
    namespace scm {
        /** @class tokenizer_error
         *  @brief represent a lexing error, with context
         *
         *  @tparam CharT  representation for single characters
         **/
        class TokenizerError {
        public:
            using CharT = char;
            using span_type = span<const CharT>;
        public:
            /** @defgroup tokenizer-error-ctors **/
            ///@{
            /** Default ctor represents a not-an-error sentinel object **/
            TokenizerError() = default;
            /** Constructor to capture parsing error context
             *  @p tk_start   current position on entry to scanner
             *  @p error_pos  error location relative to token start
             **/
            TokenizerError(const char * src_function,
                           std::string error_description,
                           const TkInputState & input_state,
                           size_t error_pos)
                : src_function_{src_function},
                  error_description_{std::move(error_description)},
                  input_state_{input_state},
                  error_pos_{error_pos}
                {
                    scope log(XO_DEBUG(input_state.debug_flag()));
                    log && log(xtag("input_state.current_pos", input_state.current_pos()),
                               xtag("error_pos", error_pos));
                }
            ///@}
            /** @defgroup tokenizer-error-access-methods **/
            ///@{
            const char * src_function() const { return src_function_; }
            const std::string & error_description() const { return error_description_; }
 #pragma GCC diagnostic push
 #ifndef __APPLE__
 #pragma GCC diagnostic ignored "-Wchanges-meaning"
 #endif
            const TkInputState & input_state() const { return input_state_; }
 #pragma GCC diagnostic pop
            size_t tk_start() const { return input_state_.current_pos(); }
            size_t whitespace() const { return input_state_.whitespace(); }
            size_t error_pos() const { return error_pos_; }
            ///@}
            /** @defgroup tokenizer-error-general-methods **/
            ///@{
            /** true, except for a sentinel error object **/
            bool is_error() const { return !error_description_.empty(); }
            /** false except for object in sentinel state **/
            bool is_not_an_error() const { return error_description_.empty(); }
            /** Print representation to stream @p os. Intended for tokenizer diagnostics.
             *  For Schematika errors prefer @ref report
             **/
            void print(std::ostream & os) const;
            /** Print human-oriented error report on @p os. **/
            void report(std::ostream & os) const;
            ///@}
        private:
            /** @defgroup tokenizer-error-vars **/
            ///@{
            /** source location (in tokenizer) at which error identified **/
            char const * src_function_ = nullptr;
            /** static error description **/
            std::string error_description_;
            /** input state associated with this error.
             *  Sufficient to precisely locate it with context.
             **/
            TkInputState input_state_;
            /** position (relative to @ref tk_entry_) of error **/
            size_t error_pos_ = 0;
            ///@}
        }; /*error_token*/
        inline std::ostream &
        operator<< (std::ostream & os,
                    const TokenizerError & tkerr)
        {
            tkerr.print(os);
            return os;
        }
    } /*namespace scm*/
 } /*namespace xo*/
 /* end tokenizer_error.hpp */
--- a/include/xo/tokenizer2/buffer.hpp
+++ b/include/xo/tokenizer2/buffer.hpp
@ -0,0 +1,328 @@
 /** @file buffer.hpp **/
 #pragma once
 #include "span.hpp"
 #include <utility>
 #include <cstdint>
 #include <cassert>
 #include <new>
 namespace xo {
    namespace scm {
        /**
         * @class buffer buffer.hpp
         *
         * @brief Container for a (possibly owned) FIFO queue of chars
         *
         * @tparam CharT.  buffer element type.
         *
         * @code
         *  .buf
         *
         *    +------------------------------------------+
         *    |  |  ...  |  | X|  ... | X|  |    ...  |  |
         *    +------------------------------------------+
         *     ^             ^            ^               ^
         *     0             .lo          .hi             .buf_z
         *
         *                   <-contents-><----avail----->
         * @endcode
         *
         * Buffer does not support wrapped content:
         * content that has not been consumed always occupies contiguous memory.
         *
         * Example:
         * @code
         * // 1.
         *   buffer<char> buf(64*1024);
         *   buf.empty() -> true
         *   buf.buf_z() -> 65536
         *   buf.lo_pos() -> 0
         *   buf.hi_pos() -> 65536
         *   buf.contents() -> empty span
         *   buf.avail() -> span entire buffer memory
         *
         *   // write to (a prefix of) buf.avail()
         *   ::strncpy(buf.buf(), "hello, world\n", 13);
         *   buf.produce(span_type(buf.buf(), buf.buf() + 13));
         *
         *   buf.lo_pos() -> 0
         *   buf.hi_pos() -> 13
         *   buf.contents() -> "hello, world\n";
         *
         *
         *   // examine stored content (does not change buffer state)
         *   auto span = buf.contents();
         *   cerr << string_view(span.lo(), span.hi());  // "hello, world\n"
         *
         *   // consume (a prefix of) stored content
         *   buf.consume(span.prefix(7);
         *
         *   buf.lo_pos() -> 7
         *   buf.hi_pos() -> 13
         *   buf.contents() -> "world\n"
         *
         *   // consuming all remain content resets to original state
         *   buf.consume(buf.contents());
         *
         *   buf.empty() -> true
         *   buf.hi_pos() -> 0     // not 13!
         *
         * // 2.
         *   buffer<char> buf;
         *   buf.empty() -> true
         *   buf.buf_z() -> 0
         *   buf.lo_pos() -> 0
         *   buf.hi_pos() -> 0
         *   buf.contents() -> empty span
         *   buf.avail() -> empty span
         *
         *   // allocate memory separately from ctor
         *   buf.alloc(64*1024);
         * @endcode
         **/
        template <typename CharT>
        class buffer {
        public:
            /** @brief typealias for span of CharT **/
            using span_type = span<CharT>;
            /** @brief typealias for buffer size (counts CharT's, not bytes) **/
            using size_type = std::uint64_t;
        public:
            /** @brief create empty buffer.
                Does not allocate any storage;  @see alloc
            **/
            buffer() = default;
            /** @brief create empty buffer,  and possibly allocate storage.
                @param buf_z    Buffer size.  allocate storage (owned by this buffer) if >0.
                @param align_z  Align to this value,  e.g. 8 to align storage on an 8-byte boundary
            **/
            buffer(size_type buf_z,
                   size_type align_z = sizeof(char))
                : is_owner_{true},
                  buf_{buf_z ? (new (std::align_val_t(align_z)) CharT [buf_z]) : nullptr},
                  buf_z_{buf_z},
                  lo_pos_{0},
                  hi_pos_{0}
                {}
            /** @brief buffer is not copyable **/
            buffer(buffer const & x) = delete;
            /** @brief destructor.  Release storage if owned **/
            ~buffer() { this->reset(); }
            /** @name Access methods **/
            ///@{
            /** @brief start of buffer memory **/
            CharT * buf() const { return buf_; }
            /** @brief buffer size (number of characters) **/
            size_type buf_z() const { return buf_z_; }
            /** @brief current start position within buffer **/
            size_type lo_pos() const { return lo_pos_; }
            /** @brief current end position within buffer **/
            size_type hi_pos() const { return hi_pos_; }
            ///@}
            /** @brief readonly access to a single buffer element.
                Relative to start of buffer (ignores current consume position)
            **/
            CharT const & operator[](size_type i) const { return buf_[i]; }
            /** @brief return span for current buffer contents **/
            span_type contents() const { return span_type(buf_ + lo_pos_,
                                                          buf_ + hi_pos_); }
            /** @brief returns span for writable buffer contents (unused prefix following produce position **/
            span_type avail() const { return span_type(buf_ + hi_pos_,
                                                       buf_ + buf_z_); }
            /** @brief @c true iff buffer is empty **/
            bool empty() const { return lo_pos_ == hi_pos_; }
            /**
               @brief update buffer produce position, after (independently) writing contents of span to it
               @pre left endpoint of @p span equals buffer produce position (@c .hi_pos)
               @pre right endpoint of @p span within bounds of buffer memory range
               @post right endpoint of @p span equals buffer produce position.
            **/
            void produce(span_type const & span) {
                assert(span.lo() == buf_ + hi_pos_);
                hi_pos_ += span.size();
            }
            /**
               @brief update buffer consume position,  when done with contents of span
               @pre left endpoint of @p span equals buffer consume position (@c .lo_pos)
               @pre right endpoint of @p span within bounds of buffer memory range
               @post Either
               buffer is empty, with @c .lo_pos = @c .hi_pos = @c 0.
               buffer is non-empty, right endpoint of @p span equals new buffer consume position.
            **/
            void consume(span_type const & span) {
                if (span.size()) {
                    assert(span.lo() == buf_ + lo_pos_);
                    lo_pos_ += span.size();
                } else {
                    /* since .consume() that arrives at empty contents also resets .lo_pos .hi_pos,
                     * we don't want to blow up when called with an empty span -- argument
                     * may represent some pre-reset location in buffer
                     */
                }
                if (lo_pos_ == hi_pos_) {
                    lo_pos_ = 0;
                    hi_pos_ = 0;
                }
            }
            /**
               @brief allocate buffer with desired amount of memory
               @param buf_z     desired buffer size
               @param align_z   alignment;  buffer memory will be aligned on this byte-boundary.
            **/
            void alloc(size_type buf_z, size_type align_z = sizeof(char)) {
                /* properly reset (+ discard) any existing state */
                this->reset();
                is_owner_ = true;
                if (buf_z)
                    buf_ = new (std::align_val_t(align_z)) CharT [buf_z];
                buf_z_ = buf_z;
                lo_pos_ = 0;
                hi_pos_ = 0;
            }
            /**
               @brief attach buffer to (unowned)  range of @p buf_z bytes starting at @p buf[0]
               Buffer is not responsible for managing storage.
               @post
               1. buffer is empty
               @post
               2. buffer read position = buffer write position = 0
            **/
            void setbuf(CharT * buf, size_type buf_z) {
                /* properly reset (+ discard) any existing state */
                this->reset();
                is_owner_ = false;
                lo_pos_ = 0;
                hi_pos_ = 0;
                buf_ = buf;
                buf_z_ = buf_z;
            }
            /**
               @brief revert buffer to empty state and possibly zero it
               @param zero_buffer_flag   Zero buffer contents iff this is true
               @post
               1. buffer is empty
               @post
               2. buffer read position = buffer write position = 0
            **/
            void clear2empty(bool zero_buffer_flag) {
                if (buf_ && zero_buffer_flag)
                    explicit_bzero(buf_, buf_z_ * sizeof(CharT));
                lo_pos_ = 0;
                hi_pos_ = 0;
            }
            /**
               @brief swap representation with another buffer instance.
            **/
            void swap (buffer & x) {
                std::swap(is_owner_, x.is_owner_);
                std::swap(buf_, x.buf_);
                std::swap(buf_z_, x.buf_z_);
                std::swap(lo_pos_, x.lo_pos_);
                std::swap(hi_pos_, x.hi_pos_);
            }
            /**
               @brief reset buffer to an empty state and recover owned storage
            **/
            void reset() {
                if (is_owner_ && buf_)
                    delete [] buf_;
                is_owner_ = false;
                buf_ = nullptr;
                buf_z_ = 0;
                lo_pos_ = 0;
                hi_pos_ = 0;
            }
            /**
               @brief move-assignment operator.
               @param x   right-hand-side to move from.
               @post
               @p x is in a valid, empty,
            **/
            buffer & operator= (buffer && x) {
                is_owner_ = x.is_owner_;
                buf_ = x.buf_;
                buf_z_ = x.buf_z_;
                lo_pos_ = x.lo_pos_;
                hi_pos_ = x.hi_pos_;
                x.is_owner_ = false;
                x.lo_pos_ = 0;
                x.hi_pos_ = 0;
                x.buf_ = nullptr;
                x.buf_z_ = 0;
                return *this;
            }
            /** @brief buffer is not assignable */
            buffer & operator= (buffer & x) = delete;
        private:
            /** @brief true iff buffer is responsible for freeing storage at @c buf_ **/
            bool is_owner_ = false;
            /** @brief buffer contents.  buffer memory comprises @c buf_[0] to @c buf_[buf_z_] **/
            CharT * buf_ = nullptr;
            /** @brief buffer size (in units of CharT) **/
            size_type buf_z_ = 0;
            /** @brief buffer read (consume) position
                @invariant
                0 <= lo_pos_ <= hi_pos_ < buf_z_
            **/
            size_type lo_pos_ = 0;
            /** @brief buffer write (produce) position
                @invariant
                0 <= hi_pos_ < hi_pos_ < buf_z_
            **/
            size_type hi_pos_ = 0;
        };
        /** @brief Overload for @c swap,  so that @c buffer<CharT> swappable **/
        template <typename CharT>
        inline void
        swap(buffer<CharT> & lhs,
             buffer<CharT> & rhs) {
            lhs.swap(rhs);
        }
    } /*namespace scm*/
 } /*namespace xo*/
 /* end buffer.hpp */
--- a/include/xo/tokenizer2/scan_result.hpp
+++ b/include/xo/tokenizer2/scan_result.hpp
@ -0,0 +1,81 @@
 /* file scan_result.hpp
 *
 * author: Roland Conybeare, Jun 2025
 */
 #pragma once
 #include "Token.hpp"
 #include "TokenizerError.hpp"
 #include "TkInputState.hpp"
 namespace xo {
    namespace scm {
        /** @class scan_result
         *  @brief Represent result of parsing one input token.
         *
         * @code
         *  Possible outcomes fall into several categories
         *  (with T: @c token_.is_valid(), E: @cerror_.is_error())
         *
         *  | T     | E     | description                         |
         *  |-------+-------+-------------------------------------|
         *  | false | false | end of input, including end of line |
         *  | true  | false | parsed token in T                   |
         *  | false | true  | parse error in E                    |
         *
         * @endcode
         **/
        class scan_result {
        public:
            using CharT = char;
            using token_type = Token;
            using span_type = span<const CharT>;
            using error_type = TokenizerError;
            using input_state_type = TkInputState;
        public:
            scan_result(const Token & token,
                        const span_type & consumed,
                        const TokenizerError & error = TokenizerError())
                : token_{token}, consumed_{consumed}, error_{error} {}
            static scan_result make_whitespace(const span_type & prefix_input);
            static scan_result make_partial(const span_type & prefix_input);
            /**
             *  @p error_src can be __FUNCTION__ from site where error generated.
             *  @p error_msg error message
             *  @p error_pos error position, relative to start of token
             *  @p input_state_ref input state object;
             *  copied into scan_result, and leaving input_state_ref.current_line cleared
             **/
            static scan_result make_error_consume_current_line(const char * error_src,
                                                               std::string error_msg,
                                                               size_t error_pos,
                                                               input_state_type & input_state_ref);
            bool is_eof_or_ambiguous() const { return token_.is_invalid() && error_.is_not_an_error(); }
            bool is_token() const { return token_.is_valid(); }
            bool is_error() const { return error_.is_error(); }
            const Token & get_token() const { return token_; }
            const span_type & consumed() const { return consumed_; }
            const TokenizerError & error() const { return error_; }
        public:
            /** Successfully parsed token, whenever tk_type != tokentype::tk_invalid.
             *  Will be tokentype::tk_invalid in normal cause of events for valid input,
             *  when consuming whitespace
             **/
            token_type token_;
            /** input span represented by .token, on success. Otherwise not defined **/
            span_type consumed_;
            /** error description, whenever .error_.is_error() is true **/
            TokenizerError error_;
        };
    } /*namespace scm*/
 } /*namespace xo*/
 /* end scan_result.hpp */
--- a/include/xo/tokenizer2/span.hpp
+++ b/include/xo/tokenizer2/span.hpp
@ -0,0 +1,291 @@
 /** @file span.hpp **/
 #pragma once
 #include "xo/indentlog/scope.hpp"
 #include "xo/indentlog/print/ppdetail_atomic.hpp"
 #include <ostream>
 #include <cstdint>
 #include <cassert>
 namespace xo {
    namespace scm {
        /** @class span compression/span.hpp
         *
         *  @brief A contiguous range of characters,  without ownership.
         *
         *  @tparam CharT type for elements referred to by this span.
         **/
        template <typename CharT>
        class span {
        public:
            /** @defgroup span-type-traits span type traits **/
            ///@{
            /** typealias for span size (in units of CharT) **/
            using size_type = std::uint64_t;
            ///@}
        public:
            /** @defgroup span-ctors span constructors **/
            ///@{
            /** null span **/
            span() : lo_{nullptr}, hi_{nullptr} {}
            /** Create span for the contiguous memory range [@p lo, @p hi) **/
            span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {}
            /** explicit conversion from span<U> **/
            template<typename CharU>
            span(const span<CharU> & other,
                 std::enable_if_t<std::is_convertible_v<CharU*, CharT*>
                 && !std::is_same_v<CharU, CharT>> * = nullptr)
                : lo_{other.lo()}, hi_{other.hi()} {}
            /** copy ctor (explicit to avoid ambiguity with template ctor) **/
            span(const span & other) = default;
            span & operator=(const span & other) = default;
            /** Create a null span (i.e. with null @p lo, @p hi pointers)
             *  A null span can be concatenated with any other span
             *  without triggering matching-endpoint asserts.
             **/
            static span make_null() { return span(static_cast<CharT*>(nullptr), static_cast<CharT*>(nullptr)); }
            /** @brief create span for C-style string @p cstr **/
            static span from_cstr(const CharT * cstr) {
                CharT * lo = cstr;
                CharT * hi = cstr ? cstr + strlen(cstr) : nullptr;
                return span(lo, hi);
            }
            /** @brief create span from std::string @p str **/
            static span from_string(const std::string& str) {
                CharT * lo = &(*str.begin());
                CharT * hi = &(*str.end());
                return span(lo, hi);
            }
            /** @brief concatenate two contiguous spans */
            static span concat(const span & span1, const span & span2) {
                if (span1.is_null())
                    return span2;
                if (span2.is_null())
                    return span1;
                if (span1.hi() != span2.lo()) {
                    scope log(XO_DEBUG(true));
                    log && log(xtag("span1.hi", (void*)span1.hi()), xtag("span2.lo", (void*)span2.lo()));
                }
                assert(span1.hi() == span2.lo());
                CharT * lo = span1.lo();
                CharT * hi = span2.hi();
                return span(lo, hi);
            }
            ///@}
            /** @defgroup span-access-methods **/
            ///@{
            CharT * lo() const { return lo_; } /* get member span::lo_ */
            CharT * hi() const { return hi_; } /* get member span::hi_ */
            ///@}
            /** @defgroup span-general-methods **/
            ///@{
            /** @brief strip prefix until first occurence of '\n', including the newline **/
            void discard_until_newline() {
                for (const CharT * p = lo_; p < hi_; ++p) {
                    if (*p == '\n') {
                        lo_ = p + 1;
                        return;
                    }
                }
                lo_ = hi_;
            }
            /** Create new span over supplied type,
             *  with identical (possibly misaligned) endpoints.
             *
             *  @warning
             *  1. New span uses exactly the same memory addresses.
             *     Endpoint pointers may not be aligned.
             *  2. Implementation assumes code compiled with
             *     @code -fno-strict-aliasing @endcode enabled.
             *
             *  @tparam OtherT element type for new span
             **/
            template <typename OtherT>
            span<OtherT>
            cast() const { return span<OtherT>(reinterpret_cast<OtherT *>(lo_),
                                               reinterpret_cast<OtherT *>(hi_)); }
            /** @brief create span including the first @p z members of this span. **/
            span prefix(size_type z) const { return span(lo_, lo_ + z); }
            /** @brief create span representing prefix up to (but not including) @p *p
             **/
            span prefix_upto(CharT * p) const {
                if (p <= hi_)
                    return span(lo_, p);
                else
                    return span(lo_, hi_);
            }
            /** @brief create span with first @p z members of this span removed **/
            span after_prefix(size_type z) const {
                if (lo_ + z > hi_)
                    z = hi_ - lo_;
                return span(lo_ + z, hi_);
            }
            /** @brief create span with @p prefix of this span removed **/
            span after_prefix(const span & prefix) const {
                if (!prefix.is_null() && (prefix.lo() != lo_)) {
                    throw std::runtime_error
                        ("after_prefix: expected prefix of this span");
                }
                return after_prefix(prefix.size());
            }
            /** Create span starting with position @p p.
             *  Does boundary checking; will return empty span if @p p is outside @c [lo_,hi)
             **/
            span suffix_from(CharT * p) const {
                if ((lo_ <= p) && (p <= hi_))
                    return span(p, hi_);
                else
                    return span(hi_, hi_);
            }
            /** true iff this span is null.  distinct from empty. **/
            bool is_null() const { return lo_ == nullptr && hi_ == nullptr; }
            /** true iff this span is empty (comprises 0 elements). **/
            bool empty() const { return lo_ == hi_; }
            /** report the number of elements (of type CharT) in this span. **/
            size_type size() const { return hi_ - lo_; }
            /** increase extent of this spans to include @p x.
             *  Requires @c hi() == @c x.lo()
             **/
            span & operator+=(const span & x) {
                if (hi_ == x.lo_) {
                    hi_ = x.hi_;
                } else if (!x.is_null()) {
                    assert(false);
                }
                return *this;
            }
            /** print representation for this span on stream @p os **/
            void print(std::ostream & os) const {
                os << "<span"
                   << xtag("addr", (void*)lo_)
                   << xtag("size", size())
                   << " :text " << xo::print::quot(std::string_view(lo_, hi_))
                   << ">";
            }
            ///@}
        private:
            /** @defgroup span-instance-vars **/
            ///@{
            /** start of span.
                Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
            **/
            CharT * lo_ = nullptr;
            /** @brief end of span.
                Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
            **/
            CharT * hi_ = nullptr;
            ///@}
        }; /*span*/
        /** @defgroup span-operators **/
        ///@{
        /** compare spans for equality.
         *  Two spans are equal iff both endpoints match exactly.
         **/
        template <typename CharT>
        inline bool
        operator==(const span<CharT> & lhs, const span<CharT> & rhs) {
            return ((lhs.lo() == rhs.lo())
                    && (lhs.hi() == rhs.hi()));
        }
        /** compare spans for inequality.
         *  Two spans are unequal if either paired endpoint differs.
         **/
        template <typename CharT>
        inline bool
        operator!=(const span<CharT> & lhs, const span<CharT> & rhs) {
            return ((lhs.lo() != rhs.lo())
                    || (lhs.hi() != rhs.hi()));
        }
        /** print a summary of @p x on stream @p os. Intended for diagnostics **/
        template <typename CharT>
        inline std::ostream &
        operator<<(std::ostream & os,
                   const span<CharT> & x) {
            x.print(os);
            return os;
        }
        ///@}
    } /*namespace scm*/
    namespace print {
        template <typename CharT>
        class printspan_impl {
        public:
            printspan_impl(xo::scm::span<CharT> x) : span_{x} {}
            xo::scm::span<CharT> span_;
        };
        template <typename CharT>
        printspan_impl<CharT> printspan(const xo::scm::span<CharT>& span) {
            return printspan_impl<CharT>(span);
        }
        template <typename CharT>
        inline std::ostream &
        operator<< (std::ostream & os,
                    const printspan_impl<CharT> & x)
        {
            for (const CharT * p = x.span_.lo(); p < x.span_.hi(); ++p)
                os << *p;
            return os;
        }
 #ifndef ppdetail_atomic
        template <typename CharT>        \
        PPDETAIL_ATOMIC_BODY(printspan_impl<CharT>);
        template <typename CharT>        \
        PPDETAIL_ATOMIC_BODY(xo::scm::span<CharT>);
 #endif
    }
 } /*namespace xo*/
--- a/include/xo/tokenizer2/tokentype.hpp
+++ b/include/xo/tokenizer2/tokentype.hpp
@ -0,0 +1,192 @@
 /** @file tokentype.hpp
 *
 *  author: Roland Conybeare, Jul 2024
 **/
 #pragma once
 #include "xo/indentlog/print/tag.hpp" // for STRINGIFY
 #include "xo/indentlog/print/ppdetail_atomic.hpp"
 #include <ostream>
 namespace xo {
    namespace scm {
        /** @enum tokentype
         *  Enum to identify different schematika input token types
         *
         *  Schematica code examples:
         *
         *  @code
         *    type point :: { xcoord : f64, ycoord : f64 };
         *    type matrix :: array<double, 2>;  // 2-d array
         *
         *    decl hypot(x : f64, y : f64) -> f64;
         *
         *    def hypot(x : f64, y : f64) {
         *      let
         *        x2 = (x * x);
         *        y2 = (y * y);
         *        hypot2 = (x2 + y2);
         *      in
         *        sqrt(hypot2);
         *    };
         *
         *    def someconst 4;
         *
         *    def foo(v : vec<i32>) {
         *      def (pi : f64) = 3.1415926;
         *      def (h : (f64,f64) -> f64) = hypot;
         *
         *      h = hypot3;
         *    };
         *
         *    def matrixproduct(x : matrix, y : matrix) {
         *      [i, j : x.row(i) * y.col(j)];
         *    };
         *  @endcode
         **/
        enum class tokentype {
            /** sentinel value **/
            tk_invalid = -1,
            /** a boolean constant **/
            tk_bool,
            /** an integer constant (signed 64-bit integer) **/
            tk_i64,
            /** a 64-bit floating-point constant **/
            tk_f64,
            /** a string literal **/
            tk_string,
            /** a symbol **/
            tk_symbol,
            /** left-hand parenthesis @c '(' **/
            tk_leftparen,
            /** right-hand parenthesis @c ')' **/
            tk_rightparen,
            /** left-hand bracket @c '[' **/
            tk_leftbracket,
            /** right-hand bracket @c ']' **/
            tk_rightbracket,
            /** left-hand brace @c '{' **/
            tk_leftbrace,
            /** right-hand brace @c '}' **/
            tk_rightbrace,
            /** left-hand angle bracket @c '<' **/
            tk_leftangle,
            /** right-hand angle bracket @c '>' **/
            tk_rightangle,
            /** less-equal @c '<=' **/
            tk_lessequal,
            /** great-equal @c '>=' **/
            tk_greatequal,
            /** dot @c '.' **/
            tk_dot,
            /** comma @c ',' **/
            tk_comma,
            /** colon @c ':' **/
            tk_colon,
            /** double-colon @c '::' **/
            tk_doublecolon,
            /** semi-colon @c ';' **/
            tk_semicolon,
            /** single equals sign @c '=' **/
            tk_singleassign,
            /** assignment @c ':=' **/
            tk_assign,
            /** indirection @c '->' **/
            tk_yields,
            /** note: operators not treated as punctuation
             *  'do-always' is a legal variable name,
             *  as is 'maybe*2', 'maybe+1', 'path/to/foo'
             **/
            /** operator @c '+' **/
            tk_plus,
            /** operator @c '-' **/
            tk_minus,
            /** operator @c '*' **/
            tk_star,
            /** operator @c '/' **/
            tk_slash,
            /** operator @c '==' **/
            tk_cmpeq,
            /** operator @c '!=' **/
            tk_cmpne,
            /** keyword @c 'type' **/
            tk_type,
            /** keyword @c 'def' **/
            tk_def,
            /** keyword @c 'lambda' **/
            tk_lambda,
            /** keyword @c 'if' **/
            tk_if,
            /** keyworkd @c 'then' **/
            tk_then,
            /** keyword @c 'else' **/
            tk_else,
            /** keyword @c 'let' **/
            tk_let,
            /** keyword @c 'in' **/
            tk_in,
            /** keyword @c 'end' **/
            tk_end,
            /** counts number of entries **/
            n_tokentype
        }; /*tokentype*/
        /** String representation for enum value.
         *  For example @c tokentype_descr(tokentype::tk_if) -> @c "if"
         **/
        extern char const *
        tokentype_descr(tokentype tk_type);
        /** Print enum value for @p tk_type on stream @p os **/
        inline std::ostream &
        operator<< (std::ostream & os, tokentype tk_type) {
            os << tokentype_descr(tk_type);
            return os;
        }
    } /*namespace scm*/
 #ifndef ppdetail_atomic
    namespace print {
        PPDETAIL_ATOMIC(xo::scm::tokentype);
    } /*namespace print*/
 #endif
 } /*namespace xo*/
 /* end tokentype.hpp */
--- a/src/tokenizer2/CMakeLists.txt
+++ b/src/tokenizer2/CMakeLists.txt
@ -0,0 +1,15 @@
 # tokenizer2/CMakeLists.txt
 set(SELF_LIB xo_tokenizer2)
 set(SELF_SRCS
    Tokenizer.cpp
    TokenizerError.cpp
    TkInputState.cpp
    scan_result.cpp
    Token.cpp
    tokentype.cpp)
 xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS})
 xo_dependency(${SELF_LIB} indentlog)
 # end CMakeLists.txt
--- a/src/tokenizer2/TkInputState.cpp
+++ b/src/tokenizer2/TkInputState.cpp
@ -0,0 +1,151 @@
 /** @file TkInputState.cpp
 *
 *  @author Roland Conybeare, Jun 2025
 **/
 #include "TkInputState.hpp"
 namespace xo {
    namespace scm {
        using CharT = char;
        bool
        TkInputState::is_newline(CharT ch) {
            return (ch == '\n');
        }
        bool
        TkInputState::is_whitespace(CharT ch) {
            switch(ch) {
            case ' ': return true;
            case '\t': return true;
            case '\n': return true;
            case '\r': return true;
            }
            return false;
        }
        TkInputState
        TkInputState::rewind(std::size_t n) const
        {
            return TkInputState(this->current_line_,
                                (n <= current_pos_) ? current_pos_ - n : 0,
                                0 /*whitespace*/);
        }
        void
        TkInputState::advance(size_t z)
        {
            scope log(XO_DEBUG(debug_flag_));
            this->current_pos_ += z;
            log && log(xtag("z", z), xtag("current_pos", current_pos_));
        }
        void
        TkInputState::advance_until(const CharT * pos)
        {
            scope log(XO_DEBUG(debug_flag_));
            assert(current_line_.lo() <= pos && pos <= current_line_.hi());
            this->current_pos_ = pos - current_line_.lo();
            log && log(xtag("current_pos", current_pos_));
        }
        auto
        TkInputState::consume_current_line() -> span_type
        {
            span_type retval = current_line_;
            this->discard_current_line();
            return retval;
        }
        void
        TkInputState::discard_current_line()
        {
            this->current_line_ = span_type::make_null();
            this->current_pos_ = 0;
            this->whitespace_ = 0;
        }
        auto
        TkInputState::capture_current_line(const span_type & input,
                                           bool eof_flag)
            -> std::pair<input_error, span_type>
        {
            // see also discard_current_line()
            // note: must capture entirety of first line,
            //       for example including leading whitespace.
            //       See discussion in tokenizer scan() method
            scope log(XO_DEBUG(debug_flag_));
            /* look ahead to {end of line, end of input}, whichever comes first */
            const CharT * sol = input.lo();
            const CharT * eol = sol;
            if (sol == current_line_.lo()) {
                log && log("short-circuit - current line already stashed");
                /* nothing to do here */
                return std::make_pair(input_error::ok, current_line_);
            }
            while ((eol < input.hi()) && (*eol != '\n'))
                ++eol;
            if (*eol == '\n') {
                /* include \n at end-of-line */
                ++eol;
            } else {
                if (!eof_flag) {
                    /* caller expected to provide complete line of input. complain and ignore */
                    return std::make_pair(input_error::incomplete,
                                          input.prefix(0ul));
                }
            }
            this->current_line_ = span_type(sol, eol);
            this->current_pos_ = 0;
            this->whitespace_ = 0;
            log && log(xtag("current_line", print::printspan(current_line_)),
                       xtag("current_pos", current_pos_));
            return std::make_pair(input_error::ok,
                                  span_type(sol, eol));
        }
        const CharT *
        TkInputState::skip_leading_whitespace()
        {
            scope log(XO_DEBUG(debug_flag_));
            const CharT * ix = current_line_.lo() + current_pos_;
            this->whitespace_ = 0;
            /* skip whitespace + remember beginning of most recent line */
            while (is_whitespace(*ix) && (ix != current_line_.hi())) {
                ++ix;
                ++(this->whitespace_);
            }
            this->tk_start_ = ix - current_line_.lo();
            this->current_pos_ = ix - current_line_.lo();
            return ix;
        }
    } /*namespace scm*/
 } /*namespace xo*/
 /* end TkInputState.cpp */
--- a/src/tokenizer2/Token.cpp
+++ b/src/tokenizer2/Token.cpp
@ -0,0 +1,259 @@
 /** @file token.cpp
 *
 *  author: Roland Conybeare
 **/
 #include "Token.hpp"
 #include "xo/indentlog/print/tag.hpp"
 namespace xo {
    namespace scm {
        bool
        Token::bool_value() const
        {
            if (tk_type_ != tokentype::tk_bool) {
                throw (std::runtime_error
                       (tostr("token::bool_value",
                              ": token with type tk found where tk_bool expected",
                              xtag("tk", tk_type_))));
            }
            if (text_ == "true")
                return true;
            if (text_ == "false")
                return false;
            throw (std::runtime_error
                   (tostr("token::bool_value",
                          ": unexpected input string tk_bool token",
                          xtag("text", text_))));
            return false;
        }
        std::int64_t
        Token::i64_value() const
        {
            if (tk_type_ != tokentype::tk_i64) {
                throw (std::runtime_error
                       (tostr("token::i64_value",
                              ": token with type tk found where tk_i64 expected",
                              xtag("tk", tk_type_))));
            }
            if (text_.empty()) {
                throw (std::runtime_error
                       (tostr("token::i64_value",
                              ": unexpected empty input string for tk_i64 token")));
            }
            int sign = 1;
            int value = 0;
            {
                auto ix = text_.begin();
                auto end_ix = text_.end();
                char ch = *ix;
                if (ch == '+') {
                    ++ix;
                } else if (ch == '-') {
                    sign = -1;
                    ++ix;
                }
                if (ix == end_ix) {
                    throw (std::runtime_error
                           (tostr("token::i64_value",
                                  ": input text found where at least one digit expected",
                                  xtag("text", text_))));
                }
                for (; ix != end_ix; ++ix) {
                    char ch = *ix;
                    if ((ch >= '0') && (ch <= '9')) {
                        value *= 10;
                        value += (ch - '0');
                    } else {
                        throw (std::runtime_error
                               (tostr("token::i64_value",
                                      ": unexpected char ch in integer token",
                                      xtag("ch", ch))));
                    }
                }
            }
            return sign * value;
        } /*i64_value*/
        double
        Token::f64_value() const
        {
            if (tk_type_ != tokentype::tk_f64) {
                throw (std::runtime_error
                       (tostr("token::f64_value",
                              ": token with type tk found where tk_f64 expected",
                              xtag("tk", tk_type_))));
            }
            if (text_.empty()) {
                throw (std::runtime_error
                       (tostr("token::f64_value",
                              ": unexpected empty input string for tk_f64 token")));
            }
            int sign = 1;
            /* integer representing denormalized unsigned mantissa
             * (mantissa scaled by smallest power of 10 sufficient to make
             *  it an integer)
             */
            std::int64_t mantissa = 0;
            /* counts #of digits to the right of decimal point '.' */
            int rh_digits = 0;
            /* sign of exponent */
            int exp_sign = 1;
            /* value of exponenct = integer to the right of 'e' or 'E' */
            int exponent = 0;
            /* floating-point value will represent
             *   sign * mantissa * 10^(sign*exponent - rh_digits)
             */
            {
                auto ix = text_.begin();
                auto end_ix = text_.end();
                char ch = *ix;
                if (ch == '+') {
                    ++ix;
                } else if (ch == '-') {
                    sign = -1;
                    ++ix;
                }
                if (ix == end_ix) {
                    throw (std::runtime_error
                           (tostr("token::f64_value",
                                  ": input text found where at least one digit expected",
                                  xtag("text", text_))));
                }
                /* true iff decimal point '.' present in mantissa */
                bool have_decimal_point = false;
                /* true iff exponent prefix 'e' or 'E' present */
                //bool have_exponent = false;
                /* counts number of digits in mantissa
                 * (both before and after, but not including, any decimal point
                 */
                int m_digits = 0;
                /* digits to the left of decimal point */
                int lh_digits = 0;
                /* loop over mantissa digits */
                for (; ix != end_ix; ++ix) {
                    char ch = *ix;
                    if (ch == '.') {
                        if (have_decimal_point) {
                            throw (std::runtime_error
                                   (tostr("token::f64_value",
                                          ": input text found where at most one decimal point expected",
                                          xtag("text", text_))));
                        }
                        have_decimal_point = true;
                        lh_digits = m_digits;
                    } else if ((ch >= '0') && (ch <= '9')) {
                        mantissa *= 10;
                        mantissa += (ch - '0');
                        ++m_digits;
                    } else if (ch == 'e' || ch == 'E') {
                        //have_exponent = true;
                        break; // done with mantissa
                    } else {
                        throw (std::runtime_error
                               (tostr("token::i64_value",
                                      ": unexpected char ch in integer token",
                                      xtag("ch", ch))));
                    }
                }
                if (have_decimal_point)
                    rh_digits = m_digits - lh_digits;
                if (ix != end_ix) {
                    /* continue to read exponent */
                    /* skip e|E */
                    ++ix;
                    if (ix == end_ix) {
                        throw (std::runtime_error
                               (tostr("token::f64_value",
                                      ": on input text, expect at least one digit following exponent marker e|E",
                                      xtag("text", text_))));
                    }
                    char ch = *ix;
                    if (ch == '+') {
                        ++ix; /*skip*/
                    } else if (ch == '-') {
                        exp_sign = -1;
                        ++ix;
                    }
                    for (; ix != end_ix; ++ix) {
                        char ch = *ix;
                        if ((ch >= '0') && (ch <= '9')) {
                            exponent *= 10;
                            exponent += (ch - '0');
                        } else {
                            throw (std::runtime_error
                                   (tostr("token::f64_value",
                                          "; on input text, expect only digits following"
                                          " (possibly signed) exponenct marker",
                                          xtag("text", text_))));
                        }
                    }
                }
            }
            /* floating-point value will represent
             *   sign * mantissa * 10^(sign*exponent - rh_digits)
             */
            double mantissa_f64 = sign * mantissa;
 #ifdef OBSOLETE_DEBUG
            std::cerr << xtag("text", text_)
                      << xtag("rh_digits", rh_digits)
                      << xtag("mantissa_f64", mantissa_f64)
                      << xtag("exp_sign", exp_sign)
                      << xtag("exponent", exponent)
                      << std::endl;
 #endif
            double retval = (mantissa_f64
                             * detail::pow10((exp_sign * exponent)
                                             - rh_digits));
            return retval;
        } /*f64_value*/
        void
        Token::print(std::ostream & os) const
        {
            os << "<token"
               << xtag("type", tk_type_);
            if (has_variable_text())
                os << xtag("text", text_);
            os << ">";
        } /*print*/
    } /*namespace scm*/
 } /*namespace xo*/
 /* end token.cpp */
--- a/src/tokenizer2/Tokenizer.cpp
+++ b/src/tokenizer2/Tokenizer.cpp
@ -0,0 +1,836 @@
 /** @file Tokenizer.cpp
 *
 *  @author Roland Conybeare, Jul 2024
 **/
 #include "Tokenizer.hpp"
 namespace xo {
    namespace scm {
        Tokenizer::Tokenizer(bool debug_flag)
            : input_state_{debug_flag}
        {}
        void
        Tokenizer::discard_current_line()
        {
            this->input_state_.discard_current_line();
        }
        bool
        Tokenizer::is_1char_punctuation(CharT ch)
        {
            switch(ch) {
            case '(':
                return true;
            case ')':
                return true;
            case '[':
                return true;
            case ']':
                return true;
            case '{':
                return true;
            case '}':
                return true;
            case '<':
                /* can't be 1char punctuation -- can begin lessequal token */
                return false;
            case '>':
                /* can't be 1char punctuation -- can begin greatequal token,
                 * and appears in tk_yields token
                 */
                return false;
            case ',':
                return true;
            case ';':
                return true;
            case ':':
                /* can't be 1char punctuation -- can begin assignment token */
                return false;
            case '=':
                /* can't be 1char punctuation -- can begin comparison token '==' */
                return false;
            case '!':
                /* can't be 1char punctuation -- can begin comparison token '!=' */
                return false;
            case '-':
                /* can't be punctuation
                 * - can appear inside f64 token: e.g. 1.23e-9.
                 * - begins tk_yields token: ->
                 */
                return false;
            case '+':
                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */
                return false;
            case '*':
                /* not punctuation -- allowed in symbol */
                return false;
            case '/':
                /* not punctuation -- for symmetry with +,- */
                return false;
            case '.':
                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */
                return false;
            }
            return false;
        }
        bool
        Tokenizer::is_2char_punctuation(CharT ch)
        {
            /* can't put '-' here, because of the way it appears in numeric literals
             * characters here may not appear in symbol names
             */
            switch(ch) {
            case '<':
                /* can begin <= */
                return true;
            case '>':
                /* can begin >= */
                return true;
            case ':':
                /* can begin := */
                return true;
            case '=':
                /* can begin == */
                return true;
            case '!':
                /* can begin != */
                return true;
            }
            return false;
        }
        auto
        Tokenizer::assemble_token(std::size_t initial_whitespace,
                                  const span_type & token_text,
                                  input_state_type * p_input_state) -> result_type
        {
            /* literal|pretty|streamlined */
            log_config::style = function_style::streamlined;
            scope log(XO_DEBUG(p_input_state->debug_flag()));
            log && log(xtag("token_text", token_text),
                       xtag("initial_whitespace", initial_whitespace),
                       xtag("input_state", *p_input_state));
            tokentype tk_type = tokentype::tk_invalid;
            std::string tk_text;
            const CharT * tk_start = token_text.lo();
            const CharT * tk_end = token_text.hi();
            const CharT * ix = tk_start;
            /* switch here applies to the first character in a token */
            switch (*ix) {
            case '-':
            case '+':
                if (token_text.size() == 1) {
                    /* standalone '+' or '-' */
                    if (*ix == '+')
                        tk_type = tokentype::tk_plus;
                    else if(*ix == '-')
                        tk_type = tokentype::tk_minus;
                }
                /** fall through to numeric literal code below **/
                [[fallthrough]];
            case '.':
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                /* examples of valid floating-point numbers:
                 *   .0
                 *   1e0
                 *   1e
                 *   0.
                 *   +1e0
                 *   -1e0
                 *   +1E+2
                 *   -1E+2
                 *   -0.123e-10
                 * non-examples:
                 *   .
                 *   -
                 *   +
                 *   e0
                 *   .e0
                 *   -.e-0
                 *   +.e+0
                 *
                 * in particular: to be recognized as a number,
                 * must contain at least one digit
                 */
                log && log("possible number-token");
                /* true if initial sign -/+ encountered */
                bool sign_flag = false;
                /* true if '.' encountered */
                bool period_flag = false;
                /* true if 'e' | 'E' encountered.
                 */
                bool exponent_flag = false;
                /* true when sign '-' | '+' precedes exponenct digits */
                bool exponent_sign_flag = false;
                /* true when at least one digit follows exponent marker */
                bool exponent_digit_flag = false;
                /* true if at least one digit encountered */
                bool number_flag = false;
                log && log(xtag("*ix", *ix),
                           xtag("tk.length", token_text.size()));
                if (log && (ix + 1 < tk_end))
                    log(xtag("*(ix+1)", *(ix + 1)));
                if ((*ix == '-') && (ix + 2 == token_text.hi()) && (*(ix + 1) == '>')) {
                    /* composing exactly '->' */
                    tk_type = tokentype::tk_yields;
                } else {
                    /* token (if valid) will be one of: {tk_i64, tk_f64, tk_dot}: */
                    for (; ix != token_text.hi(); ++ix) {
                        if ((*ix == '-') || (*ix == '+')) {
                            /* sign allowed:
                             * 1. before period and before first digit
                             * 2. after exponent
                             */
                            if (!period_flag && !number_flag && !sign_flag) {
                                sign_flag = true;
                            } else if (exponent_flag && !exponent_digit_flag) {
                                exponent_sign_flag = true;
                            } else {
                                return result_type::make_error_consume_current_line
                                    (__FUNCTION__ /*src_function*/,
                                     "improperly placed sign indicator",
                                     (ix - tk_start),
                                     *p_input_state);
                            }
                        } else if (*ix == '.') {
                            if (period_flag) {
                                return result_type::make_error_consume_current_line
                                    (__FUNCTION__ /*src_function*/,
                                     "duplicate decimal point in numeric literal",
                                     (ix - tk_start),
                                     *p_input_state);
                            }
                            period_flag = true;
                        } else if ((*ix == 'e') || (*ix == 'E')) {
                            if (exponent_flag) {
                                return result_type::make_error_consume_current_line
                                    (__FUNCTION__ /*src_function*/,
                                     "duplicate exponent marker in numeric literal",
                                     (ix - tk_start),
                                     *p_input_state);
                            }
                            exponent_flag = true;
                        } else if (isdigit(*ix)) {
                            if (exponent_flag) {
                                /* need digit before exponent to recognize as number */
                                exponent_digit_flag = true;
                            } else {
                                number_flag = true;
                            }
                        } else {
                            return result_type::make_error_consume_current_line
                                (__FUNCTION__ /*src_function*/,
                                 "unexpected character in numeric constant" /*error_description*/,
                                 (ix - tk_start),
                                 *p_input_state);
                        }
                    }
                    if (number_flag) {
                        if (period_flag || exponent_flag) {
                            tk_type = tokentype::tk_f64;
                        } else {
                            tk_type = tokentype::tk_i64;
                        }
                    } else if (period_flag && !exponent_flag) {
                        tk_type = tokentype::tk_dot;
                    } else {
                        /* not a valid token */
                    }
                    log && log(xtag("sign_flag", sign_flag));
                    log && log(xtag("period_flag", period_flag),
                               xtag("exponent_flag", exponent_flag),
                               xtag("exponent_sign_flag", exponent_sign_flag),
                               xtag("number_flag", number_flag));
                    log && log(xtag("tk_type", tk_type));
                }
                break;
            }
            case '*':
                if (token_text.size() == 1) {
                    /* standalone '*' */
                    tk_type = tokentype::tk_star;
                    ++ix;
                } else {
                    /* '*' isn't punctuation -- but may allow appearance in a longer token
                     *
                     * thinking that x*y is a symbol with an embedded '*' character;
                     * in particular want to support kebab-case symbols like 'foo-config'
                     */
                }
                break;
            case '/':
                if (token_text.size() == 1) {
                    /* standalone '/' */
                    tk_type = tokentype::tk_slash;
                    ++ix;
                }
                break;
            case '=':
                log && log("singleassign or cmpeq token");
                if (*(ix + 1) == '=') {
                    tk_type = tokentype::tk_cmpeq;
                    ++ix;
                    ++ix;
                } else {
                    /* standalone '=' */
                    tk_type = tokentype::tk_singleassign;
                    ++ix;
                }
                break;
            case '!':
                if (*(ix + 1) == '=') {
                    tk_type = tokentype::tk_cmpne;
                    ++ix;
                    ++ix;
                } else {
                    /* standlone '!' */
                    // TODO
                }
                break;
            case '"':
            {
                log && log("recognize string-token");
                tk_type = tokentype::tk_string;
                tk_text.reserve(token_text.hi() - token_text.lo());
                ++ix; /*skip initial " char*/
                /* true on final " */
                bool endofstring = false;
                for (; ix != token_text.hi(); ++ix) {
                    log && log(xtag("*ix", *ix));
                    switch(*ix) {
                    case '"':
                        endofstring = true;
                        /* skip final " char, don't capture */
                        ++ix;
                        break;
                    case '\\':
                        /* skip escape char, don't capture */
                        ++ix;
                        if (ix == token_text.hi()) {
                            return result_type::make_error_consume_current_line
                                (__FUNCTION__ /*src_function*/,
                                 "expecting key following escape character \\",
                                 (ix - tk_start),
                                 *p_input_state);
                        }
                        switch(*ix) {
                        case '\\':
                            log && log(xtag("*ix", *ix), xtag("escaped", "t"));
                            tk_text.push_back(*ix);
                            break;
                        case 'n':
                            log && log(xtag("*ix", *ix), xtag("newline", "t"));
                            tk_text.push_back('\n');
                            break;
                        case 't':
                            log && log(xtag("*ix", *ix), xtag("tab", "t"));
                            tk_text.push_back('\t');
                            break;
                        case 'r':
                            log && log(xtag("*ix", *ix), xtag("cr", "t"));
                            tk_text.push_back('\r');
                            break;
                        case '"':
                            log && log(xtag("*ix", *ix), xtag("quote", "t"));
                            tk_text.push_back('"');
                            break;
                        default:
                            return result_type::make_error_consume_current_line
                                (__FUNCTION__ /*src_function*/,
                                 "expecting one of n|r|\"|\\ following escape \\",
                                 (ix - tk_start),
                                 *p_input_state);
                        }
                        break;
                    default:
                        tk_text.push_back(*ix);
                        break;
                    }
                    if (endofstring)
                        break;
                }
                if (!endofstring) {
                    return result_type::make_error_consume_current_line
                        (__FUNCTION__ /*src_function*/,
                         "missing terminating '\"' to complete literal string",
                         (ix - tk_start),
                         *p_input_state);
                }
                log && log(tostr("tokenizer::assemble_token",
                                 xtag("tk_text", tk_text)));
                break;
            }
            case 'a': case 'A':
            case 'b': case 'B':
            case 'c': case 'C':
            case 'd': case 'D':
            case 'e': case 'E':
            case 'f': case 'F':
            case 'g': case 'G':
            case 'h': case 'H':
            case 'i': case 'I':
            case 'j': case 'J':
            case 'k': case 'K':
            case 'l': case 'L':
            case 'm': case 'M':
            case 'n': case 'N':
            case 'o': case 'O':
            case 'p': case 'P':
            case 'q': case 'Q':
            case 'r': case 'R':
            case 's': case 'S':
            case 't': case 'T':
            case 'u': case 'U':
            case 'v': case 'V':
            case 'w': case 'W':
            case 'x': case 'X':
            case 'y': case 'Y':
            case 'z': case 'Z':
            {
                /* symbol/identifier must begin with a letter?
                 * we want to accept some other chars too.
                 * specifically want to allow identifiers:
                 *   this-is-the-way
                 *   this+is+also+the+way
                 *   how/much/is/that/doggy
                 *   put*an*asterisk*in*that
                 *   something%special%
                 *
                 * like pure lisp,  we don't allow:
                 * - identifier beginning with digit
                 * - period .
                 *
                 * unlike pure lisp,  we don't allow anywhere in a symbol:
                 * - colon     :
                 * - semicolon ;
                 * - comma     ,
                 *
                 * also we don't allow symbols to begin with special chars
                 */
                tk_type = tokentype::tk_symbol;
                break;
            }
            case '<':
            {
                log && log("leftangle or lessequal token");
                if (*(ix + 1) == '=') {
                    tk_type = tokentype::tk_lessequal;
                    ++ix;
                    ++ix;
                } else {
                    tk_type = tokentype::tk_leftangle;
                    ++ix;
                }
                break;
            }
            case '>':
            {
                log && log("rightangle or greatequal token");
                if (*(ix + 1) == '=') {
                    tk_type = tokentype::tk_greatequal;
                    ++ix;
                    ++ix;
                } else {
                    tk_type = tokentype::tk_rightangle;
                    ++ix;
                }
                break;
            }
            case '(':
                tk_type = tokentype::tk_leftparen;
                ++ix;
                break;
            case ')':
                tk_type = tokentype::tk_rightparen;
                ++ix;
                break;
            case '[':
                tk_type = tokentype::tk_leftbracket;
                ++ix;
                break;
            case ']':
                tk_type = tokentype::tk_rightbracket;
                ++ix;
                break;
            case '{':
                tk_type = tokentype::tk_leftbrace;
                ++ix;
                break;
            case '}':
                tk_type = tokentype::tk_rightbrace;
                ++ix;
                break;
            case ',':
                tk_type = tokentype::tk_comma;
                ++ix;
                break;
            case ';':
                tk_type = tokentype::tk_semicolon;
                ++ix;
                break;
            case ':':
            {
                log && log("colon or assignment token");
                if (*(ix + 1) == '=') {
                    tk_type = tokentype::tk_assign;
                    ++ix;
                    ++ix;
                } else {
                     tk_type = tokentype::tk_colon;
                     ++ix;
                }
                break;
            }
            default:
                break;
            }
            if (tk_type == tokentype::tk_invalid) {
                return result_type::make_error_consume_current_line
                    (__FUNCTION__ /*src_function*/,
                     "illegal input character",
                     (ix - tk_start),
                     *p_input_state);
            }
            if ((tk_type == tokentype::tk_i64)
                || (tk_type == tokentype::tk_f64)
                || (tk_type == tokentype::tk_symbol))
            {
                /* note: capturing token text here;
                 *       for numeric literals will re-parse in token::i64_value() / token::f64_value()
                 */
                tk_text = std::string(tk_start, tk_end);
            } else if (tk_type == tokentype::tk_string) {
                ; /* nothing to do here -- desired tk_text already constructed */
            }
            if (tk_type == tokentype::tk_symbol) {
                /* check for keywords */
                bool keep_text = false;
                if ((tk_text == "true") || (tk_text == "false")) {
                    tk_type = tokentype::tk_bool;
                    keep_text = true;
                } else if (tk_text == "type") {
                    tk_type = tokentype::tk_type;
                } else if (tk_text == "def") {
                    tk_type = tokentype::tk_def;
                } else if (tk_text == "lambda") {
                    tk_type = tokentype::tk_lambda;
                } else if (tk_text == "if") {
                    tk_type = tokentype::tk_if;
                } else if (tk_text == "then") {
                    tk_type = tokentype::tk_then;
                } else if (tk_text == "else") {
                    tk_type = tokentype::tk_else;
                } else if (tk_text == "let") {
                    tk_type = tokentype::tk_let;
                } else if (tk_text == "in") {
                    tk_type = tokentype::tk_in;
                } else if (tk_text == "end") {
                    tk_type = tokentype::tk_end;
                } else {
                    /* keep as symbol */
                    keep_text = true;
                }
                if (!keep_text)
                    tk_text.clear();
            }
            /* input.prefix(0):
             * require caller preserves current input line until it's entirely exhausted
             */
            return result_type(token_type(tk_type, std::move(tk_text)),
                               p_input_state->current_line().prefix(0));
        } /*assemble_token*/
        auto
        Tokenizer::assemble_final_token(const span_type & token_text,
                                        input_state_type * p_input_state) -> result_type
        {
            return assemble_token(0 /*initial_whitespace*/,
                                  token_text,
                                  p_input_state);
        }
        auto
        Tokenizer::scan(const span_type & input,
                        bool eof_flag) -> result_type
        {
            scope log(XO_DEBUG(input_state_.debug_flag()));
            log && log(xtag("input", input));
            /* - Always at beginning of token when scan() invoked
             * - scan will not report any portion of line as consumed until it has
             *   emitted all tokens in that line.
             *   rationale: caller is allowed to discard storage that
             *   scan() reports as consumed. But will be holding that line
             *   until all tokens have been read.
             * - this means caller will typically call scan()
             *   with the same input span multiple times
             */
            /* automagically no-ops when the same input presented twice */
            this->input_state_.capture_current_line(input, eof_flag);
            const CharT * ix = this->input_state_.skip_leading_whitespace();
            if(ix == input.hi()) {
                log && log("end input -> consume current line");
                /* entirety of current line has been tokenized
                 *  -> caller may consume it
                 */
                return result_type::make_whitespace(this->input_state_.consume_current_line());
            }
            /* ix: if ix < input.hi: first non-whitespace character after input_state_.current_pos_ */
            // TODO:
            // 1. hoist complete_flag up here
            // 2. use in each branch
            // 3. common check for prefix-capturing after if-cascade below done
            /* here: *ix is not whitespace */
            auto whitespace_z = input_state_.whitespace();
            log && log(xtag("whitespace_z", whitespace_z));
            /* tk_start points to known beginning of token
             * (after any whitespace)
             *
             * goal is to leave ix pointing to 1 char past the end of the token
             */
            const CharT * tk_start = ix;
            if (is_1char_punctuation(*ix)) {
                /* 1-character token */
                ++ix;
            } else if (is_2char_punctuation(*ix)) {
                CharT ch1 = *ix;
                (void)ch1;
                ++ix;
 #ifdef OBSOLETE // no longer a thing. either input ends in whitespace, or ends translation unit
                if (ix == input.hi()) {
                    /* need more input to know if/when token complete */
                    this->prefix_ += std::string(tk_start, input.hi());
                    log && log(xtag("captured-prefix1", this->prefix_));
                } else
 #endif
                    {
                    CharT ch2 = *ix;
                    if (((ch2 >= '0') && (ch2 <= '9'))
                        || ((ch2 >= 'A') && (ch2 <= 'Z'))
                        || ((ch2 >= 'a') && (ch2 <= 'z')))
                    {
                        /* treat as 1 char punctuation */
                        ;
                    } else {
                        /* include next char */
                        ++ix;
                    }
                }
            } else if (*ix == '"') {
                bool complete_flag = false;
                /* 1. embedded space/tab allowed in string literal.
                 * 2. embedded newline/cr not allowed.
                 */
                CharT prev_ch = '"';
                ++ix;
                for (; ix != input.hi(); ++ix) {
                    /* looking for unescaped " char to end literal */
                    if (*ix == '"') {
                        if (prev_ch != '\\') {
                            ++ix;  /* include terminating " for assemble_token */
                            complete_flag = true;
                            break;
                        }
                    } else if ((*ix == '\n') || (*ix == '\r')) {
                        log && log ("string literal with naked newline or CR");
                        return result_type::make_error_consume_current_line
                            (__FUNCTION__ /*src_function*/,
                             "must use \\n or \\r to encode newline/cr in string literal",
                             (ix - tk_start),
                             this->input_state_);
                    }
                    prev_ch = *ix;
                }
                if (!complete_flag) {
                    log && log("unterminated string literal");
                    return result_type::make_error_consume_current_line
                               (__FUNCTION__ /*src_function*/,
                                "unterminated string literal",
                                (ix - tk_start),
                                this->input_state_);
                }
            } else {
                /* ix is start of some token */
                if (*ix == '-') {
                    /* this section load-bearing for input '->' scanning from beginning of token */
                    ++ix;
                    if (ix == input.hi()) {
                        /* need more input to know if/when token complete -- see captured-prefix5 below */
                    } else {
                        CharT ch2 = *ix;
                        if (ch2 == '>') {
                            /* include next char and complete token */
                            ++ix;
                            log && log("complete '->' token");
                            this->input_state_.advance_until(ix);
                            return assemble_token(whitespace_z,
                                                  span_type(tk_start, ix) /*token*/,
                                                  &(this->input_state_));
                        }
                        /* here: -123, -.5e-21 for example */
                    }
                } else if (*ix == '>') {
                    /* this section load-bearing for input '>=' scanning from beginning of token.
                     * Need this because '>' necessarily excluded from is_1char_punctuation()
                     */
                    ++ix;
                    if (ix == input.hi()) {
                        /* need more input to know if/when token complete -- see captured-prefix5 below */
                    } else {
                        CharT ch2 = *ix;
                        if (ch2 != '=') {
                            log && log("complete '>=' token");
                            this->input_state_.advance_until(ix);
                            /* ignore next char and complete token */
                            return assemble_token(whitespace_z,
                                                  span_type(tk_start, ix) /*token*/,
                                                  &(this->input_state_));
                        }
                        /* here: >= for example */
                    }
                }
                /* scan until:
                 * - whitespace
                 * - punctuation
                 */
                for (; ix != input.hi(); ++ix) {
                    if (input_state_type::is_whitespace(*ix)
                        || is_1char_punctuation(*ix)
                        || is_2char_punctuation(*ix))
                    {
                        break;
                    }
                    /* this section load-bearing for input '>' after beginning of a token, e.g. p> */
                    if ((ix > tk_start) && (*ix == '>'))
                        break;
                    /* this section load-bearing for input '->' at the end of another token, e.g. p->q */
                    if (*ix == '-') {
                        if (ix + 1 == input.hi()) {
                            /* need more input to know if/when token complete
                             *
                             *   apple-banana   parses as: {tk_symbol: apple-banana}
                             *   apple->        parses as: {tk_symbol: apple} {tk_yields}
                             *   apple-         illegal (may not end symbol with '-')
                             */
                            break;
                        }
                        if (*(ix + 1) == '>') {
                            /* treat '->' as punctuation;  complete preceding token */
                            break;
                        }
                    }
                }
            }
            log && log("assemble token z", xtag("token_z", ix - tk_start));
            assert(tk_start < ix);
            this->input_state_.advance_until(ix);
            return assemble_token(whitespace_z,
                                  span_type(tk_start, ix) /*token*/,
                                  &(this->input_state_));
        } /*scan*/
    } /*namespace scm*/
 } /*namespace xo*/
 /* end Tokenizer.cpp */
--- a/src/tokenizer2/TokenizerError.cpp
+++ b/src/tokenizer2/TokenizerError.cpp
@ -0,0 +1,60 @@
 /** @file TokenizerError.cpp
 *
 *  @author Roland Conybeare, Jun 2025
 **/
 #include "TokenizerError.hpp"
 namespace xo {
    namespace scm {
        void
        TokenizerError::print(std::ostream & os) const
        {
            os << "<tokenizer-error"
               << xtag("src-function", src_function_)
               << xtag("message", error_description_)
               << xtag("input", input_state_.current_line())
               << xtag("whitespace", input_state_.whitespace())
               << xtag("error-pos", error_pos_)
               << ">";
        }
        void
        TokenizerError::report(std::ostream & os) const
        {
            using namespace std;
            if (!error_description_.empty()) {
                const char * prefix = "input: ";
                /* input_state.tk_start:    position of first character in token
                 * input_state.current_pos: position of first character following preceding token.
                 * error_pos:               position (relative to start) at which failure detected
                 */
                const size_t tk_start = input_state_.tk_start();
                const size_t tk_indent = (strlen(prefix) + tk_start);
                const size_t error_pos = 1 + tk_start + error_pos_;
                os << "token col: " << tk_start << ", error col: " << error_pos << "\n";
                os << prefix;
                for (const char *p = input_state_.current_line().lo(),
                         *e = input_state_.current_line().hi(); p < e; ++p)
                {
                    os << *p;
                }
                //os << endl;
                os << std::setw(tk_indent) << " ";
                for (size_t i = 0; i < error_pos_; ++i) {
                    os << '_';
                }
                os << '^' << endl;
                os << error_description_ << endl;
            }
        }
    } /*namespace scm*/
 } /*namespace xo*/
 /* end TokenizerError.cpp */
--- a/src/tokenizer2/scan_result.cpp
+++ b/src/tokenizer2/scan_result.cpp
@ -0,0 +1,43 @@
 /** @file scan_result.cpp
 *
 *  @author Roland Conybeare, 2025
 **/
 #include "scan_result.hpp"
 namespace xo {
    namespace scm {
        scan_result
        scan_result::make_whitespace(const span_type& whitespace_input)
        {
            return scan_result(token_type::invalid(), whitespace_input /*consumed*/);
        }
        scan_result
        scan_result::make_partial(const span_type& prefix_input)
        {
            return scan_result(token_type::invalid(), prefix_input /*consumed*/);
        }
        scan_result
        scan_result::make_error_consume_current_line(const char * error_src,
                                                     std::string error_msg,
                                                     size_t error_pos,
                                                     input_state_type & input_state_ref)
        {
            /* report+consume entire input line */
            /* copy before altered by .consume_current_line() */
            input_state_type input_state_copy = input_state_ref;
            return scan_result(token_type::invalid(),
                               input_state_ref.consume_current_line(),
                               error_type(error_src,
                                          error_msg,
                                          input_state_copy,
                                          error_pos));
        }
    } /*namespace scm*/
 } /*namespace xo*/
 /* end scan_result.cpp */
--- a/src/tokenizer2/tokentype.cpp
+++ b/src/tokenizer2/tokentype.cpp
@ -0,0 +1,74 @@
 /* file tokentype.cpp
 *
 * author: Roland Conybeare
 */
 #include "tokentype.hpp"
 namespace xo {
    namespace scm {
        char const *
        tokentype_descr(tokentype tk_type)
        {
 #define CASE(x) case tokentype::x: return STRINGIFY(x)
            switch(tk_type) {
                CASE(tk_bool);
                CASE(tk_i64);
                CASE(tk_f64);
                CASE(tk_string);
                CASE(tk_symbol);
                CASE(tk_leftparen);
                CASE(tk_rightparen);
                CASE(tk_leftbracket);
                CASE(tk_rightbracket);
                CASE(tk_leftbrace);
                CASE(tk_rightbrace);
                CASE(tk_leftangle);
                CASE(tk_rightangle);
                CASE(tk_lessequal);
                CASE(tk_greatequal);
                CASE(tk_dot);
                CASE(tk_comma);
                CASE(tk_colon);
                CASE(tk_doublecolon);
                CASE(tk_semicolon);
                CASE(tk_singleassign);
                CASE(tk_assign);
                CASE(tk_yields);
                CASE(tk_plus);
                CASE(tk_minus);
                CASE(tk_star);
                CASE(tk_slash);
                CASE(tk_cmpeq);
                CASE(tk_cmpne);
                CASE(tk_type);
                CASE(tk_def);
                CASE(tk_lambda);
                CASE(tk_if);
                CASE(tk_then);
                CASE(tk_else);
                CASE(tk_let);
                CASE(tk_in);
                CASE(tk_end);
            case tokentype::tk_invalid:
            case tokentype::n_tokentype:
                return "?tokentype";
            }
 #undef CASE
            return "???";
        } /*tokentype_descr*/
    } /*namespace scm*/
 } /*namespace xo*/
 /* end tokentype.cpp */