Add 'xo-tokenizer/' from commit '830c6ebe55'

git-subtree-dir: xo-tokenizer git-subtree-mainline: e8f65f88cf git-subtree-split: 830c6ebe55
2025-05-11 01:36:13 -05:00 · 2025-05-11 01:36:13 -05:00 · 6395a7a285
commit 6395a7a285
parent e8f65f88cf 830c6ebe55
17 changed files with 2547 additions and 0 deletions
--- a/xo-tokenizer/.gitignore
+++ b/xo-tokenizer/.gitignore
@ -0,0 +1,8 @@
 # emacs workspace config
 .projectile
 # clangd working space (see emacs+lsp)
 .cache
 # typical cmake build directory (source-tree-nephew)
 .build*
 # symlink to builddir/compile_commands.json;  should be set manually in dev sandbox
 compile_commands.json
--- a/xo-tokenizer/CMakeLists.txt
+++ b/xo-tokenizer/CMakeLists.txt
@ -0,0 +1,27 @@
 # xo-tokenizer/CMakeLists.txt
 cmake_minimum_required(VERSION 3.10)
 project(xo_tokenizer VERSION 0.1)
 include(GNUInstallDirs)
 include(cmake/xo-bootstrap-macros.cmake)
 xo_cxx_toplevel_options3()
 # ----------------------------------------------------------------
 # c++ settings
 set(PROJECT_CXX_FLAGS "")
 #set(PROJECT_CXX_FLAGS "-fconcepts-diagnostics-depth=2")
 add_definitions(${PROJECT_CXX_FLAGS})
 # ----------------------------------------------------------------
 add_subdirectory(src/tokenizer)
 add_subdirectory(utest)
 # ----------------------------------------------------------------
 # provide find_package() support
 xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets)
--- a/xo-tokenizer/README.md
+++ b/xo-tokenizer/README.md
@ -0,0 +1,56 @@
 # schematica tokenizer library
 ## Getting Started
 ### build + install 'xo-cmake` dependency
 - [github/Rconybea/xo-cmake](https://github.com/Rconybea/xo-cmake)
 Installs a few cmake ingredients,  along with a build assistant `xo-build` for XO projects such as this one.
 ### build + install other required XO dependencies
 ```
 $ xo-build --clone --configure --build --install xo-indentlog
 $ xo-build --clone --configure --build --install xo-refnct
 $ xo-build --clone --configure --build --install xo-subsys
 $ xo-build --clone --configure --build --install xo-reflectutil
 ```
 Note: can use `-n` to dry-run here
 ### copy `xo-tokenizer` repository locally
 ```
 $ xo-build --clone xo-tokenizer
 ```
 or equivalently
 ```
 $ git clone git@github.com:Rconybea/xo-tokenizer.git
 ```
 ### build + install `xo-tokenizer`
 ```
 $ xo-build --configure --build --install xo-tokenizer
 ```
 or equivalently:
 ```
 $ PREFIX=/usr/local  # or wherever you prefer
 $ cmake -DCMAKE_INSTALL_PREFIX=${PREFIX} -S xo-tokenizer -B xo-tokenizer/.build
 $ cmake --build xo-tokenizer/.build
 $ cmake --install xo-tokenizer/.build
 ```
 ### build for unit test coverage
 ```
 $ cmake -DCMAKE_BUILD_TYPE=coverage -DCMAKE_INSTALL_PREFIX=$PREFIX xo-tokenizer/.build-ccov
 $ cmake --build xo-tokenizer/.build-ccov
 ```
 ### LSP support
 ```
 $ cd xo-tokenizer
 $ ln -s .build/compile_commands.json  # lsp will look for compile_commands.json in the root of the source tree
 ```
--- a/xo-tokenizer/cmake/xo-bootstrap-macros.cmake
+++ b/xo-tokenizer/cmake/xo-bootstrap-macros.cmake
@ -0,0 +1,35 @@
 # ----------------------------------------------------------------
 # for example:
 #   $ PREFIX=/usr/local   # for example
 #   $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build
 #
 # will get
 #   CMAKE_MODULE_PATH
 # from xo-cmake-config --cmake-module-path
 #
 # and expect .cmake macros in
 #   CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake
 # ----------------------------------------------------------------
 find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED)
 if ("${XO_CMAKE_CONFIG_EXECUTABLE}" STREQUAL "XO_CMAKE_CONFIG_EXECUTABLE-NOT_FOUND")
    message(FATAL "could not find xo-cmake-config executable")
 endif()
 message(STATUS "XO_CMAKE_CONFIG_EXECUTABLE=${XO_CMAKE_CONFIG_EXECUTABLE}")
 if (NOT XO_SUBMODULE_BUILD)
    if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix))
        # default to typical install location for xo-project-macros
        execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH)
        message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}")
    endif()
 endif()
 # needs to have been installed somewhere on CMAKE_MODULE_PATH,
 # (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX)
 #
 include(xo_macros/xo_cxx)
 xo_cxx_bootstrap_message()
--- a/xo-tokenizer/cmake/xo_tokenizerConfig.cmake.in
+++ b/xo-tokenizer/cmake/xo_tokenizerConfig.cmake.in
@ -0,0 +1,8 @@
@PACKAGE_INIT@
 include(CMakeFindDependencyMacro)
 #find_dependency(refcnt)
 find_dependency(indentlog)
 #find_dependency(subsys)
 include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
 check_required_components("@PROJECT_NAME@")
--- a/xo-tokenizer/include/xo/tokenizer/buffer.hpp
+++ b/xo-tokenizer/include/xo/tokenizer/buffer.hpp
@ -0,0 +1,324 @@
 /** @file buffer.hpp **/
 #pragma once
 #include "span.hpp"
 #include <utility>
 #include <cstdint>
 #include <cassert>
 #include <new>
 namespace xo {
    namespace scm {
        /**
         * @class buffer buffer.hpp
         *
         * @brief Container for a (possibly owned) FIFO queue of chars
         *
         * @tparam CharT.  buffer element type.
         *
         * @code
         *  .buf
         *
         *    +------------------------------------------+
         *    |  |  ...  |  | X|  ... | X|  |    ...  |  |
         *    +------------------------------------------+
         *     ^             ^            ^               ^
         *     0             .lo          .hi             .buf_z
         *
         *                   <-contents-><----avail----->
         * @endcode
         *
         * Buffer does not support wrapped content:
         * content that has not been consumed always occupies contiguous memory.
         *
         * Example:
         * @code
         * // 1.
         *   buffer<char> buf(64*1024);
         *   buf.empty() -> true
         *   buf.buf_z() -> 65536
         *   buf.lo_pos() -> 0
         *   buf.hi_pos() -> 65536
         *   buf.contents() -> empty span
         *   buf.avail() -> span entire buffer memory
         *
         *   // write to (a prefix of) buf.avail()
         *   ::strncpy(buf.buf(), "hello, world\n", 13);
         *   buf.produce(span_type(buf.buf(), buf.buf() + 13));
         *
         *   buf.lo_pos() -> 0
         *   buf.hi_pos() -> 13
         *   buf.contents() -> "hello, world\n";
         *
         *
         *   // examine stored content (does not change buffer state)
         *   auto span = buf.contents();
         *   cerr << string_view(span.lo(), span.hi());  // "hello, world\n"
         *
         *   // consume (a prefix of) stored content
         *   buf.consume(span.prefix(7);
         *
         *   buf.lo_pos() -> 7
         *   buf.hi_pos() -> 13
         *   buf.contents() -> "world\n"
         *
         *   // consuming all remain content resets to original state
         *   buf.consume(buf.contents());
         *
         *   buf.empty() -> true
         *   buf.hi_pos() -> 0     // not 13!
         *
         * // 2.
         *   buffer<char> buf;
         *   buf.empty() -> true
         *   buf.buf_z() -> 0
         *   buf.lo_pos() -> 0
         *   buf.hi_pos() -> 0
         *   buf.contents() -> empty span
         *   buf.avail() -> empty span
         *
         *   // allocate memory separately from ctor
         *   buf.alloc(64*1024);
         * @endcode
         **/
        template <typename CharT>
        class buffer {
        public:
            /** @brief typealias for span of CharT **/
            using span_type = span<CharT>;
            /** @brief typealias for buffer size (counts CharT's, not bytes) **/
            using size_type = std::uint64_t;
        public:
            /** @brief create empty buffer.
                Does not allocate any storage;  @see alloc
            **/
            buffer() = default;
            /** @brief create empty buffer,  and possibly allocate storage.
                @param buf_z    Buffer size.  allocate storage (owned by this buffer) if >0.
                @param align_z  Align to this value,  e.g. 8 to align storage on an 8-byte boundary
            **/
            buffer(size_type buf_z, size_type align_z = sizeof(char))
                : is_owner_{true},
                  buf_{buf_z ? (new (std::align_val_t(align_z)) CharT [buf_z]) : nullptr},
                  buf_z_{buf_z},
                  lo_pos_{0},
                  hi_pos_{0}
                {}
            /** @brief buffer is not copyable **/
            buffer(buffer const & x) = delete;
            /** @brief destructor.  Release storage if owned **/
            ~buffer() { this->reset(); }
            /** @name Access methods **/
            ///@{
            /** @brief start of buffer memory **/
            CharT * buf() const { return buf_; }
            /** @brief buffer size (number of characters) **/
            size_type buf_z() const { return buf_z_; }
            /** @brief current start position within buffer **/
            size_type lo_pos() const { return lo_pos_; }
            /** @brief current end position within buffer **/
            size_type hi_pos() const { return hi_pos_; }
            ///@}
            /** @brief readonly access to a single buffer element.
                Relative to start of buffer (ignores current consume position)
            **/
            CharT const & operator[](size_type i) const { return buf_[i]; }
            /** @brief return span for current buffer contents **/
            span_type contents() const { return span_type(buf_ + lo_pos_, buf_ + hi_pos_); }
            /** @brief returns span for writable buffer contents (unused prefix following produce position **/
            span_type avail() const { return span_type(buf_ + hi_pos_, buf_ + buf_z_); }
            /** @brief @c true iff buffer is empty **/
            bool empty() const { return lo_pos_ == hi_pos_; }
            /**
               @brief update buffer produce position, after (independently) writing contents of span to it
               @pre left endpoint of @p span equals buffer produce position (@c .hi_pos)
               @pre right endpoint of @p span within bounds of buffer memory range
               @post right endpoint of @p span equals buffer produce position.
            **/
            void produce(span_type const & span) {
                assert(span.lo() == buf_ + hi_pos_);
                hi_pos_ += span.size();
            }
            /**
               @brief update buffer consume position,  when done with contents of span
               @pre left endpoint of @p span equals buffer consume position (@c .lo_pos)
               @pre right endpoint of @p span within bounds of buffer memory range
               @post Either
               buffer is empty, with @c .lo_pos = @c .hi_pos = @c 0.
               buffer is non-empty, right endpoint of @p span equals new buffer consume position.
            **/
            void consume(span_type const & span) {
                if (span.size()) {
                    assert(span.lo() == buf_ + lo_pos_);
                    lo_pos_ += span.size();
                } else {
                    /* since .consume() that arrives at empty contents also resets .lo_pos .hi_pos,
                     * we don't want to blow up when called with an empty span -- argument
                     * may represent some pre-reset location in buffer
                     */
                }
                if (lo_pos_ == hi_pos_) {
                    lo_pos_ = 0;
                    hi_pos_ = 0;
                }
            }
            /**
               @brief allocate buffer with desired amount of memory
               @param buf_z     desired buffer size
               @param align_z   alignment;  buffer memory will be aligned on this byte-boundary.
            **/
            void alloc(size_type buf_z, size_type align_z = sizeof(char)) {
                /* properly reset (+ discard) any existing state */
                this->reset();
                is_owner_ = true;
                if (buf_z)
                    buf_ = new (std::align_val_t(align_z)) CharT [buf_z];
                buf_z_ = buf_z;
                lo_pos_ = 0;
                hi_pos_ = 0;
            }
            /**
               @brief attach buffer to (unowned)  range of @p buf_z bytes starting at @p buf[0]
               Buffer is not responsible for managing storage.
               @post
               1. buffer is empty
               @post
               2. buffer read position = buffer write position = 0
            **/
            void setbuf(CharT * buf, size_type buf_z) {
                /* properly reset (+ discard) any existing state */
                this->reset();
                is_owner_ = false;
                lo_pos_ = 0;
                hi_pos_ = 0;
                buf_ = buf;
                buf_z_ = buf_z;
            }
            /**
               @brief revert buffer to empty state and possibly zero it
               @param zero_buffer_flag   Zero buffer contents iff this is true
               @post
               1. buffer is empty
               @post
               2. buffer read position = buffer write position = 0
            **/
            void clear2empty(bool zero_buffer_flag) {
                if (buf_ && zero_buffer_flag)
                    explicit_bzero(buf_, buf_z_ * sizeof(CharT));
                lo_pos_ = 0;
                hi_pos_ = 0;
            }
            /**
               @brief swap representation with another buffer instance.
            **/
            void swap (buffer & x) {
                std::swap(is_owner_, x.is_owner_);
                std::swap(buf_, x.buf_);
                std::swap(buf_z_, x.buf_z_);
                std::swap(lo_pos_, x.lo_pos_);
                std::swap(hi_pos_, x.hi_pos_);
            }
            /**
               @brief reset buffer to an empty state and recover owned storage
            **/
            void reset() {
                if (is_owner_ && buf_)
                    delete [] buf_;
                is_owner_ = false;
                buf_ = nullptr;
                buf_z_ = 0;
                lo_pos_ = 0;
                hi_pos_ = 0;
            }
            /**
               @brief move-assignment operator.
               @param x   right-hand-side to move from.
               @post
               @p x is in a valid, empty,
            **/
            buffer & operator= (buffer && x) {
                is_owner_ = x.is_owner_;
                buf_ = x.buf_;
                buf_z_ = x.buf_z_;
                lo_pos_ = x.lo_pos_;
                hi_pos_ = x.hi_pos_;
                x.is_owner_ = false;
                x.lo_pos_ = 0;
                x.hi_pos_ = 0;
                x.buf_ = nullptr;
                x.buf_z_ = 0;
                return *this;
            }
            /** @brief buffer is not assignable */
            buffer & operator= (buffer & x) = delete;
        private:
            /** @brief true iff buffer is responsible for freeing storage at @c buf_ **/
            bool is_owner_ = false;
            /** @brief buffer contents.  buffer memory comprises @c buf_[0] to @c buf_[buf_z_] **/
            CharT * buf_ = nullptr;
            /** @brief buffer size (in units of CharT) **/
            size_type buf_z_ = 0;
            /** @brief buffer read (consume) position
                @invariant
                0 <= lo_pos_ <= hi_pos_ < buf_z_
            **/
            size_type lo_pos_ = 0;
            /** @brief buffer write (produce) position
                @invariant
                0 <= hi_pos_ < hi_pos_ < buf_z_
            **/
            size_type hi_pos_ = 0;
        };
        /** @brief Overload for @c swap,  so that @c buffer<CharT> swappable **/
        template <typename CharT>
        inline void
        swap(buffer<CharT> & lhs, buffer<CharT> & rhs) {
            lhs.swap(rhs);
        }
    } /*namespace scm*/
 } /*namespace xo*/
 /* end buffer.hpp */
--- a/xo-tokenizer/include/xo/tokenizer/span.hpp
+++ b/xo-tokenizer/include/xo/tokenizer/span.hpp
@ -0,0 +1,160 @@
 /** @file span.hpp **/
 #pragma once
 #include <ostream>
 #include <cstdint>
 #include <cassert>
 namespace xo {
    namespace scm {
        /** @class span compression/span.hpp
         *
         *  @brief Represents a contiguous memory range,  without ownership.
         *
         *  @tparam CharT type for elements referred to by this span.
         **/
        template <typename CharT>
        class span {
        public:
            /** @brief typealias for span size (in units of CharT) **/
            using size_type = std::uint64_t;
        public:
            /** @brief create span for the contiguous memory range [@p lo, @p hi) **/
            span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {}
            /** @brief create span for C-style string @p cstr **/
            static span from_cstr(const CharT * cstr) {
                CharT * lo = cstr;
                CharT * hi = cstr ? cstr + strlen(cstr) : nullptr;
                return span(lo, hi);
            }
            ///@{
            /** @name getters **/
            CharT * lo() const { return lo_; } /* get member span::lo_ */
            CharT * hi() const { return hi_; } /* get member span::hi_ */
            ///@}
            /** @brief create new span over supplied type,
             *  with identical (possibly misaligned) endpoints.
             *
             *  @warning
             *  1. New span uses exactly the same memory addresses.
             *     Endpoint pointers may not be aligned.
             *  2. Implementation assumes code compiled with
             *     @code -fno-strict-aliasing @endcode enabled.
             *
             *  @tparam OtherT element type for new span
             **/
            template <typename OtherT>
            span<OtherT>
            cast() const { return span<OtherT>(reinterpret_cast<OtherT *>(lo_),
                                               reinterpret_cast<OtherT *>(hi_)); }
            /** @brief create span including the first @p z members of this span. **/
            span prefix(size_type z) const { return span(lo_, lo_ + z); }
            /** @brief create span representing prefix up to (but not including) @p *p
             **/
            span prefix_upto(CharT * p) const {
                if (p <= hi_)
                    return span(lo_, p);
                else
                    return span(lo_, hi_);
            }
            /** @brief create span with first @p z members of this span removed **/
            span after_prefix(size_type z) const {
                if (lo_ + z > hi_)
                    z = hi_ - lo_;
                return span(lo_ + z, hi_);
            }
            /** @brief create span with @p prefix of this span removed **/
            span after_prefix(const span & prefix) const {
                assert(prefix.lo() == lo_);
                if (prefix.lo() != lo_) {
                    throw std::runtime_error
                        ("after_prefix: expected prefix of this span");
                }
                return after_prefix(prefix.size());
            }
            /** @brief create span starting with position p **/
            span suffix_from(CharT * p) const {
                if ((lo_ <= p) && (p <= hi_))
                    return span(p, hi_);
                else
                    return span(hi_, hi_);
            }
            /** @brief true iff this span is empty (comprises 0 elements). **/
            bool empty() const { return lo_ == hi_; }
            /** @brief report the number of elements (of type CharT) in this span. **/
            size_type size() const { return hi_ - lo_; }
            span & operator+=(const span & x) {
                if (hi_ == x.lo_) {
                    hi_ = x.hi_;
                } else {
                    assert(false);
                }
                return *this;
            }
            /** print representation for this span on stream @p os **/
            void print(std::ostream & os) const {
                os << "<span"
                   << xtag("addr", (void*)lo_)
                   << xtag("size", size())
                   << " :text " << xo::print::quot(std::string_view(lo_, hi_))
                   << ">";
            }
        private:
            ///@{
            /** @brief start of span
                Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
            **/
            CharT * lo_ = nullptr;
            /** @brief end of span
                Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
            **/
            CharT * hi_ = nullptr;
            ///@}
        }; /*span*/
        template <typename CharT>
        inline bool
        operator==(const span<CharT> & lhs, const span<CharT> & rhs) {
            return ((lhs.lo() == rhs.lo())
                    && (lhs.hi() == rhs.hi()));
        }
        template <typename CharT>
        inline bool
        operator!=(const span<CharT> & lhs, const span<CharT> & rhs) {
            return ((lhs.lo() != rhs.lo())
                    || (lhs.hi() != rhs.hi()));
        }
        template <typename CharT>
        inline std::ostream &
        operator<<(std::ostream & os,
                   const span<CharT> & x) {
            x.print(os);
            return os;
        }
    } /*namespace scm*/
 } /*namespace xo*/
--- a/xo-tokenizer/include/xo/tokenizer/token.hpp
+++ b/xo-tokenizer/include/xo/tokenizer/token.hpp
@ -0,0 +1,359 @@
 /* file token.hpp
 *
 * author: Roland Conybeare, Jul 2024
 */
 #pragma once
 #include "tokentype.hpp"
 #include "xo/indentlog/print/tag.hpp"
 #include <stdexcept>
 #include <ostream>
 #include <string>
 #include <cstdint>
 namespace xo {
    namespace scm {
        namespace detail {
            /* compute a * b^p,  p >= 0 */
            constexpr double
            pow_aux(double a, double b, int p) {
                while (p > 0) {
                    if (p % 2 == 1) {
                        /* a * b^p = a * b^(2q + 1) = a.b * 10^(2q) */
                        a *= b;
                        p -= 1;
                    } else {
                        /* a * b^p = a * b^(2q) = a * (b^2)^q */
                        b = b * b;
                        p /= 2;
                    }
                }
                /* a * b^0 = a */
                return a;
            }
            constexpr double
            pow10(int p) {
                if (p >= 0)
                    return pow_aux(1.0, 10.0, p);
                else
                    return 1.0 / pow_aux(1.0, 10.0, -p);
            }
        }
        template <typename CharT>
        class token {
        public:
            token() = default;
            token(tokentype tk_type, const std::string & text = "")
                : tk_type_{tk_type}, text_{text} {}
            static token invalid() { return token(); }
            static token i64_token(const std::string & txt) {
                return token(tokentype::tk_i64, txt);
            }
            static token f64_token(const std::string & txt) {
                return token(tokentype::tk_f64, txt);
            }
            static token string_token(const std::string & txt) {
                return token(tokentype::tk_string, txt);
            }
            static token symbol_token(const std::string & txt) {
                return token(tokentype::tk_symbol, txt);
            }
            static token leftangle() { return token(tokentype::tk_leftangle); }
            static token rightangle() { return token(tokentype::tk_rightangle); }
            static token leftparen() { return token(tokentype::tk_leftparen); }
            static token rightparen() { return token(tokentype::tk_rightparen); }
            static token leftbracket() { return token(tokentype::tk_leftbracket); }
            static token rightbracket() { return token(tokentype::tk_rightbracket); }
            static token leftbrace() { return token(tokentype::tk_leftbrace); }
            static token rightbrace() { return token(tokentype::tk_rightbrace); }
            static token dot() { return token(tokentype::tk_dot); }
            static token comma() { return token(tokentype::tk_comma); }
            static token colon() { return token(tokentype::tk_colon); }
            static token doublecolon() { return token(tokentype::tk_doublecolon); }
            static token semicolon() { return token(tokentype::tk_semicolon); }
            static token singleassign() { return token(tokentype::tk_singleassign); }
            static token assign_token() { return token(tokentype::tk_assign); }
            static token yields() { return token(tokentype::tk_yields); }
            static token star_token() { return token(tokentype::tk_star); }
            static token type() { return token(tokentype::tk_type); }
            static token def() { return token(tokentype::tk_def); }
            static token lambda() { return token(tokentype::tk_lambda); }
            static token if_token() { return token(tokentype::tk_if); }
            static token let() { return token(tokentype::tk_let); }
            static token in() { return token(tokentype::tk_in); }
            static token end() { return token(tokentype::tk_end); }
            tokentype tk_type() const { return tk_type_; }
            const std::string & text() const { return text_; }
            bool is_valid() const { return tk_type_ != tokentype::tk_invalid; }
            bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; }
            /** expect input matching
             *    [+|-][0-9][0-9]*
             **/
            std::int64_t i64_value() const;
            /** expect input matching
             *    [+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]*
             **/
            double f64_value() const;
            /** print human-readable token representation on stream @p os **/
            void print(std::ostream & os) const;
        private:
            /** category for this token **/
            tokentype tk_type_ = tokentype::tk_invalid;
            /** characters comprising this token.
             *  only provided for certain token types:
             *
             *    tk_i64
             *    tk_f64
             *    tk_string
             *    tk_symbol
             **/
            std::string text_;
        }; /*token*/
        template <typename CharT>
        std::int64_t
        token<CharT>::i64_value() const {
            if (tk_type_ != tokentype::tk_i64) {
                throw (std::runtime_error
                       (tostr("token::i64_value",
                              ": token with type tk found where tk_i64 expected",
                              xtag("tk", tk_type_))));
            }
            if (text_.empty()) {
                throw (std::runtime_error
                       (tostr("token::i64_value",
                              ": unexpected empty input string for tk_i64 token")));
            }
            int sign = 1;
            int value = 0;
            {
                auto ix = text_.begin();
                auto end_ix = text_.end();
                CharT ch = *ix;
                if (ch == '+') {
                    ++ix;
                } else if (ch == '-') {
                    sign = -1;
                    ++ix;
                }
                if (ix == end_ix) {
                    throw (std::runtime_error
                           (tostr("token::i64_value",
                                  ": input text found where at least one digit expected",
                                  xtag("text", text_))));
                }
                for (; ix != end_ix; ++ix) {
                    CharT ch = *ix;
                    if ((ch >= '0') && (ch <= '9')) {
                        value *= 10;
                        value += (ch - '0');
                    } else {
                        throw (std::runtime_error
                               (tostr("token::i64_value",
                                      ": unexpected char ch in integer token",
                                      xtag("ch", ch))));
                    }
                }
            }
            return sign * value;
        } /*i64_value*/
        template <typename CharT>
        double
        token<CharT>::f64_value() const {
            if (tk_type_ != tokentype::tk_f64) {
                throw (std::runtime_error
                       (tostr("token::f64_value",
                              ": token with type tk found where tk_f64 expected",
                              xtag("tk", tk_type_))));
            }
            if (text_.empty()) {
                throw (std::runtime_error
                       (tostr("token::f64_value",
                              ": unexpected empty input string for tk_f64 token")));
            }
            int sign = 1;
            /* integer representing denormalized unsigned mantissa
             * (mantissa scaled by smallest power of 10 sufficient to make
             *  it an integer)
             */
            std::int64_t mantissa = 0;
            /* counts #of digits to the right of decimal point '.' */
            int rh_digits = 0;
            /* sign of exponent */
            int exp_sign = 1;
            /* value of exponenct = integer to the right of 'e' or 'E' */
            int exponent = 0;
            /* floating-point value will represent
             *   sign * mantissa * 10^(sign*exponent - rh_digits)
             */
            {
                auto ix = text_.begin();
                auto end_ix = text_.end();
                CharT ch = *ix;
                if (ch == '+') {
                    ++ix;
                } else if (ch == '-') {
                    sign = -1;
                    ++ix;
                }
                if (ix == end_ix) {
                    throw (std::runtime_error
                           (tostr("token::f64_value",
                                  ": input text found where at least one digit expected",
                                  xtag("text", text_))));
                }
                /* true iff decimal point '.' present in mantissa */
                bool have_decimal_point = false;
                /* true iff exponent prefix 'e' or 'E' present */
                //bool have_exponent = false;
                /* counts number of digits in mantissa
                 * (both before and after, but not including, any decimal point
                 */
                int m_digits = 0;
                /* digits to the left of decimal point */
                int lh_digits = 0;
                /* loop over mantissa digits */
                for (; ix != end_ix; ++ix) {
                    CharT ch = *ix;
                    if (ch == '.') {
                        if (have_decimal_point) {
                            throw (std::runtime_error
                                   (tostr("token::f64_value",
                                          ": input text found where at most one decimal point expected",
                                          xtag("text", text_))));
                        }
                        have_decimal_point = true;
                        lh_digits = m_digits;
                    } else if ((ch >= '0') && (ch <= '9')) {
                        mantissa *= 10;
                        mantissa += (ch - '0');
                        ++m_digits;
                    } else if (ch == 'e' || ch == 'E') {
                        //have_exponent = true;
                        break; // done with mantissa
                    } else {
                        throw (std::runtime_error
                               (tostr("token::i64_value",
                                      ": unexpected char ch in integer token",
                                      xtag("ch", ch))));
                    }
                }
                if (have_decimal_point)
                    rh_digits = m_digits - lh_digits;
                if (ix != end_ix) {
                    /* continue to read exponent */
                    /* skip e|E */
                    ++ix;
                    if (ix == end_ix) {
                        throw (std::runtime_error
                               (tostr("token::f64_value",
                                      ": on input text, expect at least one digit following exponent marker e|E",
                                      xtag("text", text_))));
                    }
                    CharT ch = *ix;
                    if (ch == '+') {
                        ++ix; /*skip*/
                    } else if (ch == '-') {
                        exp_sign = -1;
                        ++ix;
                    }
                    for (; ix != end_ix; ++ix) {
                        CharT ch = *ix;
                        if ((ch >= '0') && (ch <= '9')) {
                            exponent *= 10;
                            exponent += (ch - '0');
                        } else {
                            throw (std::runtime_error
                                   (tostr("token::f64_value",
                                          "; on input text, expect only digits following"
                                          " (possibly signed) exponenct marker",
                                          xtag("text", text_))));
                        }
                    }
                }
            }
            /* floating-point value will represent
             *   sign * mantissa * 10^(sign*exponent - rh_digits)
             */
            double mantissa_f64 = sign * mantissa;
 #ifdef OBSOLETE_DEBUG
            std::cerr << xtag("text", text_)
                      << xtag("rh_digits", rh_digits)
                      << xtag("mantissa_f64", mantissa_f64)
                      << xtag("exp_sign", exp_sign)
                      << xtag("exponent", exponent)
                      << std::endl;
 #endif
            double retval = (mantissa_f64
                             * detail::pow10((exp_sign * exponent)
                                             - rh_digits));
            return retval;
        } /*f64_value*/
        template <typename CharT>
        void
        token<CharT>::print(std::ostream & os) const {
            os << "<token"
               << xtag("type", tk_type_)
               << xtag("text", text_)
               << ">";
        } /*print*/
        template <typename CharT>
        inline std::ostream &
        operator<< (std::ostream & os,
                    const token<CharT> & tk)
        {
            tk.print(os);
            return os;
        }
    } /*Namespace scm*/
 } /*namespace xo*/
 /* end token.hpp */
--- a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp
+++ b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp
@ -0,0 +1,775 @@
 /* file tokenizer.hpp
 *
 * author: Roland Conybeare, Jul 2024
 */
 #pragma once
 #include "token.hpp"
 #include "span.hpp"
 #include "xo/indentlog/scope.hpp"
 #include <cassert>
 namespace xo {
    namespace scm {
        /**
         *  Use:
         *  @code
         *    using tokenizer_type = tokenizer<char>;
         *    using span_type = tokenizer_type::span_type;
         *
         *    tokenizer_type tkz;
         *    span_type input = ...;
         *
         *    while !input.empty() {
         *        auto res = tkz.scan(input);
         *        const auto & tk = res.first;
         *
         *        // do something with tk if tk.is_valid()
         *
         *        input = input.after_prefix(res.second);
         *    }
         *
         *    if endofinput {
         *        auto tk = tzk.notify_eof()
         *
         *        // do something with tk if tk.is_valid()
         *    }
         *
         *    // expect !tkz.has_prefix()
         *
         *  @endcode
         **/
        template <typename CharT>
        class tokenizer {
        public:
            using token_type = token<CharT>;
            using span_type = span<const CharT>;
            using scan_result = std::pair<token_type, span_type>;
        public:
            tokenizer() = default;
            /** identifies whitespace chars.
             *  These are chars that do not belong to any token.
             *  They are not permitted to appear within
             *  a symbol or string token.
             *  Appearance of a whitespace char forces completion of
             *  preceding token.
             **/
            bool is_whitespace(CharT ch) const;
            /** identifies punctuation chars.
             *  These are chars that are not permitted to appear within
             *  a symbol token.  Instead they force completion of
             *  a preceding token,  and start a new token with themselves
             **/
            bool is_1char_punctuation(CharT ch) const;
            /** more-relazed version of is_1char_punctuation.
             *  Chars that are not permitted to appear within a symbol token,
             *  but may form token combined with next character
             **/
            bool is_2char_punctuation(CharT ch) const;
            /** true if tokenizer contains stored prefix of
             *  possibly-incomplete token
             **/
            bool has_prefix() const { return !prefix_.empty(); }
            /** assemble token from text @p token_text
             **/
            token_type assemble_token(const span_type & token_text) const;
            /** scan for next input token,  given @p input.
             *  Note tokenizer can consume input (e.g. whitespace)
             *  without completing a token
             *
             *  @return {parsed token, consumed span}
             **/
            scan_result scan(const span_type & input);
            /** When eof is false, same as scan(input).
             *  When eof is true and scan(input) does not report a token,
             *  return notify_eof()
             **/
            scan_result scan2(const span_type & input, bool eof);
            /** notify end of input,  resolve any stored input **/
            token_type notify_eof();
        private:
            /** Accumulate partial token here.
             *  This will happen if input sent to @ref tokenizer::scan
             *  ends without a determinate token boundary.
             **/
            std::string prefix_;
        }; /*tokenizer*/
        template <typename CharT>
        bool
        tokenizer<CharT>::is_whitespace(CharT ch) const {
            switch(ch) {
            case ' ': return true;
            case '\t': return true;
            case '\n': return true;
            case '\r': return true;
            }
            return false;
        }
        template <typename CharT>
        bool
        tokenizer<CharT>::is_1char_punctuation(CharT ch) const {
            switch(ch) {
            case '<':
                return true;
            case '>':
                return true;
            case '(':
                return true;
            case ')':
                return true;
            case '[':
                return true;
            case ']':
                return true;
            case '{':
                return true;
            case '}':
                return true;
            case ',':
                return true;
            case ';':
                return true;
            case ':':
                /* can't be 1char punctuation -- can begin assignment token */
                return false;
            case '=':
                return true;
            case '-':
                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e-9 */
                return false;
            case '+':
                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */
                return false;
            case '*':
                /* not punctuation -- allowed in symbol */
                return false;
            case '/':
                /* not punctuation -- for symmetry with +,- */
                return false;
            case '.':
                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */
                return false;
            }
            return false;
        }
        template <typename CharT>
        bool
        tokenizer<CharT>::is_2char_punctuation(CharT ch) const {
            switch(ch) {
            case ':':
                /* can begin := */
                return true;
            }
            return false;
        }
        template <typename CharT>
        auto
        tokenizer<CharT>::assemble_token(const span_type & token_text) const -> token_type
        {
            constexpr bool c_debug_flag = true;
            /* literal|pretty|streamlined */
            log_config::style = function_style::streamlined;
            scope log(XO_DEBUG(c_debug_flag));
            log && log(xtag("token_text", token_text));
            tokentype tk_type = tokentype::tk_invalid;
            std::string tk_text;
            const CharT * tk_start = token_text.lo();
            const CharT * tk_end = token_text.hi();
            const CharT * ix = tk_start;
            /* switch here applies to the first character in a token */
            switch (*ix) {
            case '-':
            case '+':
                if (token_text.size() == 1) {
                    /* standalone '+' or '-' */
                    if (*ix == '+')
                        tk_type = tokentype::tk_plus;
                    else if(*ix == '-')
                        tk_type = tokentype::tk_minus;
                }
                /** fall through to numeric literal code below **/
                ;
            case '.':
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                /* examples of valid floating-point numbers:
                 *   .0
                 *   1e0
                 *   1e
                 *   0.
                 *   +1e0
                 *   -1e0
                 *   +1E+2
                 *   -1E+2
                 *   -0.123e-10
                 * non-examples:
                 *   .
                 *   -
                 *   +
                 *   e0
                 *   .e0
                 *   -.e-0
                 *   +.e+0
                 *
                 * in particular: to be recognized as a number,
                 * must contain at least one digit
                 */
                log && log("possible number-token");
                /* true if initial sign -/+ encountered */
                bool sign_flag = false;
                /* true if '.' encountered */
                bool period_flag = false;
                /* true if 'e' | 'E' encountered.
                 */
                bool exponent_flag = false;
                /* true when sign '-' | '+' precedes exponenct digits */
                bool exponent_sign_flag = false;
                /* true when at least one digit follows exponent marker */
                bool exponent_digit_flag = false;
                /* true if at least one digit encountered */
                bool number_flag = false;
                /* token will be one of: {i64, f64, dot}: */
                for(; ix != token_text.hi(); ++ix) {
                    if((*ix == '-') || (*ix == '+')) {
                        /* sign allowed:
                         * 1. before period and before first digit
                         * 2. after exponent
                         */
                        if (!period_flag && !number_flag && !sign_flag) {
                            sign_flag = true;
                        } else if (exponent_flag && !exponent_digit_flag) {
                            exponent_sign_flag = true;
                        } else {
                            throw std::runtime_error
                                (tostr("tokenizer::assemble_token",
                                       ": improperly placed sign indicator",
                                       xtag("pos", ix - tk_start),
                                       xtag("char", *ix)));
                        }
                    } else if(*ix == '.') {
                        if (period_flag) {
                            throw (std::runtime_error
                                   (tostr("tokenizer::assemble_token",
                                          ": duplicate decimal point",
                                          xtag("pos", ix - tk_start),
                                          xtag("char", *ix))));
                        }
                        period_flag = true;
                    } else if((*ix == 'e') || (*ix == 'E')) {
                        if (exponent_flag) {
                            throw (std::runtime_error
                                   (tostr("tokenizer::assemble_token",
                                          ": duplicate exponent marker",
                                          xtag("pos", ix - tk_start),
                                          xtag("char", *ix))));
                        }
                        exponent_flag = true;
                    } else if(isdigit(*ix)) {
                        if (exponent_flag) {
                            /* need digit before exponent to recognize as number */
                            exponent_digit_flag = true;
                        } else {
                            number_flag = true;
                        }
                    } else {
                        /* invalid input */
                        throw (std::runtime_error
                               (tostr("tokenizer::assemble_token",
                                      ": unexpected character in numeric constant",
                                      xtag("pos", ix - tk_start),
                                      xtag("char", *ix))));
                    }
                }
                if (number_flag) {
                    if (period_flag || exponent_flag) {
                        tk_type = tokentype::tk_f64;
                    } else {
                        tk_type = tokentype::tk_i64;
                    }
                } else if (period_flag && !exponent_flag) {
                    tk_type = tokentype::tk_dot;
                } else {
                    /* not a valid token */
                }
                log && log(xtag("sign_flag", sign_flag));
                log && log(xtag("period_flag", period_flag),
                           xtag("exponent_flag", exponent_flag),
                           xtag("exponent_sign_flag", exponent_sign_flag),
                           xtag("number_flag", number_flag));
                log && log(xtag("tk_type", tk_type));
                break;
            }
            case '*':
                if (token_text.size() == 1) {
                    /* standalone '*' */
                    tk_type = tokentype::tk_star;
                    ++ix;
                } else {
                    /* '*' isn't punctuation -- but may allow appearance in a longer token
                     *
                     * thinking that x*y is a symbol with an embedded '*' character;
                     * in particular want to support kebab-case symbols like 'foo-config'
                     */
                }
                break;
            case '/':
                if (token_text.size() == 1) {
                    /* standalone '/' */
                    tk_type = tokentype::tk_slash;
                    ++ix;
                }
                break;
            case '"':
            {
                log && log("recognize string-token");
                tk_type = tokentype::tk_string;
                tk_text.reserve(token_text.hi() - token_text.lo());
                ++ix; /*skip initial " char*/
                for (; ix != token_text.hi(); ++ix) {
                    log && log(xtag("*ix", *ix));
                    bool endofstring = false;
                    switch(*ix) {
                    case '"':
                        endofstring = true;
                        /* skip final " char, don't capture */
                        ++ix;
                        break;
                    case '\\':
                        /* skip escape char, don't capture */
                        ++ix;
                        if (ix == token_text.hi()) {
                            throw std::runtime_error
                                (tostr("tokenizer::assemble_token",
                                       ": malformed string literal",
                                       xtag("input", std::string_view(token_text.lo(),
                                                                      token_text.hi()))));
                        }
                        switch(*ix) {
                        case '\\':
                            log && log(xtag("*ix", *ix), xtag("escaped", "t"));
                            tk_text.push_back(*ix);
                            break;
                        case 'n':
                            log && log(xtag("*ix", *ix), xtag("newline", "t"));
                            tk_text.push_back('\n');
                            break;
                        case 't':
                            log && log(xtag("*ix", *ix), xtag("tab", "t"));
                            tk_text.push_back('\t');
                            break;
                        case 'r':
                            log && log(xtag("*ix", *ix), xtag("cr", "t"));
                            tk_text.push_back('\r');
                            break;
                        case '"':
                            log && log(xtag("*ix", *ix), xtag("quote", "t"));
                            tk_text.push_back('"');
                            break;
                        default:
                            throw std::runtime_error
                                (tostr("tokenizer::assemble_token",
                                       ": unexpected \\-escaped char",
                                       xtag("char", *ix)));
                        }
                        break;
                    default:
                        tk_text.push_back(*ix);
                        break;
                    }
                    if (endofstring)
                        break;
                }
                if (ix != token_text.hi()) {
                    throw std::runtime_error
                        (tostr("tokenizer::assemble_token",
                               ": expected \" to end string literal",
                               xtag("input", std::string_view(token_text.lo(),
                                                              token_text.hi()))));
                }
                log && log(tostr("tokenizer::assemble_token",
                                 xtag("tk_text", tk_text)));
                break;
            }
            case 'a': case 'A':
            case 'b': case 'B':
            case 'c': case 'C':
            case 'd': case 'D':
            case 'e': case 'E':
            case 'f': case 'F':
            case 'g': case 'G':
            case 'h': case 'H':
            case 'i': case 'I':
            case 'j': case 'J':
            case 'k': case 'K':
            case 'l': case 'L':
            case 'm': case 'M':
            case 'n': case 'N':
            case 'o': case 'O':
            case 'p': case 'P':
            case 'q': case 'Q':
            case 'r': case 'R':
            case 's': case 'S':
            case 't': case 'T':
            case 'u': case 'U':
            case 'v': case 'V':
            case 'w': case 'W':
            case 'x': case 'X':
            case 'y': case 'Y':
            case 'z': case 'Z':
            {
                /* symbol/identifier must begin with a letter?
                 * we want to accept some other chars too.
                 * specifically want to allow identifiers:
                 *   this-is-the-way
                 *   this+is+also+the+way
                 *   how/much/is/that/doggy
                 *   put*an*asterisk*in*that
                 *   something%special%
                 *
                 * like pure lisp,  we don't allow:
                 * - identifier beginning with digit
                 * - period .
                 *
                 * unlike pure lisp,  we don't allow anywhere in a symbol:
                 * - colon     :
                 * - semicolon ;
                 * - comma     ,
                 *
                 * also we don't allow symbols to begin with special chars
                 */
                tk_type = tokentype::tk_symbol;
                break;
            }
            case '<':
                tk_type = tokentype::tk_leftangle;
                ++ix;
                break;
            case '>':
                tk_type = tokentype::tk_rightangle;
                ++ix;
                break;
            case '(':
                tk_type = tokentype::tk_leftparen;
                ++ix;
                break;
            case ')':
                tk_type = tokentype::tk_rightparen;
                ++ix;
                break;
            case '[':
                tk_type = tokentype::tk_leftbracket;
                ++ix;
                break;
            case ']':
                tk_type = tokentype::tk_rightbracket;
                ++ix;
                break;
            case '{':
                tk_type = tokentype::tk_leftbrace;
                ++ix;
                break;
            case '}':
                tk_type = tokentype::tk_rightbrace;
                ++ix;
                break;
            case ',':
                tk_type = tokentype::tk_comma;
                ++ix;
                break;
            case ';':
                tk_type = tokentype::tk_semicolon;
                ++ix;
                break;
            case ':':
            {
                log && log("colon or assignment token");
                if (*(ix + 1) == '=') {
                    tk_type = tokentype::tk_assign;
                    ++ix;
                    ++ix;
                } else {
                     tk_type = tokentype::tk_colon;
                     ++ix;
                }
                break;
            }
            case '=':
                tk_type = tokentype::tk_singleassign;
                ++ix;
                break;
            default:
                break;
            }
            if (tk_type == tokentype::tk_invalid) {
                throw std::runtime_error(tostr("tokenizer::assemble_token",
                                               ": unexpected input x",
                                               xtag("x", *ix)));
            }
            if ((tk_type == tokentype::tk_i64)
                || (tk_type == tokentype::tk_f64)
                || (tk_type == tokentype::tk_symbol))
            {
                /* re-parse in token::i64_value() / token::f64_value() */
                tk_text = std::string(tk_start, tk_end);
            } else if (tk_type == tokentype::tk_string) {
                ; /* nothing to do here -- desired tk_text already constructed */
            }
            if (tk_type == tokentype::tk_symbol) {
                /* check for keywords */
                bool keep_text = false;
                if (tk_text == "type") {
                    tk_type = tokentype::tk_type;
                } else if (tk_text == "def") {
                    tk_type = tokentype::tk_def;
                } else if (tk_text == "lambda") {
                    tk_type = tokentype::tk_lambda;
                } else if (tk_text == "if") {
                    tk_type = tokentype::tk_if;
                } else if (tk_text == "let") {
                    tk_type = tokentype::tk_let;
                } else if (tk_text == "in") {
                    tk_type = tokentype::tk_in;
                } else if (tk_text == "end") {
                    tk_type = tokentype::tk_end;
                } else {
                    /* keep as symbol */
                    keep_text = true;
                }
                if (!keep_text)
                    tk_text.clear();
            }
            return token_type(tk_type, std::move(tk_text));
        } /*assemble_token*/
        template <typename CharT>
        auto
        tokenizer<CharT>::scan(const span_type & input) -> scan_result
        {
            constexpr bool c_debug_flag = true;
            scope log(XO_DEBUG(c_debug_flag));
            log && log(xtag("input", input));
            const CharT * ix = input.lo();
            /* skip whitespace */
            while (is_whitespace(*ix) && (ix != input.hi()))
                ++ix;
            if(ix == input.hi()) {
                /* no-op */
                return {
                    token_type::invalid(),
                    input.prefix_upto(ix)
                };
            }
            /* here: *ix is not whitespace */
            auto whitespace = input.prefix_upto(ix);
            log && log(xtag("whitespace.size", whitespace.size()));
            /* tk_start points to beginning of token
             * (after any whitespace)
             */
            const CharT * tk_start = ix;
            if (is_1char_punctuation(*ix)) {
                /* 1-character token */
                ++ix;
            } else if (is_2char_punctuation(*ix)) {
                CharT ch1 = *ix;
                (void)ch1;
                ++ix;
                if (ix == input.hi()) {
                    /* need more input to know if/when token complete */
                    this->prefix_ += std::string(tk_start, input.hi());
                    log && log(xtag("captured-prefix", this->prefix_));
                } else {
                    CharT ch2 = *ix;
                    if (((ch2 >= '0') && (ch2 <= '9'))
                        || ((ch2 >= 'A') && (ch2 <= 'Z'))
                        || ((ch2 >= 'a') && (ch2 <= 'z')))
                    {
                        /* treat as 1 char punctuation */
                        ;
                    } else {
                        /* include next char */
                        ++ix;
                    }
                }
            } else if (*ix == '"') {
                bool complete_flag = false;
                /* 1. embedded space/tab allowed in string literal.
                 * 2. embedded newline/cr not allowed.
                 */
                CharT prev_ch = '"';
                ++ix;
                for (; ix != input.hi(); ++ix) {
                    /* looking for unescaped " char to end literal */
                    if (*ix == '"') {
                        if (prev_ch != '\\') {
                            ++ix;  /* include terminating " for assemble_token */
                            complete_flag = true;
                            break;
                        }
                    } else if ((*ix == '\n') || (*ix == '\r')) {
                        throw std::runtime_error
                            (tostr("tokenizer::scan",
                                   ": must use \\n or \\r to encode newline/cr in"
                                   " string literal"));
                    }
                    prev_ch = *ix;
                }
                if (!complete_flag) {
                    /* need more input to know if/when token complete */
                    this->prefix_ += std::string(tk_start, input.hi());
                    log && log(xtag("captured-prefix", this->prefix_));
                }
            } else {
                /* scan until:
                 * - whitespace
                 * - punctuation
                 */
                for (; ix != input.hi(); ++ix) {
                    if (is_whitespace(*ix)
                        || is_1char_punctuation(*ix)
                        || is_2char_punctuation(*ix))
                    {
                        break;
                    }
                }
                if (ix == input.hi()) {
                    /* need more input to know if/when token complete */
                    this->prefix_ += std::string(tk_start, input.hi());
                    log && log(xtag("captured-prefix", this->prefix_));
                }
            }
            auto token_span = input.after_prefix(whitespace).prefix_upto(ix);
            token tk
                = (this->prefix_.empty()
                   ? assemble_token(token_span)
                   : token_type(tokentype::tk_invalid));
            return scan_result
                { tk, input.prefix(whitespace.size() + token_span.size()) };
        } /*scan*/
        template <typename CharT>
        auto
        tokenizer<CharT>::scan2(const span_type & input, bool eof) -> scan_result {
            auto sr = this->scan(input);
            if (!sr.first.is_valid() && eof) {
                sr.first = this->notify_eof();
                /* always consume remainder of input here.
                 * ambiguous prefix can represent at most one token
                 */
                sr.second = input;
            }
            return sr;
        }
        template <typename CharT>
        auto
        tokenizer<CharT>::notify_eof() -> token_type {
            constexpr bool c_debug_flag = true;
            scope log(XO_DEBUG(c_debug_flag));
            token tk
                = (this->prefix_.empty()
                   ? token_type(tokentype::tk_invalid)
                   : assemble_token(span_type(&prefix_[0],
                                              &prefix_[prefix_.size()])));
            this->prefix_.clear();
            return tk;
        } /*notify_eof*/
    } /*namespace scm*/
 } /*namespace xo*/
 /* end tokenizer.hpp */
--- a/xo-tokenizer/include/xo/tokenizer/tokentype.hpp
+++ b/xo-tokenizer/include/xo/tokenizer/tokentype.hpp
@ -0,0 +1,158 @@
 /** @file tokentype.hpp
 *
 *  author: Roland Conybeare, Jul 2024
 **/
 #pragma once
 #include "xo/indentlog/print/tag.hpp" // for STRINGIFY
 #include <ostream>
 namespace xo {
    namespace scm {
        /** @enum tokentype
         *  @brief enum to identify different schematica input token types
         *
         *  Schematica code examples:
         *
         *    type point :: { xcoord : f64, ycoord: f64 };
         *    type matrix :: array<double, 2>;  // 2-d array
         *
         *    decl hypot(x : f64, y : f64) -> f64;
         *
         *    def hypot(x : f64, y : f64) {
         *      let
         *        x2 = (x * x);
         *        y2 = (y * y);
         *        hypot2 = (x2 + y2);
         *      in
         *        sqrt(hypot2);
         *    };
         *
         *    def someconst 4;
         *
         *    def foo(v : vec<i32>) {
         *      def (pi : f64) = 3.1415926;
         *      def (h : (f64,f64) -> f64) = hypot;
         *
         *      h = hypot3;
         *    };
         *
         *    def matrixproduct(x : matrix, y : matrix) {
         *      [i,j : x.row(i) * y.col(j)];
         *    };
         **/
        enum class tokentype {
            /** sentinel value **/
            tk_invalid = -1,
            /** an integer constant (signed 64-bit integer) **/
            tk_i64,
            /** a 64-bit floating-point constant **/
            tk_f64,
            /** a string literal **/
            tk_string,
            /** a symbol **/
            tk_symbol,
            /** left-hand parenthesis '(' **/
            tk_leftparen,
            /** right-hand parenthesis ')' **/
            tk_rightparen,
            /** left-hand bracket '[' **/
            tk_leftbracket,
            /** right-hand bracket ']' **/
            tk_rightbracket,
            /** left-hand brace '{' **/
            tk_leftbrace,
            /** right-hand brace '}' **/
            tk_rightbrace,
            /** left-hand angle bracket '<' **/
            tk_leftangle,
            /** right-hand angle bracket '>' **/
            tk_rightangle,
            /** dot '.' **/
            tk_dot,
            /** comma ',' **/
            tk_comma,
            /** colon ':' **/
            tk_colon,
            /** double-colon '::' **/
            tk_doublecolon,
            /** semi-colon ';' **/
            tk_semicolon,
            /** '=' **/
            tk_singleassign,
            /** ':=' **/
            tk_assign,
            /** '->' **/
            tk_yields,
            /** note: operators not treated as punctuation
             *  'do-always' is a legal variable name,
             *  as is 'maybe*2', 'maybe+1', 'path/to/foo'
             **/
            /** operator '+' **/
            tk_plus,
            /** operator '-' **/
            tk_minus,
            /** operator '*' **/
            tk_star,
            /** operator '/' **/
            tk_slash,
            /** keyworkd 'type' **/
            tk_type,
            /** keyword 'def' **/
            tk_def,
            /** keyword 'lambda' **/
            tk_lambda,
            /** keyword 'if' **/
            tk_if,
            /** keyword 'let' **/
            tk_let,
            /** keyword 'in' **/
            tk_in,
            /** keyword 'end' **/
            tk_end,
            n_tokentype /* comes last, counts #of entries */
        }; /*tokentype*/
        extern char const *
        tokentype_descr(tokentype tk_type);
        inline std::ostream &
        operator<< (std::ostream & os, tokentype tk_type) {
            os << tokentype_descr(tk_type);
            return os;
        }
    } /*namespace scm*/
 } /*namespace xo*/
 /* end tokentype.hpp */
--- a/xo-tokenizer/src/tokenizer/CMakeLists.txt
+++ b/xo-tokenizer/src/tokenizer/CMakeLists.txt
@ -0,0 +1,14 @@
 # tokenizer/CMakeLists.txt
 set(SELF_LIB xo_tokenizer)
 set(SELF_SRCS
    tokentype.cpp
    token.cpp)
 xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS})
 #xo_dependency(${SELF_LIB} refcnt)
 xo_dependency(${SELF_LIB} indentlog)
 #xo_dependency(${SELF_LIB} subsys)
 #xo_boost_dependency(${SELF_LIB})
 # end CMakeLists.txt
--- a/xo-tokenizer/src/tokenizer/token.cpp
+++ b/xo-tokenizer/src/tokenizer/token.cpp
@ -0,0 +1,9 @@
 /** @file token.cpp
 *
 *  author: Roland Conybeare
 **/
 #include "token.hpp"
 #include "xo/indentlog/print/tag.hpp"
 /** end token.cpp **/
--- a/xo-tokenizer/src/tokenizer/tokentype.cpp
+++ b/xo-tokenizer/src/tokenizer/tokentype.cpp
@ -0,0 +1,67 @@
 /* file tokentype.cpp
 *
 * author: Roland Conybeare
 */
 #include "tokentype.hpp"
 namespace xo {
    namespace scm {
        char const *
        tokentype_descr(tokentype tk_type)
        {
 #define CASE(x) case tokentype::x: return STRINGIFY(x)
            switch(tk_type) {
                CASE(tk_i64);
                CASE(tk_f64);
                CASE(tk_string);
                CASE(tk_symbol);
                CASE(tk_leftparen);
                CASE(tk_rightparen);
                CASE(tk_leftbracket);
                CASE(tk_rightbracket);
                CASE(tk_leftbrace);
                CASE(tk_rightbrace);
                CASE(tk_leftangle);
                CASE(tk_rightangle);
                CASE(tk_dot);
                CASE(tk_comma);
                CASE(tk_colon);
                CASE(tk_doublecolon);
                CASE(tk_semicolon);
                CASE(tk_singleassign);
                CASE(tk_assign);
                CASE(tk_yields);
                CASE(tk_plus);
                CASE(tk_minus);
                CASE(tk_star);
                CASE(tk_slash);
                CASE(tk_type);
                CASE(tk_def);
                CASE(tk_lambda);
                CASE(tk_if);
                CASE(tk_let);
                CASE(tk_in);
                CASE(tk_end);
            case tokentype::tk_invalid:
            case tokentype::n_tokentype:
                return "?tokentype";
            }
 #undef CASE
            return "???";
        } /*tokentype_descr*/
    } /*namespace scm*/
 } /*namespace xo*/
 /* end tokentype.cpp */
--- a/xo-tokenizer/utest/CMakeLists.txt
+++ b/xo-tokenizer/utest/CMakeLists.txt
@ -0,0 +1,13 @@
 # build unittest tokenizer/utest
 set(SELF_EXECUTABLE_NAME utest.tokenizer)
 set(SELF_SOURCE_FILES
    tokenizer_utest_main.cpp
    tokenizer.test.cpp
    token.test.cpp)
 xo_add_utest_executable(${SELF_EXECUTABLE_NAME} ${SELF_SOURCE_FILES})
 xo_self_dependency(${SELF_EXECUTABLE_NAME} xo_tokenizer)
 xo_external_target_dependency(${SELF_EXECUTABLE_NAME} Catch2 Catch2::Catch2)
 # end CMakeLists.txt
--- a/xo-tokenizer/utest/token.test.cpp
+++ b/xo-tokenizer/utest/token.test.cpp
@ -0,0 +1,260 @@
 /* file token.test.cpp
 *
 * author: Roland Conybeare
 */
 #include "xo/tokenizer/token.hpp"
 #include <catch2/catch.hpp>
 #include <memory>
 namespace xo {
    using token = xo::scm::token<char>;
    using xo::scm::tokentype;
    namespace ut {
        struct testcase_i64 {
            std::string text_;
            bool expect_throw_;
            std::int64_t expected_;
        };
        std::vector<testcase_i64> s_testcase_v = {
            {"", true, 0},
            {"0", false, 0},
            {"-", true, 0},
            {"+", true, 0},
            {"-0", false, 0},
            {"+0", false, 0},
            {"1", false, 1},
            {"-1", false, -1},
            {"9", false, 9},
            {"-9", false, -9},
            {"12", false, 12},
            {"+12", false, 12},
            {"-12", false, -12},
            {"99", false, 99},
            {"-99", false, -99},
            {"123x", true, 0},
        };
        TEST_CASE("parse-i64", "[token]") {
            for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
                INFO(xtag("i_tc", i_tc));
                auto const & testcase = s_testcase_v[i_tc];
                token tk(tokentype::tk_i64,
                         testcase.text_);
                REQUIRE(tk.tk_type() == tokentype::tk_i64);
                bool throw_flag = false;
                try {
                    std::int64_t x = tk.i64_value();
                    REQUIRE(x == testcase.expected_);
                } catch (std::exception & ex) {
                    throw_flag = true;
                }
                REQUIRE(throw_flag == testcase.expect_throw_);
            }
        }
        TEST_CASE("error-i64", "[token]") {
            token tk(tokentype::tk_i64, "+");
            bool throw_flag = false;
            try {
                tk.i64_value();
            } catch(std::exception & ex) {
                throw_flag = true;
            }
            REQUIRE(throw_flag);
        }
        namespace {
            struct testcase_f64 {
                std::string text_;
                bool expect_throw_;
                double expected_;
            };
            std::vector<testcase_f64> s_testcase_v = {
                {"",     true, 0},
                {"0",    false, 0},
                {"-",    true, 0},
                {"+",    true, 0},
                {"-0",   false, 0},
                {"+0",   false, 0},
                {"1",    false, 1},
                {"-1",   false, -1},
                {"9",    false, 9},
                {"-9",   false, -9},
                {"12",   false, 12},
                {"+12",  false, 12},
                {"-12",  false, -12},
                {"99",   false, 99},
                {"-99",  false, -99},
                {"123x", true, 0},
                {"0.0",  false, 0.0},
                {"0.1",  false, 0.1},
                {"0.12", false, 0.12},
                {"0.123", false, 0.123},
                {"0.1234", false, 0.1234},
                {"0.12345", false, 0.12345},
                {"0.123456", false, 0.123456},
                {"0.1234567", false, 0.1234567},
                {"0.12345678", false, 0.12345678},
                {"0.123456789", false, 0.123456789},
                {"+0.0",  false, 0.0},
                {"+0.1",  false, 0.1},
                {"+0.12", false, 0.12},
                {"+0.123", false, 0.123},
                {"+0.1234", false, 0.1234},
                {"+0.12345", false, 0.12345},
                {"+0.123456", false, 0.123456},
                {"+0.1234567", false, 0.1234567},
                {"+0.12345678", false, 0.12345678},
                {"+0.123456789", false, 0.123456789},
                {"+0.0e0",  false, 0.0},
                {"+0.1e0",  false, 0.1},
                {"+0.12e0", false, 0.12},
                {"+0.123e0", false, 0.123},
                {"+0.1234e0", false, 0.1234},
                {"+0.12345e0", false, 0.12345},
                {"+0.123456e0", false, 0.123456},
                {"+0.1234567e0", false, 0.1234567},
                {"+0.12345678e0", false, 0.12345678},
                {"+0.123456789e0", false, 0.123456789},
                {"+0.0e1",  false, 00.},
                {"+0.1e1",  false, 01.},
                {"+0.12e1", false, 01.2},
                {"+0.123e1", false, 01.23},
                {"+0.1234e1", false, 01.234},
                {"+0.12345e1", false, 01.2345},
                {"+0.123456e1", false, 01.23456},
                {"+0.1234567e1", false, 01.234567},
                {"+0.12345678e1", false, 01.2345678},
                {"+0.123456789e1", false, 01.23456789},
                {"+0.0E1",  false, 00.},
                {"+0.1E1",  false, 01.},
                {"+0.12E1", false, 01.2},
                {"+0.123E1", false, 01.23},
                {"+0.1234E1", false, 01.234},
                {"+0.12345E1", false, 01.2345},
                {"+0.123456E1", false, 01.23456},
                {"+0.1234567E1", false, 01.234567},
                {"+0.12345678E1", false, 01.2345678},
                {"+0.123456789E1", false, 01.23456789},
                {"+0.0e9",  false, 0.0},
                {"+0.1e9",  false, 0.1e9},
                {"+0.12e9", false, 0.12e9},
                {"+0.123e9", false, 0.123e9},
                {"+0.1234e9", false, 0.1234e9},
                {"+0.12345e9", false, 0.12345e9},
                {"+0.123456e9", false, 0.123456e9},
                {"+0.1234567e9", false, 0.1234567e9},
                {"+0.12345678e9", false, 0.12345678e9},
                {"+0.123456789e9", false, 0.123456789e9},
                {"-0.0",  false, -0.0},
                {"-0.1",  false, -0.1},
                {"-0.12", false, -0.12},
                {"-0.123", false, -0.123},
                {"-0.1234", false, -0.1234},
                {"-0.12345", false, -0.12345},
                {"-0.123456", false, -0.123456},
                {"-0.1234567", false, -0.1234567},
                {"-0.12345678", false, -0.12345678},
                {"-0.123456789", false, -0.123456789},
                {"00.",  false, 0.0},
                {"01.",  false, 1.0},
                {"01.2", false, 1.2},
                {"01.23", false, 1.23},
                {"01.234", false, 1.234},
                {"01.2345", false, 1.2345},
                {"01.23456", false, 1.23456},
                {"01.234567", false, 1.234567},
                {"01.2345678", false, 1.2345678},
                {"01.23456789", false, 1.23456789},
                {"0.0",  false, 0.0},
                {"1.2",  false, 1.2},
                {"12.", false, 12.0},
                {"12.3", false, 12.3},
                {"12.34", false, 12.34},
                {"12.345", false, 12.345},
                {"12.3456", false, 12.3456},
                {"12.34567", false, 12.34567},
                {"12.345678", false, 12.345678},
                {"12.3456789", false, 12.3456789},
                {"01.23",  false, 1.23},
                {"12.3",  false, 12.3},
                {"123.", false, 123.0},
                {"123.4", false, 123.4},
                {"123.45", false, 123.45},
                {"123.456", false, 123.456},
                {"123.4567", false, 123.4567},
                {"123.45678", false, 123.45678},
                {"123.456789", false, 123.456789},
            };
            TEST_CASE("parse-f64", "[token]") {
                for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
                    auto const & testcase = s_testcase_v[i_tc];
                    INFO(tostr(xtag("i_tc", i_tc),
                               xtag("text", testcase.text_)
                             ));
                    token tk(tokentype::tk_f64,
                             testcase.text_);
                    REQUIRE(tk.tk_type() == tokentype::tk_f64);
                    bool throw_flag = false;
                    std::string ex_msg;
                    try {
                        double x = tk.f64_value();
                        REQUIRE(x == Approx(testcase.expected_).epsilon(1.0e-15));
                    } catch (std::exception & ex) {
                        ex_msg = ex.what();
                        throw_flag = true;
                    }
                    INFO(xtag("ex_msg", ex_msg));
                    REQUIRE(throw_flag == testcase.expect_throw_);
                }
            }
        } /*namespace*/
    } /*namespace ut*/
 } /*namespace xo*/
 /* end token.test.cpp */
--- a/xo-tokenizer/utest/tokenizer.test.cpp
+++ b/xo-tokenizer/utest/tokenizer.test.cpp
@ -0,0 +1,268 @@
 /* file tokenizer.test.cpp
 *
 * author: Roland Conybeare
 */
 #include "xo/tokenizer/tokenizer.hpp"
 #include <catch2/catch.hpp>
 namespace xo {
    using xo::scm::tokentype;
    using token = xo::scm::token<char>;
    using xo::scm::span;
    namespace ut {
        namespace {
            struct testcase_tkz {
                std::string input_;
                bool expect_throw_;
                token expected_tk_;
                bool consume_all_;
            };
            std::vector<testcase_tkz>
            s_testcase_v = {
                {"<", false, token::leftangle(), true},
                {">", false, token::rightangle(), true},
                {"(", false, token::leftparen(), true},
                {")", false, token::rightparen(), true},
                {"[", false, token::leftbracket(), true},
                {"]", false, token::rightbracket(), true},
                {"{", false, token::leftbrace(), true},
                {" {", false, token::leftbrace(), true},
                {"\t{", false, token::leftbrace(), true},
                {"\n{", false, token::leftbrace(), true},
                {"}", false, token::rightbrace(), true},
                {"0",  false, token::i64_token("0"), true},
                {"1",  false, token::i64_token("1"), true},
                {"12",  false, token::i64_token("12"), true},
                {"123",  false, token::i64_token("123"), true},
                {"1234",  false, token::i64_token("1234"), true},
                {"0 ", false, token::i64_token("0"), false},
                {"1 ", false, token::i64_token("1"), false},
                {"12 ", false, token::i64_token("12"), false},
                {"123 ", false, token::i64_token("123"), false},
                {"1234 ", false, token::i64_token("1234"), false},
                {"1<", false, token::i64_token("1"), false},
                {"1>", false, token::i64_token("1"), false},
                {"1(", false, token::i64_token("1"), false},
                {"1)", false, token::i64_token("1"), false},
                {"1[", false, token::i64_token("1"), false},
                {"1]", false, token::i64_token("1"), false},
                {"1{", false, token::i64_token("1"), false},
                {"1}", false, token::i64_token("1"), false},
                {"1;", false, token::i64_token("1"), false},
                {"1:", false, token::i64_token("1"), false},
                {"1,", false, token::i64_token("1"), false},
                {".1", false, token::f64_token(".1"), true},
                {".12", false, token::f64_token(".12"), true},
                {".123", false, token::f64_token(".123"), true},
                {"+.1", false, token::f64_token("+.1"), true},
                {"+.12", false, token::f64_token("+.12"), true},
                {"+.123", false, token::f64_token("+.123"), true},
                {"-.1", false, token::f64_token("-.1"), true},
                {"-.12", false, token::f64_token("-.12"), true},
                {"-.123", false, token::f64_token("-.123"), true},
                {"1.", false, token::f64_token("1."), true},
                {"1.2", false, token::f64_token("1.2"), true},
                {"1.23", false, token::f64_token("1.23"), true},
                {"1e0", false, token::f64_token("1e0"), true},
                {"1e-1", false, token::f64_token("1e-1"), true},
                {"1e1", false, token::f64_token("1e1"), true},
                {"1e+1", false, token::f64_token("1e+1"), true},
                {"\"hello\"", false, token::string_token("hello"), true},
                /* tokenizer sees this input:
                 *   "\"hi\", she said"
                 */
                {"\"\\\"hi\\\", she said\"", false, token::string_token("\"hi\", she said"), true},
                /* tokenizer sees this input:
                 *   "look ma, newline ->\n<- "
                 */
                {"\"look ma, newline ->\\n<- \"", false,
                 token::string_token("look ma, newline ->\n<- "), true},
                /* tokenizer sees this input:
                 *   "tab to the right [\t], to the right [\t]"
                 */
                {"\"tab to the right [\\t], to the right [\\t]\"", false,
                 token::string_token("tab to the right [\t], to the right [\t]"), true},
                {":", false, token::colon(), true},
                {":=", false, token::assign_token(), true},
                {"symbol", false, token::symbol_token("symbol"), true},
                {"type", false, token::type(), true},
                {"def", false, token::def(), true},
                {"lambda", false, token::lambda(), true},
                {"if", false, token::if_token(), true},
                {"let", false, token::let(), true},
                {"in", false, token::in(), true},
                {"end", false, token::end(), true},
                {"*", false, token::star_token(), true},
            };
        }
        TEST_CASE("tokenizer", "[tokenizer]") {
            for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
                const testcase_tkz & testcase = s_testcase_v[i_tc];
                INFO(xtag("input", testcase.input_));
                INFO(xtag("i_tc", i_tc));
                using tokenizer
                    = xo::scm::tokenizer<char>;
                tokenizer tkz;
                tokenizer::span_type
                    in_span(testcase.input_.c_str(),
                            testcase.input_.c_str() + testcase.input_.size());
                auto out = tkz.scan(in_span);
                auto tk = out.first;
                if (tk.is_invalid())
                    tk = tkz.notify_eof();
                REQUIRE(tk.tk_type() == testcase.expected_tk_.tk_type());
                if (tk.tk_type() == tokentype::tk_i64)
                {
                    REQUIRE(!tk.text().empty());
                    REQUIRE(tk.i64_value() == testcase.expected_tk_.i64_value());
                } else if (tk.tk_type() == tokentype::tk_f64)
                {
                    REQUIRE(!tk.text().empty());
                    REQUIRE(tk.f64_value() == testcase.expected_tk_.f64_value());
                } else if(tk.tk_type() == tokentype::tk_string)
                {
                    /* tk.text() can be empty, consider input "" */
                    REQUIRE(tk.text() == testcase.expected_tk_.text());
                } else if(tk.tk_type() == tokentype::tk_symbol)
                {
                    REQUIRE(!tk.text().empty());
                    REQUIRE(tk.text() == testcase.expected_tk_.text());
                } else {
                    REQUIRE(tk.text().empty());
                }
                /* must consume all input for tests we're doing here */
                if (testcase.consume_all_)
                    REQUIRE(out.second == in_span);
                else
                    REQUIRE(out.second != in_span);
            }
        }
        namespace {
            struct testcase2_tkz {
                std::string input_;
                bool expect_throw_;
                std::vector<token> expected_tk_v_;
            };
            std::vector<testcase2_tkz>
            s_testcase2_v = {
                {"def foo : f64 = 3.141;",
                 false,
                 {token::def(),
                  token::symbol_token("foo"),
                  token::colon(),
                  token::symbol_token("f64"),
                  token::singleassign(),
                  token::f64_token("3.141"),
                  token::semicolon()
                 }},
                {"def foo = lambda (x : f64) { def y = x * x; y; }",
                 false,
                 {token::def(),
                  token::symbol_token("foo"),
                  token::singleassign(),
                  token::lambda(),
                  token::leftparen(),
                  token::symbol_token("x"),
                  token::colon(),
                  token::symbol_token("f64"),
                  token::rightparen(),
                  token::leftbrace(),
                  token::def(),
                  token::symbol_token("y"),
                  token::singleassign(),
                  token::symbol_token("x"),
                  token::star_token(),
                  token::symbol_token("x"),
                  token::semicolon(),
                  token::symbol_token("y"),
                  token::semicolon(),
                  token::rightbrace()
                 }}
            };
        }
        TEST_CASE("tokenizer2", "[tokenizer]") {
            for (std::size_t i_tc = 0, n_tc = s_testcase2_v.size(); i_tc < n_tc; ++i_tc) {
                const testcase2_tkz & testcase = s_testcase2_v[i_tc];
                INFO(xtag("input", testcase.input_));
                INFO(xtag("i_tc", i_tc));
                using tokenizer
                    = xo::scm::tokenizer<char>;
                tokenizer tkz;
                tokenizer::span_type
                    in_span(testcase.input_.c_str(),
                            testcase.input_.c_str() + testcase.input_.size());
                for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size();
                     i_tk < n_tk; ++i_tk)
                {
                    INFO(xtag("i_tk", i_tk));
                    auto res = tkz.scan2(in_span, in_span.empty());
                    const auto & tk = res.first;
                    if (tk.is_valid())
                        REQUIRE(tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type());
                    if (tk.tk_type() == tokentype::tk_i64)
                    {
                        REQUIRE(!tk.text().empty());
                        REQUIRE(tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value());
                    } else if (tk.tk_type() == tokentype::tk_f64)
                    {
                        REQUIRE(!tk.text().empty());
                        REQUIRE(tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value());
                    } else if(tk.tk_type() == tokentype::tk_string)
                    {
                        /* tk.text() can be empty, consider input "" */
                        REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text());
                    } else if(tk.tk_type() == tokentype::tk_symbol)
                    {
                        REQUIRE(!tk.text().empty());
                        REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text());
                    } else {
                        REQUIRE(tk.text().empty());
                    }
                    in_span = in_span.after_prefix(res.second);
                }
            }
        } /*TEST_CASE(tokenizer2)*/
    } /*namespace ut*/
 } /*namespace xo*/
 /* end tokenizer.test.cpp */
--- a/xo-tokenizer/utest/tokenizer_utest_main.cpp
+++ b/xo-tokenizer/utest/tokenizer_utest_main.cpp
@ -0,0 +1,6 @@
 /* file tokenizer_utest_main.cpp */
 #define CATCH_CONFIG_MAIN
 #include "catch2/catch.hpp"
 /* end tokenizer_utest_main.cpp */