xo-tokenizer: build + utest + reasonable implementation

2024-07-22 12:30:46 +10:00 · 2024-07-22 12:30:46 +10:00 · 9dc37e84e6
commit 9dc37e84e6
15 changed files with 2154 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,27 @@
+# xo-tokenizer/CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.10)
+
+project(tokenizer VERSION 0.1)
+
+include(GNUInstallDirs)
+include(cmake/xo-bootstrap-macros.cmake)
+
+xo_cxx_toplevel_options3()
+
+# ----------------------------------------------------------------
+# c++ settings
+
+set(PROJECT_CXX_FLAGS "")
+#set(PROJECT_CXX_FLAGS "-fconcepts-diagnostics-depth=2")
+add_definitions(${PROJECT_CXX_FLAGS})
+
+# ----------------------------------------------------------------
+
+add_subdirectory(src/tokenizer)
+add_subdirectory(utest)
+
+# ----------------------------------------------------------------
+# provide find_package() support
+
+xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets)
--- a/cmake/tokenizerConfig.cmake.in
+++ b/cmake/tokenizerConfig.cmake.in
@ -0,0 +1,8 @@
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+#find_dependency(refcnt)
+find_dependency(indentlog)
+#find_dependency(subsys)
+include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
+check_required_components("@PROJECT_NAME@")
--- a/cmake/xo-bootstrap-macros.cmake
+++ b/cmake/xo-bootstrap-macros.cmake
@ -0,0 +1,35 @@
+# ----------------------------------------------------------------
+# for example:
+#   $ PREFIX=/usr/local   # for example
+#   $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build
+#
+# will get
+#   CMAKE_MODULE_PATH
+# from xo-cmake-config --cmake-module-path
+#
+# and expect .cmake macros in
+#   CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake
+# ----------------------------------------------------------------
+
+find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED)
+
+if ("${XO_CMAKE_CONFIG_EXECUTABLE}" STREQUAL "XO_CMAKE_CONFIG_EXECUTABLE-NOT_FOUND")
+    message(FATAL "could not find xo-cmake-config executable")
+endif()
+
+message(STATUS "XO_CMAKE_CONFIG_EXECUTABLE=${XO_CMAKE_CONFIG_EXECUTABLE}")
+
+if (NOT XO_SUBMODULE_BUILD)
+    if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix))
+        # default to typical install location for xo-project-macros
+        execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH)
+        message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}")
+    endif()
+endif()
+
+# needs to have been installed somewhere on CMAKE_MODULE_PATH,
+# (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX)
+#
+include(xo_macros/xo_cxx)
+
+xo_cxx_bootstrap_message()
--- a/include/xo/tokenizer/buffer.hpp
+++ b/include/xo/tokenizer/buffer.hpp
@ -0,0 +1,324 @@
+/** @file buffer.hpp **/
+
+#pragma once
+
+#include "span.hpp"
+#include <utility>
+#include <cstdint>
+#include <cassert>
+#include <new>
+
+namespace xo {
+    namespace tok {
+        /**
+         * @class buffer buffer.hpp
+         *
+         * @brief Container for a (possibly owned) FIFO queue of chars
+         *
+         * @tparam CharT.  buffer element type.
+         *
+         * @code
+         *  .buf
+         *
+         *    +------------------------------------------+
+         *    |  |  ...  |  | X|  ... | X|  |    ...  |  |
+         *    +------------------------------------------+
+         *     ^             ^            ^               ^
+         *     0             .lo          .hi             .buf_z
+         *
+         *                   <-contents-><----avail----->
+         * @endcode
+         *
+         * Buffer does not support wrapped content:
+         * content that has not been consumed always occupies contiguous memory.
+         *
+         * Example:
+         * @code
+         * // 1.
+         *   buffer<char> buf(64*1024);
+         *   buf.empty() -> true
+         *   buf.buf_z() -> 65536
+         *   buf.lo_pos() -> 0
+         *   buf.hi_pos() -> 65536
+         *   buf.contents() -> empty span
+         *   buf.avail() -> span entire buffer memory
+         *
+         *   // write to (a prefix of) buf.avail()
+         *   ::strncpy(buf.buf(), "hello, world\n", 13);
+         *   buf.produce(span_type(buf.buf(), buf.buf() + 13));
+         *
+         *   buf.lo_pos() -> 0
+         *   buf.hi_pos() -> 13
+         *   buf.contents() -> "hello, world\n";
+         *
+         *
+         *   // examine stored content (does not change buffer state)
+         *   auto span = buf.contents();
+         *   cerr << string_view(span.lo(), span.hi());  // "hello, world\n"
+         *
+         *   // consume (a prefix of) stored content
+         *   buf.consume(span.prefix(7);
+         *
+         *   buf.lo_pos() -> 7
+         *   buf.hi_pos() -> 13
+         *   buf.contents() -> "world\n"
+         *
+         *   // consuming all remain content resets to original state
+         *   buf.consume(buf.contents());
+         *
+         *   buf.empty() -> true
+         *   buf.hi_pos() -> 0     // not 13!
+         *
+         * // 2.
+         *   buffer<char> buf;
+         *   buf.empty() -> true
+         *   buf.buf_z() -> 0
+         *   buf.lo_pos() -> 0
+         *   buf.hi_pos() -> 0
+         *   buf.contents() -> empty span
+         *   buf.avail() -> empty span
+         *
+         *   // allocate memory separately from ctor
+         *   buf.alloc(64*1024);
+         * @endcode
+         **/
+        template <typename CharT>
+        class buffer {
+        public:
+            /** @brief typealias for span of CharT **/
+            using span_type = span<CharT>;
+            /** @brief typealias for buffer size (counts CharT's, not bytes) **/
+            using size_type = std::uint64_t;
+
+        public:
+            /** @brief create empty buffer.
+
+                Does not allocate any storage;  @see alloc
+            **/
+            buffer() = default;
+            /** @brief create empty buffer,  and possibly allocate storage.
+
+                @param buf_z    Buffer size.  allocate storage (owned by this buffer) if >0.
+                @param align_z  Align to this value,  e.g. 8 to align storage on an 8-byte boundary
+            **/
+            buffer(size_type buf_z, size_type align_z = sizeof(char))
+                : is_owner_{true},
+                  buf_{buf_z ? (new (std::align_val_t(align_z)) CharT [buf_z]) : nullptr},
+                  buf_z_{buf_z},
+                  lo_pos_{0},
+                  hi_pos_{0}
+                {}
+            /** @brief buffer is not copyable **/
+            buffer(buffer const & x) = delete;
+            /** @brief destructor.  Release storage if owned **/
+            ~buffer() { this->reset(); }
+
+            /** @name Access methods **/
+            ///@{
+
+            /** @brief start of buffer memory **/
+            CharT * buf() const { return buf_; }
+            /** @brief buffer size (number of characters) **/
+            size_type buf_z() const { return buf_z_; }
+            /** @brief current start position within buffer **/
+            size_type lo_pos() const { return lo_pos_; }
+            /** @brief current end position within buffer **/
+            size_type hi_pos() const { return hi_pos_; }
+
+            ///@}
+
+            /** @brief readonly access to a single buffer element.
+
+                Relative to start of buffer (ignores current consume position)
+            **/
+            CharT const & operator[](size_type i) const { return buf_[i]; }
+
+            /** @brief return span for current buffer contents **/
+            span_type contents() const { return span_type(buf_ + lo_pos_, buf_ + hi_pos_); }
+            /** @brief returns span for writable buffer contents (unused prefix following produce position **/
+            span_type avail() const { return span_type(buf_ + hi_pos_, buf_ + buf_z_); }
+
+            /** @brief @c true iff buffer is empty **/
+            bool empty() const { return lo_pos_ == hi_pos_; }
+
+
+            /**
+               @brief update buffer produce position, after (independently) writing contents of span to it
+
+               @pre left endpoint of @p span equals buffer produce position (@c .hi_pos)
+               @pre right endpoint of @p span within bounds of buffer memory range
+               @post right endpoint of @p span equals buffer produce position.
+            **/
+            void produce(span_type const & span) {
+                assert(span.lo() == buf_ + hi_pos_);
+
+                hi_pos_ += span.size();
+            }
+
+            /**
+               @brief update buffer consume position,  when done with contents of span
+
+               @pre left endpoint of @p span equals buffer consume position (@c .lo_pos)
+               @pre right endpoint of @p span within bounds of buffer memory range
+               @post Either
+               buffer is empty, with @c .lo_pos = @c .hi_pos = @c 0.
+               buffer is non-empty, right endpoint of @p span equals new buffer consume position.
+            **/
+            void consume(span_type const & span) {
+                if (span.size()) {
+                    assert(span.lo() == buf_ + lo_pos_);
+
+                    lo_pos_ += span.size();
+                } else {
+                    /* since .consume() that arrives at empty contents also resets .lo_pos .hi_pos,
+                     * we don't want to blow up when called with an empty span -- argument
+                     * may represent some pre-reset location in buffer
+                     */
+                }
+
+                if (lo_pos_ == hi_pos_) {
+                    lo_pos_ = 0;
+                    hi_pos_ = 0;
+                }
+            }
+
+            /**
+               @brief allocate buffer with desired amount of memory
+
+               @param buf_z     desired buffer size
+               @param align_z   alignment;  buffer memory will be aligned on this byte-boundary.
+            **/
+            void alloc(size_type buf_z, size_type align_z = sizeof(char)) {
+                /* properly reset (+ discard) any existing state */
+                this->reset();
+
+                is_owner_ = true;
+                if (buf_z)
+                    buf_ = new (std::align_val_t(align_z)) CharT [buf_z];
+                buf_z_ = buf_z;
+                lo_pos_ = 0;
+                hi_pos_ = 0;
+            }
+
+            /**
+               @brief attach buffer to (unowned)  range of @p buf_z bytes starting at @p buf[0]
+
+               Buffer is not responsible for managing storage.
+
+               @post
+               1. buffer is empty
+               @post
+               2. buffer read position = buffer write position = 0
+            **/
+            void setbuf(CharT * buf, size_type buf_z) {
+                /* properly reset (+ discard) any existing state */
+                this->reset();
+
+                is_owner_ = false;
+                lo_pos_ = 0;
+                hi_pos_ = 0;
+                buf_ = buf;
+                buf_z_ = buf_z;
+            }
+
+            /**
+               @brief revert buffer to empty state and possibly zero it
+
+               @param zero_buffer_flag   Zero buffer contents iff this is true
+
+               @post
+               1. buffer is empty
+               @post
+               2. buffer read position = buffer write position = 0
+            **/
+            void clear2empty(bool zero_buffer_flag) {
+                if (buf_ && zero_buffer_flag)
+                    explicit_bzero(buf_, buf_z_ * sizeof(CharT));
+
+                lo_pos_ = 0;
+                hi_pos_ = 0;
+            }
+
+            /**
+               @brief swap representation with another buffer instance.
+            **/
+            void swap (buffer & x) {
+                std::swap(is_owner_, x.is_owner_);
+                std::swap(buf_, x.buf_);
+                std::swap(buf_z_, x.buf_z_);
+                std::swap(lo_pos_, x.lo_pos_);
+                std::swap(hi_pos_, x.hi_pos_);
+            }
+
+            /**
+               @brief reset buffer to an empty state and recover owned storage
+            **/
+            void reset() {
+                if (is_owner_ && buf_)
+                    delete [] buf_;
+
+                is_owner_ = false;
+                buf_ = nullptr;
+                buf_z_ = 0;
+                lo_pos_ = 0;
+                hi_pos_ = 0;
+            }
+
+            /**
+               @brief move-assignment operator.
+               @param x   right-hand-side to move from.
+
+               @post
+               @p x is in a valid, empty,
+            **/
+            buffer & operator= (buffer && x) {
+                is_owner_ = x.is_owner_;
+                buf_ = x.buf_;
+                buf_z_ = x.buf_z_;
+                lo_pos_ = x.lo_pos_;
+                hi_pos_ = x.hi_pos_;
+
+                x.is_owner_ = false;
+                x.lo_pos_ = 0;
+                x.hi_pos_ = 0;
+                x.buf_ = nullptr;
+                x.buf_z_ = 0;
+
+                return *this;
+            }
+
+            /** @brief buffer is not assignable */
+            buffer & operator= (buffer & x) = delete;
+
+        private:
+            /** @brief true iff buffer is responsible for freeing storage at @c buf_ **/
+            bool is_owner_ = false;
+            /** @brief buffer contents.  buffer memory comprises @c buf_[0] to @c buf_[buf_z_] **/
+            CharT * buf_ = nullptr;
+            /** @brief buffer size (in units of CharT) **/
+            size_type buf_z_ = 0;
+
+            /** @brief buffer read (consume) position
+
+                @invariant
+                0 <= lo_pos_ <= hi_pos_ < buf_z_
+            **/
+            size_type lo_pos_ = 0;
+            /** @brief buffer write (produce) position
+
+                @invariant
+                0 <= hi_pos_ < hi_pos_ < buf_z_
+            **/
+            size_type hi_pos_ = 0;
+        };
+
+        /** @brief Overload for @c swap,  so that @c buffer<CharT> swappable **/
+        template <typename CharT>
+        inline void
+        swap(buffer<CharT> & lhs, buffer<CharT> & rhs) {
+            lhs.swap(rhs);
+        }
+    } /*namespace tok*/
+} /*namespace xo*/
+
+/* end buffer.hpp */
--- a/include/xo/tokenizer/span.hpp
+++ b/include/xo/tokenizer/span.hpp
@ -0,0 +1,141 @@
+/** @file span.hpp **/
+
+#pragma once
+
+#include <ostream>
+#include <cstdint>
+#include <cassert>
+
+namespace xo {
+    namespace tok {
+        /** @class span compression/span.hpp
+         *
+         *  @brief Represents a contiguous memory range,  without ownership.
+         *
+         *  @tparam CharT type for elements referred to by this span.
+         **/
+        template <typename CharT>
+        class span {
+        public:
+            /** @brief typealias for span size (in units of CharT) **/
+            using size_type = std::uint64_t;
+
+        public:
+            /** @brief create span for the contiguous memory range [@p lo, @p hi) **/
+            span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {}
+
+            ///@{
+
+            /** @name getters **/
+
+            CharT * lo() const { return lo_; } /* get member span::lo_ */
+            CharT * hi() const { return hi_; } /* get member span::hi_ */
+
+            ///@}
+
+            /** @brief create new span over supplied type,
+             *  with identical (possibly misaligned) endpoints.
+             *
+             *  @warning
+             *  1. New span uses exactly the same memory addresses.
+             *     Endpoint pointers may not be aligned.
+             *  2. Implementation assumes code compiled with
+             *     @code -fno-strict-aliasing @endcode enabled.
+             *
+             *  @tparam OtherT element type for new span
+             **/
+            template <typename OtherT>
+            span<OtherT>
+            cast() const { return span<OtherT>(reinterpret_cast<OtherT *>(lo_),
+                                               reinterpret_cast<OtherT *>(hi_)); }
+
+            /** @brief create span including the first @p z members of this span. **/
+            span prefix(size_type z) const { return span(lo_, lo_ + z); }
+
+            /** @brief create span representing prefix up to (but not including) @p *p
+             **/
+            span prefix(CharT * p) const {
+                if (p <= hi_)
+                    return span(lo_, p);
+                else
+                    return span(lo_, hi_);
+            }
+
+            /** @brief create span with first @p z members of this span removed **/
+            span after_prefix(size_type z) const {
+                if (z > hi_ - lo_)
+                    z = hi_ - lo_;
+
+                return span(lo_ + z, hi_);
+            }
+
+            /** @brief create span with @p prefix of this span removed **/
+            span after_prefix(const span & prefix) const {
+                assert(prefix.lo() == lo_);
+                if (prefix.lo() != lo_) {
+                    throw std::runtime_error
+                        ("after_prefix: expected prefix of this span");
+                }
+
+                return after_prefix(prefix.size());
+            }
+
+            /** @brief create span starting with position p **/
+            span suffix_from(CharT * p) const {
+                if ((lo_ <= p) && (p <= hi_))
+                    return span(p, hi_);
+                else
+                    return span(hi_, hi_);
+            }
+
+            /** @brief true iff this span is empty (comprises 0 elements). **/
+            bool empty() const { return lo_ == hi_; }
+            /** @brief report the number of elements (of type CharT) in this span. **/
+            size_type size() const { return hi_ - lo_; }
+
+            /** print representation for this span on stream @p os **/
+            void print(std::ostream & os) const {
+                os << "<span"
+                   << xtag("size", size())
+                   << " :text " << xo::print::quot(std::string_view(lo_, hi_))
+                   << ">";
+            }
+
+        private:
+            ///@{
+
+            /** @brief start of span
+                Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
+            **/
+            CharT * lo_ = nullptr;
+            /** @brief end of span
+                Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
+            **/
+            CharT * hi_ = nullptr;
+
+            ///@}
+        }; /*span*/
+
+        template <typename CharT>
+        inline bool
+        operator==(const span<CharT> & lhs, const span<CharT> & rhs) {
+            return ((lhs.lo() == rhs.lo())
+                    && (lhs.hi() == rhs.hi()));
+        }
+
+        template <typename CharT>
+        inline bool
+        operator!=(const span<CharT> & lhs, const span<CharT> & rhs) {
+            return ((lhs.lo() != rhs.lo())
+                    || (lhs.hi() != rhs.hi()));
+        }
+
+        template <typename CharT>
+        inline std::ostream &
+        operator<<(std::ostream & os,
+                   const span<CharT> & x) {
+            x.print(os);
+            return os;
+        }
+    } /*namespace tok*/
+} /*namespace xo*/
--- a/include/xo/tokenizer/token.hpp
+++ b/include/xo/tokenizer/token.hpp
@ -0,0 +1,334 @@
+/* file token.hpp
+ *
+ * author: Roland Conybeare, Jul 2024
+ */
+
+#pragma once
+
+#include "tokentype.hpp"
+#include "xo/indentlog/print/tag.hpp"
+#include <stdexcept>
+#include <string>
+#include <cstdint>
+
+namespace xo {
+    namespace tok {
+        namespace detail {
+            /* compute a * b^p,  p >= 0 */
+            constexpr double
+            pow_aux(double a, double b, int p) {
+                while (p > 0) {
+                    if (p % 2 == 1) {
+                        /* a * b^p = a * b^(2q + 1) = a.b * 10^(2q) */
+                        a *= b;
+                        p -= 1;
+                    } else {
+                        /* a * b^p = a * b^(2q) = a * (b^2)^q */
+                        b = b * b;
+                        p /= 2;
+                    }
+                }
+
+                /* a * b^0 = a */
+                return a;
+            }
+
+            constexpr double
+            pow10(int p) {
+                if (p >= 0)
+                    return pow_aux(1.0, 10.0, p);
+                else
+                    return 1.0 / pow_aux(1.0, 10.0, -p);
+            }
+        }
+
+        template <typename CharT>
+        class token {
+        public:
+            token() = default;
+            token(tokentype tk_type, const std::string & text = "")
+                : tk_type_{tk_type}, text_{text} {}
+
+            static token invalid() { return token(); }
+            static token i64_token(const std::string & txt) {
+                return token(tokentype::tk_i64, txt);
+            }
+            static token f64_token(const std::string & txt) {
+                return token(tokentype::tk_f64, txt);
+            }
+            static token string_token(const std::string & txt) {
+                return token(tokentype::tk_string, txt);
+            }
+            static token symbol_token(const std::string & txt) {
+                return token(tokentype::tk_symbol, txt);
+            }
+            static token leftangle() { return token(tokentype::tk_leftangle); }
+            static token rightangle() { return token(tokentype::tk_rightangle); }
+            static token leftparen() { return token(tokentype::tk_leftparen); }
+            static token rightparen() { return token(tokentype::tk_rightparen); }
+            static token leftbracket() { return token(tokentype::tk_leftbracket); }
+            static token rightbracket() { return token(tokentype::tk_rightbracket); }
+            static token leftbrace() { return token(tokentype::tk_leftbrace); }
+            static token rightbrace() { return token(tokentype::tk_rightbrace); }
+            static token dot() { return token(tokentype::tk_dot); }
+            static token comma() { return token(tokentype::tk_comma); }
+            static token colon() { return token(tokentype::tk_colon); }
+            static token doublecolon() { return token(tokentype::tk_doublecolon); }
+            static token semicolon() { return token(tokentype::tk_semicolon); }
+            static token singleassign() { return token(tokentype::tk_singleassign); }
+            static token assign() { return token(tokentype::tk_assign); }
+            static token yields() { return token(tokentype::tk_yields); }
+
+            static token type() { return token(tokentype::tk_type); }
+            static token def() { return token(tokentype::tk_def); }
+            static token lambda() { return token(tokentype::tk_lambda); }
+            static token if_token() { return token(tokentype::tk_if); }
+            static token let() { return token(tokentype::tk_let); }
+            static token in() { return token(tokentype::tk_in); }
+
+            tokentype tk_type() const { return tk_type_; }
+            const std::string & text() const { return text_; }
+
+            bool is_valid() const { return tk_type_ != tokentype::tk_invalid; }
+            bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; }
+
+            /** expect input matching
+             *    [+|-][0-9][0-9]*
+             **/
+            std::int64_t i64_value() const;
+            /** expect input matching
+             *    [+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]*
+             **/
+            double f64_value() const;
+
+        private:
+            /** category for this token **/
+            tokentype tk_type_ = tokentype::tk_invalid;
+
+            /** characters comprising this token.
+             *  only provided for certain token types:
+             *
+             *    tk_i64
+             *    tk_f64
+             *    tk_string
+             *    tk_symbol
+             **/
+            std::string text_;
+        }; /*token*/
+
+        template <typename CharT>
+        std::int64_t
+        token<CharT>::i64_value() const {
+            if (tk_type_ != tokentype::tk_i64) {
+                throw (std::runtime_error
+                       (tostr("token::i64_value",
+                              ": token with type tk found where tk_i64 expected",
+                              xtag("tk", tk_type_))));
+            }
+
+            if (text_.empty()) {
+                throw (std::runtime_error
+                       (tostr("token::i64_value",
+                              ": unexpected empty input string for tk_i64 token")));
+            }
+
+            int sign = 1;
+            int value = 0;
+            {
+                auto ix = text_.begin();
+                auto end_ix = text_.end();
+
+                CharT ch = *ix;
+
+                if (ch == '+') {
+                    ++ix;
+                } else if (ch == '-') {
+                    sign = -1;
+                    ++ix;
+                }
+
+                if (ix == end_ix) {
+                    throw (std::runtime_error
+                           (tostr("token::i64_value",
+                                  ": input text found where at least one digit expected",
+                                  xtag("text", text_))));
+                }
+
+                for (; ix != end_ix; ++ix) {
+                    CharT ch = *ix;
+
+                    if ((ch >= '0') && (ch <= '9')) {
+                        value *= 10;
+                        value += (ch - '0');
+                    } else {
+                        throw (std::runtime_error
+                               (tostr("token::i64_value",
+                                      ": unexpected char ch in integer token",
+                                      xtag("ch", ch))));
+                    }
+                }
+            }
+
+            return sign * value;
+        } /*i64_value*/
+
+        template <typename CharT>
+        double
+        token<CharT>::f64_value() const {
+            if (tk_type_ != tokentype::tk_f64) {
+                throw (std::runtime_error
+                       (tostr("token::f64_value",
+                              ": token with type tk found where tk_f64 expected",
+                              xtag("tk", tk_type_))));
+            }
+
+            if (text_.empty()) {
+                throw (std::runtime_error
+                       (tostr("token::f64_value",
+                              ": unexpected empty input string for tk_f64 token")));
+            }
+
+            int sign = 1;
+            /* integer representing denormalized unsigned mantissa
+             * (mantissa scaled by smallest power of 10 sufficient to make
+             *  it an integer)
+             */
+            std::int64_t mantissa = 0;
+            /* counts #of digits to the right of decimal point '.' */
+            int rh_digits = 0;
+            /* sign of exponent */
+            int exp_sign = 1;
+            /* value of exponenct = integer to the right of 'e' or 'E' */
+            int exponent = 0;
+
+            /* floating-point value will represent
+             *   sign * mantissa * 10^(sign*exponent - rh_digits)
+             */
+            {
+                auto ix = text_.begin();
+                auto end_ix = text_.end();
+
+                CharT ch = *ix;
+
+                if (ch == '+') {
+                    ++ix;
+                } else if (ch == '-') {
+                    sign = -1;
+                    ++ix;
+                }
+
+                if (ix == end_ix) {
+                    throw (std::runtime_error
+                           (tostr("token::f64_value",
+                                  ": input text found where at least one digit expected",
+                                  xtag("text", text_))));
+                }
+
+                /* true iff decimal point '.' present in mantissa */
+                bool have_decimal_point = false;
+                /* true iff exponent prefix 'e' or 'E' present */
+                //bool have_exponent = false;
+                /* counts number of digits in mantissa
+                 * (both before and after, but not including, any decimal point
+                 */
+                int m_digits = 0;
+                /* digits to the left of decimal point */
+                int lh_digits = 0;
+
+                /* loop over mantissa digits */
+                for (; ix != end_ix; ++ix) {
+                    CharT ch = *ix;
+
+                    if (ch == '.') {
+                        if (have_decimal_point) {
+                            throw (std::runtime_error
+                                   (tostr("token::f64_value",
+                                          ": input text found where at most one decimal point expected",
+                                          xtag("text", text_))));
+                        }
+
+                        have_decimal_point = true;
+                        lh_digits = m_digits;
+                    } else if ((ch >= '0') && (ch <= '9')) {
+                        mantissa *= 10;
+                        mantissa += (ch - '0');
+                        ++m_digits;
+                    } else if (ch == 'e' || ch == 'E') {
+                        //have_exponent = true;
+                        break; // done with mantissa
+                    } else {
+                        throw (std::runtime_error
+                               (tostr("token::i64_value",
+                                      ": unexpected char ch in integer token",
+                                      xtag("ch", ch))));
+                    }
+                }
+
+                if (have_decimal_point)
+                    rh_digits = m_digits - lh_digits;
+
+                if (ix != end_ix) {
+                    /* continue to read exponent */
+
+                    /* skip e|E */
+                    ++ix;
+
+                    if (ix == end_ix) {
+                        throw (std::runtime_error
+                               (tostr("token::f64_value",
+                                      ": on input text, expect at least one digit following exponent marker e|E",
+                                      xtag("text", text_))));
+                    }
+
+                    CharT ch = *ix;
+
+                    if (ch == '+') {
+                        ++ix; /*skip*/
+                    } else if (ch == '-') {
+                        exp_sign = -1;
+                        ++ix;
+                    }
+
+                    for (; ix != end_ix; ++ix) {
+                        CharT ch = *ix;
+
+                        if ((ch >= '0') && (ch <= '9')) {
+                            exponent *= 10;
+                            exponent += (ch - '0');
+                        } else {
+                            throw (std::runtime_error
+                                   (tostr("token::f64_value",
+                                          "; on input text, expect only digits following"
+                                          " (possibly signed) exponenct marker",
+                                          xtag("text", text_))));
+                        }
+                    }
+                }
+            }
+
+            /* floating-point value will represent
+             *   sign * mantissa * 10^(sign*exponent - rh_digits)
+             */
+
+            double mantissa_f64 = sign * mantissa;
+
+#ifdef OBSOLETE_DEBUG
+            std::cerr << xtag("text", text_)
+                      << xtag("rh_digits", rh_digits)
+                      << xtag("mantissa_f64", mantissa_f64)
+                      << xtag("exp_sign", exp_sign)
+                      << xtag("exponent", exponent)
+                      << std::endl;
+#endif
+
+            double retval = (mantissa_f64
+                             * detail::pow10((exp_sign * exponent)
+                                             - rh_digits));
+
+            return retval;
+        } /*f64_value*/
+    } /*Namespace tok*/
+} /*namespace xo*/
+
+
+/* end token.hpp */
--- a/include/xo/tokenizer/tokenizer.hpp
+++ b/include/xo/tokenizer/tokenizer.hpp
@ -0,0 +1,625 @@
+/* file tokenizer.hpp
+ *
+ * author: Roland Conybeare, Jul 2024
+ */
+
+#pragma once
+
+#include "token.hpp"
+#include "span.hpp"
+#include "xo/indentlog/scope.hpp"
+#include <cassert>
+
+namespace xo {
+    namespace tok {
+        /**
+         *  Use:
+         *  @code
+         *    using tokenizer_type = tokenizer<char>;
+         *    using span_type = tokenizer_type::span_type;
+         *
+         *    tokenizer_type tkz;
+         *    span_type input = ...;
+         *
+         *    while !input.empty() {
+         *        auto res = tkz.assemble_scan(input);
+         *        const auto & tk = res.first;
+         *
+         *        // do something with tk if tk.is_valid()
+         *
+         *        input = input.after_prefix(res.second);
+         *    }
+         *
+         *    if endofinput {
+         *        auto tk = tzk.notify_eof()
+         *
+         *        // do something with tk if tk.is_valid()
+         *    }
+         *
+         *    // expect !tkz.has_prefix()
+         *
+         *  @endcode
+         **/
+        template <typename CharT>
+        class tokenizer {
+        public:
+            using token_type = token<CharT>;
+            using span_type = span<const CharT>;
+            using scan_result = std::pair<token_type, span_type>;
+
+        public:
+            tokenizer() = default;
+
+            /** identifies whitespace chars.
+             *  These are chars that do not belong to any token.
+             *  They are not permitted to appear within
+             *  a symbol or string token.
+             *  Appearance of a whitespace char forces completion of
+             *  preceding token.
+             **/
+            bool is_whitespace(CharT ch) const;
+
+            /** identifies punctuation chars.
+             *  These are chars that are not permitted to appear within
+             *  a string/symbol token.  Instead they force completion of
+             *  a preceding token,  and start a new token with themselves
+             **/
+            bool is_punctuation(CharT ch) const;
+
+            /** true if tokenizer contains stored prefix of
+             *  possibly-incomplete token
+             **/
+            bool has_prefix() const { !prefix_.empty(); }
+
+            /** assemble token from text @p token_text
+             **/
+            token_type assemble_token(const span_type & token_text) const;
+
+            /** scan for next input token,  given @p input **/
+            scan_result scan(const span_type & input);
+
+            /** notify end of input,  resolve any stored input **/
+            token_type notify_eof();
+
+        private:
+            /** Accumulate partial token here.
+             *  This will happen if input sent to @ref tokenizer::scan
+             *  ends without a determinate token boundary.
+             **/
+            std::string prefix_;
+        }; /*tokenizer*/
+
+        template <typename CharT>
+        bool
+        tokenizer<CharT>::is_whitespace(CharT ch) const {
+            switch(ch) {
+            case ' ': return true;
+            case '\t': return true;
+            case '\n': return true;
+            case '\r': return true;
+            }
+
+            return false;
+        }
+
+        template <typename CharT>
+        bool
+        tokenizer<CharT>::is_punctuation(CharT ch) const {
+            switch(ch) {
+            case '<':
+                return true;
+            case '>':
+                return true;
+            case '(':
+                return true;
+            case ')':
+                return true;
+            case '[':
+                return true;
+            case ']':
+                return true;
+            case '{':
+                return true;
+            case '}':
+                return true;
+            case ',':
+                return true;
+            case ';':
+                return true;
+            case ':':
+                return true;
+            case '=':
+                return true;
+            case '-':
+                /* can't be punctuation -- can appear inside f64 token */
+                return false;
+            case '+':
+                /* can't be punctuation -- can appear inside f64 token */
+                return false;
+            case '.':
+                /* can't be punctuation -- can appear inside f64 token */
+                return false;
+            }
+
+            return false;
+        }
+
+        template <typename CharT>
+        auto
+        tokenizer<CharT>::assemble_token(const span_type & token_text) const -> token_type
+        {
+            constexpr bool c_debug_flag = true;
+
+            /* literal|pretty|streamlined */
+            log_config::style = function_style::streamlined;
+
+            scope log(XO_DEBUG(c_debug_flag));
+            log && log(xtag("token_text", token_text));
+
+            tokentype tk_type = tokentype::tk_invalid;
+            std::string tk_text;
+
+            const CharT * tk_start = token_text.lo();
+            const CharT * tk_end = token_text.hi();
+
+            const CharT * ix = tk_start;
+
+            /* switch here applies to the first character in a token */
+            switch (*ix) {
+            case '-':
+            case '+':
+            case '.':
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            {
+                /* examples of valid floating-point numbers:
+                 *   .0
+                 *   1e0
+                 *   1e
+                 *   0.
+                 *   +1e0
+                 *   -1e0
+                 *   +1E+2
+                 *   -1E+2
+                 *   -0.123e-10
+                 * non-examples:
+                 *   .
+                 *   -
+                 *   +
+                 *   e0
+                 *   .e0
+                 *   -.e-0
+                 *   +.e+0
+                 *
+                 * in particular: to be recognized as a number,
+                 * must contain at least one digit
+                 */
+
+                log && log("possible number-token");
+
+                /* true if initial sign -/+ encountered */
+                bool sign_flag = false;
+                /* true if '.' encountered */
+                bool period_flag = false;
+                /* true if 'e' | 'E' encountered.
+                 */
+                bool exponent_flag = false;
+                /* true when sign '-' | '+' precedes exponenct digits */
+                bool exponent_sign_flag = false;
+                /* true when at least one digit follows exponent marker */
+                bool exponent_digit_flag = false;
+                /* true if at least one digit encountered */
+                bool number_flag = false;
+
+                /* token will be one of: {i64, f64, dot}: */
+                for(; ix != token_text.hi(); ++ix) {
+                    if((*ix == '-') || (*ix == '+')) {
+                        /* sign allowed:
+                         * 1. before period and before first digit
+                         * 2. after exponent
+                         */
+                        if (!period_flag && !number_flag && !sign_flag) {
+                            sign_flag = true;
+                        } else if (exponent_flag && !exponent_digit_flag) {
+                            exponent_sign_flag = true;
+                        } else {
+                            throw std::runtime_error
+                                (tostr("tokenizer::assemble_token",
+                                       ": improperly placed sign indicator",
+                                       xtag("pos", ix - tk_start),
+                                       xtag("char", *ix)));
+                        }
+                    } else if(*ix == '.') {
+                        if (period_flag) {
+                            throw (std::runtime_error
+                                   (tostr("tokenizer::assemble_token",
+                                          ": duplicate decimal point",
+                                          xtag("pos", ix - tk_start),
+                                          xtag("char", *ix))));
+                        }
+
+                        period_flag = true;
+                    } else if((*ix == 'e') || (*ix == 'E')) {
+                        if (exponent_flag) {
+                            throw (std::runtime_error
+                                   (tostr("tokenizer::assemble_token",
+                                          ": duplicate exponent marker",
+                                          xtag("pos", ix - tk_start),
+                                          xtag("char", *ix))));
+                        }
+
+                        exponent_flag = true;
+                    } else if(isdigit(*ix)) {
+                        if (exponent_flag) {
+                            /* need digit before exponent to recognize as number */
+                            exponent_digit_flag = true;
+                        } else {
+                            number_flag = true;
+                        }
+                    } else {
+                        /* invalid input */
+                        throw (std::runtime_error
+                               (tostr("tokenizer::assemble_token",
+                                      ": unexpected character in numeric constant",
+                                      xtag("pos", ix - tk_start),
+                                      xtag("char", *ix))));
+                    }
+                }
+
+                if (number_flag) {
+                    if (period_flag || exponent_flag) {
+                        tk_type = tokentype::tk_f64;
+                    } else {
+                        tk_type = tokentype::tk_i64;
+                    }
+                } else if (period_flag && !exponent_flag) {
+                    tk_type = tokentype::tk_dot;
+                } else {
+                    /* not a valid token */
+                }
+
+                log && log(xtag("sign_flag", sign_flag));
+                log && log(xtag("period_flag", period_flag),
+                           xtag("exponent_flag", exponent_flag),
+                           xtag("exponent_sign_flag", exponent_sign_flag),
+                           xtag("number_flag", number_flag));
+                log && log(xtag("tk_type", tk_type));
+
+                break;
+            }
+            case '"':
+            {
+                log && log("recognize string-token");
+
+                tk_type = tokentype::tk_string;
+
+                tk_text.reserve(token_text.hi() - token_text.lo());
+
+                ++ix; /*skip initial " char*/
+
+                for (; ix != token_text.hi(); ++ix) {
+                    log && log(xtag("*ix", *ix));
+
+                    bool endofstring = false;
+
+                    switch(*ix) {
+                    case '"':
+                        endofstring = true;
+
+                        /* skip final " char, don't capture */
+                        ++ix;
+
+                        break;
+                    case '\\':
+                        /* skip escape char, don't capture */
+                        ++ix;
+
+                        if (ix == token_text.hi()) {
+                            throw std::runtime_error
+                                (tostr("tokenizer::assemble_token",
+                                       ": malformed string literal",
+                                       xtag("input", std::string_view(token_text.lo(),
+                                                                      token_text.hi()))));
+                        }
+
+                        switch(*ix) {
+                        case '\\':
+                            log && log(xtag("*ix", *ix), xtag("escaped", "t"));
+                            tk_text.push_back(*ix);
+                            break;
+                        case 'n':
+                            log && log(xtag("*ix", *ix), xtag("newline", "t"));
+                            tk_text.push_back('\n');
+                            break;
+                        case 't':
+                            log && log(xtag("*ix", *ix), xtag("tab", "t"));
+                            tk_text.push_back('\t');
+                            break;
+                        case 'r':
+                            log && log(xtag("*ix", *ix), xtag("cr", "t"));
+                            tk_text.push_back('\r');
+                            break;
+                        case '"':
+                            log && log(xtag("*ix", *ix), xtag("quote", "t"));
+                            tk_text.push_back('"');
+                            break;
+                        default:
+                            throw std::runtime_error
+                                (tostr("tokenizer::assemble_token",
+                                       ": unexpected \\-escaped char",
+                                       xtag("char", *ix)));
+                        }
+                        break;
+                    default:
+                        tk_text.push_back(*ix);
+                        break;
+                    }
+
+                    if (endofstring)
+                        break;
+                }
+
+                if (ix != token_text.hi()) {
+                    throw std::runtime_error
+                        (tostr("tokenizer::assemble_token",
+                               ": expected \" to end string literal",
+                               xtag("input", std::string_view(token_text.lo(),
+                                                              token_text.hi()))));
+                }
+
+                log && log(tostr("tokenizer::assemble_token",
+                                 xtag("tk_text", tk_text)));
+
+                break;
+            }
+            case 'a': case 'A':
+            case 'b': case 'B':
+            case 'c': case 'C':
+            case 'd': case 'D':
+            case 'e': case 'E':
+            case 'f': case 'F':
+            case 'g': case 'G':
+            case 'h': case 'H':
+            case 'i': case 'I':
+            case 'j': case 'J':
+            case 'k': case 'K':
+            case 'l': case 'L':
+            case 'm': case 'M':
+            case 'n': case 'N':
+            case 'o': case 'O':
+            case 'p': case 'P':
+            case 'q': case 'Q':
+            case 'r': case 'R':
+            case 's': case 'S':
+            case 't': case 'T':
+            case 'u': case 'U':
+            case 'v': case 'V':
+            case 'w': case 'W':
+            case 'x': case 'X':
+            case 'y': case 'Y':
+            case 'z': case 'Z':
+            {
+                /* symbol/identifier must begin with a letter?
+                 * we want to accept some other chars too.
+                 * specifically want to allow identifiers:
+                 *   this-is-the-way
+                 *   this+is+also+the+way
+                 *   how/much/is/that/doggy
+                 *   put*an*asterisk*in*that
+                 *   something%special%
+                 *
+                 * like pure lisp,  we don't allow:
+                 * - identifier beginning with digit
+                 * - period .
+                 *
+                 * unlike pure lisp,  we don't allow anywhere in a symbol:
+                 * - colon     :
+                 * - semicolon ;
+                 * - comma     ,
+                 *
+                 * also we don't allow symbols to begin with special chars
+                 */
+
+                tk_type = tokentype::tk_symbol;
+                break;
+            }
+            case '<':
+                tk_type = tokentype::tk_leftangle;
+                ++ix;
+                break;
+            case '>':
+                tk_type = tokentype::tk_rightangle;
+                ++ix;
+                break;
+            case '(':
+                tk_type = tokentype::tk_leftparen;
+                ++ix;
+                break;
+            case ')':
+                tk_type = tokentype::tk_rightparen;
+                ++ix;
+                break;
+            case '[':
+                tk_type = tokentype::tk_leftbracket;
+                ++ix;
+                break;
+            case ']':
+                tk_type = tokentype::tk_rightbracket;
+                ++ix;
+                break;
+            case '{':
+                tk_type = tokentype::tk_leftbrace;
+                ++ix;
+                break;
+            case '}':
+                tk_type = tokentype::tk_rightbrace;
+                ++ix;
+                break;
+            case ',':
+                tk_type = tokentype::tk_comma;
+                ++ix;
+                break;
+            case ';':
+                tk_type = tokentype::tk_semicolon;
+                ++ix;
+                break;
+            case ':':
+                tk_type = tokentype::tk_colon;
+                ++ix;
+                break;
+            case '=':
+                tk_type = tokentype::tk_singleassign;
+                ++ix;
+                break;
+            default:
+                break;
+            }
+
+            if (tk_type == tokentype::tk_invalid) {
+                throw std::runtime_error(tostr("tokenizer::assemble_token",
+                                               ": unexpected input x",
+                                               xtag("x", *ix)));
+            }
+
+            if ((tk_type == tokentype::tk_i64)
+                || (tk_type == tokentype::tk_f64)
+                || (tk_type == tokentype::tk_symbol))
+            {
+                /* re-parse in token::i64_value() / token::f64_value() */
+                tk_text = std::string(tk_start, tk_end);
+            } else if (tk_type == tokentype::tk_string) {
+                ; /* nothing to do here -- desired tk_text already constructed */
+            }
+
+            return token_type(tk_type, std::move(tk_text));
+        } /*assemble_token*/
+
+        template <typename CharT>
+        auto
+        tokenizer<CharT>::scan(const span_type & input) -> scan_result
+        {
+            constexpr bool c_debug_flag = true;
+            scope log(XO_DEBUG(c_debug_flag));
+
+            log && log(xtag("input", input));
+
+            const CharT * ix = input.lo();
+
+            /* skip whitespace */
+            while (is_whitespace(*ix) && (ix != input.hi()))
+                ++ix;
+
+            if(ix == input.hi()) {
+                /* no-op */
+                return {
+                    token_type::invalid(),
+                    input.prefix(ix)
+                };
+            }
+
+            /* here: *ix is not whitespace */
+
+            auto whitespace = input.prefix(ix);
+
+            log && log(xtag("whitespace.size", whitespace.size()));
+
+            /* tk_start points to beginning of token
+             * (after any whitespace)
+             */
+            const CharT * tk_start = ix;
+
+            if (is_punctuation(*ix)) {
+                /* 1-character token */
+                ++ix;
+            } else if (*ix == '"') {
+                bool complete_flag = false;
+
+                /* 1. embedded space/tab allowed in string literal.
+                 * 2. embedded newline/cr not allowed.
+                 */
+                CharT prev_ch = '"';
+
+                ++ix;
+
+                for (; ix != input.hi(); ++ix) {
+                    /* looking for unescaped " char to end literal */
+                    if (*ix == '"') {
+                        if (prev_ch != '\\') {
+                            ++ix;  /* include terminating " for assemble_token */
+                            complete_flag = true;
+                            break;
+                        }
+                    } else if ((*ix == '\n') || (*ix == '\r')) {
+                        throw std::runtime_error
+                            (tostr("tokenizer::scan",
+                                   ": must use \\n or \\r to encode newline/cr in"
+                                   " string literal"));
+                    }
+
+                    prev_ch = *ix;
+                }
+
+                if (!complete_flag) {
+                    /* need more input to know if/when tokken complete */
+                    this->prefix_ += std::string(tk_start, input.hi());
+
+                    log && log(xtag("captured-prefix", this->prefix_));
+                }
+            } else {
+                /* scan until:
+                 * - whitespace
+                 * - punctuation
+                 */
+                for (; ix != input.hi(); ++ix) {
+                    if (is_whitespace(*ix) || is_punctuation(*ix))
+                        break;
+                }
+
+                if (ix == input.hi()) {
+                    /* need more input to know if/when token complete */
+                    this->prefix_ += std::string(tk_start, input.hi());
+
+                    log && log(xtag("captured-prefix", this->prefix_));
+                }
+            }
+
+            auto token_span = input.after_prefix(whitespace).prefix(ix);
+
+            token tk
+                = (this->prefix_.empty()
+                   ? assemble_token(token_span)
+                   : token_type(tokentype::tk_invalid));
+
+            return scan_result
+                { tk, input.prefix(whitespace.size() + token_span.size()) };
+        } /*scan*/
+
+        template <typename CharT>
+        auto
+        tokenizer<CharT>::notify_eof() -> token_type {
+            constexpr bool c_debug_flag = true;
+
+            scope log(XO_DEBUG(c_debug_flag));
+
+            token tk
+                = (this->prefix_.empty()
+                   ? token_type(tokentype::tk_invalid)
+                   : assemble_token(span_type(&prefix_[0],
+                                              &prefix_[prefix_.size()])));
+
+            this->prefix_.clear();
+
+            return tk;
+        } /*notify_eof*/
+    } /*namespace tok*/
+} /*namespace xo*/
+
+/* end tokenizer.hpp */
--- a/include/xo/tokenizer/tokentype.hpp
+++ b/include/xo/tokenizer/tokentype.hpp
@ -0,0 +1,142 @@
+/** @file tokentype.hpp
+ *
+ *  author: Roland Conybeare, Jul 2024
+ **/
+
+#pragma once
+
+#include "xo/indentlog/print/tag.hpp" // for STRINGIFY
+#include <ostream>
+
+namespace xo {
+    namespace tok {
+        /** @enum tokentype
+         *  @brief enum to identify different schematica input token types
+         *
+         *  Schematica code examples:
+         *
+         *    type point :: { xcoord : f64, ycoord: f64 };
+         *    type matrix :: array<double, 2>;  // 2-d array
+         *
+         *    decl hypot(x : f64, y : f64) -> f64;
+         *
+         *    def hypot(x : f64, y : f64) {
+         *      let
+         *        x2 = (x * x);
+         *        y2 = (y * y);
+         *        hypot2 = (x2 + y2);
+         *      in
+         *        sqrt(hypot2);
+         *    };
+         *
+         *    def someconst 4;
+         *
+         *    def foo(v : vec<i32>) {
+         *      def (pi : f64) = 3.1415926;
+         *      def (h : (f64,f64) -> f64) = hypot;
+         *
+         *      h = hypot3;
+         *    };
+         *
+         *    def matrixproduct(x : matrix, y : matrix) {
+         *      [i,j : x.row(i) * y.col(j)];
+         *    };
+         **/
+        enum class tokentype {
+            /** sentinel value **/
+            tk_invalid = -1,
+
+            /** an integer constant (signed 64-bit integer) **/
+            tk_i64,
+
+            /** a 64-bit floating-point constant **/
+            tk_f64,
+
+            /** a string literal **/
+            tk_string,
+
+            /** a symbol **/
+            tk_symbol,
+
+            /** left-hand parenthesis '(' **/
+            tk_leftparen,
+
+            /** right-hand parenthesis ')' **/
+            tk_rightparen,
+
+            /** left-hand bracket '[' **/
+            tk_leftbracket,
+
+            /** right-hand bracket ']' **/
+            tk_rightbracket,
+
+            /** left-hand brace '{' **/
+            tk_leftbrace,
+
+            /** right-hand brace '}' **/
+            tk_rightbrace,
+
+            /** left-hand angle bracket '<' **/
+            tk_leftangle,
+
+            /** right-hand angle bracket '>' **/
+            tk_rightangle,
+
+            /** dot '.' **/
+            tk_dot,
+
+            /** comma ',' **/
+            tk_comma,
+
+            /** colon ':' **/
+            tk_colon,
+
+            /** double-colon '::' **/
+            tk_doublecolon,
+
+            /** semi-colon ';' **/
+            tk_semicolon,
+
+            /** '=' **/
+            tk_singleassign,
+
+            /** ':=' **/
+            tk_assign,
+
+            /** '->' **/
+            tk_yields,
+
+            /** keyworkd 'type' **/
+            tk_type,
+
+            /** keyword 'def' **/
+            tk_def,
+
+            /** keyword 'lambda' **/
+            tk_lambda,
+
+            /** keyword 'if' **/
+            tk_if,
+
+            /** keyword 'let' **/
+            tk_let,
+
+            /** keyword 'in' **/
+            tk_in,
+
+            n_tokentype /* comes last, counts #of entries */
+        }; /*tokentype*/
+
+        extern char const *
+        tokentype_descr(tokentype tk_type);
+
+        inline std::ostream &
+        operator<< (std::ostream & os, tokentype tk_type) {
+            os << tokentype_descr(tk_type);
+            return os;
+        }
+    } /*namespace tok*/
+} /*namespace xo*/
+
+
+/* end tokentype.hpp */
--- a/src/tokenizer/CMakeLists.txt
+++ b/src/tokenizer/CMakeLists.txt
@ -0,0 +1,14 @@
+# tokenizer/CMakeLists.txt
+
+set(SELF_LIB tokenizer)
+set(SELF_SRCS
+    tokentype.cpp
+    token.cpp)
+
+xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS})
+#xo_dependency(${SELF_LIB} refcnt)
+xo_dependency(${SELF_LIB} indentlog)
+#xo_dependency(${SELF_LIB} subsys)
+#xo_boost_dependency(${SELF_LIB})
+
+# end CMakeLists.txt
--- a/src/tokenizer/token.cpp
+++ b/src/tokenizer/token.cpp
@ -0,0 +1,9 @@
+/** @file token.cpp
+ *
+ *  author: Roland Conybeare
+ **/
+
+#include "token.hpp"
+#include "xo/indentlog/print/tag.hpp"
+
+/** end token.cpp **/
--- a/src/tokenizer/tokentype.cpp
+++ b/src/tokenizer/tokentype.cpp
@ -0,0 +1,56 @@
+/* file tokentype.cpp
+ *
+ * author: Roland Conybeare
+ */
+
+#include "tokentype.hpp"
+
+namespace xo {
+    namespace tok {
+        char const *
+        tokentype_descr(tokentype tk_type)
+        {
+#define CASE(x) case tokentype::x: return STRINGIFY(x)
+
+            switch(tk_type) {
+                CASE(tk_i64);
+                CASE(tk_f64);
+                CASE(tk_string);
+                CASE(tk_symbol);
+                CASE(tk_leftparen);
+                CASE(tk_rightparen);
+                CASE(tk_leftbracket);
+                CASE(tk_rightbracket);
+                CASE(tk_leftbrace);
+                CASE(tk_rightbrace);
+                CASE(tk_leftangle);
+                CASE(tk_rightangle);
+                CASE(tk_dot);
+                CASE(tk_comma);
+                CASE(tk_colon);
+                CASE(tk_doublecolon);
+                CASE(tk_semicolon);
+                CASE(tk_singleassign);
+                CASE(tk_assign);
+                CASE(tk_yields);
+                CASE(tk_type);
+                CASE(tk_def);
+                CASE(tk_lambda);
+                CASE(tk_if);
+                CASE(tk_let);
+                CASE(tk_in);
+
+            case tokentype::tk_invalid:
+            case tokentype::n_tokentype:
+                return "?tokentype";
+            }
+
+#undef CASE
+
+            return "???";
+        } /*tokentype_descr*/
+    } /*namespace tok*/
+} /*namespace xo*/
+
+
+/* end tokentype.cpp */
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@ -0,0 +1,13 @@
+# build unittest tokenizer/utest
+
+set(SELF_EXECUTABLE_NAME utest.tokenizer)
+set(SELF_SOURCE_FILES
+    tokenizer_utest_main.cpp
+    tokenizer.test.cpp
+    token.test.cpp)
+
+xo_add_utest_executable(${SELF_EXECUTABLE_NAME} ${SELF_SOURCE_FILES})
+xo_self_dependency(${SELF_EXECUTABLE_NAME} tokenizer)
+xo_external_target_dependency(${SELF_EXECUTABLE_NAME} Catch2 Catch2::Catch2)
+
+# end CMakeLists.txt
--- a/utest/token.test.cpp
+++ b/utest/token.test.cpp
@ -0,0 +1,260 @@
+/* file token.test.cpp
+ *
+ * author: Roland Conybeare
+ */
+
+#include "xo/tokenizer/token.hpp"
+#include <catch2/catch.hpp>
+#include <memory>
+
+namespace xo {
+    using token = xo::tok::token<char>;
+    using xo::tok::tokentype;
+
+    namespace ut {
+        struct testcase_i64 {
+            std::string text_;
+            bool expect_throw_;
+            std::int64_t expected_;
+        };
+
+        std::vector<testcase_i64> s_testcase_v = {
+            {"", true, 0},
+            {"0", false, 0},
+            {"-", true, 0},
+            {"+", true, 0},
+            {"-0", false, 0},
+            {"+0", false, 0},
+            {"1", false, 1},
+            {"-1", false, -1},
+            {"9", false, 9},
+            {"-9", false, -9},
+            {"12", false, 12},
+            {"+12", false, 12},
+            {"-12", false, -12},
+            {"99", false, 99},
+            {"-99", false, -99},
+            {"123x", true, 0},
+        };
+
+        TEST_CASE("parse-i64", "[token]") {
+            for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
+                INFO(xtag("i_tc", i_tc));
+
+                auto const & testcase = s_testcase_v[i_tc];
+
+                token tk(tokentype::tk_i64,
+                         testcase.text_);
+
+                REQUIRE(tk.tk_type() == tokentype::tk_i64);
+
+                bool throw_flag = false;
+                try {
+                    std::int64_t x = tk.i64_value();
+
+                    REQUIRE(x == testcase.expected_);
+                } catch (std::exception & ex) {
+                    throw_flag = true;
+                }
+
+                REQUIRE(throw_flag == testcase.expect_throw_);
+            }
+        }
+
+        TEST_CASE("error-i64", "[token]") {
+            token tk(tokentype::tk_i64, "+");
+
+            bool throw_flag = false;
+
+            try {
+                tk.i64_value();
+            } catch(std::exception & ex) {
+                throw_flag = true;
+            }
+
+            REQUIRE(throw_flag);
+        }
+
+        namespace {
+            struct testcase_f64 {
+                std::string text_;
+                bool expect_throw_;
+                double expected_;
+            };
+
+            std::vector<testcase_f64> s_testcase_v = {
+                {"",     true, 0},
+                {"0",    false, 0},
+                {"-",    true, 0},
+                {"+",    true, 0},
+                {"-0",   false, 0},
+
+                {"+0",   false, 0},
+                {"1",    false, 1},
+                {"-1",   false, -1},
+                {"9",    false, 9},
+                {"-9",   false, -9},
+
+                {"12",   false, 12},
+                {"+12",  false, 12},
+                {"-12",  false, -12},
+                {"99",   false, 99},
+                {"-99",  false, -99},
+
+                {"123x", true, 0},
+                {"0.0",  false, 0.0},
+                {"0.1",  false, 0.1},
+                {"0.12", false, 0.12},
+                {"0.123", false, 0.123},
+
+                {"0.1234", false, 0.1234},
+                {"0.12345", false, 0.12345},
+                {"0.123456", false, 0.123456},
+                {"0.1234567", false, 0.1234567},
+                {"0.12345678", false, 0.12345678},
+
+                {"0.123456789", false, 0.123456789},
+                {"+0.0",  false, 0.0},
+                {"+0.1",  false, 0.1},
+                {"+0.12", false, 0.12},
+                {"+0.123", false, 0.123},
+
+                {"+0.1234", false, 0.1234},
+                {"+0.12345", false, 0.12345},
+                {"+0.123456", false, 0.123456},
+                {"+0.1234567", false, 0.1234567},
+                {"+0.12345678", false, 0.12345678},
+
+                {"+0.123456789", false, 0.123456789},
+                {"+0.0e0",  false, 0.0},
+                {"+0.1e0",  false, 0.1},
+                {"+0.12e0", false, 0.12},
+                {"+0.123e0", false, 0.123},
+
+                {"+0.1234e0", false, 0.1234},
+                {"+0.12345e0", false, 0.12345},
+                {"+0.123456e0", false, 0.123456},
+                {"+0.1234567e0", false, 0.1234567},
+                {"+0.12345678e0", false, 0.12345678},
+
+                {"+0.123456789e0", false, 0.123456789},
+                {"+0.0e1",  false, 00.},
+                {"+0.1e1",  false, 01.},
+                {"+0.12e1", false, 01.2},
+                {"+0.123e1", false, 01.23},
+
+                {"+0.1234e1", false, 01.234},
+                {"+0.12345e1", false, 01.2345},
+                {"+0.123456e1", false, 01.23456},
+                {"+0.1234567e1", false, 01.234567},
+                {"+0.12345678e1", false, 01.2345678},
+
+                {"+0.123456789e1", false, 01.23456789},
+                {"+0.0E1",  false, 00.},
+                {"+0.1E1",  false, 01.},
+                {"+0.12E1", false, 01.2},
+                {"+0.123E1", false, 01.23},
+
+                {"+0.1234E1", false, 01.234},
+                {"+0.12345E1", false, 01.2345},
+                {"+0.123456E1", false, 01.23456},
+                {"+0.1234567E1", false, 01.234567},
+                {"+0.12345678E1", false, 01.2345678},
+
+                {"+0.123456789E1", false, 01.23456789},
+                {"+0.0e9",  false, 0.0},
+                {"+0.1e9",  false, 0.1e9},
+                {"+0.12e9", false, 0.12e9},
+                {"+0.123e9", false, 0.123e9},
+
+                {"+0.1234e9", false, 0.1234e9},
+                {"+0.12345e9", false, 0.12345e9},
+                {"+0.123456e9", false, 0.123456e9},
+                {"+0.1234567e9", false, 0.1234567e9},
+                {"+0.12345678e9", false, 0.12345678e9},
+
+                {"+0.123456789e9", false, 0.123456789e9},
+                {"-0.0",  false, -0.0},
+                {"-0.1",  false, -0.1},
+                {"-0.12", false, -0.12},
+                {"-0.123", false, -0.123},
+
+                {"-0.1234", false, -0.1234},
+                {"-0.12345", false, -0.12345},
+                {"-0.123456", false, -0.123456},
+                {"-0.1234567", false, -0.1234567},
+                {"-0.12345678", false, -0.12345678},
+
+                {"-0.123456789", false, -0.123456789},
+                {"00.",  false, 0.0},
+                {"01.",  false, 1.0},
+                {"01.2", false, 1.2},
+                {"01.23", false, 1.23},
+
+                {"01.234", false, 1.234},
+                {"01.2345", false, 1.2345},
+                {"01.23456", false, 1.23456},
+                {"01.234567", false, 1.234567},
+                {"01.2345678", false, 1.2345678},
+
+                {"01.23456789", false, 1.23456789},
+                {"0.0",  false, 0.0},
+                {"1.2",  false, 1.2},
+                {"12.", false, 12.0},
+                {"12.3", false, 12.3},
+
+                {"12.34", false, 12.34},
+                {"12.345", false, 12.345},
+                {"12.3456", false, 12.3456},
+                {"12.34567", false, 12.34567},
+                {"12.345678", false, 12.345678},
+
+                {"12.3456789", false, 12.3456789},
+                {"01.23",  false, 1.23},
+                {"12.3",  false, 12.3},
+                {"123.", false, 123.0},
+                {"123.4", false, 123.4},
+
+                {"123.45", false, 123.45},
+                {"123.456", false, 123.456},
+                {"123.4567", false, 123.4567},
+                {"123.45678", false, 123.45678},
+                {"123.456789", false, 123.456789},
+            };
+
+            TEST_CASE("parse-f64", "[token]") {
+                for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
+                    auto const & testcase = s_testcase_v[i_tc];
+
+                    INFO(tostr(xtag("i_tc", i_tc),
+                               xtag("text", testcase.text_)
+                             ));
+
+                    token tk(tokentype::tk_f64,
+                             testcase.text_);
+
+                    REQUIRE(tk.tk_type() == tokentype::tk_f64);
+
+                    bool throw_flag = false;
+                    std::string ex_msg;
+
+                    try {
+                        double x = tk.f64_value();
+
+                        REQUIRE(x == Approx(testcase.expected_).epsilon(1.0e-15));
+                    } catch (std::exception & ex) {
+                        ex_msg = ex.what();
+
+                        throw_flag = true;
+                    }
+
+                    INFO(xtag("ex_msg", ex_msg));
+
+                    REQUIRE(throw_flag == testcase.expect_throw_);
+                }
+            }
+        } /*namespace*/
+    } /*namespace ut*/
+} /*namespace xo*/
+
+/* end token.test.cpp */
--- a/utest/tokenizer.test.cpp
+++ b/utest/tokenizer.test.cpp
@ -0,0 +1,160 @@
+/* file tokenizer.test.cpp
+ *
+ * author: Roland Conybeare
+ */
+
+#include "tokenizer.hpp"
+#include <catch2/catch.hpp>
+
+namespace xo {
+    using xo::tok::tokentype;
+    using token = xo::tok::token<char>;
+    using xo::tok::span;
+
+    namespace ut {
+        namespace {
+            struct testcase_tkz {
+                std::string input_;
+                bool expect_throw_;
+                token expected_tk_;
+                bool consume_all_;
+            };
+
+            std::vector<testcase_tkz>
+            s_testcase_v = {
+                {"<", false, token::leftangle(), true},
+                {">", false, token::rightangle(), true},
+
+                {"(", false, token::leftparen(), true},
+                {")", false, token::rightparen(), true},
+
+                {"[", false, token::leftbracket(), true},
+                {"]", false, token::rightbracket(), true},
+
+                {"{", false, token::leftbrace(), true},
+                {" {", false, token::leftbrace(), true},
+
+                {"\t{", false, token::leftbrace(), true},
+                {"\n{", false, token::leftbrace(), true},
+                {"}", false, token::rightbrace(), true},
+
+                {"0",  false, token::i64_token("0"), true},
+                {"1",  false, token::i64_token("1"), true},
+                {"12",  false, token::i64_token("12"), true},
+                {"123",  false, token::i64_token("123"), true},
+                {"1234",  false, token::i64_token("1234"), true},
+
+                {"0 ", false, token::i64_token("0"), false},
+                {"1 ", false, token::i64_token("1"), false},
+                {"12 ", false, token::i64_token("12"), false},
+                {"123 ", false, token::i64_token("123"), false},
+                {"1234 ", false, token::i64_token("1234"), false},
+
+                {"1<", false, token::i64_token("1"), false},
+                {"1>", false, token::i64_token("1"), false},
+                {"1(", false, token::i64_token("1"), false},
+                {"1)", false, token::i64_token("1"), false},
+                {"1[", false, token::i64_token("1"), false},
+                {"1]", false, token::i64_token("1"), false},
+                {"1{", false, token::i64_token("1"), false},
+                {"1}", false, token::i64_token("1"), false},
+                {"1;", false, token::i64_token("1"), false},
+                {"1:", false, token::i64_token("1"), false},
+                {"1,", false, token::i64_token("1"), false},
+
+                {".1", false, token::f64_token(".1"), true},
+                {".12", false, token::f64_token(".12"), true},
+                {".123", false, token::f64_token(".123"), true},
+
+                {"+.1", false, token::f64_token("+.1"), true},
+                {"+.12", false, token::f64_token("+.12"), true},
+                {"+.123", false, token::f64_token("+.123"), true},
+
+                {"-.1", false, token::f64_token("-.1"), true},
+                {"-.12", false, token::f64_token("-.12"), true},
+                {"-.123", false, token::f64_token("-.123"), true},
+
+                {"1.", false, token::f64_token("1."), true},
+                {"1.2", false, token::f64_token("1.2"), true},
+                {"1.23", false, token::f64_token("1.23"), true},
+
+                {"1e0", false, token::f64_token("1e0"), true},
+                {"1e-1", false, token::f64_token("1e-1"), true},
+                {"1e1", false, token::f64_token("1e1"), true},
+                {"1e+1", false, token::f64_token("1e+1"), true},
+
+                {"\"hello\"", false, token::string_token("hello"), true},
+                /* tokenizer sees this input:
+                 *   "\"hi\", she said"
+                 */
+                {"\"\\\"hi\\\", she said\"", false, token::string_token("\"hi\", she said"), true},
+                /* tokenizer sees this input:
+                 *   "look ma, newline ->\n<- "
+                 */
+                {"\"look ma, newline ->\\n<- \"", false,
+                 token::string_token("look ma, newline ->\n<- "), true},
+                /* tokenizer sees this input:
+                 *   "tab to the right [\t], to the right [\t]"
+                 */
+                {"\"tab to the right [\\t], to the right [\\t]\"", false,
+                 token::string_token("tab to the right [\t], to the right [\t]"), true},
+
+                {"symbol", false, token::symbol_token("symbol"), true},
+            };
+        }
+
+        TEST_CASE("tokenizer", "[tokenizer]") {
+            for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
+                const testcase_tkz & testcase = s_testcase_v[i_tc];
+
+                INFO(xtag("input", testcase.input_));
+                INFO(xtag("i_tc", i_tc));
+
+                using tokenizer
+                    = xo::tok::tokenizer<char>;
+
+                tokenizer tkz;
+                tokenizer::span_type
+                    in_span(testcase.input_.c_str(),
+                            testcase.input_.c_str() + testcase.input_.size());
+
+                auto out = tkz.scan(in_span);
+
+                auto tk = out.first;
+
+                if (tk.is_invalid())
+                    tk = tkz.notify_eof();
+
+                REQUIRE(tk.tk_type() == testcase.expected_tk_.tk_type());
+                if (tk.tk_type() == tokentype::tk_i64)
+                {
+                    REQUIRE(!tk.text().empty());
+                    REQUIRE(tk.i64_value() == testcase.expected_tk_.i64_value());
+                } else if (tk.tk_type() == tokentype::tk_f64)
+                {
+                    REQUIRE(!tk.text().empty());
+                    REQUIRE(tk.f64_value() == testcase.expected_tk_.f64_value());
+                } else if(tk.tk_type() == tokentype::tk_string)
+                {
+                    /* tk.text() can be empty, consider input "" */
+                    REQUIRE(tk.text() == testcase.expected_tk_.text());
+                } else if(tk.tk_type() == tokentype::tk_symbol)
+                {
+                    REQUIRE(!tk.text().empty());
+                    REQUIRE(tk.text() == testcase.expected_tk_.text());
+                } else {
+                    REQUIRE(tk.text().empty());
+                }
+
+                /* must consume all input for tests we're doing here */
+                if (testcase.consume_all_)
+                    REQUIRE(out.second == in_span);
+                else
+                    REQUIRE(out.second != in_span);
+            }
+        }
+
+    } /*namespace ut*/
+} /*namespace xo*/
+
+/* end tokenizer.test.cpp */
--- a/utest/tokenizer_utest_main.cpp
+++ b/utest/tokenizer_utest_main.cpp
@ -0,0 +1,6 @@
+/* file tokenizer_utest_main.cpp */
+
+#define CATCH_CONFIG_MAIN
+#include "catch2/catch.hpp"
+
+/* end tokenizer_utest_main.cpp */