xo-tokenizer2/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp

/* file tokenizer.hpp
 *
 * author: Roland Conybeare, Jul 2024
 */

#pragma once

#include "token.hpp"
#include "input_state.hpp"
#include "span.hpp"
#include "scan_result.hpp"
#include "xo/indentlog/scope.hpp"
#include "xo/indentlog/print/ppdetail_atomic.hpp"
#include <cassert>

namespace xo {
    namespace scm {
        /** @class tokenizer
         *  @brief Parse a Schematika character stream into lexical tokens
         *
         *  Use:
         *
         *  @code
         *    // see xo-tokenizer/example/tokenrepl/tokenrepl.cpp
         *    // for exact working code
         *
         *    using tokenizer_type = tokenizer<char>;
         *    using span_type = tokenizer_type::span_type;
         *
         *    tokenizer_type tkz;
         *    span_type input = ...;
         *
         *    while (!input.empty()) {
         *        auto [tk, consumed, error] = tkz.scan(input);
         *
         *        if (tk.is_valid()) {
         *            // do something with tk
         *        } else if (error.is_error()) {
         *            error.report(cout);
         *            break;
         *        }
         *
         *        input = tkz.consume(consumed, input);
         *    }
         *
         *    if endofinput {
         *        auto [tk, consumed, error] = tzk.notify_eof()
         *
         *        // do something with (final) tk if tk.is_valid()
         *    }
         *
         *  @endcode
         *
         * See tokentype.hpp for token types
         **/
        template <typename CharT>
        class tokenizer {
        public:
            using token_type = token<CharT>;
            using error_type = tokenizer_error<CharT>;
            using span_type = span<const CharT>;
            using input_state_type = input_state<CharT>;
            using result_type = scan_result<CharT>;

        public:
            /** @defgroup tokenizer-ctors tokenizer constructors **/
            ///@{

            tokenizer(bool debug_flag = false);

            ///@}

            /** @defgroup tokenizer-access-methods tokenizer access methods **/
            ///@{

#pragma GCC diagnostic push
#ifndef __APPLE__
#pragma GCC diagnostic ignored "-Wchanges-meaning"
#endif
            const input_state<CharT> & input_state() const { return input_state_; }
#pragma GCC diagnostic pop

            ///@}

            /** @defgroup tokenizer-general-methods tokenizer methods **/
            ///@{

            /** identifies punctuation chars.
             *  These are chars that are not permitted to appear within
             *  a symbol token.  Instead they force completion of
             *  a preceding token,  and start a new token with themselves
             **/
            static bool is_1char_punctuation(CharT ch);

            /** more-relazed version of is_1char_punctuation.
             *  Chars that are not permitted to appear within a symbol token,
             *  but may form token combined with next character
             **/
            static bool is_2char_punctuation(CharT ch);

            /** assemble token from text @p token_text.
             *  @p initial_whitespace   Amount of whitespace input being consumed from input.
             *  @p token_text subset of input_line representing a single token.
             *  @p input_state input state containing input_line
             *
             *  retval.consumed will represent some possibly-empty prefix of @p input
             **/
            static result_type assemble_token(std::size_t initial_whitespace,
                                              const span_type & token_text,
                                              input_state_type & input_state);

            /** degenerate version of assemble_token() on reaching end-of-file **/
            static result_type assemble_final_token(const span_type & token_text,
                                                    const input_state_type & input_state);

            /** true if tokenizer contains stored prefix of
             *  possibly-incomplete token
             **/
            bool has_prefix() const { return !prefix_.empty(); }

            /** scan for next input token,  given @p input.
             *  Note:
             *  - tokenizer can consume input (e.g. whitespace)
             *    without completing a token
             *  - input will remember the extent of the last line of input
             *    for which parsing has begun, but not completed.
             *    It's required that at least that portion of the input span
             *    remain valid across scan(), scan2() calls
             *
             *  @return {parsed token, consumed span}
             **/
            result_type scan(const span_type & input,
                             bool eof_flag);

            /** discard current line after error.  Just cleans up error-reporting state **/
            void discard_current_line();

            ///@}

        private:
            /** @defgroup tokenizer-instance-vars tokenizer instance variables **/
            ///@{

            /** track input state (line#,pos,..) for error messages.
             *  There's an ordering problem here:
             *  1. input_state_.skip_leading_whitespace() advances current line automagically
             *     when it sees \n
             *  2. need to capture value of @ref input_state_ _before_ newline
             *  3. but neeed newline to end token
             *  Also recall input_state_type needed for reporting errors.
             **/
            input_state_type input_state_;
            /** Accumulate partial token here.
             *  This will happen if input sent to @ref tokenizer::scan
             *  ends without whitespace such that last available token's extent is not determined
             **/
            std::string prefix_;

            ///@}
        }; /*tokenizer*/

        template <typename CharT>
        tokenizer<CharT>::tokenizer(bool debug_flag)
            : input_state_{debug_flag}
        {}

        template <typename CharT>
        bool
        tokenizer<CharT>::is_1char_punctuation(CharT ch) {
            switch(ch) {
            case '(':
                return true;
            case ')':
                return true;
            case '[':
                return true;
            case ']':
                return true;
            case '{':
                return true;
            case '}':
                return true;
            case '<':
                /* can't be 1char punctuation -- can begin lessequal token */
                return false;
            case '>':
                /* can't be 1char punctuation -- can begin greatequal token,
                 * and appears in tk_yields token
                 */
                return false;
            case ',':
                return true;
            case ';':
                return true;
            case ':':
                /* can't be 1char punctuation -- can begin assignment token */
                return false;
            case '=':
                /* can't be 1char punctuation -- can begin comparison token '==' */
                return false;
            case '!':
                /* can't be 1char punctuation -- can begin comparison token '!=' */
                return false;
            case '-':
                /* can't be punctuation
                 * - can appear inside f64 token: e.g. 1.23e-9.
                 * - begins tk_yields token: ->
                 */
                return false;
            case '+':
                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */
                return false;
            case '*':
                /* not punctuation -- allowed in symbol */
                return false;
            case '/':
                /* not punctuation -- for symmetry with +,- */
                return false;
            case '.':
                /* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */
                return false;
            }

            return false;
        }

        template <typename CharT>
        bool
        tokenizer<CharT>::is_2char_punctuation(CharT ch) {
            /* can't put '-' here, because of the way it appears in numeric literals
             * characters here may not appear in symbol names
             */

            switch(ch) {
            case '<':
                /* can begin <= */
                return true;
            case '>':
                /* can begin >= */
                return true;
            case ':':
                /* can begin := */
                return true;
            case '=':
                /* can begin == */
                return true;
            case '!':
                /* can begin != */
                return true;
            }

            return false;
        }

        template <typename CharT>
        auto
        tokenizer<CharT>::assemble_token(std::size_t initial_whitespace,
                                         const span_type & token_text,
                                         input_state_type & input_state_ref) -> result_type
        {
            /* literal|pretty|streamlined */
            log_config::style = function_style::streamlined;

            scope log(XO_DEBUG(input_state_ref.debug_flag()));
            log && log(xtag("token_text", token_text),
                       xtag("initial_whitespace", initial_whitespace),
                       xtag("input_state", input_state_ref));

            tokentype tk_type = tokentype::tk_invalid;
            std::string tk_text;

            const CharT * tk_start = token_text.lo();
            const CharT * tk_end = token_text.hi();

            const CharT * ix = tk_start;

            /* switch here applies to the first character in a token */
            switch (*ix) {
            case '-':
            case '+':
                if (token_text.size() == 1) {
                    /* standalone '+' or '-' */
                    if (*ix == '+')
                        tk_type = tokentype::tk_plus;
                    else if(*ix == '-')
                        tk_type = tokentype::tk_minus;
                }

                /** fall through to numeric literal code below **/
                [[fallthrough]];
            case '.':
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
            {
                /* examples of valid floating-point numbers:
                 *   .0
                 *   1e0
                 *   1e
                 *   0.
                 *   +1e0
                 *   -1e0
                 *   +1E+2
                 *   -1E+2
                 *   -0.123e-10
                 * non-examples:
                 *   .
                 *   -
                 *   +
                 *   e0
                 *   .e0
                 *   -.e-0
                 *   +.e+0
                 *
                 * in particular: to be recognized as a number,
                 * must contain at least one digit
                 */

                log && log("possible number-token");

                /* true if initial sign -/+ encountered */
                bool sign_flag = false;
                /* true if '.' encountered */
                bool period_flag = false;
                /* true if 'e' | 'E' encountered.
                 */
                bool exponent_flag = false;
                /* true when sign '-' | '+' precedes exponenct digits */
                bool exponent_sign_flag = false;
                /* true when at least one digit follows exponent marker */
                bool exponent_digit_flag = false;
                /* true if at least one digit encountered */
                bool number_flag = false;

                log && log(xtag("*ix", *ix),
                           xtag("tk.length", token_text.size()));
                if (log && (ix + 1 < tk_end))
                    log(xtag("*(ix+1)", *(ix + 1)));

                if ((*ix == '-') && (ix + 2 == token_text.hi()) && (*(ix + 1) == '>')) {
                    /* composing exactly '->' */
                    tk_type = tokentype::tk_yields;
                } else {
                    /* token (if valid) will be one of: {tk_i64, tk_f64, tk_dot}: */
                    for (; ix != token_text.hi(); ++ix) {
                        if ((*ix == '-') || (*ix == '+')) {
                            /* sign allowed:
                             * 1. before period and before first digit
                             * 2. after exponent
                             */
                            if (!period_flag && !number_flag && !sign_flag) {
                                sign_flag = true;
                            } else if (exponent_flag && !exponent_digit_flag) {
                                exponent_sign_flag = true;
                            } else {
                                return result_type::make_error_consume_current_line
                                    (__FUNCTION__ /*src_function*/,
                                     "improperly placed sign indicator",
                                     (ix - tk_start),
                                     input_state_ref);
                            }
                        } else if (*ix == '.') {
                            if (period_flag) {
                                return result_type::make_error_consume_current_line
                                    (__FUNCTION__ /*src_function*/,
                                     "duplicate decimal point in numeric literal",
                                     (ix - tk_start),
                                     input_state_ref);
                            }

                            period_flag = true;
                        } else if ((*ix == 'e') || (*ix == 'E')) {
                            if (exponent_flag) {
                                return result_type::make_error_consume_current_line
                                    (__FUNCTION__ /*src_function*/,
                                     "duplicate exponent marker in numeric literal",
                                     (ix - tk_start),
                                     input_state_ref);
                            }

                            exponent_flag = true;
                        } else if (isdigit(*ix)) {
                            if (exponent_flag) {
                                /* need digit before exponent to recognize as number */
                                exponent_digit_flag = true;
                            } else {
                                number_flag = true;
                            }
                        } else {
                            return result_type::make_error_consume_current_line
                                (__FUNCTION__ /*src_function*/,
                                 "unexpected character in numeric constant" /*error_description*/,
                                 (ix - tk_start),
                                 input_state_ref);
                        }
                    }

                    if (number_flag) {
                        if (period_flag || exponent_flag) {
                            tk_type = tokentype::tk_f64;
                        } else {
                            tk_type = tokentype::tk_i64;
                        }
                    } else if (period_flag && !exponent_flag) {
                        tk_type = tokentype::tk_dot;
                    } else {
                        /* not a valid token */
                    }

                    log && log(xtag("sign_flag", sign_flag));
                    log && log(xtag("period_flag", period_flag),
                               xtag("exponent_flag", exponent_flag),
                               xtag("exponent_sign_flag", exponent_sign_flag),
                               xtag("number_flag", number_flag));
                    log && log(xtag("tk_type", tk_type));
                }

                break;
            }
            case '*':
                if (token_text.size() == 1) {
                    /* standalone '*' */
                    tk_type = tokentype::tk_star;
                    ++ix;
                } else {
                    /* '*' isn't punctuation -- but may allow appearance in a longer token
                     *
                     * thinking that x*y is a symbol with an embedded '*' character;
                     * in particular want to support kebab-case symbols like 'foo-config'
                     */
                }
                break;
            case '/':
                if (token_text.size() == 1) {
                    /* standalone '/' */
                    tk_type = tokentype::tk_slash;
                    ++ix;
                }
                break;
            case '=':
                log && log("singleassign or cmpeq token");

                if (*(ix + 1) == '=') {
                    tk_type = tokentype::tk_cmpeq;
                    ++ix;
                    ++ix;
                } else {
                    /* standalone '=' */
                    tk_type = tokentype::tk_singleassign;
                    ++ix;
                }
                break;
            case '!':
                if (*(ix + 1) == '=') {
                    tk_type = tokentype::tk_cmpne;
                    ++ix;
                    ++ix;
                } else {
                    /* standlone '!' */

                    // TODO
                }
                break;
            case '"':
            {
                log && log("recognize string-token");

                tk_type = tokentype::tk_string;

                tk_text.reserve(token_text.hi() - token_text.lo());

                ++ix; /*skip initial " char*/

                /* true on final " */
                bool endofstring = false;

                for (; ix != token_text.hi(); ++ix) {
                    log && log(xtag("*ix", *ix));

                    switch(*ix) {
                    case '"':
                        endofstring = true;

                        /* skip final " char, don't capture */
                        ++ix;

                        break;
                    case '\\':
                        /* skip escape char, don't capture */
                        ++ix;

                        if (ix == token_text.hi()) {
                            return result_type::make_error_consume_current_line
                                (__FUNCTION__ /*src_function*/,
                                 "expecting key following escape character \\",
                                 (ix - tk_start),
                                 input_state_ref);
                        }

                        switch(*ix) {
                        case '\\':
                            log && log(xtag("*ix", *ix), xtag("escaped", "t"));
                            tk_text.push_back(*ix);
                            break;
                        case 'n':
                            log && log(xtag("*ix", *ix), xtag("newline", "t"));
                            tk_text.push_back('\n');
                            break;
                        case 't':
                            log && log(xtag("*ix", *ix), xtag("tab", "t"));
                            tk_text.push_back('\t');
                            break;
                        case 'r':
                            log && log(xtag("*ix", *ix), xtag("cr", "t"));
                            tk_text.push_back('\r');
                            break;
                        case '"':
                            log && log(xtag("*ix", *ix), xtag("quote", "t"));
                            tk_text.push_back('"');
                            break;
                        default:
                            return result_type::make_error_consume_current_line
                                (__FUNCTION__ /*src_function*/,
                                 "expecting one of n|r|\"|\\ following escape \\",
                                 (ix - tk_start),
                                 input_state_ref);
                        }
                        break;
                    default:
                        tk_text.push_back(*ix);
                        break;
                    }

                    if (endofstring)
                        break;
                }

                if (!endofstring) {
                    return result_type::make_error_consume_current_line
                        (__FUNCTION__ /*src_function*/,
                         "missing terminating '\"' to complete literal string",
                         (ix - tk_start),
                         input_state_ref);
                }

                log && log(tostr("tokenizer::assemble_token",
                                 xtag("tk_text", tk_text)));

                break;
            }
            case 'a': case 'A':
            case 'b': case 'B':
            case 'c': case 'C':
            case 'd': case 'D':
            case 'e': case 'E':
            case 'f': case 'F':
            case 'g': case 'G':
            case 'h': case 'H':
            case 'i': case 'I':
            case 'j': case 'J':
            case 'k': case 'K':
            case 'l': case 'L':
            case 'm': case 'M':
            case 'n': case 'N':
            case 'o': case 'O':
            case 'p': case 'P':
            case 'q': case 'Q':
            case 'r': case 'R':
            case 's': case 'S':
            case 't': case 'T':
            case 'u': case 'U':
            case 'v': case 'V':
            case 'w': case 'W':
            case 'x': case 'X':
            case 'y': case 'Y':
            case 'z': case 'Z':
            {
                /* symbol/identifier must begin with a letter?
                 * we want to accept some other chars too.
                 * specifically want to allow identifiers:
                 *   this-is-the-way
                 *   this+is+also+the+way
                 *   how/much/is/that/doggy
                 *   put*an*asterisk*in*that
                 *   something%special%
                 *
                 * like pure lisp,  we don't allow:
                 * - identifier beginning with digit
                 * - period .
                 *
                 * unlike pure lisp,  we don't allow anywhere in a symbol:
                 * - colon     :
                 * - semicolon ;
                 * - comma     ,
                 *
                 * also we don't allow symbols to begin with special chars
                 */

                tk_type = tokentype::tk_symbol;
                break;
            }
            case '<':
            {
                log && log("leftangle or lessequal token");

                if (*(ix + 1) == '=') {
                    tk_type = tokentype::tk_lessequal;
                    ++ix;
                    ++ix;
                } else {
                    tk_type = tokentype::tk_leftangle;
                    ++ix;
                }
                break;
            }
            case '>':
            {
                log && log("rightangle or greatequal token");

                if (*(ix + 1) == '=') {
                    tk_type = tokentype::tk_greatequal;
                    ++ix;
                    ++ix;
                } else {
                    tk_type = tokentype::tk_rightangle;
                    ++ix;
                }
                break;
            }
            case '(':
                tk_type = tokentype::tk_leftparen;
                ++ix;
                break;
            case ')':
                tk_type = tokentype::tk_rightparen;
                ++ix;
                break;
            case '[':
                tk_type = tokentype::tk_leftbracket;
                ++ix;
                break;
            case ']':
                tk_type = tokentype::tk_rightbracket;
                ++ix;
                break;
            case '{':
                tk_type = tokentype::tk_leftbrace;
                ++ix;
                break;
            case '}':
                tk_type = tokentype::tk_rightbrace;
                ++ix;
                break;
            case ',':
                tk_type = tokentype::tk_comma;
                ++ix;
                break;
            case ';':
                tk_type = tokentype::tk_semicolon;
                ++ix;
                break;
            case ':':
            {
                log && log("colon or assignment token");

                if (*(ix + 1) == '=') {
                    tk_type = tokentype::tk_assign;
                    ++ix;
                    ++ix;
                } else {
                     tk_type = tokentype::tk_colon;
                     ++ix;
                }
                break;
            }
            default:
                break;
            }

            if (tk_type == tokentype::tk_invalid) {
                return result_type::make_error_consume_current_line
                    (__FUNCTION__ /*src_function*/,
                     "illegal input character",
                     (ix - tk_start),
                     input_state_ref);
            }

            if ((tk_type == tokentype::tk_i64)
                || (tk_type == tokentype::tk_f64)
                || (tk_type == tokentype::tk_symbol))
            {
                /* note: capturing token text here;
                 *       for numeric literals will re-parse in token::i64_value() / token::f64_value()
                 */
                tk_text = std::string(tk_start, tk_end);
            } else if (tk_type == tokentype::tk_string) {
                ; /* nothing to do here -- desired tk_text already constructed */
            }

            if (tk_type == tokentype::tk_symbol) {
                /* check for keywords */

                bool keep_text = false;

                if ((tk_text == "true") || (tk_text == "false")) {
                    tk_type = tokentype::tk_bool;
                    keep_text = true;
                } else if (tk_text == "type") {
                    tk_type = tokentype::tk_type;
                } else if (tk_text == "def") {
                    tk_type = tokentype::tk_def;
                } else if (tk_text == "lambda") {
                    tk_type = tokentype::tk_lambda;
                } else if (tk_text == "if") {
                    tk_type = tokentype::tk_if;
                } else if (tk_text == "then") {
                    tk_type = tokentype::tk_then;
                } else if (tk_text == "else") {
                    tk_type = tokentype::tk_else;
                } else if (tk_text == "let") {
                    tk_type = tokentype::tk_let;
                } else if (tk_text == "in") {
                    tk_type = tokentype::tk_in;
                } else if (tk_text == "end") {
                    tk_type = tokentype::tk_end;
                } else {
                    /* keep as symbol */
                    keep_text = true;
                }

                if (!keep_text)
                    tk_text.clear();
            }

            /* input.prefix(0):
             * require caller preserves current input line until it's entirely exhausted
             */
            return result_type(token_type(tk_type, std::move(tk_text)),
                               input_state_ref.current_line().prefix(0));
        } /*assemble_token*/

        /* TODO: input_state_ as argument ? */
        template <typename CharT>
        auto
        tokenizer<CharT>::assemble_final_token(const span_type & token_text,
                                               const input_state_type & input_state) -> result_type
        {
            return assemble_token(0 /*initial_whitespace*/,
                                  token_text,
                                  input_state);
        }

        template <typename CharT>
        auto
        tokenizer<CharT>::scan(const span_type & input,
                               bool eof_flag) -> result_type
        {
            scope log(XO_DEBUG(input_state_.debug_flag()));

            log && log(xtag("input", input));

            /* - Always at beginning of token when scan() invoked
             * - scan will not report any portion of line as consumed until it has
             *   emitted all tokens in that line.
             *   rationale: caller is allowed to discard storage that
             *   scan() reports as consumed. But will be holding that line
             *   until all tokens have been read.
             * - this means caller will typically call scan()
             *   with the same input span multiple times
             */

            /* automagically no-ops when the same input presented twice */
            this->input_state_.capture_current_line(input, eof_flag);

            const CharT * ix = this->input_state_.skip_leading_whitespace();

            if(ix == input.hi()) {
                log && log("end input -> consume current line");

                /* entirety of current line has been tokenized
                 *  -> caller may consume it
                 */
                return result_type::make_whitespace(this->input_state_.consume_current_line());
            }

            /* ix: if ix < input.hi: first non-whitespace character after input_state_.current_pos_ */

            // TODO:
            // 1. hoist complete_flag up here
            // 2. use in each branch
            // 3. common check for prefix-capturing after if-cascade below done

            /* here: *ix is not whitespace */

            auto whitespace_z = input_state_.whitespace();

            log && log(xtag("whitespace_z", whitespace_z));

            /* tk_start points to known beginning of token
             * (after any whitespace)
             *
             * goal is to leave ix pointing to 1 char past the end of the token
             */
            const CharT * tk_start = ix;

            if (is_1char_punctuation(*ix)) {
                /* 1-character token */
                ++ix;
            } else if (is_2char_punctuation(*ix)) {
                CharT ch1 = *ix;

                (void)ch1;

                ++ix;

#ifdef OBSOLETE // no longer a thing. either input ends in whitespace, or ends translation unit
                if (ix == input.hi()) {
                    /* need more input to know if/when token complete */
                    this->prefix_ += std::string(tk_start, input.hi());

                    log && log(xtag("captured-prefix1", this->prefix_));
                } else
#endif
                    {
                    CharT ch2 = *ix;

                    if (((ch2 >= '0') && (ch2 <= '9'))
                        || ((ch2 >= 'A') && (ch2 <= 'Z'))
                        || ((ch2 >= 'a') && (ch2 <= 'z')))
                    {
                        /* treat as 1 char punctuation */
                        ;
                    } else {
                        /* include next char */
                        ++ix;
                    }
                }
            } else if (*ix == '"') {
                bool complete_flag = false;

                /* 1. embedded space/tab allowed in string literal.
                 * 2. embedded newline/cr not allowed.
                 */
                CharT prev_ch = '"';

                ++ix;

                for (; ix != input.hi(); ++ix) {
                    /* looking for unescaped " char to end literal */
                    if (*ix == '"') {
                        if (prev_ch != '\\') {
                            ++ix;  /* include terminating " for assemble_token */
                            complete_flag = true;
                            break;
                        }
                    } else if ((*ix == '\n') || (*ix == '\r')) {
                        log && log ("string literal with naked newline or CR");

                        return result_type::make_error_consume_current_line
                            (__FUNCTION__ /*src_function*/,
                             "must use \\n or \\r to encode newline/cr in string literal",
                             (ix - tk_start),
                             this->input_state_);
                    }

                    prev_ch = *ix;
                }

                if (!complete_flag) {
                    log && log("unterminated string literal");

                    return result_type::make_error_consume_current_line
                               (__FUNCTION__ /*src_function*/,
                                "unterminated string literal",
                                (ix - tk_start),
                                this->input_state_);
                }
            } else {
                /* ix is start of some token */

                if (*ix == '-') {
                    /* this section load-bearing for input '->' scanning from beginning of token */
                    ++ix;

                    if (ix == input.hi()) {
                        /* need more input to know if/when token complete -- see captured-prefix5 below */
                    } else {
                        CharT ch2 = *ix;

                        if (ch2 == '>') {
                            /* include next char and complete token */
                            ++ix;

                            log && log("complete '->' token");

                            this->input_state_.advance_until(ix);

                            return assemble_token(whitespace_z,
                                                  span_type(tk_start, ix) /*token*/,
                                                  input_state_);
                        }

                        /* here: -123, -.5e-21 for example */
                    }
                } else if (*ix == '>') {
                    /* this section load-bearing for input '>=' scanning from beginning of token.
                     * Need this because '>' necessarily excluded from is_1char_punctuation()
                     */
                    ++ix;

                    if (ix == input.hi()) {
                        /* need more input to know if/when token complete -- see captured-prefix5 below */
                    } else {
                        CharT ch2 = *ix;

                        if (ch2 != '=') {
                            log && log("complete '>=' token");

                            this->input_state_.advance_until(ix);

                            /* ignore next char and complete token */
                            return assemble_token(whitespace_z,
                                                  span_type(tk_start, ix) /*token*/,
                                                  this->input_state_);
                        }

                        /* here: >= for example */
                    }
                }

                /* scan until:
                 * - whitespace
                 * - punctuation
                 */
                for (; ix != input.hi(); ++ix) {
                    if (input_state_type::is_whitespace(*ix)
                        || is_1char_punctuation(*ix)
                        || is_2char_punctuation(*ix))
                    {
                        break;
                    }

                    /* this section load-bearing for input '>' after beginning of a token, e.g. p> */
                    if ((ix > tk_start) && (*ix == '>'))
                        break;

                    /* this section load-bearing for input '->' at the end of another token, e.g. p->q */
                    if (*ix == '-') {
                        if (ix + 1 == input.hi()) {
                            /* need more input to know if/when token complete
                             *
                             *   apple-banana   parses as: {tk_symbol: apple-banana}
                             *   apple->        parses as: {tk_symbol: apple} {tk_yields}
                             *   apple-         illegal (may not end symbol with '-')
                             */
                            break;
                        }

                        if (*(ix + 1) == '>') {
                            /* treat '->' as punctuation;  complete preceding token */
                            break;
                        }
                    }
                }

#ifdef OBSOLETE
                if (ix == input.hi()) {
                    /* need more input to know if/when token complete */
                    this->prefix_ += std::string(tk_start, input.hi());

                    log && log(xtag("captured-prefix5", this->prefix_));
                }
#endif
            }

            log && log("assemble token z", xtag("token_z", ix - tk_start));

            assert(tk_start < ix);

            this->input_state_.advance_until(ix);

            return assemble_token(whitespace_z,
                                  span_type(tk_start, ix) /*token*/,
                                  this->input_state_);
        } /*scan*/

#ifdef OBSOLETE
        template <typename CharT>
        auto
        tokenizer<CharT>::scan2(const span_type & input, bool eof) -> result_type {
            scope log(XO_DEBUG(input_state_.debug_flag()));

            auto sr = this->scan(input);

            if (sr.is_token() || sr.is_error() || !eof)
                return sr;

            /* control here only if input contains no unambiguous tokens.
             * This implies it contains _at most one_ final token.
             */

            span_type input2 = input.after_prefix(sr.consumed());

            /* need to include src.consumed() in retval */

            auto sr2 = this->notify_eof(input2);

            return result_type(sr2.get_token(),
                               span_type::concat(sr.consumed(), sr2.consumed()),
                               sr2.error());
        }
#endif

#ifdef OBSOLETE
        template <typename CharT>
        auto
        tokenizer<CharT>::consume(const span_type & consumed,
                                  const span_type & input) -> span_type
        {
            this->input_state_.consume(consumed.size());

            return input.after_prefix(consumed);
        }
#endif

        template <typename CharT>
        void
        tokenizer<CharT>::discard_current_line()
        {
            this->input_state_.discard_current_line();
        }

#ifdef OBSOLETE
        template <typename CharT>
        auto
        tokenizer<CharT>::notify_eof(const span_type & input) -> result_type {
            scope log(XO_DEBUG(input_state_.debug_flag()));

            log && log(xtag("prefix_", prefix_), xtag("prefix_.size", prefix_.size()), xtag("input", input));

            /* almost meretricious to include input here,
             * when called from scan2() it can only be whitespace
             */
            return result_type::make_whitespace(input);
        } /*notify_eof*/
#endif
    } /*namespace scm*/
} /*namespace xo*/

/* end tokenizer.hpp */