xo-umbrella2/include/xo/tokenizer/tokenizer.hpp

775 lines
26 KiB
C++

/* file tokenizer.hpp
*
* author: Roland Conybeare, Jul 2024
*/
#pragma once
#include "token.hpp"
#include "span.hpp"
#include "xo/indentlog/scope.hpp"
#include <cassert>
namespace xo {
namespace scm {
/**
* Use:
* @code
* using tokenizer_type = tokenizer<char>;
* using span_type = tokenizer_type::span_type;
*
* tokenizer_type tkz;
* span_type input = ...;
*
* while !input.empty() {
* auto res = tkz.scan(input);
* const auto & tk = res.first;
*
* // do something with tk if tk.is_valid()
*
* input = input.after_prefix(res.second);
* }
*
* if endofinput {
* auto tk = tzk.notify_eof()
*
* // do something with tk if tk.is_valid()
* }
*
* // expect !tkz.has_prefix()
*
* @endcode
**/
template <typename CharT>
class tokenizer {
public:
using token_type = token<CharT>;
using span_type = span<const CharT>;
using scan_result = std::pair<token_type, span_type>;
public:
tokenizer() = default;
/** identifies whitespace chars.
* These are chars that do not belong to any token.
* They are not permitted to appear within
* a symbol or string token.
* Appearance of a whitespace char forces completion of
* preceding token.
**/
bool is_whitespace(CharT ch) const;
/** identifies punctuation chars.
* These are chars that are not permitted to appear within
* a symbol token. Instead they force completion of
* a preceding token, and start a new token with themselves
**/
bool is_1char_punctuation(CharT ch) const;
/** more-relazed version of is_1char_punctuation.
* Chars that are not permitted to appear within a symbol token,
* but may form token combined with next character
**/
bool is_2char_punctuation(CharT ch) const;
/** true if tokenizer contains stored prefix of
* possibly-incomplete token
**/
bool has_prefix() const { return !prefix_.empty(); }
/** assemble token from text @p token_text
**/
token_type assemble_token(const span_type & token_text) const;
/** scan for next input token, given @p input.
* Note tokenizer can consume input (e.g. whitespace)
* without completing a token
*
* @return {parsed token, consumed span}
**/
scan_result scan(const span_type & input);
/** When eof is false, same as scan(input).
* When eof is true and scan(input) does not report a token,
* return notify_eof()
**/
scan_result scan2(const span_type & input, bool eof);
/** notify end of input, resolve any stored input **/
token_type notify_eof();
private:
/** Accumulate partial token here.
* This will happen if input sent to @ref tokenizer::scan
* ends without a determinate token boundary.
**/
std::string prefix_;
}; /*tokenizer*/
template <typename CharT>
bool
tokenizer<CharT>::is_whitespace(CharT ch) const {
switch(ch) {
case ' ': return true;
case '\t': return true;
case '\n': return true;
case '\r': return true;
}
return false;
}
template <typename CharT>
bool
tokenizer<CharT>::is_1char_punctuation(CharT ch) const {
switch(ch) {
case '<':
return true;
case '>':
return true;
case '(':
return true;
case ')':
return true;
case '[':
return true;
case ']':
return true;
case '{':
return true;
case '}':
return true;
case ',':
return true;
case ';':
return true;
case ':':
/* can't be 1char punctuation -- can begin assignment token */
return false;
case '=':
return true;
case '-':
/* can't be punctuation -- can appear inside f64 token: e.g. 1.23e-9 */
return false;
case '+':
/* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */
return false;
case '*':
/* not punctuation -- allowed in symbol */
return false;
case '/':
/* not punctuation -- for symmetry with +,- */
return false;
case '.':
/* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */
return false;
}
return false;
}
template <typename CharT>
bool
tokenizer<CharT>::is_2char_punctuation(CharT ch) const {
switch(ch) {
case ':':
/* can begin := */
return true;
}
return false;
}
template <typename CharT>
auto
tokenizer<CharT>::assemble_token(const span_type & token_text) const -> token_type
{
constexpr bool c_debug_flag = true;
/* literal|pretty|streamlined */
log_config::style = function_style::streamlined;
scope log(XO_DEBUG(c_debug_flag));
log && log(xtag("token_text", token_text));
tokentype tk_type = tokentype::tk_invalid;
std::string tk_text;
const CharT * tk_start = token_text.lo();
const CharT * tk_end = token_text.hi();
const CharT * ix = tk_start;
/* switch here applies to the first character in a token */
switch (*ix) {
case '-':
case '+':
if (token_text.size() == 1) {
/* standalone '+' or '-' */
if (*ix == '+')
tk_type = tokentype::tk_plus;
else if(*ix == '-')
tk_type = tokentype::tk_minus;
}
/** fall through to numeric literal code below **/
;
case '.':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
{
/* examples of valid floating-point numbers:
* .0
* 1e0
* 1e
* 0.
* +1e0
* -1e0
* +1E+2
* -1E+2
* -0.123e-10
* non-examples:
* .
* -
* +
* e0
* .e0
* -.e-0
* +.e+0
*
* in particular: to be recognized as a number,
* must contain at least one digit
*/
log && log("possible number-token");
/* true if initial sign -/+ encountered */
bool sign_flag = false;
/* true if '.' encountered */
bool period_flag = false;
/* true if 'e' | 'E' encountered.
*/
bool exponent_flag = false;
/* true when sign '-' | '+' precedes exponenct digits */
bool exponent_sign_flag = false;
/* true when at least one digit follows exponent marker */
bool exponent_digit_flag = false;
/* true if at least one digit encountered */
bool number_flag = false;
/* token will be one of: {i64, f64, dot}: */
for(; ix != token_text.hi(); ++ix) {
if((*ix == '-') || (*ix == '+')) {
/* sign allowed:
* 1. before period and before first digit
* 2. after exponent
*/
if (!period_flag && !number_flag && !sign_flag) {
sign_flag = true;
} else if (exponent_flag && !exponent_digit_flag) {
exponent_sign_flag = true;
} else {
throw std::runtime_error
(tostr("tokenizer::assemble_token",
": improperly placed sign indicator",
xtag("pos", ix - tk_start),
xtag("char", *ix)));
}
} else if(*ix == '.') {
if (period_flag) {
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": duplicate decimal point",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
period_flag = true;
} else if((*ix == 'e') || (*ix == 'E')) {
if (exponent_flag) {
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": duplicate exponent marker",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
exponent_flag = true;
} else if(isdigit(*ix)) {
if (exponent_flag) {
/* need digit before exponent to recognize as number */
exponent_digit_flag = true;
} else {
number_flag = true;
}
} else {
/* invalid input */
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": unexpected character in numeric constant",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
}
if (number_flag) {
if (period_flag || exponent_flag) {
tk_type = tokentype::tk_f64;
} else {
tk_type = tokentype::tk_i64;
}
} else if (period_flag && !exponent_flag) {
tk_type = tokentype::tk_dot;
} else {
/* not a valid token */
}
log && log(xtag("sign_flag", sign_flag));
log && log(xtag("period_flag", period_flag),
xtag("exponent_flag", exponent_flag),
xtag("exponent_sign_flag", exponent_sign_flag),
xtag("number_flag", number_flag));
log && log(xtag("tk_type", tk_type));
break;
}
case '*':
if (token_text.size() == 1) {
/* standalone '*' */
tk_type = tokentype::tk_star;
++ix;
} else {
/* '*' isn't punctuation -- but may allow appearance in a longer token
*
* thinking that x*y is a symbol with an embedded '*' character;
* in particular want to support kebab-case symbols like 'foo-config'
*/
}
break;
case '/':
if (token_text.size() == 1) {
/* standalone '/' */
tk_type = tokentype::tk_slash;
++ix;
}
break;
case '"':
{
log && log("recognize string-token");
tk_type = tokentype::tk_string;
tk_text.reserve(token_text.hi() - token_text.lo());
++ix; /*skip initial " char*/
for (; ix != token_text.hi(); ++ix) {
log && log(xtag("*ix", *ix));
bool endofstring = false;
switch(*ix) {
case '"':
endofstring = true;
/* skip final " char, don't capture */
++ix;
break;
case '\\':
/* skip escape char, don't capture */
++ix;
if (ix == token_text.hi()) {
throw std::runtime_error
(tostr("tokenizer::assemble_token",
": malformed string literal",
xtag("input", std::string_view(token_text.lo(),
token_text.hi()))));
}
switch(*ix) {
case '\\':
log && log(xtag("*ix", *ix), xtag("escaped", "t"));
tk_text.push_back(*ix);
break;
case 'n':
log && log(xtag("*ix", *ix), xtag("newline", "t"));
tk_text.push_back('\n');
break;
case 't':
log && log(xtag("*ix", *ix), xtag("tab", "t"));
tk_text.push_back('\t');
break;
case 'r':
log && log(xtag("*ix", *ix), xtag("cr", "t"));
tk_text.push_back('\r');
break;
case '"':
log && log(xtag("*ix", *ix), xtag("quote", "t"));
tk_text.push_back('"');
break;
default:
throw std::runtime_error
(tostr("tokenizer::assemble_token",
": unexpected \\-escaped char",
xtag("char", *ix)));
}
break;
default:
tk_text.push_back(*ix);
break;
}
if (endofstring)
break;
}
if (ix != token_text.hi()) {
throw std::runtime_error
(tostr("tokenizer::assemble_token",
": expected \" to end string literal",
xtag("input", std::string_view(token_text.lo(),
token_text.hi()))));
}
log && log(tostr("tokenizer::assemble_token",
xtag("tk_text", tk_text)));
break;
}
case 'a': case 'A':
case 'b': case 'B':
case 'c': case 'C':
case 'd': case 'D':
case 'e': case 'E':
case 'f': case 'F':
case 'g': case 'G':
case 'h': case 'H':
case 'i': case 'I':
case 'j': case 'J':
case 'k': case 'K':
case 'l': case 'L':
case 'm': case 'M':
case 'n': case 'N':
case 'o': case 'O':
case 'p': case 'P':
case 'q': case 'Q':
case 'r': case 'R':
case 's': case 'S':
case 't': case 'T':
case 'u': case 'U':
case 'v': case 'V':
case 'w': case 'W':
case 'x': case 'X':
case 'y': case 'Y':
case 'z': case 'Z':
{
/* symbol/identifier must begin with a letter?
* we want to accept some other chars too.
* specifically want to allow identifiers:
* this-is-the-way
* this+is+also+the+way
* how/much/is/that/doggy
* put*an*asterisk*in*that
* something%special%
*
* like pure lisp, we don't allow:
* - identifier beginning with digit
* - period .
*
* unlike pure lisp, we don't allow anywhere in a symbol:
* - colon :
* - semicolon ;
* - comma ,
*
* also we don't allow symbols to begin with special chars
*/
tk_type = tokentype::tk_symbol;
break;
}
case '<':
tk_type = tokentype::tk_leftangle;
++ix;
break;
case '>':
tk_type = tokentype::tk_rightangle;
++ix;
break;
case '(':
tk_type = tokentype::tk_leftparen;
++ix;
break;
case ')':
tk_type = tokentype::tk_rightparen;
++ix;
break;
case '[':
tk_type = tokentype::tk_leftbracket;
++ix;
break;
case ']':
tk_type = tokentype::tk_rightbracket;
++ix;
break;
case '{':
tk_type = tokentype::tk_leftbrace;
++ix;
break;
case '}':
tk_type = tokentype::tk_rightbrace;
++ix;
break;
case ',':
tk_type = tokentype::tk_comma;
++ix;
break;
case ';':
tk_type = tokentype::tk_semicolon;
++ix;
break;
case ':':
{
log && log("colon or assignment token");
if (*(ix + 1) == '=') {
tk_type = tokentype::tk_assign;
++ix;
++ix;
} else {
tk_type = tokentype::tk_colon;
++ix;
}
break;
}
case '=':
tk_type = tokentype::tk_singleassign;
++ix;
break;
default:
break;
}
if (tk_type == tokentype::tk_invalid) {
throw std::runtime_error(tostr("tokenizer::assemble_token",
": unexpected input x",
xtag("x", *ix)));
}
if ((tk_type == tokentype::tk_i64)
|| (tk_type == tokentype::tk_f64)
|| (tk_type == tokentype::tk_symbol))
{
/* re-parse in token::i64_value() / token::f64_value() */
tk_text = std::string(tk_start, tk_end);
} else if (tk_type == tokentype::tk_string) {
; /* nothing to do here -- desired tk_text already constructed */
}
if (tk_type == tokentype::tk_symbol) {
/* check for keywords */
bool keep_text = false;
if (tk_text == "type") {
tk_type = tokentype::tk_type;
} else if (tk_text == "def") {
tk_type = tokentype::tk_def;
} else if (tk_text == "lambda") {
tk_type = tokentype::tk_lambda;
} else if (tk_text == "if") {
tk_type = tokentype::tk_if;
} else if (tk_text == "let") {
tk_type = tokentype::tk_let;
} else if (tk_text == "in") {
tk_type = tokentype::tk_in;
} else if (tk_text == "end") {
tk_type = tokentype::tk_end;
} else {
/* keep as symbol */
keep_text = true;
}
if (!keep_text)
tk_text.clear();
}
return token_type(tk_type, std::move(tk_text));
} /*assemble_token*/
template <typename CharT>
auto
tokenizer<CharT>::scan(const span_type & input) -> scan_result
{
constexpr bool c_debug_flag = true;
scope log(XO_DEBUG(c_debug_flag));
log && log(xtag("input", input));
const CharT * ix = input.lo();
/* skip whitespace */
while (is_whitespace(*ix) && (ix != input.hi()))
++ix;
if(ix == input.hi()) {
/* no-op */
return {
token_type::invalid(),
input.prefix_upto(ix)
};
}
/* here: *ix is not whitespace */
auto whitespace = input.prefix_upto(ix);
log && log(xtag("whitespace.size", whitespace.size()));
/* tk_start points to beginning of token
* (after any whitespace)
*/
const CharT * tk_start = ix;
if (is_1char_punctuation(*ix)) {
/* 1-character token */
++ix;
} else if (is_2char_punctuation(*ix)) {
CharT ch1 = *ix;
(void)ch1;
++ix;
if (ix == input.hi()) {
/* need more input to know if/when token complete */
this->prefix_ += std::string(tk_start, input.hi());
log && log(xtag("captured-prefix", this->prefix_));
} else {
CharT ch2 = *ix;
if (((ch2 >= '0') && (ch2 <= '9'))
|| ((ch2 >= 'A') && (ch2 <= 'Z'))
|| ((ch2 >= 'a') && (ch2 <= 'z')))
{
/* treat as 1 char punctuation */
;
} else {
/* include next char */
++ix;
}
}
} else if (*ix == '"') {
bool complete_flag = false;
/* 1. embedded space/tab allowed in string literal.
* 2. embedded newline/cr not allowed.
*/
CharT prev_ch = '"';
++ix;
for (; ix != input.hi(); ++ix) {
/* looking for unescaped " char to end literal */
if (*ix == '"') {
if (prev_ch != '\\') {
++ix; /* include terminating " for assemble_token */
complete_flag = true;
break;
}
} else if ((*ix == '\n') || (*ix == '\r')) {
throw std::runtime_error
(tostr("tokenizer::scan",
": must use \\n or \\r to encode newline/cr in"
" string literal"));
}
prev_ch = *ix;
}
if (!complete_flag) {
/* need more input to know if/when token complete */
this->prefix_ += std::string(tk_start, input.hi());
log && log(xtag("captured-prefix", this->prefix_));
}
} else {
/* scan until:
* - whitespace
* - punctuation
*/
for (; ix != input.hi(); ++ix) {
if (is_whitespace(*ix)
|| is_1char_punctuation(*ix)
|| is_2char_punctuation(*ix))
{
break;
}
}
if (ix == input.hi()) {
/* need more input to know if/when token complete */
this->prefix_ += std::string(tk_start, input.hi());
log && log(xtag("captured-prefix", this->prefix_));
}
}
auto token_span = input.after_prefix(whitespace).prefix_upto(ix);
token tk
= (this->prefix_.empty()
? assemble_token(token_span)
: token_type(tokentype::tk_invalid));
return scan_result
{ tk, input.prefix(whitespace.size() + token_span.size()) };
} /*scan*/
template <typename CharT>
auto
tokenizer<CharT>::scan2(const span_type & input, bool eof) -> scan_result {
auto sr = this->scan(input);
if (!sr.first.is_valid() && eof) {
sr.first = this->notify_eof();
/* always consume remainder of input here.
* ambiguous prefix can represent at most one token
*/
sr.second = input;
}
return sr;
}
template <typename CharT>
auto
tokenizer<CharT>::notify_eof() -> token_type {
constexpr bool c_debug_flag = true;
scope log(XO_DEBUG(c_debug_flag));
token tk
= (this->prefix_.empty()
? token_type(tokentype::tk_invalid)
: assemble_token(span_type(&prefix_[0],
&prefix_[prefix_.size()])));
this->prefix_.clear();
return tk;
} /*notify_eof*/
} /*namespace scm*/
} /*namespace xo*/
/* end tokenizer.hpp */