xo-tokenizer: build + utest + reasonable implementation

This commit is contained in:
Roland Conybeare 2024-07-22 12:30:46 +10:00
commit 9dc37e84e6
15 changed files with 2154 additions and 0 deletions

27
CMakeLists.txt Normal file
View file

@ -0,0 +1,27 @@
# xo-tokenizer/CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
project(tokenizer VERSION 0.1)
include(GNUInstallDirs)
include(cmake/xo-bootstrap-macros.cmake)
xo_cxx_toplevel_options3()
# ----------------------------------------------------------------
# c++ settings
set(PROJECT_CXX_FLAGS "")
#set(PROJECT_CXX_FLAGS "-fconcepts-diagnostics-depth=2")
add_definitions(${PROJECT_CXX_FLAGS})
# ----------------------------------------------------------------
add_subdirectory(src/tokenizer)
add_subdirectory(utest)
# ----------------------------------------------------------------
# provide find_package() support
xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets)

View file

@ -0,0 +1,8 @@
@PACKAGE_INIT@
include(CMakeFindDependencyMacro)
#find_dependency(refcnt)
find_dependency(indentlog)
#find_dependency(subsys)
include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
check_required_components("@PROJECT_NAME@")

View file

@ -0,0 +1,35 @@
# ----------------------------------------------------------------
# for example:
# $ PREFIX=/usr/local # for example
# $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build
#
# will get
# CMAKE_MODULE_PATH
# from xo-cmake-config --cmake-module-path
#
# and expect .cmake macros in
# CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake
# ----------------------------------------------------------------
find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED)
if ("${XO_CMAKE_CONFIG_EXECUTABLE}" STREQUAL "XO_CMAKE_CONFIG_EXECUTABLE-NOT_FOUND")
message(FATAL "could not find xo-cmake-config executable")
endif()
message(STATUS "XO_CMAKE_CONFIG_EXECUTABLE=${XO_CMAKE_CONFIG_EXECUTABLE}")
if (NOT XO_SUBMODULE_BUILD)
if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix))
# default to typical install location for xo-project-macros
execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH)
message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}")
endif()
endif()
# needs to have been installed somewhere on CMAKE_MODULE_PATH,
# (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX)
#
include(xo_macros/xo_cxx)
xo_cxx_bootstrap_message()

View file

@ -0,0 +1,324 @@
/** @file buffer.hpp **/
#pragma once
#include "span.hpp"
#include <utility>
#include <cstdint>
#include <cassert>
#include <new>
namespace xo {
namespace tok {
/**
* @class buffer buffer.hpp
*
* @brief Container for a (possibly owned) FIFO queue of chars
*
* @tparam CharT. buffer element type.
*
* @code
* .buf
*
* +------------------------------------------+
* | | ... | | X| ... | X| | ... | |
* +------------------------------------------+
* ^ ^ ^ ^
* 0 .lo .hi .buf_z
*
* <-contents-><----avail----->
* @endcode
*
* Buffer does not support wrapped content:
* content that has not been consumed always occupies contiguous memory.
*
* Example:
* @code
* // 1.
* buffer<char> buf(64*1024);
* buf.empty() -> true
* buf.buf_z() -> 65536
* buf.lo_pos() -> 0
* buf.hi_pos() -> 65536
* buf.contents() -> empty span
* buf.avail() -> span entire buffer memory
*
* // write to (a prefix of) buf.avail()
* ::strncpy(buf.buf(), "hello, world\n", 13);
* buf.produce(span_type(buf.buf(), buf.buf() + 13));
*
* buf.lo_pos() -> 0
* buf.hi_pos() -> 13
* buf.contents() -> "hello, world\n";
*
*
* // examine stored content (does not change buffer state)
* auto span = buf.contents();
* cerr << string_view(span.lo(), span.hi()); // "hello, world\n"
*
* // consume (a prefix of) stored content
* buf.consume(span.prefix(7);
*
* buf.lo_pos() -> 7
* buf.hi_pos() -> 13
* buf.contents() -> "world\n"
*
* // consuming all remain content resets to original state
* buf.consume(buf.contents());
*
* buf.empty() -> true
* buf.hi_pos() -> 0 // not 13!
*
* // 2.
* buffer<char> buf;
* buf.empty() -> true
* buf.buf_z() -> 0
* buf.lo_pos() -> 0
* buf.hi_pos() -> 0
* buf.contents() -> empty span
* buf.avail() -> empty span
*
* // allocate memory separately from ctor
* buf.alloc(64*1024);
* @endcode
**/
template <typename CharT>
class buffer {
public:
/** @brief typealias for span of CharT **/
using span_type = span<CharT>;
/** @brief typealias for buffer size (counts CharT's, not bytes) **/
using size_type = std::uint64_t;
public:
/** @brief create empty buffer.
Does not allocate any storage; @see alloc
**/
buffer() = default;
/** @brief create empty buffer, and possibly allocate storage.
@param buf_z Buffer size. allocate storage (owned by this buffer) if >0.
@param align_z Align to this value, e.g. 8 to align storage on an 8-byte boundary
**/
buffer(size_type buf_z, size_type align_z = sizeof(char))
: is_owner_{true},
buf_{buf_z ? (new (std::align_val_t(align_z)) CharT [buf_z]) : nullptr},
buf_z_{buf_z},
lo_pos_{0},
hi_pos_{0}
{}
/** @brief buffer is not copyable **/
buffer(buffer const & x) = delete;
/** @brief destructor. Release storage if owned **/
~buffer() { this->reset(); }
/** @name Access methods **/
///@{
/** @brief start of buffer memory **/
CharT * buf() const { return buf_; }
/** @brief buffer size (number of characters) **/
size_type buf_z() const { return buf_z_; }
/** @brief current start position within buffer **/
size_type lo_pos() const { return lo_pos_; }
/** @brief current end position within buffer **/
size_type hi_pos() const { return hi_pos_; }
///@}
/** @brief readonly access to a single buffer element.
Relative to start of buffer (ignores current consume position)
**/
CharT const & operator[](size_type i) const { return buf_[i]; }
/** @brief return span for current buffer contents **/
span_type contents() const { return span_type(buf_ + lo_pos_, buf_ + hi_pos_); }
/** @brief returns span for writable buffer contents (unused prefix following produce position **/
span_type avail() const { return span_type(buf_ + hi_pos_, buf_ + buf_z_); }
/** @brief @c true iff buffer is empty **/
bool empty() const { return lo_pos_ == hi_pos_; }
/**
@brief update buffer produce position, after (independently) writing contents of span to it
@pre left endpoint of @p span equals buffer produce position (@c .hi_pos)
@pre right endpoint of @p span within bounds of buffer memory range
@post right endpoint of @p span equals buffer produce position.
**/
void produce(span_type const & span) {
assert(span.lo() == buf_ + hi_pos_);
hi_pos_ += span.size();
}
/**
@brief update buffer consume position, when done with contents of span
@pre left endpoint of @p span equals buffer consume position (@c .lo_pos)
@pre right endpoint of @p span within bounds of buffer memory range
@post Either
buffer is empty, with @c .lo_pos = @c .hi_pos = @c 0.
buffer is non-empty, right endpoint of @p span equals new buffer consume position.
**/
void consume(span_type const & span) {
if (span.size()) {
assert(span.lo() == buf_ + lo_pos_);
lo_pos_ += span.size();
} else {
/* since .consume() that arrives at empty contents also resets .lo_pos .hi_pos,
* we don't want to blow up when called with an empty span -- argument
* may represent some pre-reset location in buffer
*/
}
if (lo_pos_ == hi_pos_) {
lo_pos_ = 0;
hi_pos_ = 0;
}
}
/**
@brief allocate buffer with desired amount of memory
@param buf_z desired buffer size
@param align_z alignment; buffer memory will be aligned on this byte-boundary.
**/
void alloc(size_type buf_z, size_type align_z = sizeof(char)) {
/* properly reset (+ discard) any existing state */
this->reset();
is_owner_ = true;
if (buf_z)
buf_ = new (std::align_val_t(align_z)) CharT [buf_z];
buf_z_ = buf_z;
lo_pos_ = 0;
hi_pos_ = 0;
}
/**
@brief attach buffer to (unowned) range of @p buf_z bytes starting at @p buf[0]
Buffer is not responsible for managing storage.
@post
1. buffer is empty
@post
2. buffer read position = buffer write position = 0
**/
void setbuf(CharT * buf, size_type buf_z) {
/* properly reset (+ discard) any existing state */
this->reset();
is_owner_ = false;
lo_pos_ = 0;
hi_pos_ = 0;
buf_ = buf;
buf_z_ = buf_z;
}
/**
@brief revert buffer to empty state and possibly zero it
@param zero_buffer_flag Zero buffer contents iff this is true
@post
1. buffer is empty
@post
2. buffer read position = buffer write position = 0
**/
void clear2empty(bool zero_buffer_flag) {
if (buf_ && zero_buffer_flag)
explicit_bzero(buf_, buf_z_ * sizeof(CharT));
lo_pos_ = 0;
hi_pos_ = 0;
}
/**
@brief swap representation with another buffer instance.
**/
void swap (buffer & x) {
std::swap(is_owner_, x.is_owner_);
std::swap(buf_, x.buf_);
std::swap(buf_z_, x.buf_z_);
std::swap(lo_pos_, x.lo_pos_);
std::swap(hi_pos_, x.hi_pos_);
}
/**
@brief reset buffer to an empty state and recover owned storage
**/
void reset() {
if (is_owner_ && buf_)
delete [] buf_;
is_owner_ = false;
buf_ = nullptr;
buf_z_ = 0;
lo_pos_ = 0;
hi_pos_ = 0;
}
/**
@brief move-assignment operator.
@param x right-hand-side to move from.
@post
@p x is in a valid, empty,
**/
buffer & operator= (buffer && x) {
is_owner_ = x.is_owner_;
buf_ = x.buf_;
buf_z_ = x.buf_z_;
lo_pos_ = x.lo_pos_;
hi_pos_ = x.hi_pos_;
x.is_owner_ = false;
x.lo_pos_ = 0;
x.hi_pos_ = 0;
x.buf_ = nullptr;
x.buf_z_ = 0;
return *this;
}
/** @brief buffer is not assignable */
buffer & operator= (buffer & x) = delete;
private:
/** @brief true iff buffer is responsible for freeing storage at @c buf_ **/
bool is_owner_ = false;
/** @brief buffer contents. buffer memory comprises @c buf_[0] to @c buf_[buf_z_] **/
CharT * buf_ = nullptr;
/** @brief buffer size (in units of CharT) **/
size_type buf_z_ = 0;
/** @brief buffer read (consume) position
@invariant
0 <= lo_pos_ <= hi_pos_ < buf_z_
**/
size_type lo_pos_ = 0;
/** @brief buffer write (produce) position
@invariant
0 <= hi_pos_ < hi_pos_ < buf_z_
**/
size_type hi_pos_ = 0;
};
/** @brief Overload for @c swap, so that @c buffer<CharT> swappable **/
template <typename CharT>
inline void
swap(buffer<CharT> & lhs, buffer<CharT> & rhs) {
lhs.swap(rhs);
}
} /*namespace tok*/
} /*namespace xo*/
/* end buffer.hpp */

View file

@ -0,0 +1,141 @@
/** @file span.hpp **/
#pragma once
#include <ostream>
#include <cstdint>
#include <cassert>
namespace xo {
namespace tok {
/** @class span compression/span.hpp
*
* @brief Represents a contiguous memory range, without ownership.
*
* @tparam CharT type for elements referred to by this span.
**/
template <typename CharT>
class span {
public:
/** @brief typealias for span size (in units of CharT) **/
using size_type = std::uint64_t;
public:
/** @brief create span for the contiguous memory range [@p lo, @p hi) **/
span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {}
///@{
/** @name getters **/
CharT * lo() const { return lo_; } /* get member span::lo_ */
CharT * hi() const { return hi_; } /* get member span::hi_ */
///@}
/** @brief create new span over supplied type,
* with identical (possibly misaligned) endpoints.
*
* @warning
* 1. New span uses exactly the same memory addresses.
* Endpoint pointers may not be aligned.
* 2. Implementation assumes code compiled with
* @code -fno-strict-aliasing @endcode enabled.
*
* @tparam OtherT element type for new span
**/
template <typename OtherT>
span<OtherT>
cast() const { return span<OtherT>(reinterpret_cast<OtherT *>(lo_),
reinterpret_cast<OtherT *>(hi_)); }
/** @brief create span including the first @p z members of this span. **/
span prefix(size_type z) const { return span(lo_, lo_ + z); }
/** @brief create span representing prefix up to (but not including) @p *p
**/
span prefix(CharT * p) const {
if (p <= hi_)
return span(lo_, p);
else
return span(lo_, hi_);
}
/** @brief create span with first @p z members of this span removed **/
span after_prefix(size_type z) const {
if (z > hi_ - lo_)
z = hi_ - lo_;
return span(lo_ + z, hi_);
}
/** @brief create span with @p prefix of this span removed **/
span after_prefix(const span & prefix) const {
assert(prefix.lo() == lo_);
if (prefix.lo() != lo_) {
throw std::runtime_error
("after_prefix: expected prefix of this span");
}
return after_prefix(prefix.size());
}
/** @brief create span starting with position p **/
span suffix_from(CharT * p) const {
if ((lo_ <= p) && (p <= hi_))
return span(p, hi_);
else
return span(hi_, hi_);
}
/** @brief true iff this span is empty (comprises 0 elements). **/
bool empty() const { return lo_ == hi_; }
/** @brief report the number of elements (of type CharT) in this span. **/
size_type size() const { return hi_ - lo_; }
/** print representation for this span on stream @p os **/
void print(std::ostream & os) const {
os << "<span"
<< xtag("size", size())
<< " :text " << xo::print::quot(std::string_view(lo_, hi_))
<< ">";
}
private:
///@{
/** @brief start of span
Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
**/
CharT * lo_ = nullptr;
/** @brief end of span
Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
**/
CharT * hi_ = nullptr;
///@}
}; /*span*/
template <typename CharT>
inline bool
operator==(const span<CharT> & lhs, const span<CharT> & rhs) {
return ((lhs.lo() == rhs.lo())
&& (lhs.hi() == rhs.hi()));
}
template <typename CharT>
inline bool
operator!=(const span<CharT> & lhs, const span<CharT> & rhs) {
return ((lhs.lo() != rhs.lo())
|| (lhs.hi() != rhs.hi()));
}
template <typename CharT>
inline std::ostream &
operator<<(std::ostream & os,
const span<CharT> & x) {
x.print(os);
return os;
}
} /*namespace tok*/
} /*namespace xo*/

View file

@ -0,0 +1,334 @@
/* file token.hpp
*
* author: Roland Conybeare, Jul 2024
*/
#pragma once
#include "tokentype.hpp"
#include "xo/indentlog/print/tag.hpp"
#include <stdexcept>
#include <string>
#include <cstdint>
namespace xo {
namespace tok {
namespace detail {
/* compute a * b^p, p >= 0 */
constexpr double
pow_aux(double a, double b, int p) {
while (p > 0) {
if (p % 2 == 1) {
/* a * b^p = a * b^(2q + 1) = a.b * 10^(2q) */
a *= b;
p -= 1;
} else {
/* a * b^p = a * b^(2q) = a * (b^2)^q */
b = b * b;
p /= 2;
}
}
/* a * b^0 = a */
return a;
}
constexpr double
pow10(int p) {
if (p >= 0)
return pow_aux(1.0, 10.0, p);
else
return 1.0 / pow_aux(1.0, 10.0, -p);
}
}
template <typename CharT>
class token {
public:
token() = default;
token(tokentype tk_type, const std::string & text = "")
: tk_type_{tk_type}, text_{text} {}
static token invalid() { return token(); }
static token i64_token(const std::string & txt) {
return token(tokentype::tk_i64, txt);
}
static token f64_token(const std::string & txt) {
return token(tokentype::tk_f64, txt);
}
static token string_token(const std::string & txt) {
return token(tokentype::tk_string, txt);
}
static token symbol_token(const std::string & txt) {
return token(tokentype::tk_symbol, txt);
}
static token leftangle() { return token(tokentype::tk_leftangle); }
static token rightangle() { return token(tokentype::tk_rightangle); }
static token leftparen() { return token(tokentype::tk_leftparen); }
static token rightparen() { return token(tokentype::tk_rightparen); }
static token leftbracket() { return token(tokentype::tk_leftbracket); }
static token rightbracket() { return token(tokentype::tk_rightbracket); }
static token leftbrace() { return token(tokentype::tk_leftbrace); }
static token rightbrace() { return token(tokentype::tk_rightbrace); }
static token dot() { return token(tokentype::tk_dot); }
static token comma() { return token(tokentype::tk_comma); }
static token colon() { return token(tokentype::tk_colon); }
static token doublecolon() { return token(tokentype::tk_doublecolon); }
static token semicolon() { return token(tokentype::tk_semicolon); }
static token singleassign() { return token(tokentype::tk_singleassign); }
static token assign() { return token(tokentype::tk_assign); }
static token yields() { return token(tokentype::tk_yields); }
static token type() { return token(tokentype::tk_type); }
static token def() { return token(tokentype::tk_def); }
static token lambda() { return token(tokentype::tk_lambda); }
static token if_token() { return token(tokentype::tk_if); }
static token let() { return token(tokentype::tk_let); }
static token in() { return token(tokentype::tk_in); }
tokentype tk_type() const { return tk_type_; }
const std::string & text() const { return text_; }
bool is_valid() const { return tk_type_ != tokentype::tk_invalid; }
bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; }
/** expect input matching
* [+|-][0-9][0-9]*
**/
std::int64_t i64_value() const;
/** expect input matching
* [+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]*
**/
double f64_value() const;
private:
/** category for this token **/
tokentype tk_type_ = tokentype::tk_invalid;
/** characters comprising this token.
* only provided for certain token types:
*
* tk_i64
* tk_f64
* tk_string
* tk_symbol
**/
std::string text_;
}; /*token*/
template <typename CharT>
std::int64_t
token<CharT>::i64_value() const {
if (tk_type_ != tokentype::tk_i64) {
throw (std::runtime_error
(tostr("token::i64_value",
": token with type tk found where tk_i64 expected",
xtag("tk", tk_type_))));
}
if (text_.empty()) {
throw (std::runtime_error
(tostr("token::i64_value",
": unexpected empty input string for tk_i64 token")));
}
int sign = 1;
int value = 0;
{
auto ix = text_.begin();
auto end_ix = text_.end();
CharT ch = *ix;
if (ch == '+') {
++ix;
} else if (ch == '-') {
sign = -1;
++ix;
}
if (ix == end_ix) {
throw (std::runtime_error
(tostr("token::i64_value",
": input text found where at least one digit expected",
xtag("text", text_))));
}
for (; ix != end_ix; ++ix) {
CharT ch = *ix;
if ((ch >= '0') && (ch <= '9')) {
value *= 10;
value += (ch - '0');
} else {
throw (std::runtime_error
(tostr("token::i64_value",
": unexpected char ch in integer token",
xtag("ch", ch))));
}
}
}
return sign * value;
} /*i64_value*/
template <typename CharT>
double
token<CharT>::f64_value() const {
if (tk_type_ != tokentype::tk_f64) {
throw (std::runtime_error
(tostr("token::f64_value",
": token with type tk found where tk_f64 expected",
xtag("tk", tk_type_))));
}
if (text_.empty()) {
throw (std::runtime_error
(tostr("token::f64_value",
": unexpected empty input string for tk_f64 token")));
}
int sign = 1;
/* integer representing denormalized unsigned mantissa
* (mantissa scaled by smallest power of 10 sufficient to make
* it an integer)
*/
std::int64_t mantissa = 0;
/* counts #of digits to the right of decimal point '.' */
int rh_digits = 0;
/* sign of exponent */
int exp_sign = 1;
/* value of exponenct = integer to the right of 'e' or 'E' */
int exponent = 0;
/* floating-point value will represent
* sign * mantissa * 10^(sign*exponent - rh_digits)
*/
{
auto ix = text_.begin();
auto end_ix = text_.end();
CharT ch = *ix;
if (ch == '+') {
++ix;
} else if (ch == '-') {
sign = -1;
++ix;
}
if (ix == end_ix) {
throw (std::runtime_error
(tostr("token::f64_value",
": input text found where at least one digit expected",
xtag("text", text_))));
}
/* true iff decimal point '.' present in mantissa */
bool have_decimal_point = false;
/* true iff exponent prefix 'e' or 'E' present */
//bool have_exponent = false;
/* counts number of digits in mantissa
* (both before and after, but not including, any decimal point
*/
int m_digits = 0;
/* digits to the left of decimal point */
int lh_digits = 0;
/* loop over mantissa digits */
for (; ix != end_ix; ++ix) {
CharT ch = *ix;
if (ch == '.') {
if (have_decimal_point) {
throw (std::runtime_error
(tostr("token::f64_value",
": input text found where at most one decimal point expected",
xtag("text", text_))));
}
have_decimal_point = true;
lh_digits = m_digits;
} else if ((ch >= '0') && (ch <= '9')) {
mantissa *= 10;
mantissa += (ch - '0');
++m_digits;
} else if (ch == 'e' || ch == 'E') {
//have_exponent = true;
break; // done with mantissa
} else {
throw (std::runtime_error
(tostr("token::i64_value",
": unexpected char ch in integer token",
xtag("ch", ch))));
}
}
if (have_decimal_point)
rh_digits = m_digits - lh_digits;
if (ix != end_ix) {
/* continue to read exponent */
/* skip e|E */
++ix;
if (ix == end_ix) {
throw (std::runtime_error
(tostr("token::f64_value",
": on input text, expect at least one digit following exponent marker e|E",
xtag("text", text_))));
}
CharT ch = *ix;
if (ch == '+') {
++ix; /*skip*/
} else if (ch == '-') {
exp_sign = -1;
++ix;
}
for (; ix != end_ix; ++ix) {
CharT ch = *ix;
if ((ch >= '0') && (ch <= '9')) {
exponent *= 10;
exponent += (ch - '0');
} else {
throw (std::runtime_error
(tostr("token::f64_value",
"; on input text, expect only digits following"
" (possibly signed) exponenct marker",
xtag("text", text_))));
}
}
}
}
/* floating-point value will represent
* sign * mantissa * 10^(sign*exponent - rh_digits)
*/
double mantissa_f64 = sign * mantissa;
#ifdef OBSOLETE_DEBUG
std::cerr << xtag("text", text_)
<< xtag("rh_digits", rh_digits)
<< xtag("mantissa_f64", mantissa_f64)
<< xtag("exp_sign", exp_sign)
<< xtag("exponent", exponent)
<< std::endl;
#endif
double retval = (mantissa_f64
* detail::pow10((exp_sign * exponent)
- rh_digits));
return retval;
} /*f64_value*/
} /*Namespace tok*/
} /*namespace xo*/
/* end token.hpp */

View file

@ -0,0 +1,625 @@
/* file tokenizer.hpp
*
* author: Roland Conybeare, Jul 2024
*/
#pragma once
#include "token.hpp"
#include "span.hpp"
#include "xo/indentlog/scope.hpp"
#include <cassert>
namespace xo {
namespace tok {
/**
* Use:
* @code
* using tokenizer_type = tokenizer<char>;
* using span_type = tokenizer_type::span_type;
*
* tokenizer_type tkz;
* span_type input = ...;
*
* while !input.empty() {
* auto res = tkz.assemble_scan(input);
* const auto & tk = res.first;
*
* // do something with tk if tk.is_valid()
*
* input = input.after_prefix(res.second);
* }
*
* if endofinput {
* auto tk = tzk.notify_eof()
*
* // do something with tk if tk.is_valid()
* }
*
* // expect !tkz.has_prefix()
*
* @endcode
**/
template <typename CharT>
class tokenizer {
public:
using token_type = token<CharT>;
using span_type = span<const CharT>;
using scan_result = std::pair<token_type, span_type>;
public:
tokenizer() = default;
/** identifies whitespace chars.
* These are chars that do not belong to any token.
* They are not permitted to appear within
* a symbol or string token.
* Appearance of a whitespace char forces completion of
* preceding token.
**/
bool is_whitespace(CharT ch) const;
/** identifies punctuation chars.
* These are chars that are not permitted to appear within
* a string/symbol token. Instead they force completion of
* a preceding token, and start a new token with themselves
**/
bool is_punctuation(CharT ch) const;
/** true if tokenizer contains stored prefix of
* possibly-incomplete token
**/
bool has_prefix() const { !prefix_.empty(); }
/** assemble token from text @p token_text
**/
token_type assemble_token(const span_type & token_text) const;
/** scan for next input token, given @p input **/
scan_result scan(const span_type & input);
/** notify end of input, resolve any stored input **/
token_type notify_eof();
private:
/** Accumulate partial token here.
* This will happen if input sent to @ref tokenizer::scan
* ends without a determinate token boundary.
**/
std::string prefix_;
}; /*tokenizer*/
template <typename CharT>
bool
tokenizer<CharT>::is_whitespace(CharT ch) const {
switch(ch) {
case ' ': return true;
case '\t': return true;
case '\n': return true;
case '\r': return true;
}
return false;
}
template <typename CharT>
bool
tokenizer<CharT>::is_punctuation(CharT ch) const {
switch(ch) {
case '<':
return true;
case '>':
return true;
case '(':
return true;
case ')':
return true;
case '[':
return true;
case ']':
return true;
case '{':
return true;
case '}':
return true;
case ',':
return true;
case ';':
return true;
case ':':
return true;
case '=':
return true;
case '-':
/* can't be punctuation -- can appear inside f64 token */
return false;
case '+':
/* can't be punctuation -- can appear inside f64 token */
return false;
case '.':
/* can't be punctuation -- can appear inside f64 token */
return false;
}
return false;
}
template <typename CharT>
auto
tokenizer<CharT>::assemble_token(const span_type & token_text) const -> token_type
{
constexpr bool c_debug_flag = true;
/* literal|pretty|streamlined */
log_config::style = function_style::streamlined;
scope log(XO_DEBUG(c_debug_flag));
log && log(xtag("token_text", token_text));
tokentype tk_type = tokentype::tk_invalid;
std::string tk_text;
const CharT * tk_start = token_text.lo();
const CharT * tk_end = token_text.hi();
const CharT * ix = tk_start;
/* switch here applies to the first character in a token */
switch (*ix) {
case '-':
case '+':
case '.':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
{
/* examples of valid floating-point numbers:
* .0
* 1e0
* 1e
* 0.
* +1e0
* -1e0
* +1E+2
* -1E+2
* -0.123e-10
* non-examples:
* .
* -
* +
* e0
* .e0
* -.e-0
* +.e+0
*
* in particular: to be recognized as a number,
* must contain at least one digit
*/
log && log("possible number-token");
/* true if initial sign -/+ encountered */
bool sign_flag = false;
/* true if '.' encountered */
bool period_flag = false;
/* true if 'e' | 'E' encountered.
*/
bool exponent_flag = false;
/* true when sign '-' | '+' precedes exponenct digits */
bool exponent_sign_flag = false;
/* true when at least one digit follows exponent marker */
bool exponent_digit_flag = false;
/* true if at least one digit encountered */
bool number_flag = false;
/* token will be one of: {i64, f64, dot}: */
for(; ix != token_text.hi(); ++ix) {
if((*ix == '-') || (*ix == '+')) {
/* sign allowed:
* 1. before period and before first digit
* 2. after exponent
*/
if (!period_flag && !number_flag && !sign_flag) {
sign_flag = true;
} else if (exponent_flag && !exponent_digit_flag) {
exponent_sign_flag = true;
} else {
throw std::runtime_error
(tostr("tokenizer::assemble_token",
": improperly placed sign indicator",
xtag("pos", ix - tk_start),
xtag("char", *ix)));
}
} else if(*ix == '.') {
if (period_flag) {
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": duplicate decimal point",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
period_flag = true;
} else if((*ix == 'e') || (*ix == 'E')) {
if (exponent_flag) {
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": duplicate exponent marker",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
exponent_flag = true;
} else if(isdigit(*ix)) {
if (exponent_flag) {
/* need digit before exponent to recognize as number */
exponent_digit_flag = true;
} else {
number_flag = true;
}
} else {
/* invalid input */
throw (std::runtime_error
(tostr("tokenizer::assemble_token",
": unexpected character in numeric constant",
xtag("pos", ix - tk_start),
xtag("char", *ix))));
}
}
if (number_flag) {
if (period_flag || exponent_flag) {
tk_type = tokentype::tk_f64;
} else {
tk_type = tokentype::tk_i64;
}
} else if (period_flag && !exponent_flag) {
tk_type = tokentype::tk_dot;
} else {
/* not a valid token */
}
log && log(xtag("sign_flag", sign_flag));
log && log(xtag("period_flag", period_flag),
xtag("exponent_flag", exponent_flag),
xtag("exponent_sign_flag", exponent_sign_flag),
xtag("number_flag", number_flag));
log && log(xtag("tk_type", tk_type));
break;
}
case '"':
{
log && log("recognize string-token");
tk_type = tokentype::tk_string;
tk_text.reserve(token_text.hi() - token_text.lo());
++ix; /*skip initial " char*/
for (; ix != token_text.hi(); ++ix) {
log && log(xtag("*ix", *ix));
bool endofstring = false;
switch(*ix) {
case '"':
endofstring = true;
/* skip final " char, don't capture */
++ix;
break;
case '\\':
/* skip escape char, don't capture */
++ix;
if (ix == token_text.hi()) {
throw std::runtime_error
(tostr("tokenizer::assemble_token",
": malformed string literal",
xtag("input", std::string_view(token_text.lo(),
token_text.hi()))));
}
switch(*ix) {
case '\\':
log && log(xtag("*ix", *ix), xtag("escaped", "t"));
tk_text.push_back(*ix);
break;
case 'n':
log && log(xtag("*ix", *ix), xtag("newline", "t"));
tk_text.push_back('\n');
break;
case 't':
log && log(xtag("*ix", *ix), xtag("tab", "t"));
tk_text.push_back('\t');
break;
case 'r':
log && log(xtag("*ix", *ix), xtag("cr", "t"));
tk_text.push_back('\r');
break;
case '"':
log && log(xtag("*ix", *ix), xtag("quote", "t"));
tk_text.push_back('"');
break;
default:
throw std::runtime_error
(tostr("tokenizer::assemble_token",
": unexpected \\-escaped char",
xtag("char", *ix)));
}
break;
default:
tk_text.push_back(*ix);
break;
}
if (endofstring)
break;
}
if (ix != token_text.hi()) {
throw std::runtime_error
(tostr("tokenizer::assemble_token",
": expected \" to end string literal",
xtag("input", std::string_view(token_text.lo(),
token_text.hi()))));
}
log && log(tostr("tokenizer::assemble_token",
xtag("tk_text", tk_text)));
break;
}
case 'a': case 'A':
case 'b': case 'B':
case 'c': case 'C':
case 'd': case 'D':
case 'e': case 'E':
case 'f': case 'F':
case 'g': case 'G':
case 'h': case 'H':
case 'i': case 'I':
case 'j': case 'J':
case 'k': case 'K':
case 'l': case 'L':
case 'm': case 'M':
case 'n': case 'N':
case 'o': case 'O':
case 'p': case 'P':
case 'q': case 'Q':
case 'r': case 'R':
case 's': case 'S':
case 't': case 'T':
case 'u': case 'U':
case 'v': case 'V':
case 'w': case 'W':
case 'x': case 'X':
case 'y': case 'Y':
case 'z': case 'Z':
{
/* symbol/identifier must begin with a letter?
* we want to accept some other chars too.
* specifically want to allow identifiers:
* this-is-the-way
* this+is+also+the+way
* how/much/is/that/doggy
* put*an*asterisk*in*that
* something%special%
*
* like pure lisp, we don't allow:
* - identifier beginning with digit
* - period .
*
* unlike pure lisp, we don't allow anywhere in a symbol:
* - colon :
* - semicolon ;
* - comma ,
*
* also we don't allow symbols to begin with special chars
*/
tk_type = tokentype::tk_symbol;
break;
}
case '<':
tk_type = tokentype::tk_leftangle;
++ix;
break;
case '>':
tk_type = tokentype::tk_rightangle;
++ix;
break;
case '(':
tk_type = tokentype::tk_leftparen;
++ix;
break;
case ')':
tk_type = tokentype::tk_rightparen;
++ix;
break;
case '[':
tk_type = tokentype::tk_leftbracket;
++ix;
break;
case ']':
tk_type = tokentype::tk_rightbracket;
++ix;
break;
case '{':
tk_type = tokentype::tk_leftbrace;
++ix;
break;
case '}':
tk_type = tokentype::tk_rightbrace;
++ix;
break;
case ',':
tk_type = tokentype::tk_comma;
++ix;
break;
case ';':
tk_type = tokentype::tk_semicolon;
++ix;
break;
case ':':
tk_type = tokentype::tk_colon;
++ix;
break;
case '=':
tk_type = tokentype::tk_singleassign;
++ix;
break;
default:
break;
}
if (tk_type == tokentype::tk_invalid) {
throw std::runtime_error(tostr("tokenizer::assemble_token",
": unexpected input x",
xtag("x", *ix)));
}
if ((tk_type == tokentype::tk_i64)
|| (tk_type == tokentype::tk_f64)
|| (tk_type == tokentype::tk_symbol))
{
/* re-parse in token::i64_value() / token::f64_value() */
tk_text = std::string(tk_start, tk_end);
} else if (tk_type == tokentype::tk_string) {
; /* nothing to do here -- desired tk_text already constructed */
}
return token_type(tk_type, std::move(tk_text));
} /*assemble_token*/
template <typename CharT>
auto
tokenizer<CharT>::scan(const span_type & input) -> scan_result
{
constexpr bool c_debug_flag = true;
scope log(XO_DEBUG(c_debug_flag));
log && log(xtag("input", input));
const CharT * ix = input.lo();
/* skip whitespace */
while (is_whitespace(*ix) && (ix != input.hi()))
++ix;
if(ix == input.hi()) {
/* no-op */
return {
token_type::invalid(),
input.prefix(ix)
};
}
/* here: *ix is not whitespace */
auto whitespace = input.prefix(ix);
log && log(xtag("whitespace.size", whitespace.size()));
/* tk_start points to beginning of token
* (after any whitespace)
*/
const CharT * tk_start = ix;
if (is_punctuation(*ix)) {
/* 1-character token */
++ix;
} else if (*ix == '"') {
bool complete_flag = false;
/* 1. embedded space/tab allowed in string literal.
* 2. embedded newline/cr not allowed.
*/
CharT prev_ch = '"';
++ix;
for (; ix != input.hi(); ++ix) {
/* looking for unescaped " char to end literal */
if (*ix == '"') {
if (prev_ch != '\\') {
++ix; /* include terminating " for assemble_token */
complete_flag = true;
break;
}
} else if ((*ix == '\n') || (*ix == '\r')) {
throw std::runtime_error
(tostr("tokenizer::scan",
": must use \\n or \\r to encode newline/cr in"
" string literal"));
}
prev_ch = *ix;
}
if (!complete_flag) {
/* need more input to know if/when tokken complete */
this->prefix_ += std::string(tk_start, input.hi());
log && log(xtag("captured-prefix", this->prefix_));
}
} else {
/* scan until:
* - whitespace
* - punctuation
*/
for (; ix != input.hi(); ++ix) {
if (is_whitespace(*ix) || is_punctuation(*ix))
break;
}
if (ix == input.hi()) {
/* need more input to know if/when token complete */
this->prefix_ += std::string(tk_start, input.hi());
log && log(xtag("captured-prefix", this->prefix_));
}
}
auto token_span = input.after_prefix(whitespace).prefix(ix);
token tk
= (this->prefix_.empty()
? assemble_token(token_span)
: token_type(tokentype::tk_invalid));
return scan_result
{ tk, input.prefix(whitespace.size() + token_span.size()) };
} /*scan*/
template <typename CharT>
auto
tokenizer<CharT>::notify_eof() -> token_type {
constexpr bool c_debug_flag = true;
scope log(XO_DEBUG(c_debug_flag));
token tk
= (this->prefix_.empty()
? token_type(tokentype::tk_invalid)
: assemble_token(span_type(&prefix_[0],
&prefix_[prefix_.size()])));
this->prefix_.clear();
return tk;
} /*notify_eof*/
} /*namespace tok*/
} /*namespace xo*/
/* end tokenizer.hpp */

View file

@ -0,0 +1,142 @@
/** @file tokentype.hpp
*
* author: Roland Conybeare, Jul 2024
**/
#pragma once
#include "xo/indentlog/print/tag.hpp" // for STRINGIFY
#include <ostream>
namespace xo {
namespace tok {
/** @enum tokentype
* @brief enum to identify different schematica input token types
*
* Schematica code examples:
*
* type point :: { xcoord : f64, ycoord: f64 };
* type matrix :: array<double, 2>; // 2-d array
*
* decl hypot(x : f64, y : f64) -> f64;
*
* def hypot(x : f64, y : f64) {
* let
* x2 = (x * x);
* y2 = (y * y);
* hypot2 = (x2 + y2);
* in
* sqrt(hypot2);
* };
*
* def someconst 4;
*
* def foo(v : vec<i32>) {
* def (pi : f64) = 3.1415926;
* def (h : (f64,f64) -> f64) = hypot;
*
* h = hypot3;
* };
*
* def matrixproduct(x : matrix, y : matrix) {
* [i,j : x.row(i) * y.col(j)];
* };
**/
enum class tokentype {
/** sentinel value **/
tk_invalid = -1,
/** an integer constant (signed 64-bit integer) **/
tk_i64,
/** a 64-bit floating-point constant **/
tk_f64,
/** a string literal **/
tk_string,
/** a symbol **/
tk_symbol,
/** left-hand parenthesis '(' **/
tk_leftparen,
/** right-hand parenthesis ')' **/
tk_rightparen,
/** left-hand bracket '[' **/
tk_leftbracket,
/** right-hand bracket ']' **/
tk_rightbracket,
/** left-hand brace '{' **/
tk_leftbrace,
/** right-hand brace '}' **/
tk_rightbrace,
/** left-hand angle bracket '<' **/
tk_leftangle,
/** right-hand angle bracket '>' **/
tk_rightangle,
/** dot '.' **/
tk_dot,
/** comma ',' **/
tk_comma,
/** colon ':' **/
tk_colon,
/** double-colon '::' **/
tk_doublecolon,
/** semi-colon ';' **/
tk_semicolon,
/** '=' **/
tk_singleassign,
/** ':=' **/
tk_assign,
/** '->' **/
tk_yields,
/** keyworkd 'type' **/
tk_type,
/** keyword 'def' **/
tk_def,
/** keyword 'lambda' **/
tk_lambda,
/** keyword 'if' **/
tk_if,
/** keyword 'let' **/
tk_let,
/** keyword 'in' **/
tk_in,
n_tokentype /* comes last, counts #of entries */
}; /*tokentype*/
extern char const *
tokentype_descr(tokentype tk_type);
inline std::ostream &
operator<< (std::ostream & os, tokentype tk_type) {
os << tokentype_descr(tk_type);
return os;
}
} /*namespace tok*/
} /*namespace xo*/
/* end tokentype.hpp */

View file

@ -0,0 +1,14 @@
# tokenizer/CMakeLists.txt
set(SELF_LIB tokenizer)
set(SELF_SRCS
tokentype.cpp
token.cpp)
xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS})
#xo_dependency(${SELF_LIB} refcnt)
xo_dependency(${SELF_LIB} indentlog)
#xo_dependency(${SELF_LIB} subsys)
#xo_boost_dependency(${SELF_LIB})
# end CMakeLists.txt

9
src/tokenizer/token.cpp Normal file
View file

@ -0,0 +1,9 @@
/** @file token.cpp
*
* author: Roland Conybeare
**/
#include "token.hpp"
#include "xo/indentlog/print/tag.hpp"
/** end token.cpp **/

View file

@ -0,0 +1,56 @@
/* file tokentype.cpp
*
* author: Roland Conybeare
*/
#include "tokentype.hpp"
namespace xo {
namespace tok {
char const *
tokentype_descr(tokentype tk_type)
{
#define CASE(x) case tokentype::x: return STRINGIFY(x)
switch(tk_type) {
CASE(tk_i64);
CASE(tk_f64);
CASE(tk_string);
CASE(tk_symbol);
CASE(tk_leftparen);
CASE(tk_rightparen);
CASE(tk_leftbracket);
CASE(tk_rightbracket);
CASE(tk_leftbrace);
CASE(tk_rightbrace);
CASE(tk_leftangle);
CASE(tk_rightangle);
CASE(tk_dot);
CASE(tk_comma);
CASE(tk_colon);
CASE(tk_doublecolon);
CASE(tk_semicolon);
CASE(tk_singleassign);
CASE(tk_assign);
CASE(tk_yields);
CASE(tk_type);
CASE(tk_def);
CASE(tk_lambda);
CASE(tk_if);
CASE(tk_let);
CASE(tk_in);
case tokentype::tk_invalid:
case tokentype::n_tokentype:
return "?tokentype";
}
#undef CASE
return "???";
} /*tokentype_descr*/
} /*namespace tok*/
} /*namespace xo*/
/* end tokentype.cpp */

13
utest/CMakeLists.txt Normal file
View file

@ -0,0 +1,13 @@
# build unittest tokenizer/utest
set(SELF_EXECUTABLE_NAME utest.tokenizer)
set(SELF_SOURCE_FILES
tokenizer_utest_main.cpp
tokenizer.test.cpp
token.test.cpp)
xo_add_utest_executable(${SELF_EXECUTABLE_NAME} ${SELF_SOURCE_FILES})
xo_self_dependency(${SELF_EXECUTABLE_NAME} tokenizer)
xo_external_target_dependency(${SELF_EXECUTABLE_NAME} Catch2 Catch2::Catch2)
# end CMakeLists.txt

260
utest/token.test.cpp Normal file
View file

@ -0,0 +1,260 @@
/* file token.test.cpp
*
* author: Roland Conybeare
*/
#include "xo/tokenizer/token.hpp"
#include <catch2/catch.hpp>
#include <memory>
namespace xo {
using token = xo::tok::token<char>;
using xo::tok::tokentype;
namespace ut {
struct testcase_i64 {
std::string text_;
bool expect_throw_;
std::int64_t expected_;
};
std::vector<testcase_i64> s_testcase_v = {
{"", true, 0},
{"0", false, 0},
{"-", true, 0},
{"+", true, 0},
{"-0", false, 0},
{"+0", false, 0},
{"1", false, 1},
{"-1", false, -1},
{"9", false, 9},
{"-9", false, -9},
{"12", false, 12},
{"+12", false, 12},
{"-12", false, -12},
{"99", false, 99},
{"-99", false, -99},
{"123x", true, 0},
};
TEST_CASE("parse-i64", "[token]") {
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
INFO(xtag("i_tc", i_tc));
auto const & testcase = s_testcase_v[i_tc];
token tk(tokentype::tk_i64,
testcase.text_);
REQUIRE(tk.tk_type() == tokentype::tk_i64);
bool throw_flag = false;
try {
std::int64_t x = tk.i64_value();
REQUIRE(x == testcase.expected_);
} catch (std::exception & ex) {
throw_flag = true;
}
REQUIRE(throw_flag == testcase.expect_throw_);
}
}
TEST_CASE("error-i64", "[token]") {
token tk(tokentype::tk_i64, "+");
bool throw_flag = false;
try {
tk.i64_value();
} catch(std::exception & ex) {
throw_flag = true;
}
REQUIRE(throw_flag);
}
namespace {
struct testcase_f64 {
std::string text_;
bool expect_throw_;
double expected_;
};
std::vector<testcase_f64> s_testcase_v = {
{"", true, 0},
{"0", false, 0},
{"-", true, 0},
{"+", true, 0},
{"-0", false, 0},
{"+0", false, 0},
{"1", false, 1},
{"-1", false, -1},
{"9", false, 9},
{"-9", false, -9},
{"12", false, 12},
{"+12", false, 12},
{"-12", false, -12},
{"99", false, 99},
{"-99", false, -99},
{"123x", true, 0},
{"0.0", false, 0.0},
{"0.1", false, 0.1},
{"0.12", false, 0.12},
{"0.123", false, 0.123},
{"0.1234", false, 0.1234},
{"0.12345", false, 0.12345},
{"0.123456", false, 0.123456},
{"0.1234567", false, 0.1234567},
{"0.12345678", false, 0.12345678},
{"0.123456789", false, 0.123456789},
{"+0.0", false, 0.0},
{"+0.1", false, 0.1},
{"+0.12", false, 0.12},
{"+0.123", false, 0.123},
{"+0.1234", false, 0.1234},
{"+0.12345", false, 0.12345},
{"+0.123456", false, 0.123456},
{"+0.1234567", false, 0.1234567},
{"+0.12345678", false, 0.12345678},
{"+0.123456789", false, 0.123456789},
{"+0.0e0", false, 0.0},
{"+0.1e0", false, 0.1},
{"+0.12e0", false, 0.12},
{"+0.123e0", false, 0.123},
{"+0.1234e0", false, 0.1234},
{"+0.12345e0", false, 0.12345},
{"+0.123456e0", false, 0.123456},
{"+0.1234567e0", false, 0.1234567},
{"+0.12345678e0", false, 0.12345678},
{"+0.123456789e0", false, 0.123456789},
{"+0.0e1", false, 00.},
{"+0.1e1", false, 01.},
{"+0.12e1", false, 01.2},
{"+0.123e1", false, 01.23},
{"+0.1234e1", false, 01.234},
{"+0.12345e1", false, 01.2345},
{"+0.123456e1", false, 01.23456},
{"+0.1234567e1", false, 01.234567},
{"+0.12345678e1", false, 01.2345678},
{"+0.123456789e1", false, 01.23456789},
{"+0.0E1", false, 00.},
{"+0.1E1", false, 01.},
{"+0.12E1", false, 01.2},
{"+0.123E1", false, 01.23},
{"+0.1234E1", false, 01.234},
{"+0.12345E1", false, 01.2345},
{"+0.123456E1", false, 01.23456},
{"+0.1234567E1", false, 01.234567},
{"+0.12345678E1", false, 01.2345678},
{"+0.123456789E1", false, 01.23456789},
{"+0.0e9", false, 0.0},
{"+0.1e9", false, 0.1e9},
{"+0.12e9", false, 0.12e9},
{"+0.123e9", false, 0.123e9},
{"+0.1234e9", false, 0.1234e9},
{"+0.12345e9", false, 0.12345e9},
{"+0.123456e9", false, 0.123456e9},
{"+0.1234567e9", false, 0.1234567e9},
{"+0.12345678e9", false, 0.12345678e9},
{"+0.123456789e9", false, 0.123456789e9},
{"-0.0", false, -0.0},
{"-0.1", false, -0.1},
{"-0.12", false, -0.12},
{"-0.123", false, -0.123},
{"-0.1234", false, -0.1234},
{"-0.12345", false, -0.12345},
{"-0.123456", false, -0.123456},
{"-0.1234567", false, -0.1234567},
{"-0.12345678", false, -0.12345678},
{"-0.123456789", false, -0.123456789},
{"00.", false, 0.0},
{"01.", false, 1.0},
{"01.2", false, 1.2},
{"01.23", false, 1.23},
{"01.234", false, 1.234},
{"01.2345", false, 1.2345},
{"01.23456", false, 1.23456},
{"01.234567", false, 1.234567},
{"01.2345678", false, 1.2345678},
{"01.23456789", false, 1.23456789},
{"0.0", false, 0.0},
{"1.2", false, 1.2},
{"12.", false, 12.0},
{"12.3", false, 12.3},
{"12.34", false, 12.34},
{"12.345", false, 12.345},
{"12.3456", false, 12.3456},
{"12.34567", false, 12.34567},
{"12.345678", false, 12.345678},
{"12.3456789", false, 12.3456789},
{"01.23", false, 1.23},
{"12.3", false, 12.3},
{"123.", false, 123.0},
{"123.4", false, 123.4},
{"123.45", false, 123.45},
{"123.456", false, 123.456},
{"123.4567", false, 123.4567},
{"123.45678", false, 123.45678},
{"123.456789", false, 123.456789},
};
TEST_CASE("parse-f64", "[token]") {
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
auto const & testcase = s_testcase_v[i_tc];
INFO(tostr(xtag("i_tc", i_tc),
xtag("text", testcase.text_)
));
token tk(tokentype::tk_f64,
testcase.text_);
REQUIRE(tk.tk_type() == tokentype::tk_f64);
bool throw_flag = false;
std::string ex_msg;
try {
double x = tk.f64_value();
REQUIRE(x == Approx(testcase.expected_).epsilon(1.0e-15));
} catch (std::exception & ex) {
ex_msg = ex.what();
throw_flag = true;
}
INFO(xtag("ex_msg", ex_msg));
REQUIRE(throw_flag == testcase.expect_throw_);
}
}
} /*namespace*/
} /*namespace ut*/
} /*namespace xo*/
/* end token.test.cpp */

160
utest/tokenizer.test.cpp Normal file
View file

@ -0,0 +1,160 @@
/* file tokenizer.test.cpp
*
* author: Roland Conybeare
*/
#include "tokenizer.hpp"
#include <catch2/catch.hpp>
namespace xo {
using xo::tok::tokentype;
using token = xo::tok::token<char>;
using xo::tok::span;
namespace ut {
namespace {
struct testcase_tkz {
std::string input_;
bool expect_throw_;
token expected_tk_;
bool consume_all_;
};
std::vector<testcase_tkz>
s_testcase_v = {
{"<", false, token::leftangle(), true},
{">", false, token::rightangle(), true},
{"(", false, token::leftparen(), true},
{")", false, token::rightparen(), true},
{"[", false, token::leftbracket(), true},
{"]", false, token::rightbracket(), true},
{"{", false, token::leftbrace(), true},
{" {", false, token::leftbrace(), true},
{"\t{", false, token::leftbrace(), true},
{"\n{", false, token::leftbrace(), true},
{"}", false, token::rightbrace(), true},
{"0", false, token::i64_token("0"), true},
{"1", false, token::i64_token("1"), true},
{"12", false, token::i64_token("12"), true},
{"123", false, token::i64_token("123"), true},
{"1234", false, token::i64_token("1234"), true},
{"0 ", false, token::i64_token("0"), false},
{"1 ", false, token::i64_token("1"), false},
{"12 ", false, token::i64_token("12"), false},
{"123 ", false, token::i64_token("123"), false},
{"1234 ", false, token::i64_token("1234"), false},
{"1<", false, token::i64_token("1"), false},
{"1>", false, token::i64_token("1"), false},
{"1(", false, token::i64_token("1"), false},
{"1)", false, token::i64_token("1"), false},
{"1[", false, token::i64_token("1"), false},
{"1]", false, token::i64_token("1"), false},
{"1{", false, token::i64_token("1"), false},
{"1}", false, token::i64_token("1"), false},
{"1;", false, token::i64_token("1"), false},
{"1:", false, token::i64_token("1"), false},
{"1,", false, token::i64_token("1"), false},
{".1", false, token::f64_token(".1"), true},
{".12", false, token::f64_token(".12"), true},
{".123", false, token::f64_token(".123"), true},
{"+.1", false, token::f64_token("+.1"), true},
{"+.12", false, token::f64_token("+.12"), true},
{"+.123", false, token::f64_token("+.123"), true},
{"-.1", false, token::f64_token("-.1"), true},
{"-.12", false, token::f64_token("-.12"), true},
{"-.123", false, token::f64_token("-.123"), true},
{"1.", false, token::f64_token("1."), true},
{"1.2", false, token::f64_token("1.2"), true},
{"1.23", false, token::f64_token("1.23"), true},
{"1e0", false, token::f64_token("1e0"), true},
{"1e-1", false, token::f64_token("1e-1"), true},
{"1e1", false, token::f64_token("1e1"), true},
{"1e+1", false, token::f64_token("1e+1"), true},
{"\"hello\"", false, token::string_token("hello"), true},
/* tokenizer sees this input:
* "\"hi\", she said"
*/
{"\"\\\"hi\\\", she said\"", false, token::string_token("\"hi\", she said"), true},
/* tokenizer sees this input:
* "look ma, newline ->\n<- "
*/
{"\"look ma, newline ->\\n<- \"", false,
token::string_token("look ma, newline ->\n<- "), true},
/* tokenizer sees this input:
* "tab to the right [\t], to the right [\t]"
*/
{"\"tab to the right [\\t], to the right [\\t]\"", false,
token::string_token("tab to the right [\t], to the right [\t]"), true},
{"symbol", false, token::symbol_token("symbol"), true},
};
}
TEST_CASE("tokenizer", "[tokenizer]") {
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
const testcase_tkz & testcase = s_testcase_v[i_tc];
INFO(xtag("input", testcase.input_));
INFO(xtag("i_tc", i_tc));
using tokenizer
= xo::tok::tokenizer<char>;
tokenizer tkz;
tokenizer::span_type
in_span(testcase.input_.c_str(),
testcase.input_.c_str() + testcase.input_.size());
auto out = tkz.scan(in_span);
auto tk = out.first;
if (tk.is_invalid())
tk = tkz.notify_eof();
REQUIRE(tk.tk_type() == testcase.expected_tk_.tk_type());
if (tk.tk_type() == tokentype::tk_i64)
{
REQUIRE(!tk.text().empty());
REQUIRE(tk.i64_value() == testcase.expected_tk_.i64_value());
} else if (tk.tk_type() == tokentype::tk_f64)
{
REQUIRE(!tk.text().empty());
REQUIRE(tk.f64_value() == testcase.expected_tk_.f64_value());
} else if(tk.tk_type() == tokentype::tk_string)
{
/* tk.text() can be empty, consider input "" */
REQUIRE(tk.text() == testcase.expected_tk_.text());
} else if(tk.tk_type() == tokentype::tk_symbol)
{
REQUIRE(!tk.text().empty());
REQUIRE(tk.text() == testcase.expected_tk_.text());
} else {
REQUIRE(tk.text().empty());
}
/* must consume all input for tests we're doing here */
if (testcase.consume_all_)
REQUIRE(out.second == in_span);
else
REQUIRE(out.second != in_span);
}
}
} /*namespace ut*/
} /*namespace xo*/
/* end tokenizer.test.cpp */

View file

@ -0,0 +1,6 @@
/* file tokenizer_utest_main.cpp */
#define CATCH_CONFIG_MAIN
#include "catch2/catch.hpp"
/* end tokenizer_utest_main.cpp */