xo-tokenizer: build + utest + reasonable implementation
This commit is contained in:
commit
9dc37e84e6
15 changed files with 2154 additions and 0 deletions
27
CMakeLists.txt
Normal file
27
CMakeLists.txt
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# xo-tokenizer/CMakeLists.txt
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
project(tokenizer VERSION 0.1)
|
||||
|
||||
include(GNUInstallDirs)
|
||||
include(cmake/xo-bootstrap-macros.cmake)
|
||||
|
||||
xo_cxx_toplevel_options3()
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# c++ settings
|
||||
|
||||
set(PROJECT_CXX_FLAGS "")
|
||||
#set(PROJECT_CXX_FLAGS "-fconcepts-diagnostics-depth=2")
|
||||
add_definitions(${PROJECT_CXX_FLAGS})
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
add_subdirectory(src/tokenizer)
|
||||
add_subdirectory(utest)
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# provide find_package() support
|
||||
|
||||
xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets)
|
||||
8
cmake/tokenizerConfig.cmake.in
Normal file
8
cmake/tokenizerConfig.cmake.in
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
@PACKAGE_INIT@
|
||||
|
||||
include(CMakeFindDependencyMacro)
|
||||
#find_dependency(refcnt)
|
||||
find_dependency(indentlog)
|
||||
#find_dependency(subsys)
|
||||
include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
|
||||
check_required_components("@PROJECT_NAME@")
|
||||
35
cmake/xo-bootstrap-macros.cmake
Normal file
35
cmake/xo-bootstrap-macros.cmake
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
# ----------------------------------------------------------------
|
||||
# for example:
|
||||
# $ PREFIX=/usr/local # for example
|
||||
# $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build
|
||||
#
|
||||
# will get
|
||||
# CMAKE_MODULE_PATH
|
||||
# from xo-cmake-config --cmake-module-path
|
||||
#
|
||||
# and expect .cmake macros in
|
||||
# CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED)
|
||||
|
||||
if ("${XO_CMAKE_CONFIG_EXECUTABLE}" STREQUAL "XO_CMAKE_CONFIG_EXECUTABLE-NOT_FOUND")
|
||||
message(FATAL "could not find xo-cmake-config executable")
|
||||
endif()
|
||||
|
||||
message(STATUS "XO_CMAKE_CONFIG_EXECUTABLE=${XO_CMAKE_CONFIG_EXECUTABLE}")
|
||||
|
||||
if (NOT XO_SUBMODULE_BUILD)
|
||||
if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix))
|
||||
# default to typical install location for xo-project-macros
|
||||
execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH)
|
||||
message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# needs to have been installed somewhere on CMAKE_MODULE_PATH,
|
||||
# (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX)
|
||||
#
|
||||
include(xo_macros/xo_cxx)
|
||||
|
||||
xo_cxx_bootstrap_message()
|
||||
324
include/xo/tokenizer/buffer.hpp
Normal file
324
include/xo/tokenizer/buffer.hpp
Normal file
|
|
@ -0,0 +1,324 @@
|
|||
/** @file buffer.hpp **/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "span.hpp"
|
||||
#include <utility>
|
||||
#include <cstdint>
|
||||
#include <cassert>
|
||||
#include <new>
|
||||
|
||||
namespace xo {
|
||||
namespace tok {
|
||||
/**
|
||||
* @class buffer buffer.hpp
|
||||
*
|
||||
* @brief Container for a (possibly owned) FIFO queue of chars
|
||||
*
|
||||
* @tparam CharT. buffer element type.
|
||||
*
|
||||
* @code
|
||||
* .buf
|
||||
*
|
||||
* +------------------------------------------+
|
||||
* | | ... | | X| ... | X| | ... | |
|
||||
* +------------------------------------------+
|
||||
* ^ ^ ^ ^
|
||||
* 0 .lo .hi .buf_z
|
||||
*
|
||||
* <-contents-><----avail----->
|
||||
* @endcode
|
||||
*
|
||||
* Buffer does not support wrapped content:
|
||||
* content that has not been consumed always occupies contiguous memory.
|
||||
*
|
||||
* Example:
|
||||
* @code
|
||||
* // 1.
|
||||
* buffer<char> buf(64*1024);
|
||||
* buf.empty() -> true
|
||||
* buf.buf_z() -> 65536
|
||||
* buf.lo_pos() -> 0
|
||||
* buf.hi_pos() -> 65536
|
||||
* buf.contents() -> empty span
|
||||
* buf.avail() -> span entire buffer memory
|
||||
*
|
||||
* // write to (a prefix of) buf.avail()
|
||||
* ::strncpy(buf.buf(), "hello, world\n", 13);
|
||||
* buf.produce(span_type(buf.buf(), buf.buf() + 13));
|
||||
*
|
||||
* buf.lo_pos() -> 0
|
||||
* buf.hi_pos() -> 13
|
||||
* buf.contents() -> "hello, world\n";
|
||||
*
|
||||
*
|
||||
* // examine stored content (does not change buffer state)
|
||||
* auto span = buf.contents();
|
||||
* cerr << string_view(span.lo(), span.hi()); // "hello, world\n"
|
||||
*
|
||||
* // consume (a prefix of) stored content
|
||||
* buf.consume(span.prefix(7);
|
||||
*
|
||||
* buf.lo_pos() -> 7
|
||||
* buf.hi_pos() -> 13
|
||||
* buf.contents() -> "world\n"
|
||||
*
|
||||
* // consuming all remain content resets to original state
|
||||
* buf.consume(buf.contents());
|
||||
*
|
||||
* buf.empty() -> true
|
||||
* buf.hi_pos() -> 0 // not 13!
|
||||
*
|
||||
* // 2.
|
||||
* buffer<char> buf;
|
||||
* buf.empty() -> true
|
||||
* buf.buf_z() -> 0
|
||||
* buf.lo_pos() -> 0
|
||||
* buf.hi_pos() -> 0
|
||||
* buf.contents() -> empty span
|
||||
* buf.avail() -> empty span
|
||||
*
|
||||
* // allocate memory separately from ctor
|
||||
* buf.alloc(64*1024);
|
||||
* @endcode
|
||||
**/
|
||||
template <typename CharT>
|
||||
class buffer {
|
||||
public:
|
||||
/** @brief typealias for span of CharT **/
|
||||
using span_type = span<CharT>;
|
||||
/** @brief typealias for buffer size (counts CharT's, not bytes) **/
|
||||
using size_type = std::uint64_t;
|
||||
|
||||
public:
|
||||
/** @brief create empty buffer.
|
||||
|
||||
Does not allocate any storage; @see alloc
|
||||
**/
|
||||
buffer() = default;
|
||||
/** @brief create empty buffer, and possibly allocate storage.
|
||||
|
||||
@param buf_z Buffer size. allocate storage (owned by this buffer) if >0.
|
||||
@param align_z Align to this value, e.g. 8 to align storage on an 8-byte boundary
|
||||
**/
|
||||
buffer(size_type buf_z, size_type align_z = sizeof(char))
|
||||
: is_owner_{true},
|
||||
buf_{buf_z ? (new (std::align_val_t(align_z)) CharT [buf_z]) : nullptr},
|
||||
buf_z_{buf_z},
|
||||
lo_pos_{0},
|
||||
hi_pos_{0}
|
||||
{}
|
||||
/** @brief buffer is not copyable **/
|
||||
buffer(buffer const & x) = delete;
|
||||
/** @brief destructor. Release storage if owned **/
|
||||
~buffer() { this->reset(); }
|
||||
|
||||
/** @name Access methods **/
|
||||
///@{
|
||||
|
||||
/** @brief start of buffer memory **/
|
||||
CharT * buf() const { return buf_; }
|
||||
/** @brief buffer size (number of characters) **/
|
||||
size_type buf_z() const { return buf_z_; }
|
||||
/** @brief current start position within buffer **/
|
||||
size_type lo_pos() const { return lo_pos_; }
|
||||
/** @brief current end position within buffer **/
|
||||
size_type hi_pos() const { return hi_pos_; }
|
||||
|
||||
///@}
|
||||
|
||||
/** @brief readonly access to a single buffer element.
|
||||
|
||||
Relative to start of buffer (ignores current consume position)
|
||||
**/
|
||||
CharT const & operator[](size_type i) const { return buf_[i]; }
|
||||
|
||||
/** @brief return span for current buffer contents **/
|
||||
span_type contents() const { return span_type(buf_ + lo_pos_, buf_ + hi_pos_); }
|
||||
/** @brief returns span for writable buffer contents (unused prefix following produce position **/
|
||||
span_type avail() const { return span_type(buf_ + hi_pos_, buf_ + buf_z_); }
|
||||
|
||||
/** @brief @c true iff buffer is empty **/
|
||||
bool empty() const { return lo_pos_ == hi_pos_; }
|
||||
|
||||
|
||||
/**
|
||||
@brief update buffer produce position, after (independently) writing contents of span to it
|
||||
|
||||
@pre left endpoint of @p span equals buffer produce position (@c .hi_pos)
|
||||
@pre right endpoint of @p span within bounds of buffer memory range
|
||||
@post right endpoint of @p span equals buffer produce position.
|
||||
**/
|
||||
void produce(span_type const & span) {
|
||||
assert(span.lo() == buf_ + hi_pos_);
|
||||
|
||||
hi_pos_ += span.size();
|
||||
}
|
||||
|
||||
/**
|
||||
@brief update buffer consume position, when done with contents of span
|
||||
|
||||
@pre left endpoint of @p span equals buffer consume position (@c .lo_pos)
|
||||
@pre right endpoint of @p span within bounds of buffer memory range
|
||||
@post Either
|
||||
buffer is empty, with @c .lo_pos = @c .hi_pos = @c 0.
|
||||
buffer is non-empty, right endpoint of @p span equals new buffer consume position.
|
||||
**/
|
||||
void consume(span_type const & span) {
|
||||
if (span.size()) {
|
||||
assert(span.lo() == buf_ + lo_pos_);
|
||||
|
||||
lo_pos_ += span.size();
|
||||
} else {
|
||||
/* since .consume() that arrives at empty contents also resets .lo_pos .hi_pos,
|
||||
* we don't want to blow up when called with an empty span -- argument
|
||||
* may represent some pre-reset location in buffer
|
||||
*/
|
||||
}
|
||||
|
||||
if (lo_pos_ == hi_pos_) {
|
||||
lo_pos_ = 0;
|
||||
hi_pos_ = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@brief allocate buffer with desired amount of memory
|
||||
|
||||
@param buf_z desired buffer size
|
||||
@param align_z alignment; buffer memory will be aligned on this byte-boundary.
|
||||
**/
|
||||
void alloc(size_type buf_z, size_type align_z = sizeof(char)) {
|
||||
/* properly reset (+ discard) any existing state */
|
||||
this->reset();
|
||||
|
||||
is_owner_ = true;
|
||||
if (buf_z)
|
||||
buf_ = new (std::align_val_t(align_z)) CharT [buf_z];
|
||||
buf_z_ = buf_z;
|
||||
lo_pos_ = 0;
|
||||
hi_pos_ = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@brief attach buffer to (unowned) range of @p buf_z bytes starting at @p buf[0]
|
||||
|
||||
Buffer is not responsible for managing storage.
|
||||
|
||||
@post
|
||||
1. buffer is empty
|
||||
@post
|
||||
2. buffer read position = buffer write position = 0
|
||||
**/
|
||||
void setbuf(CharT * buf, size_type buf_z) {
|
||||
/* properly reset (+ discard) any existing state */
|
||||
this->reset();
|
||||
|
||||
is_owner_ = false;
|
||||
lo_pos_ = 0;
|
||||
hi_pos_ = 0;
|
||||
buf_ = buf;
|
||||
buf_z_ = buf_z;
|
||||
}
|
||||
|
||||
/**
|
||||
@brief revert buffer to empty state and possibly zero it
|
||||
|
||||
@param zero_buffer_flag Zero buffer contents iff this is true
|
||||
|
||||
@post
|
||||
1. buffer is empty
|
||||
@post
|
||||
2. buffer read position = buffer write position = 0
|
||||
**/
|
||||
void clear2empty(bool zero_buffer_flag) {
|
||||
if (buf_ && zero_buffer_flag)
|
||||
explicit_bzero(buf_, buf_z_ * sizeof(CharT));
|
||||
|
||||
lo_pos_ = 0;
|
||||
hi_pos_ = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@brief swap representation with another buffer instance.
|
||||
**/
|
||||
void swap (buffer & x) {
|
||||
std::swap(is_owner_, x.is_owner_);
|
||||
std::swap(buf_, x.buf_);
|
||||
std::swap(buf_z_, x.buf_z_);
|
||||
std::swap(lo_pos_, x.lo_pos_);
|
||||
std::swap(hi_pos_, x.hi_pos_);
|
||||
}
|
||||
|
||||
/**
|
||||
@brief reset buffer to an empty state and recover owned storage
|
||||
**/
|
||||
void reset() {
|
||||
if (is_owner_ && buf_)
|
||||
delete [] buf_;
|
||||
|
||||
is_owner_ = false;
|
||||
buf_ = nullptr;
|
||||
buf_z_ = 0;
|
||||
lo_pos_ = 0;
|
||||
hi_pos_ = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@brief move-assignment operator.
|
||||
@param x right-hand-side to move from.
|
||||
|
||||
@post
|
||||
@p x is in a valid, empty,
|
||||
**/
|
||||
buffer & operator= (buffer && x) {
|
||||
is_owner_ = x.is_owner_;
|
||||
buf_ = x.buf_;
|
||||
buf_z_ = x.buf_z_;
|
||||
lo_pos_ = x.lo_pos_;
|
||||
hi_pos_ = x.hi_pos_;
|
||||
|
||||
x.is_owner_ = false;
|
||||
x.lo_pos_ = 0;
|
||||
x.hi_pos_ = 0;
|
||||
x.buf_ = nullptr;
|
||||
x.buf_z_ = 0;
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
/** @brief buffer is not assignable */
|
||||
buffer & operator= (buffer & x) = delete;
|
||||
|
||||
private:
|
||||
/** @brief true iff buffer is responsible for freeing storage at @c buf_ **/
|
||||
bool is_owner_ = false;
|
||||
/** @brief buffer contents. buffer memory comprises @c buf_[0] to @c buf_[buf_z_] **/
|
||||
CharT * buf_ = nullptr;
|
||||
/** @brief buffer size (in units of CharT) **/
|
||||
size_type buf_z_ = 0;
|
||||
|
||||
/** @brief buffer read (consume) position
|
||||
|
||||
@invariant
|
||||
0 <= lo_pos_ <= hi_pos_ < buf_z_
|
||||
**/
|
||||
size_type lo_pos_ = 0;
|
||||
/** @brief buffer write (produce) position
|
||||
|
||||
@invariant
|
||||
0 <= hi_pos_ < hi_pos_ < buf_z_
|
||||
**/
|
||||
size_type hi_pos_ = 0;
|
||||
};
|
||||
|
||||
/** @brief Overload for @c swap, so that @c buffer<CharT> swappable **/
|
||||
template <typename CharT>
|
||||
inline void
|
||||
swap(buffer<CharT> & lhs, buffer<CharT> & rhs) {
|
||||
lhs.swap(rhs);
|
||||
}
|
||||
} /*namespace tok*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end buffer.hpp */
|
||||
141
include/xo/tokenizer/span.hpp
Normal file
141
include/xo/tokenizer/span.hpp
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
/** @file span.hpp **/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ostream>
|
||||
#include <cstdint>
|
||||
#include <cassert>
|
||||
|
||||
namespace xo {
|
||||
namespace tok {
|
||||
/** @class span compression/span.hpp
|
||||
*
|
||||
* @brief Represents a contiguous memory range, without ownership.
|
||||
*
|
||||
* @tparam CharT type for elements referred to by this span.
|
||||
**/
|
||||
template <typename CharT>
|
||||
class span {
|
||||
public:
|
||||
/** @brief typealias for span size (in units of CharT) **/
|
||||
using size_type = std::uint64_t;
|
||||
|
||||
public:
|
||||
/** @brief create span for the contiguous memory range [@p lo, @p hi) **/
|
||||
span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {}
|
||||
|
||||
///@{
|
||||
|
||||
/** @name getters **/
|
||||
|
||||
CharT * lo() const { return lo_; } /* get member span::lo_ */
|
||||
CharT * hi() const { return hi_; } /* get member span::hi_ */
|
||||
|
||||
///@}
|
||||
|
||||
/** @brief create new span over supplied type,
|
||||
* with identical (possibly misaligned) endpoints.
|
||||
*
|
||||
* @warning
|
||||
* 1. New span uses exactly the same memory addresses.
|
||||
* Endpoint pointers may not be aligned.
|
||||
* 2. Implementation assumes code compiled with
|
||||
* @code -fno-strict-aliasing @endcode enabled.
|
||||
*
|
||||
* @tparam OtherT element type for new span
|
||||
**/
|
||||
template <typename OtherT>
|
||||
span<OtherT>
|
||||
cast() const { return span<OtherT>(reinterpret_cast<OtherT *>(lo_),
|
||||
reinterpret_cast<OtherT *>(hi_)); }
|
||||
|
||||
/** @brief create span including the first @p z members of this span. **/
|
||||
span prefix(size_type z) const { return span(lo_, lo_ + z); }
|
||||
|
||||
/** @brief create span representing prefix up to (but not including) @p *p
|
||||
**/
|
||||
span prefix(CharT * p) const {
|
||||
if (p <= hi_)
|
||||
return span(lo_, p);
|
||||
else
|
||||
return span(lo_, hi_);
|
||||
}
|
||||
|
||||
/** @brief create span with first @p z members of this span removed **/
|
||||
span after_prefix(size_type z) const {
|
||||
if (z > hi_ - lo_)
|
||||
z = hi_ - lo_;
|
||||
|
||||
return span(lo_ + z, hi_);
|
||||
}
|
||||
|
||||
/** @brief create span with @p prefix of this span removed **/
|
||||
span after_prefix(const span & prefix) const {
|
||||
assert(prefix.lo() == lo_);
|
||||
if (prefix.lo() != lo_) {
|
||||
throw std::runtime_error
|
||||
("after_prefix: expected prefix of this span");
|
||||
}
|
||||
|
||||
return after_prefix(prefix.size());
|
||||
}
|
||||
|
||||
/** @brief create span starting with position p **/
|
||||
span suffix_from(CharT * p) const {
|
||||
if ((lo_ <= p) && (p <= hi_))
|
||||
return span(p, hi_);
|
||||
else
|
||||
return span(hi_, hi_);
|
||||
}
|
||||
|
||||
/** @brief true iff this span is empty (comprises 0 elements). **/
|
||||
bool empty() const { return lo_ == hi_; }
|
||||
/** @brief report the number of elements (of type CharT) in this span. **/
|
||||
size_type size() const { return hi_ - lo_; }
|
||||
|
||||
/** print representation for this span on stream @p os **/
|
||||
void print(std::ostream & os) const {
|
||||
os << "<span"
|
||||
<< xtag("size", size())
|
||||
<< " :text " << xo::print::quot(std::string_view(lo_, hi_))
|
||||
<< ">";
|
||||
}
|
||||
|
||||
private:
|
||||
///@{
|
||||
|
||||
/** @brief start of span
|
||||
Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
|
||||
**/
|
||||
CharT * lo_ = nullptr;
|
||||
/** @brief end of span
|
||||
Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
|
||||
**/
|
||||
CharT * hi_ = nullptr;
|
||||
|
||||
///@}
|
||||
}; /*span*/
|
||||
|
||||
template <typename CharT>
|
||||
inline bool
|
||||
operator==(const span<CharT> & lhs, const span<CharT> & rhs) {
|
||||
return ((lhs.lo() == rhs.lo())
|
||||
&& (lhs.hi() == rhs.hi()));
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
inline bool
|
||||
operator!=(const span<CharT> & lhs, const span<CharT> & rhs) {
|
||||
return ((lhs.lo() != rhs.lo())
|
||||
|| (lhs.hi() != rhs.hi()));
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
inline std::ostream &
|
||||
operator<<(std::ostream & os,
|
||||
const span<CharT> & x) {
|
||||
x.print(os);
|
||||
return os;
|
||||
}
|
||||
} /*namespace tok*/
|
||||
} /*namespace xo*/
|
||||
334
include/xo/tokenizer/token.hpp
Normal file
334
include/xo/tokenizer/token.hpp
Normal file
|
|
@ -0,0 +1,334 @@
|
|||
/* file token.hpp
|
||||
*
|
||||
* author: Roland Conybeare, Jul 2024
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tokentype.hpp"
|
||||
#include "xo/indentlog/print/tag.hpp"
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <cstdint>
|
||||
|
||||
namespace xo {
|
||||
namespace tok {
|
||||
namespace detail {
|
||||
/* compute a * b^p, p >= 0 */
|
||||
constexpr double
|
||||
pow_aux(double a, double b, int p) {
|
||||
while (p > 0) {
|
||||
if (p % 2 == 1) {
|
||||
/* a * b^p = a * b^(2q + 1) = a.b * 10^(2q) */
|
||||
a *= b;
|
||||
p -= 1;
|
||||
} else {
|
||||
/* a * b^p = a * b^(2q) = a * (b^2)^q */
|
||||
b = b * b;
|
||||
p /= 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* a * b^0 = a */
|
||||
return a;
|
||||
}
|
||||
|
||||
constexpr double
|
||||
pow10(int p) {
|
||||
if (p >= 0)
|
||||
return pow_aux(1.0, 10.0, p);
|
||||
else
|
||||
return 1.0 / pow_aux(1.0, 10.0, -p);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
class token {
|
||||
public:
|
||||
token() = default;
|
||||
token(tokentype tk_type, const std::string & text = "")
|
||||
: tk_type_{tk_type}, text_{text} {}
|
||||
|
||||
static token invalid() { return token(); }
|
||||
static token i64_token(const std::string & txt) {
|
||||
return token(tokentype::tk_i64, txt);
|
||||
}
|
||||
static token f64_token(const std::string & txt) {
|
||||
return token(tokentype::tk_f64, txt);
|
||||
}
|
||||
static token string_token(const std::string & txt) {
|
||||
return token(tokentype::tk_string, txt);
|
||||
}
|
||||
static token symbol_token(const std::string & txt) {
|
||||
return token(tokentype::tk_symbol, txt);
|
||||
}
|
||||
static token leftangle() { return token(tokentype::tk_leftangle); }
|
||||
static token rightangle() { return token(tokentype::tk_rightangle); }
|
||||
static token leftparen() { return token(tokentype::tk_leftparen); }
|
||||
static token rightparen() { return token(tokentype::tk_rightparen); }
|
||||
static token leftbracket() { return token(tokentype::tk_leftbracket); }
|
||||
static token rightbracket() { return token(tokentype::tk_rightbracket); }
|
||||
static token leftbrace() { return token(tokentype::tk_leftbrace); }
|
||||
static token rightbrace() { return token(tokentype::tk_rightbrace); }
|
||||
static token dot() { return token(tokentype::tk_dot); }
|
||||
static token comma() { return token(tokentype::tk_comma); }
|
||||
static token colon() { return token(tokentype::tk_colon); }
|
||||
static token doublecolon() { return token(tokentype::tk_doublecolon); }
|
||||
static token semicolon() { return token(tokentype::tk_semicolon); }
|
||||
static token singleassign() { return token(tokentype::tk_singleassign); }
|
||||
static token assign() { return token(tokentype::tk_assign); }
|
||||
static token yields() { return token(tokentype::tk_yields); }
|
||||
|
||||
static token type() { return token(tokentype::tk_type); }
|
||||
static token def() { return token(tokentype::tk_def); }
|
||||
static token lambda() { return token(tokentype::tk_lambda); }
|
||||
static token if_token() { return token(tokentype::tk_if); }
|
||||
static token let() { return token(tokentype::tk_let); }
|
||||
static token in() { return token(tokentype::tk_in); }
|
||||
|
||||
tokentype tk_type() const { return tk_type_; }
|
||||
const std::string & text() const { return text_; }
|
||||
|
||||
bool is_valid() const { return tk_type_ != tokentype::tk_invalid; }
|
||||
bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; }
|
||||
|
||||
/** expect input matching
|
||||
* [+|-][0-9][0-9]*
|
||||
**/
|
||||
std::int64_t i64_value() const;
|
||||
/** expect input matching
|
||||
* [+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]*
|
||||
**/
|
||||
double f64_value() const;
|
||||
|
||||
private:
|
||||
/** category for this token **/
|
||||
tokentype tk_type_ = tokentype::tk_invalid;
|
||||
|
||||
/** characters comprising this token.
|
||||
* only provided for certain token types:
|
||||
*
|
||||
* tk_i64
|
||||
* tk_f64
|
||||
* tk_string
|
||||
* tk_symbol
|
||||
**/
|
||||
std::string text_;
|
||||
}; /*token*/
|
||||
|
||||
template <typename CharT>
|
||||
std::int64_t
|
||||
token<CharT>::i64_value() const {
|
||||
if (tk_type_ != tokentype::tk_i64) {
|
||||
throw (std::runtime_error
|
||||
(tostr("token::i64_value",
|
||||
": token with type tk found where tk_i64 expected",
|
||||
xtag("tk", tk_type_))));
|
||||
}
|
||||
|
||||
if (text_.empty()) {
|
||||
throw (std::runtime_error
|
||||
(tostr("token::i64_value",
|
||||
": unexpected empty input string for tk_i64 token")));
|
||||
}
|
||||
|
||||
int sign = 1;
|
||||
int value = 0;
|
||||
{
|
||||
auto ix = text_.begin();
|
||||
auto end_ix = text_.end();
|
||||
|
||||
CharT ch = *ix;
|
||||
|
||||
if (ch == '+') {
|
||||
++ix;
|
||||
} else if (ch == '-') {
|
||||
sign = -1;
|
||||
++ix;
|
||||
}
|
||||
|
||||
if (ix == end_ix) {
|
||||
throw (std::runtime_error
|
||||
(tostr("token::i64_value",
|
||||
": input text found where at least one digit expected",
|
||||
xtag("text", text_))));
|
||||
}
|
||||
|
||||
for (; ix != end_ix; ++ix) {
|
||||
CharT ch = *ix;
|
||||
|
||||
if ((ch >= '0') && (ch <= '9')) {
|
||||
value *= 10;
|
||||
value += (ch - '0');
|
||||
} else {
|
||||
throw (std::runtime_error
|
||||
(tostr("token::i64_value",
|
||||
": unexpected char ch in integer token",
|
||||
xtag("ch", ch))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sign * value;
|
||||
} /*i64_value*/
|
||||
|
||||
template <typename CharT>
|
||||
double
|
||||
token<CharT>::f64_value() const {
|
||||
if (tk_type_ != tokentype::tk_f64) {
|
||||
throw (std::runtime_error
|
||||
(tostr("token::f64_value",
|
||||
": token with type tk found where tk_f64 expected",
|
||||
xtag("tk", tk_type_))));
|
||||
}
|
||||
|
||||
if (text_.empty()) {
|
||||
throw (std::runtime_error
|
||||
(tostr("token::f64_value",
|
||||
": unexpected empty input string for tk_f64 token")));
|
||||
}
|
||||
|
||||
int sign = 1;
|
||||
/* integer representing denormalized unsigned mantissa
|
||||
* (mantissa scaled by smallest power of 10 sufficient to make
|
||||
* it an integer)
|
||||
*/
|
||||
std::int64_t mantissa = 0;
|
||||
/* counts #of digits to the right of decimal point '.' */
|
||||
int rh_digits = 0;
|
||||
/* sign of exponent */
|
||||
int exp_sign = 1;
|
||||
/* value of exponenct = integer to the right of 'e' or 'E' */
|
||||
int exponent = 0;
|
||||
|
||||
/* floating-point value will represent
|
||||
* sign * mantissa * 10^(sign*exponent - rh_digits)
|
||||
*/
|
||||
{
|
||||
auto ix = text_.begin();
|
||||
auto end_ix = text_.end();
|
||||
|
||||
CharT ch = *ix;
|
||||
|
||||
if (ch == '+') {
|
||||
++ix;
|
||||
} else if (ch == '-') {
|
||||
sign = -1;
|
||||
++ix;
|
||||
}
|
||||
|
||||
if (ix == end_ix) {
|
||||
throw (std::runtime_error
|
||||
(tostr("token::f64_value",
|
||||
": input text found where at least one digit expected",
|
||||
xtag("text", text_))));
|
||||
}
|
||||
|
||||
/* true iff decimal point '.' present in mantissa */
|
||||
bool have_decimal_point = false;
|
||||
/* true iff exponent prefix 'e' or 'E' present */
|
||||
//bool have_exponent = false;
|
||||
/* counts number of digits in mantissa
|
||||
* (both before and after, but not including, any decimal point
|
||||
*/
|
||||
int m_digits = 0;
|
||||
/* digits to the left of decimal point */
|
||||
int lh_digits = 0;
|
||||
|
||||
/* loop over mantissa digits */
|
||||
for (; ix != end_ix; ++ix) {
|
||||
CharT ch = *ix;
|
||||
|
||||
if (ch == '.') {
|
||||
if (have_decimal_point) {
|
||||
throw (std::runtime_error
|
||||
(tostr("token::f64_value",
|
||||
": input text found where at most one decimal point expected",
|
||||
xtag("text", text_))));
|
||||
}
|
||||
|
||||
have_decimal_point = true;
|
||||
lh_digits = m_digits;
|
||||
} else if ((ch >= '0') && (ch <= '9')) {
|
||||
mantissa *= 10;
|
||||
mantissa += (ch - '0');
|
||||
++m_digits;
|
||||
} else if (ch == 'e' || ch == 'E') {
|
||||
//have_exponent = true;
|
||||
break; // done with mantissa
|
||||
} else {
|
||||
throw (std::runtime_error
|
||||
(tostr("token::i64_value",
|
||||
": unexpected char ch in integer token",
|
||||
xtag("ch", ch))));
|
||||
}
|
||||
}
|
||||
|
||||
if (have_decimal_point)
|
||||
rh_digits = m_digits - lh_digits;
|
||||
|
||||
if (ix != end_ix) {
|
||||
/* continue to read exponent */
|
||||
|
||||
/* skip e|E */
|
||||
++ix;
|
||||
|
||||
if (ix == end_ix) {
|
||||
throw (std::runtime_error
|
||||
(tostr("token::f64_value",
|
||||
": on input text, expect at least one digit following exponent marker e|E",
|
||||
xtag("text", text_))));
|
||||
}
|
||||
|
||||
CharT ch = *ix;
|
||||
|
||||
if (ch == '+') {
|
||||
++ix; /*skip*/
|
||||
} else if (ch == '-') {
|
||||
exp_sign = -1;
|
||||
++ix;
|
||||
}
|
||||
|
||||
for (; ix != end_ix; ++ix) {
|
||||
CharT ch = *ix;
|
||||
|
||||
if ((ch >= '0') && (ch <= '9')) {
|
||||
exponent *= 10;
|
||||
exponent += (ch - '0');
|
||||
} else {
|
||||
throw (std::runtime_error
|
||||
(tostr("token::f64_value",
|
||||
"; on input text, expect only digits following"
|
||||
" (possibly signed) exponenct marker",
|
||||
xtag("text", text_))));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* floating-point value will represent
|
||||
* sign * mantissa * 10^(sign*exponent - rh_digits)
|
||||
*/
|
||||
|
||||
double mantissa_f64 = sign * mantissa;
|
||||
|
||||
#ifdef OBSOLETE_DEBUG
|
||||
std::cerr << xtag("text", text_)
|
||||
<< xtag("rh_digits", rh_digits)
|
||||
<< xtag("mantissa_f64", mantissa_f64)
|
||||
<< xtag("exp_sign", exp_sign)
|
||||
<< xtag("exponent", exponent)
|
||||
<< std::endl;
|
||||
#endif
|
||||
|
||||
double retval = (mantissa_f64
|
||||
* detail::pow10((exp_sign * exponent)
|
||||
- rh_digits));
|
||||
|
||||
return retval;
|
||||
} /*f64_value*/
|
||||
} /*Namespace tok*/
|
||||
} /*namespace xo*/
|
||||
|
||||
|
||||
/* end token.hpp */
|
||||
625
include/xo/tokenizer/tokenizer.hpp
Normal file
625
include/xo/tokenizer/tokenizer.hpp
Normal file
|
|
@ -0,0 +1,625 @@
|
|||
/* file tokenizer.hpp
|
||||
*
|
||||
* author: Roland Conybeare, Jul 2024
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "token.hpp"
|
||||
#include "span.hpp"
|
||||
#include "xo/indentlog/scope.hpp"
|
||||
#include <cassert>
|
||||
|
||||
namespace xo {
|
||||
namespace tok {
|
||||
/**
|
||||
* Use:
|
||||
* @code
|
||||
* using tokenizer_type = tokenizer<char>;
|
||||
* using span_type = tokenizer_type::span_type;
|
||||
*
|
||||
* tokenizer_type tkz;
|
||||
* span_type input = ...;
|
||||
*
|
||||
* while !input.empty() {
|
||||
* auto res = tkz.assemble_scan(input);
|
||||
* const auto & tk = res.first;
|
||||
*
|
||||
* // do something with tk if tk.is_valid()
|
||||
*
|
||||
* input = input.after_prefix(res.second);
|
||||
* }
|
||||
*
|
||||
* if endofinput {
|
||||
* auto tk = tzk.notify_eof()
|
||||
*
|
||||
* // do something with tk if tk.is_valid()
|
||||
* }
|
||||
*
|
||||
* // expect !tkz.has_prefix()
|
||||
*
|
||||
* @endcode
|
||||
**/
|
||||
template <typename CharT>
|
||||
class tokenizer {
|
||||
public:
|
||||
using token_type = token<CharT>;
|
||||
using span_type = span<const CharT>;
|
||||
using scan_result = std::pair<token_type, span_type>;
|
||||
|
||||
public:
|
||||
tokenizer() = default;
|
||||
|
||||
/** identifies whitespace chars.
|
||||
* These are chars that do not belong to any token.
|
||||
* They are not permitted to appear within
|
||||
* a symbol or string token.
|
||||
* Appearance of a whitespace char forces completion of
|
||||
* preceding token.
|
||||
**/
|
||||
bool is_whitespace(CharT ch) const;
|
||||
|
||||
/** identifies punctuation chars.
|
||||
* These are chars that are not permitted to appear within
|
||||
* a string/symbol token. Instead they force completion of
|
||||
* a preceding token, and start a new token with themselves
|
||||
**/
|
||||
bool is_punctuation(CharT ch) const;
|
||||
|
||||
/** true if tokenizer contains stored prefix of
|
||||
* possibly-incomplete token
|
||||
**/
|
||||
bool has_prefix() const { !prefix_.empty(); }
|
||||
|
||||
/** assemble token from text @p token_text
|
||||
**/
|
||||
token_type assemble_token(const span_type & token_text) const;
|
||||
|
||||
/** scan for next input token, given @p input **/
|
||||
scan_result scan(const span_type & input);
|
||||
|
||||
/** notify end of input, resolve any stored input **/
|
||||
token_type notify_eof();
|
||||
|
||||
private:
|
||||
/** Accumulate partial token here.
|
||||
* This will happen if input sent to @ref tokenizer::scan
|
||||
* ends without a determinate token boundary.
|
||||
**/
|
||||
std::string prefix_;
|
||||
}; /*tokenizer*/
|
||||
|
||||
template <typename CharT>
|
||||
bool
|
||||
tokenizer<CharT>::is_whitespace(CharT ch) const {
|
||||
switch(ch) {
|
||||
case ' ': return true;
|
||||
case '\t': return true;
|
||||
case '\n': return true;
|
||||
case '\r': return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
bool
|
||||
tokenizer<CharT>::is_punctuation(CharT ch) const {
|
||||
switch(ch) {
|
||||
case '<':
|
||||
return true;
|
||||
case '>':
|
||||
return true;
|
||||
case '(':
|
||||
return true;
|
||||
case ')':
|
||||
return true;
|
||||
case '[':
|
||||
return true;
|
||||
case ']':
|
||||
return true;
|
||||
case '{':
|
||||
return true;
|
||||
case '}':
|
||||
return true;
|
||||
case ',':
|
||||
return true;
|
||||
case ';':
|
||||
return true;
|
||||
case ':':
|
||||
return true;
|
||||
case '=':
|
||||
return true;
|
||||
case '-':
|
||||
/* can't be punctuation -- can appear inside f64 token */
|
||||
return false;
|
||||
case '+':
|
||||
/* can't be punctuation -- can appear inside f64 token */
|
||||
return false;
|
||||
case '.':
|
||||
/* can't be punctuation -- can appear inside f64 token */
|
||||
return false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::assemble_token(const span_type & token_text) const -> token_type
|
||||
{
|
||||
constexpr bool c_debug_flag = true;
|
||||
|
||||
/* literal|pretty|streamlined */
|
||||
log_config::style = function_style::streamlined;
|
||||
|
||||
scope log(XO_DEBUG(c_debug_flag));
|
||||
log && log(xtag("token_text", token_text));
|
||||
|
||||
tokentype tk_type = tokentype::tk_invalid;
|
||||
std::string tk_text;
|
||||
|
||||
const CharT * tk_start = token_text.lo();
|
||||
const CharT * tk_end = token_text.hi();
|
||||
|
||||
const CharT * ix = tk_start;
|
||||
|
||||
/* switch here applies to the first character in a token */
|
||||
switch (*ix) {
|
||||
case '-':
|
||||
case '+':
|
||||
case '.':
|
||||
case '0':
|
||||
case '1':
|
||||
case '2':
|
||||
case '3':
|
||||
case '4':
|
||||
case '5':
|
||||
case '6':
|
||||
case '7':
|
||||
case '8':
|
||||
case '9':
|
||||
{
|
||||
/* examples of valid floating-point numbers:
|
||||
* .0
|
||||
* 1e0
|
||||
* 1e
|
||||
* 0.
|
||||
* +1e0
|
||||
* -1e0
|
||||
* +1E+2
|
||||
* -1E+2
|
||||
* -0.123e-10
|
||||
* non-examples:
|
||||
* .
|
||||
* -
|
||||
* +
|
||||
* e0
|
||||
* .e0
|
||||
* -.e-0
|
||||
* +.e+0
|
||||
*
|
||||
* in particular: to be recognized as a number,
|
||||
* must contain at least one digit
|
||||
*/
|
||||
|
||||
log && log("possible number-token");
|
||||
|
||||
/* true if initial sign -/+ encountered */
|
||||
bool sign_flag = false;
|
||||
/* true if '.' encountered */
|
||||
bool period_flag = false;
|
||||
/* true if 'e' | 'E' encountered.
|
||||
*/
|
||||
bool exponent_flag = false;
|
||||
/* true when sign '-' | '+' precedes exponenct digits */
|
||||
bool exponent_sign_flag = false;
|
||||
/* true when at least one digit follows exponent marker */
|
||||
bool exponent_digit_flag = false;
|
||||
/* true if at least one digit encountered */
|
||||
bool number_flag = false;
|
||||
|
||||
/* token will be one of: {i64, f64, dot}: */
|
||||
for(; ix != token_text.hi(); ++ix) {
|
||||
if((*ix == '-') || (*ix == '+')) {
|
||||
/* sign allowed:
|
||||
* 1. before period and before first digit
|
||||
* 2. after exponent
|
||||
*/
|
||||
if (!period_flag && !number_flag && !sign_flag) {
|
||||
sign_flag = true;
|
||||
} else if (exponent_flag && !exponent_digit_flag) {
|
||||
exponent_sign_flag = true;
|
||||
} else {
|
||||
throw std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": improperly placed sign indicator",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix)));
|
||||
}
|
||||
} else if(*ix == '.') {
|
||||
if (period_flag) {
|
||||
throw (std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": duplicate decimal point",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix))));
|
||||
}
|
||||
|
||||
period_flag = true;
|
||||
} else if((*ix == 'e') || (*ix == 'E')) {
|
||||
if (exponent_flag) {
|
||||
throw (std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": duplicate exponent marker",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix))));
|
||||
}
|
||||
|
||||
exponent_flag = true;
|
||||
} else if(isdigit(*ix)) {
|
||||
if (exponent_flag) {
|
||||
/* need digit before exponent to recognize as number */
|
||||
exponent_digit_flag = true;
|
||||
} else {
|
||||
number_flag = true;
|
||||
}
|
||||
} else {
|
||||
/* invalid input */
|
||||
throw (std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": unexpected character in numeric constant",
|
||||
xtag("pos", ix - tk_start),
|
||||
xtag("char", *ix))));
|
||||
}
|
||||
}
|
||||
|
||||
if (number_flag) {
|
||||
if (period_flag || exponent_flag) {
|
||||
tk_type = tokentype::tk_f64;
|
||||
} else {
|
||||
tk_type = tokentype::tk_i64;
|
||||
}
|
||||
} else if (period_flag && !exponent_flag) {
|
||||
tk_type = tokentype::tk_dot;
|
||||
} else {
|
||||
/* not a valid token */
|
||||
}
|
||||
|
||||
log && log(xtag("sign_flag", sign_flag));
|
||||
log && log(xtag("period_flag", period_flag),
|
||||
xtag("exponent_flag", exponent_flag),
|
||||
xtag("exponent_sign_flag", exponent_sign_flag),
|
||||
xtag("number_flag", number_flag));
|
||||
log && log(xtag("tk_type", tk_type));
|
||||
|
||||
break;
|
||||
}
|
||||
case '"':
|
||||
{
|
||||
log && log("recognize string-token");
|
||||
|
||||
tk_type = tokentype::tk_string;
|
||||
|
||||
tk_text.reserve(token_text.hi() - token_text.lo());
|
||||
|
||||
++ix; /*skip initial " char*/
|
||||
|
||||
for (; ix != token_text.hi(); ++ix) {
|
||||
log && log(xtag("*ix", *ix));
|
||||
|
||||
bool endofstring = false;
|
||||
|
||||
switch(*ix) {
|
||||
case '"':
|
||||
endofstring = true;
|
||||
|
||||
/* skip final " char, don't capture */
|
||||
++ix;
|
||||
|
||||
break;
|
||||
case '\\':
|
||||
/* skip escape char, don't capture */
|
||||
++ix;
|
||||
|
||||
if (ix == token_text.hi()) {
|
||||
throw std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": malformed string literal",
|
||||
xtag("input", std::string_view(token_text.lo(),
|
||||
token_text.hi()))));
|
||||
}
|
||||
|
||||
switch(*ix) {
|
||||
case '\\':
|
||||
log && log(xtag("*ix", *ix), xtag("escaped", "t"));
|
||||
tk_text.push_back(*ix);
|
||||
break;
|
||||
case 'n':
|
||||
log && log(xtag("*ix", *ix), xtag("newline", "t"));
|
||||
tk_text.push_back('\n');
|
||||
break;
|
||||
case 't':
|
||||
log && log(xtag("*ix", *ix), xtag("tab", "t"));
|
||||
tk_text.push_back('\t');
|
||||
break;
|
||||
case 'r':
|
||||
log && log(xtag("*ix", *ix), xtag("cr", "t"));
|
||||
tk_text.push_back('\r');
|
||||
break;
|
||||
case '"':
|
||||
log && log(xtag("*ix", *ix), xtag("quote", "t"));
|
||||
tk_text.push_back('"');
|
||||
break;
|
||||
default:
|
||||
throw std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": unexpected \\-escaped char",
|
||||
xtag("char", *ix)));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
tk_text.push_back(*ix);
|
||||
break;
|
||||
}
|
||||
|
||||
if (endofstring)
|
||||
break;
|
||||
}
|
||||
|
||||
if (ix != token_text.hi()) {
|
||||
throw std::runtime_error
|
||||
(tostr("tokenizer::assemble_token",
|
||||
": expected \" to end string literal",
|
||||
xtag("input", std::string_view(token_text.lo(),
|
||||
token_text.hi()))));
|
||||
}
|
||||
|
||||
log && log(tostr("tokenizer::assemble_token",
|
||||
xtag("tk_text", tk_text)));
|
||||
|
||||
break;
|
||||
}
|
||||
case 'a': case 'A':
|
||||
case 'b': case 'B':
|
||||
case 'c': case 'C':
|
||||
case 'd': case 'D':
|
||||
case 'e': case 'E':
|
||||
case 'f': case 'F':
|
||||
case 'g': case 'G':
|
||||
case 'h': case 'H':
|
||||
case 'i': case 'I':
|
||||
case 'j': case 'J':
|
||||
case 'k': case 'K':
|
||||
case 'l': case 'L':
|
||||
case 'm': case 'M':
|
||||
case 'n': case 'N':
|
||||
case 'o': case 'O':
|
||||
case 'p': case 'P':
|
||||
case 'q': case 'Q':
|
||||
case 'r': case 'R':
|
||||
case 's': case 'S':
|
||||
case 't': case 'T':
|
||||
case 'u': case 'U':
|
||||
case 'v': case 'V':
|
||||
case 'w': case 'W':
|
||||
case 'x': case 'X':
|
||||
case 'y': case 'Y':
|
||||
case 'z': case 'Z':
|
||||
{
|
||||
/* symbol/identifier must begin with a letter?
|
||||
* we want to accept some other chars too.
|
||||
* specifically want to allow identifiers:
|
||||
* this-is-the-way
|
||||
* this+is+also+the+way
|
||||
* how/much/is/that/doggy
|
||||
* put*an*asterisk*in*that
|
||||
* something%special%
|
||||
*
|
||||
* like pure lisp, we don't allow:
|
||||
* - identifier beginning with digit
|
||||
* - period .
|
||||
*
|
||||
* unlike pure lisp, we don't allow anywhere in a symbol:
|
||||
* - colon :
|
||||
* - semicolon ;
|
||||
* - comma ,
|
||||
*
|
||||
* also we don't allow symbols to begin with special chars
|
||||
*/
|
||||
|
||||
tk_type = tokentype::tk_symbol;
|
||||
break;
|
||||
}
|
||||
case '<':
|
||||
tk_type = tokentype::tk_leftangle;
|
||||
++ix;
|
||||
break;
|
||||
case '>':
|
||||
tk_type = tokentype::tk_rightangle;
|
||||
++ix;
|
||||
break;
|
||||
case '(':
|
||||
tk_type = tokentype::tk_leftparen;
|
||||
++ix;
|
||||
break;
|
||||
case ')':
|
||||
tk_type = tokentype::tk_rightparen;
|
||||
++ix;
|
||||
break;
|
||||
case '[':
|
||||
tk_type = tokentype::tk_leftbracket;
|
||||
++ix;
|
||||
break;
|
||||
case ']':
|
||||
tk_type = tokentype::tk_rightbracket;
|
||||
++ix;
|
||||
break;
|
||||
case '{':
|
||||
tk_type = tokentype::tk_leftbrace;
|
||||
++ix;
|
||||
break;
|
||||
case '}':
|
||||
tk_type = tokentype::tk_rightbrace;
|
||||
++ix;
|
||||
break;
|
||||
case ',':
|
||||
tk_type = tokentype::tk_comma;
|
||||
++ix;
|
||||
break;
|
||||
case ';':
|
||||
tk_type = tokentype::tk_semicolon;
|
||||
++ix;
|
||||
break;
|
||||
case ':':
|
||||
tk_type = tokentype::tk_colon;
|
||||
++ix;
|
||||
break;
|
||||
case '=':
|
||||
tk_type = tokentype::tk_singleassign;
|
||||
++ix;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (tk_type == tokentype::tk_invalid) {
|
||||
throw std::runtime_error(tostr("tokenizer::assemble_token",
|
||||
": unexpected input x",
|
||||
xtag("x", *ix)));
|
||||
}
|
||||
|
||||
if ((tk_type == tokentype::tk_i64)
|
||||
|| (tk_type == tokentype::tk_f64)
|
||||
|| (tk_type == tokentype::tk_symbol))
|
||||
{
|
||||
/* re-parse in token::i64_value() / token::f64_value() */
|
||||
tk_text = std::string(tk_start, tk_end);
|
||||
} else if (tk_type == tokentype::tk_string) {
|
||||
; /* nothing to do here -- desired tk_text already constructed */
|
||||
}
|
||||
|
||||
return token_type(tk_type, std::move(tk_text));
|
||||
} /*assemble_token*/
|
||||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::scan(const span_type & input) -> scan_result
|
||||
{
|
||||
constexpr bool c_debug_flag = true;
|
||||
scope log(XO_DEBUG(c_debug_flag));
|
||||
|
||||
log && log(xtag("input", input));
|
||||
|
||||
const CharT * ix = input.lo();
|
||||
|
||||
/* skip whitespace */
|
||||
while (is_whitespace(*ix) && (ix != input.hi()))
|
||||
++ix;
|
||||
|
||||
if(ix == input.hi()) {
|
||||
/* no-op */
|
||||
return {
|
||||
token_type::invalid(),
|
||||
input.prefix(ix)
|
||||
};
|
||||
}
|
||||
|
||||
/* here: *ix is not whitespace */
|
||||
|
||||
auto whitespace = input.prefix(ix);
|
||||
|
||||
log && log(xtag("whitespace.size", whitespace.size()));
|
||||
|
||||
/* tk_start points to beginning of token
|
||||
* (after any whitespace)
|
||||
*/
|
||||
const CharT * tk_start = ix;
|
||||
|
||||
if (is_punctuation(*ix)) {
|
||||
/* 1-character token */
|
||||
++ix;
|
||||
} else if (*ix == '"') {
|
||||
bool complete_flag = false;
|
||||
|
||||
/* 1. embedded space/tab allowed in string literal.
|
||||
* 2. embedded newline/cr not allowed.
|
||||
*/
|
||||
CharT prev_ch = '"';
|
||||
|
||||
++ix;
|
||||
|
||||
for (; ix != input.hi(); ++ix) {
|
||||
/* looking for unescaped " char to end literal */
|
||||
if (*ix == '"') {
|
||||
if (prev_ch != '\\') {
|
||||
++ix; /* include terminating " for assemble_token */
|
||||
complete_flag = true;
|
||||
break;
|
||||
}
|
||||
} else if ((*ix == '\n') || (*ix == '\r')) {
|
||||
throw std::runtime_error
|
||||
(tostr("tokenizer::scan",
|
||||
": must use \\n or \\r to encode newline/cr in"
|
||||
" string literal"));
|
||||
}
|
||||
|
||||
prev_ch = *ix;
|
||||
}
|
||||
|
||||
if (!complete_flag) {
|
||||
/* need more input to know if/when tokken complete */
|
||||
this->prefix_ += std::string(tk_start, input.hi());
|
||||
|
||||
log && log(xtag("captured-prefix", this->prefix_));
|
||||
}
|
||||
} else {
|
||||
/* scan until:
|
||||
* - whitespace
|
||||
* - punctuation
|
||||
*/
|
||||
for (; ix != input.hi(); ++ix) {
|
||||
if (is_whitespace(*ix) || is_punctuation(*ix))
|
||||
break;
|
||||
}
|
||||
|
||||
if (ix == input.hi()) {
|
||||
/* need more input to know if/when token complete */
|
||||
this->prefix_ += std::string(tk_start, input.hi());
|
||||
|
||||
log && log(xtag("captured-prefix", this->prefix_));
|
||||
}
|
||||
}
|
||||
|
||||
auto token_span = input.after_prefix(whitespace).prefix(ix);
|
||||
|
||||
token tk
|
||||
= (this->prefix_.empty()
|
||||
? assemble_token(token_span)
|
||||
: token_type(tokentype::tk_invalid));
|
||||
|
||||
return scan_result
|
||||
{ tk, input.prefix(whitespace.size() + token_span.size()) };
|
||||
} /*scan*/
|
||||
|
||||
template <typename CharT>
|
||||
auto
|
||||
tokenizer<CharT>::notify_eof() -> token_type {
|
||||
constexpr bool c_debug_flag = true;
|
||||
|
||||
scope log(XO_DEBUG(c_debug_flag));
|
||||
|
||||
token tk
|
||||
= (this->prefix_.empty()
|
||||
? token_type(tokentype::tk_invalid)
|
||||
: assemble_token(span_type(&prefix_[0],
|
||||
&prefix_[prefix_.size()])));
|
||||
|
||||
this->prefix_.clear();
|
||||
|
||||
return tk;
|
||||
} /*notify_eof*/
|
||||
} /*namespace tok*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end tokenizer.hpp */
|
||||
142
include/xo/tokenizer/tokentype.hpp
Normal file
142
include/xo/tokenizer/tokentype.hpp
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
/** @file tokentype.hpp
|
||||
*
|
||||
* author: Roland Conybeare, Jul 2024
|
||||
**/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "xo/indentlog/print/tag.hpp" // for STRINGIFY
|
||||
#include <ostream>
|
||||
|
||||
namespace xo {
|
||||
namespace tok {
|
||||
/** @enum tokentype
|
||||
* @brief enum to identify different schematica input token types
|
||||
*
|
||||
* Schematica code examples:
|
||||
*
|
||||
* type point :: { xcoord : f64, ycoord: f64 };
|
||||
* type matrix :: array<double, 2>; // 2-d array
|
||||
*
|
||||
* decl hypot(x : f64, y : f64) -> f64;
|
||||
*
|
||||
* def hypot(x : f64, y : f64) {
|
||||
* let
|
||||
* x2 = (x * x);
|
||||
* y2 = (y * y);
|
||||
* hypot2 = (x2 + y2);
|
||||
* in
|
||||
* sqrt(hypot2);
|
||||
* };
|
||||
*
|
||||
* def someconst 4;
|
||||
*
|
||||
* def foo(v : vec<i32>) {
|
||||
* def (pi : f64) = 3.1415926;
|
||||
* def (h : (f64,f64) -> f64) = hypot;
|
||||
*
|
||||
* h = hypot3;
|
||||
* };
|
||||
*
|
||||
* def matrixproduct(x : matrix, y : matrix) {
|
||||
* [i,j : x.row(i) * y.col(j)];
|
||||
* };
|
||||
**/
|
||||
enum class tokentype {
|
||||
/** sentinel value **/
|
||||
tk_invalid = -1,
|
||||
|
||||
/** an integer constant (signed 64-bit integer) **/
|
||||
tk_i64,
|
||||
|
||||
/** a 64-bit floating-point constant **/
|
||||
tk_f64,
|
||||
|
||||
/** a string literal **/
|
||||
tk_string,
|
||||
|
||||
/** a symbol **/
|
||||
tk_symbol,
|
||||
|
||||
/** left-hand parenthesis '(' **/
|
||||
tk_leftparen,
|
||||
|
||||
/** right-hand parenthesis ')' **/
|
||||
tk_rightparen,
|
||||
|
||||
/** left-hand bracket '[' **/
|
||||
tk_leftbracket,
|
||||
|
||||
/** right-hand bracket ']' **/
|
||||
tk_rightbracket,
|
||||
|
||||
/** left-hand brace '{' **/
|
||||
tk_leftbrace,
|
||||
|
||||
/** right-hand brace '}' **/
|
||||
tk_rightbrace,
|
||||
|
||||
/** left-hand angle bracket '<' **/
|
||||
tk_leftangle,
|
||||
|
||||
/** right-hand angle bracket '>' **/
|
||||
tk_rightangle,
|
||||
|
||||
/** dot '.' **/
|
||||
tk_dot,
|
||||
|
||||
/** comma ',' **/
|
||||
tk_comma,
|
||||
|
||||
/** colon ':' **/
|
||||
tk_colon,
|
||||
|
||||
/** double-colon '::' **/
|
||||
tk_doublecolon,
|
||||
|
||||
/** semi-colon ';' **/
|
||||
tk_semicolon,
|
||||
|
||||
/** '=' **/
|
||||
tk_singleassign,
|
||||
|
||||
/** ':=' **/
|
||||
tk_assign,
|
||||
|
||||
/** '->' **/
|
||||
tk_yields,
|
||||
|
||||
/** keyworkd 'type' **/
|
||||
tk_type,
|
||||
|
||||
/** keyword 'def' **/
|
||||
tk_def,
|
||||
|
||||
/** keyword 'lambda' **/
|
||||
tk_lambda,
|
||||
|
||||
/** keyword 'if' **/
|
||||
tk_if,
|
||||
|
||||
/** keyword 'let' **/
|
||||
tk_let,
|
||||
|
||||
/** keyword 'in' **/
|
||||
tk_in,
|
||||
|
||||
n_tokentype /* comes last, counts #of entries */
|
||||
}; /*tokentype*/
|
||||
|
||||
extern char const *
|
||||
tokentype_descr(tokentype tk_type);
|
||||
|
||||
inline std::ostream &
|
||||
operator<< (std::ostream & os, tokentype tk_type) {
|
||||
os << tokentype_descr(tk_type);
|
||||
return os;
|
||||
}
|
||||
} /*namespace tok*/
|
||||
} /*namespace xo*/
|
||||
|
||||
|
||||
/* end tokentype.hpp */
|
||||
14
src/tokenizer/CMakeLists.txt
Normal file
14
src/tokenizer/CMakeLists.txt
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
# tokenizer/CMakeLists.txt
|
||||
|
||||
set(SELF_LIB tokenizer)
|
||||
set(SELF_SRCS
|
||||
tokentype.cpp
|
||||
token.cpp)
|
||||
|
||||
xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS})
|
||||
#xo_dependency(${SELF_LIB} refcnt)
|
||||
xo_dependency(${SELF_LIB} indentlog)
|
||||
#xo_dependency(${SELF_LIB} subsys)
|
||||
#xo_boost_dependency(${SELF_LIB})
|
||||
|
||||
# end CMakeLists.txt
|
||||
9
src/tokenizer/token.cpp
Normal file
9
src/tokenizer/token.cpp
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
/** @file token.cpp
|
||||
*
|
||||
* author: Roland Conybeare
|
||||
**/
|
||||
|
||||
#include "token.hpp"
|
||||
#include "xo/indentlog/print/tag.hpp"
|
||||
|
||||
/** end token.cpp **/
|
||||
56
src/tokenizer/tokentype.cpp
Normal file
56
src/tokenizer/tokentype.cpp
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
/* file tokentype.cpp
|
||||
*
|
||||
* author: Roland Conybeare
|
||||
*/
|
||||
|
||||
#include "tokentype.hpp"
|
||||
|
||||
namespace xo {
|
||||
namespace tok {
|
||||
char const *
|
||||
tokentype_descr(tokentype tk_type)
|
||||
{
|
||||
#define CASE(x) case tokentype::x: return STRINGIFY(x)
|
||||
|
||||
switch(tk_type) {
|
||||
CASE(tk_i64);
|
||||
CASE(tk_f64);
|
||||
CASE(tk_string);
|
||||
CASE(tk_symbol);
|
||||
CASE(tk_leftparen);
|
||||
CASE(tk_rightparen);
|
||||
CASE(tk_leftbracket);
|
||||
CASE(tk_rightbracket);
|
||||
CASE(tk_leftbrace);
|
||||
CASE(tk_rightbrace);
|
||||
CASE(tk_leftangle);
|
||||
CASE(tk_rightangle);
|
||||
CASE(tk_dot);
|
||||
CASE(tk_comma);
|
||||
CASE(tk_colon);
|
||||
CASE(tk_doublecolon);
|
||||
CASE(tk_semicolon);
|
||||
CASE(tk_singleassign);
|
||||
CASE(tk_assign);
|
||||
CASE(tk_yields);
|
||||
CASE(tk_type);
|
||||
CASE(tk_def);
|
||||
CASE(tk_lambda);
|
||||
CASE(tk_if);
|
||||
CASE(tk_let);
|
||||
CASE(tk_in);
|
||||
|
||||
case tokentype::tk_invalid:
|
||||
case tokentype::n_tokentype:
|
||||
return "?tokentype";
|
||||
}
|
||||
|
||||
#undef CASE
|
||||
|
||||
return "???";
|
||||
} /*tokentype_descr*/
|
||||
} /*namespace tok*/
|
||||
} /*namespace xo*/
|
||||
|
||||
|
||||
/* end tokentype.cpp */
|
||||
13
utest/CMakeLists.txt
Normal file
13
utest/CMakeLists.txt
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
# build unittest tokenizer/utest
|
||||
|
||||
set(SELF_EXECUTABLE_NAME utest.tokenizer)
|
||||
set(SELF_SOURCE_FILES
|
||||
tokenizer_utest_main.cpp
|
||||
tokenizer.test.cpp
|
||||
token.test.cpp)
|
||||
|
||||
xo_add_utest_executable(${SELF_EXECUTABLE_NAME} ${SELF_SOURCE_FILES})
|
||||
xo_self_dependency(${SELF_EXECUTABLE_NAME} tokenizer)
|
||||
xo_external_target_dependency(${SELF_EXECUTABLE_NAME} Catch2 Catch2::Catch2)
|
||||
|
||||
# end CMakeLists.txt
|
||||
260
utest/token.test.cpp
Normal file
260
utest/token.test.cpp
Normal file
|
|
@ -0,0 +1,260 @@
|
|||
/* file token.test.cpp
|
||||
*
|
||||
* author: Roland Conybeare
|
||||
*/
|
||||
|
||||
#include "xo/tokenizer/token.hpp"
|
||||
#include <catch2/catch.hpp>
|
||||
#include <memory>
|
||||
|
||||
namespace xo {
|
||||
using token = xo::tok::token<char>;
|
||||
using xo::tok::tokentype;
|
||||
|
||||
namespace ut {
|
||||
struct testcase_i64 {
|
||||
std::string text_;
|
||||
bool expect_throw_;
|
||||
std::int64_t expected_;
|
||||
};
|
||||
|
||||
std::vector<testcase_i64> s_testcase_v = {
|
||||
{"", true, 0},
|
||||
{"0", false, 0},
|
||||
{"-", true, 0},
|
||||
{"+", true, 0},
|
||||
{"-0", false, 0},
|
||||
{"+0", false, 0},
|
||||
{"1", false, 1},
|
||||
{"-1", false, -1},
|
||||
{"9", false, 9},
|
||||
{"-9", false, -9},
|
||||
{"12", false, 12},
|
||||
{"+12", false, 12},
|
||||
{"-12", false, -12},
|
||||
{"99", false, 99},
|
||||
{"-99", false, -99},
|
||||
{"123x", true, 0},
|
||||
};
|
||||
|
||||
TEST_CASE("parse-i64", "[token]") {
|
||||
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
|
||||
INFO(xtag("i_tc", i_tc));
|
||||
|
||||
auto const & testcase = s_testcase_v[i_tc];
|
||||
|
||||
token tk(tokentype::tk_i64,
|
||||
testcase.text_);
|
||||
|
||||
REQUIRE(tk.tk_type() == tokentype::tk_i64);
|
||||
|
||||
bool throw_flag = false;
|
||||
try {
|
||||
std::int64_t x = tk.i64_value();
|
||||
|
||||
REQUIRE(x == testcase.expected_);
|
||||
} catch (std::exception & ex) {
|
||||
throw_flag = true;
|
||||
}
|
||||
|
||||
REQUIRE(throw_flag == testcase.expect_throw_);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("error-i64", "[token]") {
|
||||
token tk(tokentype::tk_i64, "+");
|
||||
|
||||
bool throw_flag = false;
|
||||
|
||||
try {
|
||||
tk.i64_value();
|
||||
} catch(std::exception & ex) {
|
||||
throw_flag = true;
|
||||
}
|
||||
|
||||
REQUIRE(throw_flag);
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct testcase_f64 {
|
||||
std::string text_;
|
||||
bool expect_throw_;
|
||||
double expected_;
|
||||
};
|
||||
|
||||
std::vector<testcase_f64> s_testcase_v = {
|
||||
{"", true, 0},
|
||||
{"0", false, 0},
|
||||
{"-", true, 0},
|
||||
{"+", true, 0},
|
||||
{"-0", false, 0},
|
||||
|
||||
{"+0", false, 0},
|
||||
{"1", false, 1},
|
||||
{"-1", false, -1},
|
||||
{"9", false, 9},
|
||||
{"-9", false, -9},
|
||||
|
||||
{"12", false, 12},
|
||||
{"+12", false, 12},
|
||||
{"-12", false, -12},
|
||||
{"99", false, 99},
|
||||
{"-99", false, -99},
|
||||
|
||||
{"123x", true, 0},
|
||||
{"0.0", false, 0.0},
|
||||
{"0.1", false, 0.1},
|
||||
{"0.12", false, 0.12},
|
||||
{"0.123", false, 0.123},
|
||||
|
||||
{"0.1234", false, 0.1234},
|
||||
{"0.12345", false, 0.12345},
|
||||
{"0.123456", false, 0.123456},
|
||||
{"0.1234567", false, 0.1234567},
|
||||
{"0.12345678", false, 0.12345678},
|
||||
|
||||
{"0.123456789", false, 0.123456789},
|
||||
{"+0.0", false, 0.0},
|
||||
{"+0.1", false, 0.1},
|
||||
{"+0.12", false, 0.12},
|
||||
{"+0.123", false, 0.123},
|
||||
|
||||
{"+0.1234", false, 0.1234},
|
||||
{"+0.12345", false, 0.12345},
|
||||
{"+0.123456", false, 0.123456},
|
||||
{"+0.1234567", false, 0.1234567},
|
||||
{"+0.12345678", false, 0.12345678},
|
||||
|
||||
{"+0.123456789", false, 0.123456789},
|
||||
{"+0.0e0", false, 0.0},
|
||||
{"+0.1e0", false, 0.1},
|
||||
{"+0.12e0", false, 0.12},
|
||||
{"+0.123e0", false, 0.123},
|
||||
|
||||
{"+0.1234e0", false, 0.1234},
|
||||
{"+0.12345e0", false, 0.12345},
|
||||
{"+0.123456e0", false, 0.123456},
|
||||
{"+0.1234567e0", false, 0.1234567},
|
||||
{"+0.12345678e0", false, 0.12345678},
|
||||
|
||||
{"+0.123456789e0", false, 0.123456789},
|
||||
{"+0.0e1", false, 00.},
|
||||
{"+0.1e1", false, 01.},
|
||||
{"+0.12e1", false, 01.2},
|
||||
{"+0.123e1", false, 01.23},
|
||||
|
||||
{"+0.1234e1", false, 01.234},
|
||||
{"+0.12345e1", false, 01.2345},
|
||||
{"+0.123456e1", false, 01.23456},
|
||||
{"+0.1234567e1", false, 01.234567},
|
||||
{"+0.12345678e1", false, 01.2345678},
|
||||
|
||||
{"+0.123456789e1", false, 01.23456789},
|
||||
{"+0.0E1", false, 00.},
|
||||
{"+0.1E1", false, 01.},
|
||||
{"+0.12E1", false, 01.2},
|
||||
{"+0.123E1", false, 01.23},
|
||||
|
||||
{"+0.1234E1", false, 01.234},
|
||||
{"+0.12345E1", false, 01.2345},
|
||||
{"+0.123456E1", false, 01.23456},
|
||||
{"+0.1234567E1", false, 01.234567},
|
||||
{"+0.12345678E1", false, 01.2345678},
|
||||
|
||||
{"+0.123456789E1", false, 01.23456789},
|
||||
{"+0.0e9", false, 0.0},
|
||||
{"+0.1e9", false, 0.1e9},
|
||||
{"+0.12e9", false, 0.12e9},
|
||||
{"+0.123e9", false, 0.123e9},
|
||||
|
||||
{"+0.1234e9", false, 0.1234e9},
|
||||
{"+0.12345e9", false, 0.12345e9},
|
||||
{"+0.123456e9", false, 0.123456e9},
|
||||
{"+0.1234567e9", false, 0.1234567e9},
|
||||
{"+0.12345678e9", false, 0.12345678e9},
|
||||
|
||||
{"+0.123456789e9", false, 0.123456789e9},
|
||||
{"-0.0", false, -0.0},
|
||||
{"-0.1", false, -0.1},
|
||||
{"-0.12", false, -0.12},
|
||||
{"-0.123", false, -0.123},
|
||||
|
||||
{"-0.1234", false, -0.1234},
|
||||
{"-0.12345", false, -0.12345},
|
||||
{"-0.123456", false, -0.123456},
|
||||
{"-0.1234567", false, -0.1234567},
|
||||
{"-0.12345678", false, -0.12345678},
|
||||
|
||||
{"-0.123456789", false, -0.123456789},
|
||||
{"00.", false, 0.0},
|
||||
{"01.", false, 1.0},
|
||||
{"01.2", false, 1.2},
|
||||
{"01.23", false, 1.23},
|
||||
|
||||
{"01.234", false, 1.234},
|
||||
{"01.2345", false, 1.2345},
|
||||
{"01.23456", false, 1.23456},
|
||||
{"01.234567", false, 1.234567},
|
||||
{"01.2345678", false, 1.2345678},
|
||||
|
||||
{"01.23456789", false, 1.23456789},
|
||||
{"0.0", false, 0.0},
|
||||
{"1.2", false, 1.2},
|
||||
{"12.", false, 12.0},
|
||||
{"12.3", false, 12.3},
|
||||
|
||||
{"12.34", false, 12.34},
|
||||
{"12.345", false, 12.345},
|
||||
{"12.3456", false, 12.3456},
|
||||
{"12.34567", false, 12.34567},
|
||||
{"12.345678", false, 12.345678},
|
||||
|
||||
{"12.3456789", false, 12.3456789},
|
||||
{"01.23", false, 1.23},
|
||||
{"12.3", false, 12.3},
|
||||
{"123.", false, 123.0},
|
||||
{"123.4", false, 123.4},
|
||||
|
||||
{"123.45", false, 123.45},
|
||||
{"123.456", false, 123.456},
|
||||
{"123.4567", false, 123.4567},
|
||||
{"123.45678", false, 123.45678},
|
||||
{"123.456789", false, 123.456789},
|
||||
};
|
||||
|
||||
TEST_CASE("parse-f64", "[token]") {
|
||||
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
|
||||
auto const & testcase = s_testcase_v[i_tc];
|
||||
|
||||
INFO(tostr(xtag("i_tc", i_tc),
|
||||
xtag("text", testcase.text_)
|
||||
));
|
||||
|
||||
token tk(tokentype::tk_f64,
|
||||
testcase.text_);
|
||||
|
||||
REQUIRE(tk.tk_type() == tokentype::tk_f64);
|
||||
|
||||
bool throw_flag = false;
|
||||
std::string ex_msg;
|
||||
|
||||
try {
|
||||
double x = tk.f64_value();
|
||||
|
||||
REQUIRE(x == Approx(testcase.expected_).epsilon(1.0e-15));
|
||||
} catch (std::exception & ex) {
|
||||
ex_msg = ex.what();
|
||||
|
||||
throw_flag = true;
|
||||
}
|
||||
|
||||
INFO(xtag("ex_msg", ex_msg));
|
||||
|
||||
REQUIRE(throw_flag == testcase.expect_throw_);
|
||||
}
|
||||
}
|
||||
} /*namespace*/
|
||||
} /*namespace ut*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end token.test.cpp */
|
||||
160
utest/tokenizer.test.cpp
Normal file
160
utest/tokenizer.test.cpp
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
/* file tokenizer.test.cpp
|
||||
*
|
||||
* author: Roland Conybeare
|
||||
*/
|
||||
|
||||
#include "tokenizer.hpp"
|
||||
#include <catch2/catch.hpp>
|
||||
|
||||
namespace xo {
|
||||
using xo::tok::tokentype;
|
||||
using token = xo::tok::token<char>;
|
||||
using xo::tok::span;
|
||||
|
||||
namespace ut {
|
||||
namespace {
|
||||
struct testcase_tkz {
|
||||
std::string input_;
|
||||
bool expect_throw_;
|
||||
token expected_tk_;
|
||||
bool consume_all_;
|
||||
};
|
||||
|
||||
std::vector<testcase_tkz>
|
||||
s_testcase_v = {
|
||||
{"<", false, token::leftangle(), true},
|
||||
{">", false, token::rightangle(), true},
|
||||
|
||||
{"(", false, token::leftparen(), true},
|
||||
{")", false, token::rightparen(), true},
|
||||
|
||||
{"[", false, token::leftbracket(), true},
|
||||
{"]", false, token::rightbracket(), true},
|
||||
|
||||
{"{", false, token::leftbrace(), true},
|
||||
{" {", false, token::leftbrace(), true},
|
||||
|
||||
{"\t{", false, token::leftbrace(), true},
|
||||
{"\n{", false, token::leftbrace(), true},
|
||||
{"}", false, token::rightbrace(), true},
|
||||
|
||||
{"0", false, token::i64_token("0"), true},
|
||||
{"1", false, token::i64_token("1"), true},
|
||||
{"12", false, token::i64_token("12"), true},
|
||||
{"123", false, token::i64_token("123"), true},
|
||||
{"1234", false, token::i64_token("1234"), true},
|
||||
|
||||
{"0 ", false, token::i64_token("0"), false},
|
||||
{"1 ", false, token::i64_token("1"), false},
|
||||
{"12 ", false, token::i64_token("12"), false},
|
||||
{"123 ", false, token::i64_token("123"), false},
|
||||
{"1234 ", false, token::i64_token("1234"), false},
|
||||
|
||||
{"1<", false, token::i64_token("1"), false},
|
||||
{"1>", false, token::i64_token("1"), false},
|
||||
{"1(", false, token::i64_token("1"), false},
|
||||
{"1)", false, token::i64_token("1"), false},
|
||||
{"1[", false, token::i64_token("1"), false},
|
||||
{"1]", false, token::i64_token("1"), false},
|
||||
{"1{", false, token::i64_token("1"), false},
|
||||
{"1}", false, token::i64_token("1"), false},
|
||||
{"1;", false, token::i64_token("1"), false},
|
||||
{"1:", false, token::i64_token("1"), false},
|
||||
{"1,", false, token::i64_token("1"), false},
|
||||
|
||||
{".1", false, token::f64_token(".1"), true},
|
||||
{".12", false, token::f64_token(".12"), true},
|
||||
{".123", false, token::f64_token(".123"), true},
|
||||
|
||||
{"+.1", false, token::f64_token("+.1"), true},
|
||||
{"+.12", false, token::f64_token("+.12"), true},
|
||||
{"+.123", false, token::f64_token("+.123"), true},
|
||||
|
||||
{"-.1", false, token::f64_token("-.1"), true},
|
||||
{"-.12", false, token::f64_token("-.12"), true},
|
||||
{"-.123", false, token::f64_token("-.123"), true},
|
||||
|
||||
{"1.", false, token::f64_token("1."), true},
|
||||
{"1.2", false, token::f64_token("1.2"), true},
|
||||
{"1.23", false, token::f64_token("1.23"), true},
|
||||
|
||||
{"1e0", false, token::f64_token("1e0"), true},
|
||||
{"1e-1", false, token::f64_token("1e-1"), true},
|
||||
{"1e1", false, token::f64_token("1e1"), true},
|
||||
{"1e+1", false, token::f64_token("1e+1"), true},
|
||||
|
||||
{"\"hello\"", false, token::string_token("hello"), true},
|
||||
/* tokenizer sees this input:
|
||||
* "\"hi\", she said"
|
||||
*/
|
||||
{"\"\\\"hi\\\", she said\"", false, token::string_token("\"hi\", she said"), true},
|
||||
/* tokenizer sees this input:
|
||||
* "look ma, newline ->\n<- "
|
||||
*/
|
||||
{"\"look ma, newline ->\\n<- \"", false,
|
||||
token::string_token("look ma, newline ->\n<- "), true},
|
||||
/* tokenizer sees this input:
|
||||
* "tab to the right [\t], to the right [\t]"
|
||||
*/
|
||||
{"\"tab to the right [\\t], to the right [\\t]\"", false,
|
||||
token::string_token("tab to the right [\t], to the right [\t]"), true},
|
||||
|
||||
{"symbol", false, token::symbol_token("symbol"), true},
|
||||
};
|
||||
}
|
||||
|
||||
TEST_CASE("tokenizer", "[tokenizer]") {
|
||||
for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) {
|
||||
const testcase_tkz & testcase = s_testcase_v[i_tc];
|
||||
|
||||
INFO(xtag("input", testcase.input_));
|
||||
INFO(xtag("i_tc", i_tc));
|
||||
|
||||
using tokenizer
|
||||
= xo::tok::tokenizer<char>;
|
||||
|
||||
tokenizer tkz;
|
||||
tokenizer::span_type
|
||||
in_span(testcase.input_.c_str(),
|
||||
testcase.input_.c_str() + testcase.input_.size());
|
||||
|
||||
auto out = tkz.scan(in_span);
|
||||
|
||||
auto tk = out.first;
|
||||
|
||||
if (tk.is_invalid())
|
||||
tk = tkz.notify_eof();
|
||||
|
||||
REQUIRE(tk.tk_type() == testcase.expected_tk_.tk_type());
|
||||
if (tk.tk_type() == tokentype::tk_i64)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.i64_value() == testcase.expected_tk_.i64_value());
|
||||
} else if (tk.tk_type() == tokentype::tk_f64)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.f64_value() == testcase.expected_tk_.f64_value());
|
||||
} else if(tk.tk_type() == tokentype::tk_string)
|
||||
{
|
||||
/* tk.text() can be empty, consider input "" */
|
||||
REQUIRE(tk.text() == testcase.expected_tk_.text());
|
||||
} else if(tk.tk_type() == tokentype::tk_symbol)
|
||||
{
|
||||
REQUIRE(!tk.text().empty());
|
||||
REQUIRE(tk.text() == testcase.expected_tk_.text());
|
||||
} else {
|
||||
REQUIRE(tk.text().empty());
|
||||
}
|
||||
|
||||
/* must consume all input for tests we're doing here */
|
||||
if (testcase.consume_all_)
|
||||
REQUIRE(out.second == in_span);
|
||||
else
|
||||
REQUIRE(out.second != in_span);
|
||||
}
|
||||
}
|
||||
|
||||
} /*namespace ut*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end tokenizer.test.cpp */
|
||||
6
utest/tokenizer_utest_main.cpp
Normal file
6
utest/tokenizer_utest_main.cpp
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
/* file tokenizer_utest_main.cpp */
|
||||
|
||||
#define CATCH_CONFIG_MAIN
|
||||
#include "catch2/catch.hpp"
|
||||
|
||||
/* end tokenizer_utest_main.cpp */
|
||||
Loading…
Add table
Add a link
Reference in a new issue