diff --git a/xo-tokenizer/.gitignore b/xo-tokenizer/.gitignore new file mode 100644 index 00000000..3d3a7826 --- /dev/null +++ b/xo-tokenizer/.gitignore @@ -0,0 +1,8 @@ +# emacs workspace config +.projectile +# clangd working space (see emacs+lsp) +.cache +# typical cmake build directory (source-tree-nephew) +.build* +# symlink to builddir/compile_commands.json; should be set manually in dev sandbox +compile_commands.json diff --git a/xo-tokenizer/CMakeLists.txt b/xo-tokenizer/CMakeLists.txt new file mode 100644 index 00000000..147e16c7 --- /dev/null +++ b/xo-tokenizer/CMakeLists.txt @@ -0,0 +1,27 @@ +# xo-tokenizer/CMakeLists.txt + +cmake_minimum_required(VERSION 3.10) + +project(xo_tokenizer VERSION 0.1) + +include(GNUInstallDirs) +include(cmake/xo-bootstrap-macros.cmake) + +xo_cxx_toplevel_options3() + +# ---------------------------------------------------------------- +# c++ settings + +set(PROJECT_CXX_FLAGS "") +#set(PROJECT_CXX_FLAGS "-fconcepts-diagnostics-depth=2") +add_definitions(${PROJECT_CXX_FLAGS}) + +# ---------------------------------------------------------------- + +add_subdirectory(src/tokenizer) +add_subdirectory(utest) + +# ---------------------------------------------------------------- +# provide find_package() support + +xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets) diff --git a/xo-tokenizer/README.md b/xo-tokenizer/README.md new file mode 100644 index 00000000..3f0befba --- /dev/null +++ b/xo-tokenizer/README.md @@ -0,0 +1,56 @@ +# schematica tokenizer library + +## Getting Started + +### build + install 'xo-cmake` dependency + +- [github/Rconybea/xo-cmake](https://github.com/Rconybea/xo-cmake) + +Installs a few cmake ingredients, along with a build assistant `xo-build` for XO projects such as this one. + +### build + install other required XO dependencies +``` +$ xo-build --clone --configure --build --install xo-indentlog +$ xo-build --clone --configure --build --install xo-refnct +$ xo-build --clone --configure --build --install xo-subsys +$ xo-build --clone --configure --build --install xo-reflectutil +``` + +Note: can use `-n` to dry-run here + +### copy `xo-tokenizer` repository locally +``` +$ xo-build --clone xo-tokenizer +``` + +or equivalently +``` +$ git clone git@github.com:Rconybea/xo-tokenizer.git +``` + +### build + install `xo-tokenizer` + +``` +$ xo-build --configure --build --install xo-tokenizer +``` + +or equivalently: + +``` +$ PREFIX=/usr/local # or wherever you prefer +$ cmake -DCMAKE_INSTALL_PREFIX=${PREFIX} -S xo-tokenizer -B xo-tokenizer/.build +$ cmake --build xo-tokenizer/.build +$ cmake --install xo-tokenizer/.build +``` + +### build for unit test coverage +``` +$ cmake -DCMAKE_BUILD_TYPE=coverage -DCMAKE_INSTALL_PREFIX=$PREFIX xo-tokenizer/.build-ccov +$ cmake --build xo-tokenizer/.build-ccov +``` + +### LSP support +``` +$ cd xo-tokenizer +$ ln -s .build/compile_commands.json # lsp will look for compile_commands.json in the root of the source tree +``` diff --git a/xo-tokenizer/cmake/xo-bootstrap-macros.cmake b/xo-tokenizer/cmake/xo-bootstrap-macros.cmake new file mode 100644 index 00000000..aba31169 --- /dev/null +++ b/xo-tokenizer/cmake/xo-bootstrap-macros.cmake @@ -0,0 +1,35 @@ +# ---------------------------------------------------------------- +# for example: +# $ PREFIX=/usr/local # for example +# $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build +# +# will get +# CMAKE_MODULE_PATH +# from xo-cmake-config --cmake-module-path +# +# and expect .cmake macros in +# CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake +# ---------------------------------------------------------------- + +find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED) + +if ("${XO_CMAKE_CONFIG_EXECUTABLE}" STREQUAL "XO_CMAKE_CONFIG_EXECUTABLE-NOT_FOUND") + message(FATAL "could not find xo-cmake-config executable") +endif() + +message(STATUS "XO_CMAKE_CONFIG_EXECUTABLE=${XO_CMAKE_CONFIG_EXECUTABLE}") + +if (NOT XO_SUBMODULE_BUILD) + if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix)) + # default to typical install location for xo-project-macros + execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH) + message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}") + endif() +endif() + +# needs to have been installed somewhere on CMAKE_MODULE_PATH, +# (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX) +# +include(xo_macros/xo_cxx) + +xo_cxx_bootstrap_message() diff --git a/xo-tokenizer/cmake/xo_tokenizerConfig.cmake.in b/xo-tokenizer/cmake/xo_tokenizerConfig.cmake.in new file mode 100644 index 00000000..f13d9e2b --- /dev/null +++ b/xo-tokenizer/cmake/xo_tokenizerConfig.cmake.in @@ -0,0 +1,8 @@ +@PACKAGE_INIT@ + +include(CMakeFindDependencyMacro) +#find_dependency(refcnt) +find_dependency(indentlog) +#find_dependency(subsys) +include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") +check_required_components("@PROJECT_NAME@") diff --git a/xo-tokenizer/include/xo/tokenizer/buffer.hpp b/xo-tokenizer/include/xo/tokenizer/buffer.hpp new file mode 100644 index 00000000..bc3621f2 --- /dev/null +++ b/xo-tokenizer/include/xo/tokenizer/buffer.hpp @@ -0,0 +1,324 @@ +/** @file buffer.hpp **/ + +#pragma once + +#include "span.hpp" +#include +#include +#include +#include + +namespace xo { + namespace scm { + /** + * @class buffer buffer.hpp + * + * @brief Container for a (possibly owned) FIFO queue of chars + * + * @tparam CharT. buffer element type. + * + * @code + * .buf + * + * +------------------------------------------+ + * | | ... | | X| ... | X| | ... | | + * +------------------------------------------+ + * ^ ^ ^ ^ + * 0 .lo .hi .buf_z + * + * <-contents-><----avail-----> + * @endcode + * + * Buffer does not support wrapped content: + * content that has not been consumed always occupies contiguous memory. + * + * Example: + * @code + * // 1. + * buffer buf(64*1024); + * buf.empty() -> true + * buf.buf_z() -> 65536 + * buf.lo_pos() -> 0 + * buf.hi_pos() -> 65536 + * buf.contents() -> empty span + * buf.avail() -> span entire buffer memory + * + * // write to (a prefix of) buf.avail() + * ::strncpy(buf.buf(), "hello, world\n", 13); + * buf.produce(span_type(buf.buf(), buf.buf() + 13)); + * + * buf.lo_pos() -> 0 + * buf.hi_pos() -> 13 + * buf.contents() -> "hello, world\n"; + * + * + * // examine stored content (does not change buffer state) + * auto span = buf.contents(); + * cerr << string_view(span.lo(), span.hi()); // "hello, world\n" + * + * // consume (a prefix of) stored content + * buf.consume(span.prefix(7); + * + * buf.lo_pos() -> 7 + * buf.hi_pos() -> 13 + * buf.contents() -> "world\n" + * + * // consuming all remain content resets to original state + * buf.consume(buf.contents()); + * + * buf.empty() -> true + * buf.hi_pos() -> 0 // not 13! + * + * // 2. + * buffer buf; + * buf.empty() -> true + * buf.buf_z() -> 0 + * buf.lo_pos() -> 0 + * buf.hi_pos() -> 0 + * buf.contents() -> empty span + * buf.avail() -> empty span + * + * // allocate memory separately from ctor + * buf.alloc(64*1024); + * @endcode + **/ + template + class buffer { + public: + /** @brief typealias for span of CharT **/ + using span_type = span; + /** @brief typealias for buffer size (counts CharT's, not bytes) **/ + using size_type = std::uint64_t; + + public: + /** @brief create empty buffer. + + Does not allocate any storage; @see alloc + **/ + buffer() = default; + /** @brief create empty buffer, and possibly allocate storage. + + @param buf_z Buffer size. allocate storage (owned by this buffer) if >0. + @param align_z Align to this value, e.g. 8 to align storage on an 8-byte boundary + **/ + buffer(size_type buf_z, size_type align_z = sizeof(char)) + : is_owner_{true}, + buf_{buf_z ? (new (std::align_val_t(align_z)) CharT [buf_z]) : nullptr}, + buf_z_{buf_z}, + lo_pos_{0}, + hi_pos_{0} + {} + /** @brief buffer is not copyable **/ + buffer(buffer const & x) = delete; + /** @brief destructor. Release storage if owned **/ + ~buffer() { this->reset(); } + + /** @name Access methods **/ + ///@{ + + /** @brief start of buffer memory **/ + CharT * buf() const { return buf_; } + /** @brief buffer size (number of characters) **/ + size_type buf_z() const { return buf_z_; } + /** @brief current start position within buffer **/ + size_type lo_pos() const { return lo_pos_; } + /** @brief current end position within buffer **/ + size_type hi_pos() const { return hi_pos_; } + + ///@} + + /** @brief readonly access to a single buffer element. + + Relative to start of buffer (ignores current consume position) + **/ + CharT const & operator[](size_type i) const { return buf_[i]; } + + /** @brief return span for current buffer contents **/ + span_type contents() const { return span_type(buf_ + lo_pos_, buf_ + hi_pos_); } + /** @brief returns span for writable buffer contents (unused prefix following produce position **/ + span_type avail() const { return span_type(buf_ + hi_pos_, buf_ + buf_z_); } + + /** @brief @c true iff buffer is empty **/ + bool empty() const { return lo_pos_ == hi_pos_; } + + + /** + @brief update buffer produce position, after (independently) writing contents of span to it + + @pre left endpoint of @p span equals buffer produce position (@c .hi_pos) + @pre right endpoint of @p span within bounds of buffer memory range + @post right endpoint of @p span equals buffer produce position. + **/ + void produce(span_type const & span) { + assert(span.lo() == buf_ + hi_pos_); + + hi_pos_ += span.size(); + } + + /** + @brief update buffer consume position, when done with contents of span + + @pre left endpoint of @p span equals buffer consume position (@c .lo_pos) + @pre right endpoint of @p span within bounds of buffer memory range + @post Either + buffer is empty, with @c .lo_pos = @c .hi_pos = @c 0. + buffer is non-empty, right endpoint of @p span equals new buffer consume position. + **/ + void consume(span_type const & span) { + if (span.size()) { + assert(span.lo() == buf_ + lo_pos_); + + lo_pos_ += span.size(); + } else { + /* since .consume() that arrives at empty contents also resets .lo_pos .hi_pos, + * we don't want to blow up when called with an empty span -- argument + * may represent some pre-reset location in buffer + */ + } + + if (lo_pos_ == hi_pos_) { + lo_pos_ = 0; + hi_pos_ = 0; + } + } + + /** + @brief allocate buffer with desired amount of memory + + @param buf_z desired buffer size + @param align_z alignment; buffer memory will be aligned on this byte-boundary. + **/ + void alloc(size_type buf_z, size_type align_z = sizeof(char)) { + /* properly reset (+ discard) any existing state */ + this->reset(); + + is_owner_ = true; + if (buf_z) + buf_ = new (std::align_val_t(align_z)) CharT [buf_z]; + buf_z_ = buf_z; + lo_pos_ = 0; + hi_pos_ = 0; + } + + /** + @brief attach buffer to (unowned) range of @p buf_z bytes starting at @p buf[0] + + Buffer is not responsible for managing storage. + + @post + 1. buffer is empty + @post + 2. buffer read position = buffer write position = 0 + **/ + void setbuf(CharT * buf, size_type buf_z) { + /* properly reset (+ discard) any existing state */ + this->reset(); + + is_owner_ = false; + lo_pos_ = 0; + hi_pos_ = 0; + buf_ = buf; + buf_z_ = buf_z; + } + + /** + @brief revert buffer to empty state and possibly zero it + + @param zero_buffer_flag Zero buffer contents iff this is true + + @post + 1. buffer is empty + @post + 2. buffer read position = buffer write position = 0 + **/ + void clear2empty(bool zero_buffer_flag) { + if (buf_ && zero_buffer_flag) + explicit_bzero(buf_, buf_z_ * sizeof(CharT)); + + lo_pos_ = 0; + hi_pos_ = 0; + } + + /** + @brief swap representation with another buffer instance. + **/ + void swap (buffer & x) { + std::swap(is_owner_, x.is_owner_); + std::swap(buf_, x.buf_); + std::swap(buf_z_, x.buf_z_); + std::swap(lo_pos_, x.lo_pos_); + std::swap(hi_pos_, x.hi_pos_); + } + + /** + @brief reset buffer to an empty state and recover owned storage + **/ + void reset() { + if (is_owner_ && buf_) + delete [] buf_; + + is_owner_ = false; + buf_ = nullptr; + buf_z_ = 0; + lo_pos_ = 0; + hi_pos_ = 0; + } + + /** + @brief move-assignment operator. + @param x right-hand-side to move from. + + @post + @p x is in a valid, empty, + **/ + buffer & operator= (buffer && x) { + is_owner_ = x.is_owner_; + buf_ = x.buf_; + buf_z_ = x.buf_z_; + lo_pos_ = x.lo_pos_; + hi_pos_ = x.hi_pos_; + + x.is_owner_ = false; + x.lo_pos_ = 0; + x.hi_pos_ = 0; + x.buf_ = nullptr; + x.buf_z_ = 0; + + return *this; + } + + /** @brief buffer is not assignable */ + buffer & operator= (buffer & x) = delete; + + private: + /** @brief true iff buffer is responsible for freeing storage at @c buf_ **/ + bool is_owner_ = false; + /** @brief buffer contents. buffer memory comprises @c buf_[0] to @c buf_[buf_z_] **/ + CharT * buf_ = nullptr; + /** @brief buffer size (in units of CharT) **/ + size_type buf_z_ = 0; + + /** @brief buffer read (consume) position + + @invariant + 0 <= lo_pos_ <= hi_pos_ < buf_z_ + **/ + size_type lo_pos_ = 0; + /** @brief buffer write (produce) position + + @invariant + 0 <= hi_pos_ < hi_pos_ < buf_z_ + **/ + size_type hi_pos_ = 0; + }; + + /** @brief Overload for @c swap, so that @c buffer swappable **/ + template + inline void + swap(buffer & lhs, buffer & rhs) { + lhs.swap(rhs); + } + } /*namespace scm*/ +} /*namespace xo*/ + +/* end buffer.hpp */ diff --git a/xo-tokenizer/include/xo/tokenizer/span.hpp b/xo-tokenizer/include/xo/tokenizer/span.hpp new file mode 100644 index 00000000..22695ec5 --- /dev/null +++ b/xo-tokenizer/include/xo/tokenizer/span.hpp @@ -0,0 +1,160 @@ +/** @file span.hpp **/ + +#pragma once + +#include +#include +#include + +namespace xo { + namespace scm { + /** @class span compression/span.hpp + * + * @brief Represents a contiguous memory range, without ownership. + * + * @tparam CharT type for elements referred to by this span. + **/ + template + class span { + public: + /** @brief typealias for span size (in units of CharT) **/ + using size_type = std::uint64_t; + + public: + /** @brief create span for the contiguous memory range [@p lo, @p hi) **/ + span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {} + + /** @brief create span for C-style string @p cstr **/ + static span from_cstr(const CharT * cstr) { + CharT * lo = cstr; + CharT * hi = cstr ? cstr + strlen(cstr) : nullptr; + + return span(lo, hi); + } + + ///@{ + + /** @name getters **/ + + CharT * lo() const { return lo_; } /* get member span::lo_ */ + CharT * hi() const { return hi_; } /* get member span::hi_ */ + + ///@} + + /** @brief create new span over supplied type, + * with identical (possibly misaligned) endpoints. + * + * @warning + * 1. New span uses exactly the same memory addresses. + * Endpoint pointers may not be aligned. + * 2. Implementation assumes code compiled with + * @code -fno-strict-aliasing @endcode enabled. + * + * @tparam OtherT element type for new span + **/ + template + span + cast() const { return span(reinterpret_cast(lo_), + reinterpret_cast(hi_)); } + + /** @brief create span including the first @p z members of this span. **/ + span prefix(size_type z) const { return span(lo_, lo_ + z); } + + /** @brief create span representing prefix up to (but not including) @p *p + **/ + span prefix_upto(CharT * p) const { + if (p <= hi_) + return span(lo_, p); + else + return span(lo_, hi_); + } + + /** @brief create span with first @p z members of this span removed **/ + span after_prefix(size_type z) const { + if (lo_ + z > hi_) + z = hi_ - lo_; + + return span(lo_ + z, hi_); + } + + /** @brief create span with @p prefix of this span removed **/ + span after_prefix(const span & prefix) const { + assert(prefix.lo() == lo_); + if (prefix.lo() != lo_) { + throw std::runtime_error + ("after_prefix: expected prefix of this span"); + } + + return after_prefix(prefix.size()); + } + + /** @brief create span starting with position p **/ + span suffix_from(CharT * p) const { + if ((lo_ <= p) && (p <= hi_)) + return span(p, hi_); + else + return span(hi_, hi_); + } + + /** @brief true iff this span is empty (comprises 0 elements). **/ + bool empty() const { return lo_ == hi_; } + /** @brief report the number of elements (of type CharT) in this span. **/ + size_type size() const { return hi_ - lo_; } + + span & operator+=(const span & x) { + if (hi_ == x.lo_) { + hi_ = x.hi_; + } else { + assert(false); + } + + return *this; + } + + /** print representation for this span on stream @p os **/ + void print(std::ostream & os) const { + os << ""; + } + + private: + ///@{ + + /** @brief start of span + Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) + **/ + CharT * lo_ = nullptr; + /** @brief end of span + Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) + **/ + CharT * hi_ = nullptr; + + ///@} + }; /*span*/ + + template + inline bool + operator==(const span & lhs, const span & rhs) { + return ((lhs.lo() == rhs.lo()) + && (lhs.hi() == rhs.hi())); + } + + template + inline bool + operator!=(const span & lhs, const span & rhs) { + return ((lhs.lo() != rhs.lo()) + || (lhs.hi() != rhs.hi())); + } + + template + inline std::ostream & + operator<<(std::ostream & os, + const span & x) { + x.print(os); + return os; + } + } /*namespace scm*/ +} /*namespace xo*/ diff --git a/xo-tokenizer/include/xo/tokenizer/token.hpp b/xo-tokenizer/include/xo/tokenizer/token.hpp new file mode 100644 index 00000000..988b4976 --- /dev/null +++ b/xo-tokenizer/include/xo/tokenizer/token.hpp @@ -0,0 +1,359 @@ +/* file token.hpp + * + * author: Roland Conybeare, Jul 2024 + */ + +#pragma once + +#include "tokentype.hpp" +#include "xo/indentlog/print/tag.hpp" +#include +#include +#include +#include + +namespace xo { + namespace scm { + namespace detail { + /* compute a * b^p, p >= 0 */ + constexpr double + pow_aux(double a, double b, int p) { + while (p > 0) { + if (p % 2 == 1) { + /* a * b^p = a * b^(2q + 1) = a.b * 10^(2q) */ + a *= b; + p -= 1; + } else { + /* a * b^p = a * b^(2q) = a * (b^2)^q */ + b = b * b; + p /= 2; + } + } + + /* a * b^0 = a */ + return a; + } + + constexpr double + pow10(int p) { + if (p >= 0) + return pow_aux(1.0, 10.0, p); + else + return 1.0 / pow_aux(1.0, 10.0, -p); + } + } + + template + class token { + public: + token() = default; + token(tokentype tk_type, const std::string & text = "") + : tk_type_{tk_type}, text_{text} {} + + static token invalid() { return token(); } + static token i64_token(const std::string & txt) { + return token(tokentype::tk_i64, txt); + } + static token f64_token(const std::string & txt) { + return token(tokentype::tk_f64, txt); + } + static token string_token(const std::string & txt) { + return token(tokentype::tk_string, txt); + } + static token symbol_token(const std::string & txt) { + return token(tokentype::tk_symbol, txt); + } + static token leftangle() { return token(tokentype::tk_leftangle); } + static token rightangle() { return token(tokentype::tk_rightangle); } + static token leftparen() { return token(tokentype::tk_leftparen); } + static token rightparen() { return token(tokentype::tk_rightparen); } + static token leftbracket() { return token(tokentype::tk_leftbracket); } + static token rightbracket() { return token(tokentype::tk_rightbracket); } + static token leftbrace() { return token(tokentype::tk_leftbrace); } + static token rightbrace() { return token(tokentype::tk_rightbrace); } + static token dot() { return token(tokentype::tk_dot); } + static token comma() { return token(tokentype::tk_comma); } + static token colon() { return token(tokentype::tk_colon); } + static token doublecolon() { return token(tokentype::tk_doublecolon); } + static token semicolon() { return token(tokentype::tk_semicolon); } + static token singleassign() { return token(tokentype::tk_singleassign); } + static token assign_token() { return token(tokentype::tk_assign); } + static token yields() { return token(tokentype::tk_yields); } + + static token star_token() { return token(tokentype::tk_star); } + + static token type() { return token(tokentype::tk_type); } + static token def() { return token(tokentype::tk_def); } + static token lambda() { return token(tokentype::tk_lambda); } + static token if_token() { return token(tokentype::tk_if); } + static token let() { return token(tokentype::tk_let); } + static token in() { return token(tokentype::tk_in); } + static token end() { return token(tokentype::tk_end); } + + tokentype tk_type() const { return tk_type_; } + const std::string & text() const { return text_; } + + bool is_valid() const { return tk_type_ != tokentype::tk_invalid; } + bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; } + + /** expect input matching + * [+|-][0-9][0-9]* + **/ + std::int64_t i64_value() const; + /** expect input matching + * [+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]* + **/ + double f64_value() const; + + /** print human-readable token representation on stream @p os **/ + void print(std::ostream & os) const; + + private: + /** category for this token **/ + tokentype tk_type_ = tokentype::tk_invalid; + + /** characters comprising this token. + * only provided for certain token types: + * + * tk_i64 + * tk_f64 + * tk_string + * tk_symbol + **/ + std::string text_; + }; /*token*/ + + template + std::int64_t + token::i64_value() const { + if (tk_type_ != tokentype::tk_i64) { + throw (std::runtime_error + (tostr("token::i64_value", + ": token with type tk found where tk_i64 expected", + xtag("tk", tk_type_)))); + } + + if (text_.empty()) { + throw (std::runtime_error + (tostr("token::i64_value", + ": unexpected empty input string for tk_i64 token"))); + } + + int sign = 1; + int value = 0; + { + auto ix = text_.begin(); + auto end_ix = text_.end(); + + CharT ch = *ix; + + if (ch == '+') { + ++ix; + } else if (ch == '-') { + sign = -1; + ++ix; + } + + if (ix == end_ix) { + throw (std::runtime_error + (tostr("token::i64_value", + ": input text found where at least one digit expected", + xtag("text", text_)))); + } + + for (; ix != end_ix; ++ix) { + CharT ch = *ix; + + if ((ch >= '0') && (ch <= '9')) { + value *= 10; + value += (ch - '0'); + } else { + throw (std::runtime_error + (tostr("token::i64_value", + ": unexpected char ch in integer token", + xtag("ch", ch)))); + } + } + } + + return sign * value; + } /*i64_value*/ + + template + double + token::f64_value() const { + if (tk_type_ != tokentype::tk_f64) { + throw (std::runtime_error + (tostr("token::f64_value", + ": token with type tk found where tk_f64 expected", + xtag("tk", tk_type_)))); + } + + if (text_.empty()) { + throw (std::runtime_error + (tostr("token::f64_value", + ": unexpected empty input string for tk_f64 token"))); + } + + int sign = 1; + /* integer representing denormalized unsigned mantissa + * (mantissa scaled by smallest power of 10 sufficient to make + * it an integer) + */ + std::int64_t mantissa = 0; + /* counts #of digits to the right of decimal point '.' */ + int rh_digits = 0; + /* sign of exponent */ + int exp_sign = 1; + /* value of exponenct = integer to the right of 'e' or 'E' */ + int exponent = 0; + + /* floating-point value will represent + * sign * mantissa * 10^(sign*exponent - rh_digits) + */ + { + auto ix = text_.begin(); + auto end_ix = text_.end(); + + CharT ch = *ix; + + if (ch == '+') { + ++ix; + } else if (ch == '-') { + sign = -1; + ++ix; + } + + if (ix == end_ix) { + throw (std::runtime_error + (tostr("token::f64_value", + ": input text found where at least one digit expected", + xtag("text", text_)))); + } + + /* true iff decimal point '.' present in mantissa */ + bool have_decimal_point = false; + /* true iff exponent prefix 'e' or 'E' present */ + //bool have_exponent = false; + /* counts number of digits in mantissa + * (both before and after, but not including, any decimal point + */ + int m_digits = 0; + /* digits to the left of decimal point */ + int lh_digits = 0; + + /* loop over mantissa digits */ + for (; ix != end_ix; ++ix) { + CharT ch = *ix; + + if (ch == '.') { + if (have_decimal_point) { + throw (std::runtime_error + (tostr("token::f64_value", + ": input text found where at most one decimal point expected", + xtag("text", text_)))); + } + + have_decimal_point = true; + lh_digits = m_digits; + } else if ((ch >= '0') && (ch <= '9')) { + mantissa *= 10; + mantissa += (ch - '0'); + ++m_digits; + } else if (ch == 'e' || ch == 'E') { + //have_exponent = true; + break; // done with mantissa + } else { + throw (std::runtime_error + (tostr("token::i64_value", + ": unexpected char ch in integer token", + xtag("ch", ch)))); + } + } + + if (have_decimal_point) + rh_digits = m_digits - lh_digits; + + if (ix != end_ix) { + /* continue to read exponent */ + + /* skip e|E */ + ++ix; + + if (ix == end_ix) { + throw (std::runtime_error + (tostr("token::f64_value", + ": on input text, expect at least one digit following exponent marker e|E", + xtag("text", text_)))); + } + + CharT ch = *ix; + + if (ch == '+') { + ++ix; /*skip*/ + } else if (ch == '-') { + exp_sign = -1; + ++ix; + } + + for (; ix != end_ix; ++ix) { + CharT ch = *ix; + + if ((ch >= '0') && (ch <= '9')) { + exponent *= 10; + exponent += (ch - '0'); + } else { + throw (std::runtime_error + (tostr("token::f64_value", + "; on input text, expect only digits following" + " (possibly signed) exponenct marker", + xtag("text", text_)))); + } + } + } + } + + /* floating-point value will represent + * sign * mantissa * 10^(sign*exponent - rh_digits) + */ + + double mantissa_f64 = sign * mantissa; + +#ifdef OBSOLETE_DEBUG + std::cerr << xtag("text", text_) + << xtag("rh_digits", rh_digits) + << xtag("mantissa_f64", mantissa_f64) + << xtag("exp_sign", exp_sign) + << xtag("exponent", exponent) + << std::endl; +#endif + + double retval = (mantissa_f64 + * detail::pow10((exp_sign * exponent) + - rh_digits)); + + return retval; + } /*f64_value*/ + + template + void + token::print(std::ostream & os) const { + os << ""; + } /*print*/ + + template + inline std::ostream & + operator<< (std::ostream & os, + const token & tk) + { + tk.print(os); + return os; + } + } /*Namespace scm*/ +} /*namespace xo*/ + + +/* end token.hpp */ diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp new file mode 100644 index 00000000..ba340b1a --- /dev/null +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp @@ -0,0 +1,775 @@ +/* file tokenizer.hpp + * + * author: Roland Conybeare, Jul 2024 + */ + +#pragma once + +#include "token.hpp" +#include "span.hpp" +#include "xo/indentlog/scope.hpp" +#include + +namespace xo { + namespace scm { + /** + * Use: + * @code + * using tokenizer_type = tokenizer; + * using span_type = tokenizer_type::span_type; + * + * tokenizer_type tkz; + * span_type input = ...; + * + * while !input.empty() { + * auto res = tkz.scan(input); + * const auto & tk = res.first; + * + * // do something with tk if tk.is_valid() + * + * input = input.after_prefix(res.second); + * } + * + * if endofinput { + * auto tk = tzk.notify_eof() + * + * // do something with tk if tk.is_valid() + * } + * + * // expect !tkz.has_prefix() + * + * @endcode + **/ + template + class tokenizer { + public: + using token_type = token; + using span_type = span; + using scan_result = std::pair; + + public: + tokenizer() = default; + + /** identifies whitespace chars. + * These are chars that do not belong to any token. + * They are not permitted to appear within + * a symbol or string token. + * Appearance of a whitespace char forces completion of + * preceding token. + **/ + bool is_whitespace(CharT ch) const; + + /** identifies punctuation chars. + * These are chars that are not permitted to appear within + * a symbol token. Instead they force completion of + * a preceding token, and start a new token with themselves + **/ + bool is_1char_punctuation(CharT ch) const; + + /** more-relazed version of is_1char_punctuation. + * Chars that are not permitted to appear within a symbol token, + * but may form token combined with next character + **/ + bool is_2char_punctuation(CharT ch) const; + + /** true if tokenizer contains stored prefix of + * possibly-incomplete token + **/ + bool has_prefix() const { return !prefix_.empty(); } + + /** assemble token from text @p token_text + **/ + token_type assemble_token(const span_type & token_text) const; + + /** scan for next input token, given @p input. + * Note tokenizer can consume input (e.g. whitespace) + * without completing a token + * + * @return {parsed token, consumed span} + **/ + scan_result scan(const span_type & input); + + /** When eof is false, same as scan(input). + * When eof is true and scan(input) does not report a token, + * return notify_eof() + **/ + scan_result scan2(const span_type & input, bool eof); + + /** notify end of input, resolve any stored input **/ + token_type notify_eof(); + + private: + /** Accumulate partial token here. + * This will happen if input sent to @ref tokenizer::scan + * ends without a determinate token boundary. + **/ + std::string prefix_; + }; /*tokenizer*/ + + template + bool + tokenizer::is_whitespace(CharT ch) const { + switch(ch) { + case ' ': return true; + case '\t': return true; + case '\n': return true; + case '\r': return true; + } + + return false; + } + + template + bool + tokenizer::is_1char_punctuation(CharT ch) const { + switch(ch) { + case '<': + return true; + case '>': + return true; + case '(': + return true; + case ')': + return true; + case '[': + return true; + case ']': + return true; + case '{': + return true; + case '}': + return true; + case ',': + return true; + case ';': + return true; + case ':': + /* can't be 1char punctuation -- can begin assignment token */ + return false; + case '=': + return true; + case '-': + /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e-9 */ + return false; + case '+': + /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */ + return false; + case '*': + /* not punctuation -- allowed in symbol */ + return false; + case '/': + /* not punctuation -- for symmetry with +,- */ + return false; + case '.': + /* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */ + return false; + } + + return false; + } + + template + bool + tokenizer::is_2char_punctuation(CharT ch) const { + switch(ch) { + case ':': + /* can begin := */ + return true; + } + + return false; + } + + template + auto + tokenizer::assemble_token(const span_type & token_text) const -> token_type + { + constexpr bool c_debug_flag = true; + + /* literal|pretty|streamlined */ + log_config::style = function_style::streamlined; + + scope log(XO_DEBUG(c_debug_flag)); + log && log(xtag("token_text", token_text)); + + tokentype tk_type = tokentype::tk_invalid; + std::string tk_text; + + const CharT * tk_start = token_text.lo(); + const CharT * tk_end = token_text.hi(); + + const CharT * ix = tk_start; + + /* switch here applies to the first character in a token */ + switch (*ix) { + case '-': + case '+': + if (token_text.size() == 1) { + /* standalone '+' or '-' */ + if (*ix == '+') + tk_type = tokentype::tk_plus; + else if(*ix == '-') + tk_type = tokentype::tk_minus; + } + + /** fall through to numeric literal code below **/ + ; + case '.': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + /* examples of valid floating-point numbers: + * .0 + * 1e0 + * 1e + * 0. + * +1e0 + * -1e0 + * +1E+2 + * -1E+2 + * -0.123e-10 + * non-examples: + * . + * - + * + + * e0 + * .e0 + * -.e-0 + * +.e+0 + * + * in particular: to be recognized as a number, + * must contain at least one digit + */ + + log && log("possible number-token"); + + /* true if initial sign -/+ encountered */ + bool sign_flag = false; + /* true if '.' encountered */ + bool period_flag = false; + /* true if 'e' | 'E' encountered. + */ + bool exponent_flag = false; + /* true when sign '-' | '+' precedes exponenct digits */ + bool exponent_sign_flag = false; + /* true when at least one digit follows exponent marker */ + bool exponent_digit_flag = false; + /* true if at least one digit encountered */ + bool number_flag = false; + + /* token will be one of: {i64, f64, dot}: */ + for(; ix != token_text.hi(); ++ix) { + if((*ix == '-') || (*ix == '+')) { + /* sign allowed: + * 1. before period and before first digit + * 2. after exponent + */ + if (!period_flag && !number_flag && !sign_flag) { + sign_flag = true; + } else if (exponent_flag && !exponent_digit_flag) { + exponent_sign_flag = true; + } else { + throw std::runtime_error + (tostr("tokenizer::assemble_token", + ": improperly placed sign indicator", + xtag("pos", ix - tk_start), + xtag("char", *ix))); + } + } else if(*ix == '.') { + if (period_flag) { + throw (std::runtime_error + (tostr("tokenizer::assemble_token", + ": duplicate decimal point", + xtag("pos", ix - tk_start), + xtag("char", *ix)))); + } + + period_flag = true; + } else if((*ix == 'e') || (*ix == 'E')) { + if (exponent_flag) { + throw (std::runtime_error + (tostr("tokenizer::assemble_token", + ": duplicate exponent marker", + xtag("pos", ix - tk_start), + xtag("char", *ix)))); + } + + exponent_flag = true; + } else if(isdigit(*ix)) { + if (exponent_flag) { + /* need digit before exponent to recognize as number */ + exponent_digit_flag = true; + } else { + number_flag = true; + } + } else { + /* invalid input */ + throw (std::runtime_error + (tostr("tokenizer::assemble_token", + ": unexpected character in numeric constant", + xtag("pos", ix - tk_start), + xtag("char", *ix)))); + } + } + + if (number_flag) { + if (period_flag || exponent_flag) { + tk_type = tokentype::tk_f64; + } else { + tk_type = tokentype::tk_i64; + } + } else if (period_flag && !exponent_flag) { + tk_type = tokentype::tk_dot; + } else { + /* not a valid token */ + } + + log && log(xtag("sign_flag", sign_flag)); + log && log(xtag("period_flag", period_flag), + xtag("exponent_flag", exponent_flag), + xtag("exponent_sign_flag", exponent_sign_flag), + xtag("number_flag", number_flag)); + log && log(xtag("tk_type", tk_type)); + + break; + } + case '*': + if (token_text.size() == 1) { + /* standalone '*' */ + tk_type = tokentype::tk_star; + ++ix; + } else { + /* '*' isn't punctuation -- but may allow appearance in a longer token + * + * thinking that x*y is a symbol with an embedded '*' character; + * in particular want to support kebab-case symbols like 'foo-config' + */ + } + break; + case '/': + if (token_text.size() == 1) { + /* standalone '/' */ + tk_type = tokentype::tk_slash; + ++ix; + } + break; + case '"': + { + log && log("recognize string-token"); + + tk_type = tokentype::tk_string; + + tk_text.reserve(token_text.hi() - token_text.lo()); + + ++ix; /*skip initial " char*/ + + for (; ix != token_text.hi(); ++ix) { + log && log(xtag("*ix", *ix)); + + bool endofstring = false; + + switch(*ix) { + case '"': + endofstring = true; + + /* skip final " char, don't capture */ + ++ix; + + break; + case '\\': + /* skip escape char, don't capture */ + ++ix; + + if (ix == token_text.hi()) { + throw std::runtime_error + (tostr("tokenizer::assemble_token", + ": malformed string literal", + xtag("input", std::string_view(token_text.lo(), + token_text.hi())))); + } + + switch(*ix) { + case '\\': + log && log(xtag("*ix", *ix), xtag("escaped", "t")); + tk_text.push_back(*ix); + break; + case 'n': + log && log(xtag("*ix", *ix), xtag("newline", "t")); + tk_text.push_back('\n'); + break; + case 't': + log && log(xtag("*ix", *ix), xtag("tab", "t")); + tk_text.push_back('\t'); + break; + case 'r': + log && log(xtag("*ix", *ix), xtag("cr", "t")); + tk_text.push_back('\r'); + break; + case '"': + log && log(xtag("*ix", *ix), xtag("quote", "t")); + tk_text.push_back('"'); + break; + default: + throw std::runtime_error + (tostr("tokenizer::assemble_token", + ": unexpected \\-escaped char", + xtag("char", *ix))); + } + break; + default: + tk_text.push_back(*ix); + break; + } + + if (endofstring) + break; + } + + if (ix != token_text.hi()) { + throw std::runtime_error + (tostr("tokenizer::assemble_token", + ": expected \" to end string literal", + xtag("input", std::string_view(token_text.lo(), + token_text.hi())))); + } + + log && log(tostr("tokenizer::assemble_token", + xtag("tk_text", tk_text))); + + break; + } + case 'a': case 'A': + case 'b': case 'B': + case 'c': case 'C': + case 'd': case 'D': + case 'e': case 'E': + case 'f': case 'F': + case 'g': case 'G': + case 'h': case 'H': + case 'i': case 'I': + case 'j': case 'J': + case 'k': case 'K': + case 'l': case 'L': + case 'm': case 'M': + case 'n': case 'N': + case 'o': case 'O': + case 'p': case 'P': + case 'q': case 'Q': + case 'r': case 'R': + case 's': case 'S': + case 't': case 'T': + case 'u': case 'U': + case 'v': case 'V': + case 'w': case 'W': + case 'x': case 'X': + case 'y': case 'Y': + case 'z': case 'Z': + { + /* symbol/identifier must begin with a letter? + * we want to accept some other chars too. + * specifically want to allow identifiers: + * this-is-the-way + * this+is+also+the+way + * how/much/is/that/doggy + * put*an*asterisk*in*that + * something%special% + * + * like pure lisp, we don't allow: + * - identifier beginning with digit + * - period . + * + * unlike pure lisp, we don't allow anywhere in a symbol: + * - colon : + * - semicolon ; + * - comma , + * + * also we don't allow symbols to begin with special chars + */ + + tk_type = tokentype::tk_symbol; + break; + } + case '<': + tk_type = tokentype::tk_leftangle; + ++ix; + break; + case '>': + tk_type = tokentype::tk_rightangle; + ++ix; + break; + case '(': + tk_type = tokentype::tk_leftparen; + ++ix; + break; + case ')': + tk_type = tokentype::tk_rightparen; + ++ix; + break; + case '[': + tk_type = tokentype::tk_leftbracket; + ++ix; + break; + case ']': + tk_type = tokentype::tk_rightbracket; + ++ix; + break; + case '{': + tk_type = tokentype::tk_leftbrace; + ++ix; + break; + case '}': + tk_type = tokentype::tk_rightbrace; + ++ix; + break; + case ',': + tk_type = tokentype::tk_comma; + ++ix; + break; + case ';': + tk_type = tokentype::tk_semicolon; + ++ix; + break; + case ':': + { + log && log("colon or assignment token"); + + if (*(ix + 1) == '=') { + tk_type = tokentype::tk_assign; + ++ix; + ++ix; + } else { + tk_type = tokentype::tk_colon; + ++ix; + } + break; + } + case '=': + tk_type = tokentype::tk_singleassign; + ++ix; + break; + default: + break; + } + + if (tk_type == tokentype::tk_invalid) { + throw std::runtime_error(tostr("tokenizer::assemble_token", + ": unexpected input x", + xtag("x", *ix))); + } + + if ((tk_type == tokentype::tk_i64) + || (tk_type == tokentype::tk_f64) + || (tk_type == tokentype::tk_symbol)) + { + /* re-parse in token::i64_value() / token::f64_value() */ + tk_text = std::string(tk_start, tk_end); + } else if (tk_type == tokentype::tk_string) { + ; /* nothing to do here -- desired tk_text already constructed */ + } + + if (tk_type == tokentype::tk_symbol) { + /* check for keywords */ + + bool keep_text = false; + + if (tk_text == "type") { + tk_type = tokentype::tk_type; + } else if (tk_text == "def") { + tk_type = tokentype::tk_def; + } else if (tk_text == "lambda") { + tk_type = tokentype::tk_lambda; + } else if (tk_text == "if") { + tk_type = tokentype::tk_if; + } else if (tk_text == "let") { + tk_type = tokentype::tk_let; + } else if (tk_text == "in") { + tk_type = tokentype::tk_in; + } else if (tk_text == "end") { + tk_type = tokentype::tk_end; + } else { + /* keep as symbol */ + keep_text = true; + } + + if (!keep_text) + tk_text.clear(); + } + + return token_type(tk_type, std::move(tk_text)); + } /*assemble_token*/ + + template + auto + tokenizer::scan(const span_type & input) -> scan_result + { + constexpr bool c_debug_flag = true; + scope log(XO_DEBUG(c_debug_flag)); + + log && log(xtag("input", input)); + + const CharT * ix = input.lo(); + + /* skip whitespace */ + while (is_whitespace(*ix) && (ix != input.hi())) + ++ix; + + if(ix == input.hi()) { + /* no-op */ + return { + token_type::invalid(), + input.prefix_upto(ix) + }; + } + + /* here: *ix is not whitespace */ + + auto whitespace = input.prefix_upto(ix); + + log && log(xtag("whitespace.size", whitespace.size())); + + /* tk_start points to beginning of token + * (after any whitespace) + */ + const CharT * tk_start = ix; + + if (is_1char_punctuation(*ix)) { + /* 1-character token */ + ++ix; + } else if (is_2char_punctuation(*ix)) { + CharT ch1 = *ix; + + (void)ch1; + + ++ix; + + if (ix == input.hi()) { + /* need more input to know if/when token complete */ + this->prefix_ += std::string(tk_start, input.hi()); + + log && log(xtag("captured-prefix", this->prefix_)); + } else { + CharT ch2 = *ix; + + if (((ch2 >= '0') && (ch2 <= '9')) + || ((ch2 >= 'A') && (ch2 <= 'Z')) + || ((ch2 >= 'a') && (ch2 <= 'z'))) + { + /* treat as 1 char punctuation */ + ; + } else { + /* include next char */ + ++ix; + } + } + } else if (*ix == '"') { + bool complete_flag = false; + + /* 1. embedded space/tab allowed in string literal. + * 2. embedded newline/cr not allowed. + */ + CharT prev_ch = '"'; + + ++ix; + + for (; ix != input.hi(); ++ix) { + /* looking for unescaped " char to end literal */ + if (*ix == '"') { + if (prev_ch != '\\') { + ++ix; /* include terminating " for assemble_token */ + complete_flag = true; + break; + } + } else if ((*ix == '\n') || (*ix == '\r')) { + throw std::runtime_error + (tostr("tokenizer::scan", + ": must use \\n or \\r to encode newline/cr in" + " string literal")); + } + + prev_ch = *ix; + } + + if (!complete_flag) { + /* need more input to know if/when token complete */ + this->prefix_ += std::string(tk_start, input.hi()); + + log && log(xtag("captured-prefix", this->prefix_)); + } + } else { + /* scan until: + * - whitespace + * - punctuation + */ + for (; ix != input.hi(); ++ix) { + if (is_whitespace(*ix) + || is_1char_punctuation(*ix) + || is_2char_punctuation(*ix)) + { + break; + } + } + + if (ix == input.hi()) { + /* need more input to know if/when token complete */ + this->prefix_ += std::string(tk_start, input.hi()); + + log && log(xtag("captured-prefix", this->prefix_)); + } + } + + auto token_span = input.after_prefix(whitespace).prefix_upto(ix); + + token tk + = (this->prefix_.empty() + ? assemble_token(token_span) + : token_type(tokentype::tk_invalid)); + + return scan_result + { tk, input.prefix(whitespace.size() + token_span.size()) }; + } /*scan*/ + + template + auto + tokenizer::scan2(const span_type & input, bool eof) -> scan_result { + auto sr = this->scan(input); + + if (!sr.first.is_valid() && eof) { + sr.first = this->notify_eof(); + /* always consume remainder of input here. + * ambiguous prefix can represent at most one token + */ + sr.second = input; + } + + return sr; + } + + template + auto + tokenizer::notify_eof() -> token_type { + constexpr bool c_debug_flag = true; + + scope log(XO_DEBUG(c_debug_flag)); + + token tk + = (this->prefix_.empty() + ? token_type(tokentype::tk_invalid) + : assemble_token(span_type(&prefix_[0], + &prefix_[prefix_.size()]))); + + this->prefix_.clear(); + + return tk; + } /*notify_eof*/ + } /*namespace scm*/ +} /*namespace xo*/ + +/* end tokenizer.hpp */ diff --git a/xo-tokenizer/include/xo/tokenizer/tokentype.hpp b/xo-tokenizer/include/xo/tokenizer/tokentype.hpp new file mode 100644 index 00000000..6da013d9 --- /dev/null +++ b/xo-tokenizer/include/xo/tokenizer/tokentype.hpp @@ -0,0 +1,158 @@ +/** @file tokentype.hpp + * + * author: Roland Conybeare, Jul 2024 + **/ + +#pragma once + +#include "xo/indentlog/print/tag.hpp" // for STRINGIFY +#include + +namespace xo { + namespace scm { + /** @enum tokentype + * @brief enum to identify different schematica input token types + * + * Schematica code examples: + * + * type point :: { xcoord : f64, ycoord: f64 }; + * type matrix :: array; // 2-d array + * + * decl hypot(x : f64, y : f64) -> f64; + * + * def hypot(x : f64, y : f64) { + * let + * x2 = (x * x); + * y2 = (y * y); + * hypot2 = (x2 + y2); + * in + * sqrt(hypot2); + * }; + * + * def someconst 4; + * + * def foo(v : vec) { + * def (pi : f64) = 3.1415926; + * def (h : (f64,f64) -> f64) = hypot; + * + * h = hypot3; + * }; + * + * def matrixproduct(x : matrix, y : matrix) { + * [i,j : x.row(i) * y.col(j)]; + * }; + **/ + enum class tokentype { + /** sentinel value **/ + tk_invalid = -1, + + /** an integer constant (signed 64-bit integer) **/ + tk_i64, + + /** a 64-bit floating-point constant **/ + tk_f64, + + /** a string literal **/ + tk_string, + + /** a symbol **/ + tk_symbol, + + /** left-hand parenthesis '(' **/ + tk_leftparen, + + /** right-hand parenthesis ')' **/ + tk_rightparen, + + /** left-hand bracket '[' **/ + tk_leftbracket, + + /** right-hand bracket ']' **/ + tk_rightbracket, + + /** left-hand brace '{' **/ + tk_leftbrace, + + /** right-hand brace '}' **/ + tk_rightbrace, + + /** left-hand angle bracket '<' **/ + tk_leftangle, + + /** right-hand angle bracket '>' **/ + tk_rightangle, + + /** dot '.' **/ + tk_dot, + + /** comma ',' **/ + tk_comma, + + /** colon ':' **/ + tk_colon, + + /** double-colon '::' **/ + tk_doublecolon, + + /** semi-colon ';' **/ + tk_semicolon, + + /** '=' **/ + tk_singleassign, + + /** ':=' **/ + tk_assign, + + /** '->' **/ + tk_yields, + + /** note: operators not treated as punctuation + * 'do-always' is a legal variable name, + * as is 'maybe*2', 'maybe+1', 'path/to/foo' + **/ + + /** operator '+' **/ + tk_plus, + /** operator '-' **/ + tk_minus, + /** operator '*' **/ + tk_star, + /** operator '/' **/ + tk_slash, + + /** keyworkd 'type' **/ + tk_type, + + /** keyword 'def' **/ + tk_def, + + /** keyword 'lambda' **/ + tk_lambda, + + /** keyword 'if' **/ + tk_if, + + /** keyword 'let' **/ + tk_let, + + /** keyword 'in' **/ + tk_in, + + /** keyword 'end' **/ + tk_end, + + n_tokentype /* comes last, counts #of entries */ + }; /*tokentype*/ + + extern char const * + tokentype_descr(tokentype tk_type); + + inline std::ostream & + operator<< (std::ostream & os, tokentype tk_type) { + os << tokentype_descr(tk_type); + return os; + } + } /*namespace scm*/ +} /*namespace xo*/ + +/* end tokentype.hpp */ diff --git a/xo-tokenizer/src/tokenizer/CMakeLists.txt b/xo-tokenizer/src/tokenizer/CMakeLists.txt new file mode 100644 index 00000000..cad846f4 --- /dev/null +++ b/xo-tokenizer/src/tokenizer/CMakeLists.txt @@ -0,0 +1,14 @@ +# tokenizer/CMakeLists.txt + +set(SELF_LIB xo_tokenizer) +set(SELF_SRCS + tokentype.cpp + token.cpp) + +xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS}) +#xo_dependency(${SELF_LIB} refcnt) +xo_dependency(${SELF_LIB} indentlog) +#xo_dependency(${SELF_LIB} subsys) +#xo_boost_dependency(${SELF_LIB}) + +# end CMakeLists.txt diff --git a/xo-tokenizer/src/tokenizer/token.cpp b/xo-tokenizer/src/tokenizer/token.cpp new file mode 100644 index 00000000..6438dee1 --- /dev/null +++ b/xo-tokenizer/src/tokenizer/token.cpp @@ -0,0 +1,9 @@ +/** @file token.cpp + * + * author: Roland Conybeare + **/ + +#include "token.hpp" +#include "xo/indentlog/print/tag.hpp" + +/** end token.cpp **/ diff --git a/xo-tokenizer/src/tokenizer/tokentype.cpp b/xo-tokenizer/src/tokenizer/tokentype.cpp new file mode 100644 index 00000000..b7172118 --- /dev/null +++ b/xo-tokenizer/src/tokenizer/tokentype.cpp @@ -0,0 +1,67 @@ +/* file tokentype.cpp + * + * author: Roland Conybeare + */ + +#include "tokentype.hpp" + +namespace xo { + namespace scm { + char const * + tokentype_descr(tokentype tk_type) + { +#define CASE(x) case tokentype::x: return STRINGIFY(x) + + switch(tk_type) { + CASE(tk_i64); + CASE(tk_f64); + CASE(tk_string); + CASE(tk_symbol); + CASE(tk_leftparen); + + CASE(tk_rightparen); + CASE(tk_leftbracket); + CASE(tk_rightbracket); + CASE(tk_leftbrace); + CASE(tk_rightbrace); + + CASE(tk_leftangle); + CASE(tk_rightangle); + CASE(tk_dot); + CASE(tk_comma); + CASE(tk_colon); + + CASE(tk_doublecolon); + CASE(tk_semicolon); + CASE(tk_singleassign); + CASE(tk_assign); + CASE(tk_yields); + + CASE(tk_plus); + CASE(tk_minus); + CASE(tk_star); + CASE(tk_slash); + + CASE(tk_type); + CASE(tk_def); + CASE(tk_lambda); + CASE(tk_if); + CASE(tk_let); + + CASE(tk_in); + CASE(tk_end); + + case tokentype::tk_invalid: + case tokentype::n_tokentype: + return "?tokentype"; + } + +#undef CASE + + return "???"; + } /*tokentype_descr*/ + } /*namespace scm*/ +} /*namespace xo*/ + + +/* end tokentype.cpp */ diff --git a/xo-tokenizer/utest/CMakeLists.txt b/xo-tokenizer/utest/CMakeLists.txt new file mode 100644 index 00000000..cc080294 --- /dev/null +++ b/xo-tokenizer/utest/CMakeLists.txt @@ -0,0 +1,13 @@ +# build unittest tokenizer/utest + +set(SELF_EXECUTABLE_NAME utest.tokenizer) +set(SELF_SOURCE_FILES + tokenizer_utest_main.cpp + tokenizer.test.cpp + token.test.cpp) + +xo_add_utest_executable(${SELF_EXECUTABLE_NAME} ${SELF_SOURCE_FILES}) +xo_self_dependency(${SELF_EXECUTABLE_NAME} xo_tokenizer) +xo_external_target_dependency(${SELF_EXECUTABLE_NAME} Catch2 Catch2::Catch2) + +# end CMakeLists.txt diff --git a/xo-tokenizer/utest/token.test.cpp b/xo-tokenizer/utest/token.test.cpp new file mode 100644 index 00000000..160420b0 --- /dev/null +++ b/xo-tokenizer/utest/token.test.cpp @@ -0,0 +1,260 @@ +/* file token.test.cpp + * + * author: Roland Conybeare + */ + +#include "xo/tokenizer/token.hpp" +#include +#include + +namespace xo { + using token = xo::scm::token; + using xo::scm::tokentype; + + namespace ut { + struct testcase_i64 { + std::string text_; + bool expect_throw_; + std::int64_t expected_; + }; + + std::vector s_testcase_v = { + {"", true, 0}, + {"0", false, 0}, + {"-", true, 0}, + {"+", true, 0}, + {"-0", false, 0}, + {"+0", false, 0}, + {"1", false, 1}, + {"-1", false, -1}, + {"9", false, 9}, + {"-9", false, -9}, + {"12", false, 12}, + {"+12", false, 12}, + {"-12", false, -12}, + {"99", false, 99}, + {"-99", false, -99}, + {"123x", true, 0}, + }; + + TEST_CASE("parse-i64", "[token]") { + for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) { + INFO(xtag("i_tc", i_tc)); + + auto const & testcase = s_testcase_v[i_tc]; + + token tk(tokentype::tk_i64, + testcase.text_); + + REQUIRE(tk.tk_type() == tokentype::tk_i64); + + bool throw_flag = false; + try { + std::int64_t x = tk.i64_value(); + + REQUIRE(x == testcase.expected_); + } catch (std::exception & ex) { + throw_flag = true; + } + + REQUIRE(throw_flag == testcase.expect_throw_); + } + } + + TEST_CASE("error-i64", "[token]") { + token tk(tokentype::tk_i64, "+"); + + bool throw_flag = false; + + try { + tk.i64_value(); + } catch(std::exception & ex) { + throw_flag = true; + } + + REQUIRE(throw_flag); + } + + namespace { + struct testcase_f64 { + std::string text_; + bool expect_throw_; + double expected_; + }; + + std::vector s_testcase_v = { + {"", true, 0}, + {"0", false, 0}, + {"-", true, 0}, + {"+", true, 0}, + {"-0", false, 0}, + + {"+0", false, 0}, + {"1", false, 1}, + {"-1", false, -1}, + {"9", false, 9}, + {"-9", false, -9}, + + {"12", false, 12}, + {"+12", false, 12}, + {"-12", false, -12}, + {"99", false, 99}, + {"-99", false, -99}, + + {"123x", true, 0}, + {"0.0", false, 0.0}, + {"0.1", false, 0.1}, + {"0.12", false, 0.12}, + {"0.123", false, 0.123}, + + {"0.1234", false, 0.1234}, + {"0.12345", false, 0.12345}, + {"0.123456", false, 0.123456}, + {"0.1234567", false, 0.1234567}, + {"0.12345678", false, 0.12345678}, + + {"0.123456789", false, 0.123456789}, + {"+0.0", false, 0.0}, + {"+0.1", false, 0.1}, + {"+0.12", false, 0.12}, + {"+0.123", false, 0.123}, + + {"+0.1234", false, 0.1234}, + {"+0.12345", false, 0.12345}, + {"+0.123456", false, 0.123456}, + {"+0.1234567", false, 0.1234567}, + {"+0.12345678", false, 0.12345678}, + + {"+0.123456789", false, 0.123456789}, + {"+0.0e0", false, 0.0}, + {"+0.1e0", false, 0.1}, + {"+0.12e0", false, 0.12}, + {"+0.123e0", false, 0.123}, + + {"+0.1234e0", false, 0.1234}, + {"+0.12345e0", false, 0.12345}, + {"+0.123456e0", false, 0.123456}, + {"+0.1234567e0", false, 0.1234567}, + {"+0.12345678e0", false, 0.12345678}, + + {"+0.123456789e0", false, 0.123456789}, + {"+0.0e1", false, 00.}, + {"+0.1e1", false, 01.}, + {"+0.12e1", false, 01.2}, + {"+0.123e1", false, 01.23}, + + {"+0.1234e1", false, 01.234}, + {"+0.12345e1", false, 01.2345}, + {"+0.123456e1", false, 01.23456}, + {"+0.1234567e1", false, 01.234567}, + {"+0.12345678e1", false, 01.2345678}, + + {"+0.123456789e1", false, 01.23456789}, + {"+0.0E1", false, 00.}, + {"+0.1E1", false, 01.}, + {"+0.12E1", false, 01.2}, + {"+0.123E1", false, 01.23}, + + {"+0.1234E1", false, 01.234}, + {"+0.12345E1", false, 01.2345}, + {"+0.123456E1", false, 01.23456}, + {"+0.1234567E1", false, 01.234567}, + {"+0.12345678E1", false, 01.2345678}, + + {"+0.123456789E1", false, 01.23456789}, + {"+0.0e9", false, 0.0}, + {"+0.1e9", false, 0.1e9}, + {"+0.12e9", false, 0.12e9}, + {"+0.123e9", false, 0.123e9}, + + {"+0.1234e9", false, 0.1234e9}, + {"+0.12345e9", false, 0.12345e9}, + {"+0.123456e9", false, 0.123456e9}, + {"+0.1234567e9", false, 0.1234567e9}, + {"+0.12345678e9", false, 0.12345678e9}, + + {"+0.123456789e9", false, 0.123456789e9}, + {"-0.0", false, -0.0}, + {"-0.1", false, -0.1}, + {"-0.12", false, -0.12}, + {"-0.123", false, -0.123}, + + {"-0.1234", false, -0.1234}, + {"-0.12345", false, -0.12345}, + {"-0.123456", false, -0.123456}, + {"-0.1234567", false, -0.1234567}, + {"-0.12345678", false, -0.12345678}, + + {"-0.123456789", false, -0.123456789}, + {"00.", false, 0.0}, + {"01.", false, 1.0}, + {"01.2", false, 1.2}, + {"01.23", false, 1.23}, + + {"01.234", false, 1.234}, + {"01.2345", false, 1.2345}, + {"01.23456", false, 1.23456}, + {"01.234567", false, 1.234567}, + {"01.2345678", false, 1.2345678}, + + {"01.23456789", false, 1.23456789}, + {"0.0", false, 0.0}, + {"1.2", false, 1.2}, + {"12.", false, 12.0}, + {"12.3", false, 12.3}, + + {"12.34", false, 12.34}, + {"12.345", false, 12.345}, + {"12.3456", false, 12.3456}, + {"12.34567", false, 12.34567}, + {"12.345678", false, 12.345678}, + + {"12.3456789", false, 12.3456789}, + {"01.23", false, 1.23}, + {"12.3", false, 12.3}, + {"123.", false, 123.0}, + {"123.4", false, 123.4}, + + {"123.45", false, 123.45}, + {"123.456", false, 123.456}, + {"123.4567", false, 123.4567}, + {"123.45678", false, 123.45678}, + {"123.456789", false, 123.456789}, + }; + + TEST_CASE("parse-f64", "[token]") { + for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) { + auto const & testcase = s_testcase_v[i_tc]; + + INFO(tostr(xtag("i_tc", i_tc), + xtag("text", testcase.text_) + )); + + token tk(tokentype::tk_f64, + testcase.text_); + + REQUIRE(tk.tk_type() == tokentype::tk_f64); + + bool throw_flag = false; + std::string ex_msg; + + try { + double x = tk.f64_value(); + + REQUIRE(x == Approx(testcase.expected_).epsilon(1.0e-15)); + } catch (std::exception & ex) { + ex_msg = ex.what(); + + throw_flag = true; + } + + INFO(xtag("ex_msg", ex_msg)); + + REQUIRE(throw_flag == testcase.expect_throw_); + } + } + } /*namespace*/ + } /*namespace ut*/ +} /*namespace xo*/ + +/* end token.test.cpp */ diff --git a/xo-tokenizer/utest/tokenizer.test.cpp b/xo-tokenizer/utest/tokenizer.test.cpp new file mode 100644 index 00000000..44600e7f --- /dev/null +++ b/xo-tokenizer/utest/tokenizer.test.cpp @@ -0,0 +1,268 @@ +/* file tokenizer.test.cpp + * + * author: Roland Conybeare + */ + +#include "xo/tokenizer/tokenizer.hpp" +#include + +namespace xo { + using xo::scm::tokentype; + using token = xo::scm::token; + using xo::scm::span; + + namespace ut { + namespace { + struct testcase_tkz { + std::string input_; + bool expect_throw_; + token expected_tk_; + bool consume_all_; + }; + + std::vector + s_testcase_v = { + {"<", false, token::leftangle(), true}, + {">", false, token::rightangle(), true}, + + {"(", false, token::leftparen(), true}, + {")", false, token::rightparen(), true}, + + {"[", false, token::leftbracket(), true}, + {"]", false, token::rightbracket(), true}, + + {"{", false, token::leftbrace(), true}, + {" {", false, token::leftbrace(), true}, + + {"\t{", false, token::leftbrace(), true}, + {"\n{", false, token::leftbrace(), true}, + {"}", false, token::rightbrace(), true}, + + {"0", false, token::i64_token("0"), true}, + {"1", false, token::i64_token("1"), true}, + {"12", false, token::i64_token("12"), true}, + {"123", false, token::i64_token("123"), true}, + {"1234", false, token::i64_token("1234"), true}, + + {"0 ", false, token::i64_token("0"), false}, + {"1 ", false, token::i64_token("1"), false}, + {"12 ", false, token::i64_token("12"), false}, + {"123 ", false, token::i64_token("123"), false}, + {"1234 ", false, token::i64_token("1234"), false}, + + {"1<", false, token::i64_token("1"), false}, + {"1>", false, token::i64_token("1"), false}, + {"1(", false, token::i64_token("1"), false}, + {"1)", false, token::i64_token("1"), false}, + {"1[", false, token::i64_token("1"), false}, + {"1]", false, token::i64_token("1"), false}, + {"1{", false, token::i64_token("1"), false}, + {"1}", false, token::i64_token("1"), false}, + {"1;", false, token::i64_token("1"), false}, + {"1:", false, token::i64_token("1"), false}, + {"1,", false, token::i64_token("1"), false}, + + {".1", false, token::f64_token(".1"), true}, + {".12", false, token::f64_token(".12"), true}, + {".123", false, token::f64_token(".123"), true}, + + {"+.1", false, token::f64_token("+.1"), true}, + {"+.12", false, token::f64_token("+.12"), true}, + {"+.123", false, token::f64_token("+.123"), true}, + + {"-.1", false, token::f64_token("-.1"), true}, + {"-.12", false, token::f64_token("-.12"), true}, + {"-.123", false, token::f64_token("-.123"), true}, + + {"1.", false, token::f64_token("1."), true}, + {"1.2", false, token::f64_token("1.2"), true}, + {"1.23", false, token::f64_token("1.23"), true}, + + {"1e0", false, token::f64_token("1e0"), true}, + {"1e-1", false, token::f64_token("1e-1"), true}, + {"1e1", false, token::f64_token("1e1"), true}, + {"1e+1", false, token::f64_token("1e+1"), true}, + + {"\"hello\"", false, token::string_token("hello"), true}, + /* tokenizer sees this input: + * "\"hi\", she said" + */ + {"\"\\\"hi\\\", she said\"", false, token::string_token("\"hi\", she said"), true}, + /* tokenizer sees this input: + * "look ma, newline ->\n<- " + */ + {"\"look ma, newline ->\\n<- \"", false, + token::string_token("look ma, newline ->\n<- "), true}, + /* tokenizer sees this input: + * "tab to the right [\t], to the right [\t]" + */ + {"\"tab to the right [\\t], to the right [\\t]\"", false, + token::string_token("tab to the right [\t], to the right [\t]"), true}, + + {":", false, token::colon(), true}, + {":=", false, token::assign_token(), true}, + + {"symbol", false, token::symbol_token("symbol"), true}, + + {"type", false, token::type(), true}, + {"def", false, token::def(), true}, + {"lambda", false, token::lambda(), true}, + {"if", false, token::if_token(), true}, + {"let", false, token::let(), true}, + {"in", false, token::in(), true}, + {"end", false, token::end(), true}, + + {"*", false, token::star_token(), true}, + }; + } + + TEST_CASE("tokenizer", "[tokenizer]") { + for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) { + const testcase_tkz & testcase = s_testcase_v[i_tc]; + + INFO(xtag("input", testcase.input_)); + INFO(xtag("i_tc", i_tc)); + + using tokenizer + = xo::scm::tokenizer; + + tokenizer tkz; + tokenizer::span_type + in_span(testcase.input_.c_str(), + testcase.input_.c_str() + testcase.input_.size()); + + auto out = tkz.scan(in_span); + + auto tk = out.first; + + if (tk.is_invalid()) + tk = tkz.notify_eof(); + + REQUIRE(tk.tk_type() == testcase.expected_tk_.tk_type()); + if (tk.tk_type() == tokentype::tk_i64) + { + REQUIRE(!tk.text().empty()); + REQUIRE(tk.i64_value() == testcase.expected_tk_.i64_value()); + } else if (tk.tk_type() == tokentype::tk_f64) + { + REQUIRE(!tk.text().empty()); + REQUIRE(tk.f64_value() == testcase.expected_tk_.f64_value()); + } else if(tk.tk_type() == tokentype::tk_string) + { + /* tk.text() can be empty, consider input "" */ + REQUIRE(tk.text() == testcase.expected_tk_.text()); + } else if(tk.tk_type() == tokentype::tk_symbol) + { + REQUIRE(!tk.text().empty()); + REQUIRE(tk.text() == testcase.expected_tk_.text()); + } else { + REQUIRE(tk.text().empty()); + } + + /* must consume all input for tests we're doing here */ + if (testcase.consume_all_) + REQUIRE(out.second == in_span); + else + REQUIRE(out.second != in_span); + } + } + + namespace { + struct testcase2_tkz { + std::string input_; + bool expect_throw_; + std::vector expected_tk_v_; + }; + + std::vector + s_testcase2_v = { + {"def foo : f64 = 3.141;", + false, + {token::def(), + token::symbol_token("foo"), + token::colon(), + token::symbol_token("f64"), + token::singleassign(), + token::f64_token("3.141"), + token::semicolon() + }}, + {"def foo = lambda (x : f64) { def y = x * x; y; }", + false, + {token::def(), + token::symbol_token("foo"), + token::singleassign(), + token::lambda(), + token::leftparen(), + token::symbol_token("x"), + token::colon(), + token::symbol_token("f64"), + token::rightparen(), + token::leftbrace(), + token::def(), + token::symbol_token("y"), + token::singleassign(), + token::symbol_token("x"), + token::star_token(), + token::symbol_token("x"), + token::semicolon(), + token::symbol_token("y"), + token::semicolon(), + token::rightbrace() + }} + }; + } + + TEST_CASE("tokenizer2", "[tokenizer]") { + for (std::size_t i_tc = 0, n_tc = s_testcase2_v.size(); i_tc < n_tc; ++i_tc) { + const testcase2_tkz & testcase = s_testcase2_v[i_tc]; + + INFO(xtag("input", testcase.input_)); + INFO(xtag("i_tc", i_tc)); + + using tokenizer + = xo::scm::tokenizer; + + tokenizer tkz; + tokenizer::span_type + in_span(testcase.input_.c_str(), + testcase.input_.c_str() + testcase.input_.size()); + + for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size(); + i_tk < n_tk; ++i_tk) + { + INFO(xtag("i_tk", i_tk)); + + auto res = tkz.scan2(in_span, in_span.empty()); + const auto & tk = res.first; + + if (tk.is_valid()) + REQUIRE(tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type()); + if (tk.tk_type() == tokentype::tk_i64) + { + REQUIRE(!tk.text().empty()); + REQUIRE(tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value()); + } else if (tk.tk_type() == tokentype::tk_f64) + { + REQUIRE(!tk.text().empty()); + REQUIRE(tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value()); + } else if(tk.tk_type() == tokentype::tk_string) + { + /* tk.text() can be empty, consider input "" */ + REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text()); + } else if(tk.tk_type() == tokentype::tk_symbol) + { + REQUIRE(!tk.text().empty()); + REQUIRE(tk.text() == testcase.expected_tk_v_[i_tk].text()); + } else { + REQUIRE(tk.text().empty()); + } + + in_span = in_span.after_prefix(res.second); + } + } + } /*TEST_CASE(tokenizer2)*/ + + } /*namespace ut*/ +} /*namespace xo*/ + +/* end tokenizer.test.cpp */ diff --git a/xo-tokenizer/utest/tokenizer_utest_main.cpp b/xo-tokenizer/utest/tokenizer_utest_main.cpp new file mode 100644 index 00000000..c5e273c4 --- /dev/null +++ b/xo-tokenizer/utest/tokenizer_utest_main.cpp @@ -0,0 +1,6 @@ +/* file tokenizer_utest_main.cpp */ + +#define CATCH_CONFIG_MAIN +#include "catch2/catch.hpp" + +/* end tokenizer_utest_main.cpp */