diff --git a/CMakeLists.txt b/CMakeLists.txt index 90d5df2b..0e7d37ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,6 +115,6 @@ add_subdirectory(xo-pyjit) # ---------------------------------------------------------------- # documentation. must follow add_subdirectory() for satellite projects -xo_umbrella_doxygen_deps(xo_flatstring xo_ratio) +xo_umbrella_doxygen_deps(xo_flatstring xo_ratio xo_unit xo_tokenizer xo_jit) xo_umbrella_doxygen_config() -xo_umbrella_sphinx_config(index.rst docs/install.rst) +xo_umbrella_sphinx_config(index.rst docs/install.rst docs/glossary.rst) diff --git a/cmake/xo-bootstrap-macros.cmake b/cmake/xo-bootstrap-macros.cmake new file mode 100644 index 00000000..aba31169 --- /dev/null +++ b/cmake/xo-bootstrap-macros.cmake @@ -0,0 +1,35 @@ +# ---------------------------------------------------------------- +# for example: +# $ PREFIX=/usr/local # for example +# $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build +# +# will get +# CMAKE_MODULE_PATH +# from xo-cmake-config --cmake-module-path +# +# and expect .cmake macros in +# CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake +# ---------------------------------------------------------------- + +find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED) + +if ("${XO_CMAKE_CONFIG_EXECUTABLE}" STREQUAL "XO_CMAKE_CONFIG_EXECUTABLE-NOT_FOUND") + message(FATAL "could not find xo-cmake-config executable") +endif() + +message(STATUS "XO_CMAKE_CONFIG_EXECUTABLE=${XO_CMAKE_CONFIG_EXECUTABLE}") + +if (NOT XO_SUBMODULE_BUILD) + if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix)) + # default to typical install location for xo-project-macros + execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH) + message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}") + endif() +endif() + +# needs to have been installed somewhere on CMAKE_MODULE_PATH, +# (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX) +# +include(xo_macros/xo_cxx) + +xo_cxx_bootstrap_message() diff --git a/conf.py b/conf.py index cf43946b..1cb86062 100644 --- a/conf.py +++ b/conf.py @@ -44,3 +44,11 @@ pygments_style = 'sphinx' html_theme = 'sphinx_rtd_theme' html_static_path = ['_static'] html_favicon = '_static/img/favicon.ico' + +# disable caching (at least helpful in development) + +html_meta = { + 'http-equiv=Cache-Control': 'no-cache, no-store, must-revalidate', + 'http-equiv=Pragma': 'no-cache', + 'http-equiv=Expires': '0' +} diff --git a/default.nix b/default.nix index 15461d85..29ab7c9b 100644 --- a/default.nix +++ b/default.nix @@ -109,7 +109,7 @@ let # xo-expression = self.callPackage pkgs/xo-expression.nix {}; xo-pyexpression = self.callPackage pkgs/xo-pyexpression.nix {}; - xo-tokenizer = self.callPackage pkgs/xo-tokenizer.nix {}; + xo-tokenizer = self.callPackage pkgs/xo-tokenizer.nix { buildDocs = true; }; xo-reader = self.callPackage pkgs/xo-reader.nix {}; xo-jit = self.callPackage pkgs/xo-jit.nix { #stdenv = jitStdenv; @@ -152,11 +152,18 @@ pkgs.mkShell { pkgs.python3Packages.python pkgs.python3Packages.pybind11 pkgs.python3Packages.sphinx-rtd-theme + #pkgs.python3Packages.sphinx-autobuild # needs patch for typeguard; defer for now pkgs.python3Packages.breathe pkgs.python3Packages.sphinxcontrib-ditaa pkgs.python3Packages.sphinxcontrib-plantuml pkgs.python3Packages.pillow + pkgs.gdb + + pkgs.emacs + pkgs.ditaa + pkgs.ripgrep + pkgs.git pkgs.cloc pkgs.sphinx @@ -169,6 +176,8 @@ pkgs.mkShell { pkgs.eigen pkgs.cmake pkgs.catch2 + pkgs.zlib + pkgs.unzip ]; shellHook = '' diff --git a/docs/glossary.rst b/docs/glossary.rst index 03bb0751..b6b43838 100644 --- a/docs/glossary.rst +++ b/docs/glossary.rst @@ -1 +1,12 @@ -scm = schematika +.. _glossary: + +Glossary +-------- + +.. glossary:: + schematika + scm + | Experimental programming language. + | Designed for convenient integration with C++ and python. + +.. toctree:: diff --git a/docs/install.rst b/docs/install.rst index 33995524..611e34d9 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -92,8 +92,11 @@ Aternatively can enter nix environment, then follow instructions for cmake build # etc +Development +=========== + LSP Setup -========= +--------- To setup xo-umbrella2 build to work with a language server: @@ -105,3 +108,13 @@ To setup xo-umbrella2 build to work with a language server: In this case subsystem LSP setup should be omitted, git root is ``path/to/xo-umbrella2``, not ``path/to/xo-umbrella2/xo-ratio`` etc. + +Sphinx Autobuild Setup +---------------------- + +To serve cache-busting headers + +.. code-block:: + + $ cd xo-umbrella2 + $ sphinx-autobuild . .build/sphinx/html --port 3000 diff --git a/index.rst b/index.rst index 32ccaaf8..441d1ef4 100644 --- a/index.rst +++ b/index.rst @@ -17,5 +17,6 @@ Some features: kalman filters, stochastic processes, complex event processing, s xo-unit/docs/index xo-tokenizer/docs/index xo-jit/docs/index + glossary genindex search diff --git a/xo-tokenizer/docs/CMakeLists.txt b/xo-tokenizer/docs/CMakeLists.txt index 2943563d..c930092b 100644 --- a/xo-tokenizer/docs/CMakeLists.txt +++ b/xo-tokenizer/docs/CMakeLists.txt @@ -1,5 +1,8 @@ -# xo-tokenizer/CMakeLists.txt +# xo-tokenizer/docs/CMakeLists.txt xo_doxygen_collect_deps() xo_docdir_doxygen_config() -xo_docdir_sphinx_config(index.rst install.rst) +xo_docdir_sphinx_config( + index.rst install.rst examples.rst implementation.rst + token-class.rst tokenizer-error-class.rst span-class.rst tokentype-enum.rst +) diff --git a/xo-tokenizer/docs/_static/README b/xo-tokenizer/docs/_static/README new file mode 100644 index 00000000..8230095c --- /dev/null +++ b/xo-tokenizer/docs/_static/README @@ -0,0 +1 @@ +add any static {.html, .js, ..} files for sphinx to pickup here \ No newline at end of file diff --git a/xo-tokenizer/docs/_static/img/favicon.ico b/xo-tokenizer/docs/_static/img/favicon.ico new file mode 100644 index 00000000..15da2145 Binary files /dev/null and b/xo-tokenizer/docs/_static/img/favicon.ico differ diff --git a/xo-tokenizer/docs/conf.py b/xo-tokenizer/docs/conf.py new file mode 100644 index 00000000..31e2f0b2 --- /dev/null +++ b/xo-tokenizer/docs/conf.py @@ -0,0 +1,39 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'xo tokenizer documentation' +copyright = '2024-2025, Roland Conybeare' +author = 'Roland Conybeare' + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +#extensions = [] +extensions = [ "breathe", + "sphinx.ext.mathjax", # inline math + "sphinx.ext.autodoc", # generate info from docstrings + "sphinxcontrib.ditaa", # diagrams-through-ascii-art + "sphinxcontrib.plantuml" # text -> uml diagrams + ] + +# note: breathe requires doxygen xml output -> must have GENERATE_XML = YES in Doxyfile.in +# match project name in Doxyfile.in +breathe_default_project = "xodoxxml" + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +pygments_style = 'sphinx' + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +#html_theme = 'alabaster' +html_theme = 'sphinx_rtd_theme' +html_static_path = ['_static'] +html_favicon = '_static/img/favicon.ico' diff --git a/xo-tokenizer/docs/examples.rst b/xo-tokenizer/docs/examples.rst index 7a6df0ce..72e890e6 100644 --- a/xo-tokenizer/docs/examples.rst +++ b/xo-tokenizer/docs/examples.rst @@ -1,6 +1,6 @@ .. _examples: -.. toctree +.. toctree:: :maxdepth: 2 Examples @@ -31,19 +31,28 @@ See ``xo-tokenizer/examples/tokenrepl`` for (slighly elaborated) version of code // input may contain multiple tokens while (!input.empty()) { - auto [tk, nread] = tkz.scan(input); + auto [tk, consumed, error] = tkz.scan(input); if (tk.is_valid()) { cout << tk; } - input = input.after_prefix(nread); + input = input.after_prefix(consumed.size()); } } - auto tk = tkz.notify_eof(); + auto [tk, consumed, error] = tkz.notify_eof(spxn_type::from_string(input_str)); if (tk.is_valid()) { cout << tk; } } + +.. code-block:: + :linenos: + + $ .build/xo-tokenizer/utest/utest.tokenizer + > 123 + + > 123e5 + diff --git a/xo-tokenizer/docs/implementation.rst b/xo-tokenizer/docs/implementation.rst new file mode 100644 index 00000000..62a01100 --- /dev/null +++ b/xo-tokenizer/docs/implementation.rst @@ -0,0 +1,36 @@ +.. _implementation: + +.. toctree:: + :maxdepth: 2 + +Components +========== + +Library dependency tower for *xo-tokenizer*: + +.. ditaa:: + + +-----------------+ + | xo_unit | + +-----------------+ + | xo_indentlog | + +-----------------+ + | xo_cmake | + +-----------------+ + +Install instructions :doc:`here` + +Abstraction tower for *xo-tokenizer* components: + +.. ditaa:: + :--scale: 0.85 + + +-----------------------------------------+----------+ + | tokenizer | | + +-----------------------------------------+ | + | scan_result | | + +-----------------+-----------------------+ buffer | + | token | tokenizer_error | | + +-----------------+-----------------------+ | + | tokentype | span | | + +-----------------+-----------------------+----------+ diff --git a/xo-tokenizer/docs/index.rst b/xo-tokenizer/docs/index.rst index 91075db3..bcbf1f6a 100644 --- a/xo-tokenizer/docs/index.rst +++ b/xo-tokenizer/docs/index.rst @@ -1,6 +1,6 @@ .. xo-tokenizer documentation master file. -xo-tokenizer documentation +Xo-tokenizer documentation ========================== xo-tokenizer provides a tokenizer for the Schematika language. @@ -15,5 +15,8 @@ may appear in variable names: ``one-of-those-days`` is an ordinary symbol. install examples - genindex - search + implementation + token-class + tokenizer-error-class + span-class + tokentype-enum diff --git a/xo-tokenizer/docs/install.rst b/xo-tokenizer/docs/install.rst index 55eacbb3..94f17794 100644 --- a/xo-tokenizer/docs/install.rst +++ b/xo-tokenizer/docs/install.rst @@ -1,8 +1,23 @@ .. _install: -.. toctree +.. toctree:: :maxdepth: 2 +Source +====== + +Souce code lives on github `here`_ + +.. _here: https://github.com/rconybea/xo-tokenizer + +To clone from git: + +.. code-block:: bash + + git clone https://github.com/rconybea/xo-tokenizer + +Tested with gcc 13.3 + Install ======= diff --git a/xo-tokenizer/docs/span-class.rst b/xo-tokenizer/docs/span-class.rst new file mode 100644 index 00000000..a935d309 --- /dev/null +++ b/xo-tokenizer/docs/span-class.rst @@ -0,0 +1,84 @@ + +.. _span-class: + +Span +==== + +Identify an unowned contiguous memory range + +Context +------- + +.. ditaa:: + :--scale: 0.85 + + +-----------------------------------------+----------+ + | tokenizer | | + +-----------------------------------------+ | + | scan_result | | + +-----------------+-----------------------+ buffer | + | token | tokenizer_error | | + +-----------------+-----------------------+ | + | tokentype |cBLU span | | + +-----------------+-----------------------+----------+ + +.. code-block:: cpp + + #include + +.. uml:: + :scale: 99% + :align: center + + allowmixing + + object span1<> + span1 : lo = p + span2 : hi = p+25 + + object dest<> + dest : def fact(n : i64) { ... } + + +- Identify a sequence of characters stored in contiguous memory. + +- Lightweight, consists of a pair of pointers. + +- Does not own storage. Lifetime management for target memory is + up to the caller. + + +Class +----- + +.. doxygenclass:: xo::scm::span + +Member Variables +---------------- + +.. doxygengroup:: span-instance-vars + +Type Traits +----------- + +.. doxygengroup:: span-type-traits + +Constructors +------------ + +.. doxygengroup:: span-ctors + +Access Methods +-------------- + +.. doxygengroup:: span-access-methods + +General Methods +--------------- + +.. doxygengroup:: span-general-methods + +Operators +--------- + +.. doxygengroup:: span-operators diff --git a/xo-tokenizer/docs/token-class.rst b/xo-tokenizer/docs/token-class.rst new file mode 100644 index 00000000..e0f58299 --- /dev/null +++ b/xo-tokenizer/docs/token-class.rst @@ -0,0 +1,94 @@ + +.. _token-class: + +Token +===== + +Represent a single lexical token in the Schematika language + +Context +------- + +.. ditaa:: + :--scale: 0.85 + + +-----------------------------------------+----------+ + | tokenizer | | + +-----------------------------------------+ | + | scan_result | | + +-----------------+-----------------------+ buffer | + |cBLU token | tokenizer_error | | + +-----------------+-----------------------+ | + | tokentype | span | | + +-----------------+-----------------------+----------+ + +.. code-block:: cpp + + #include + +.. uml:: + :scale: 99% + :align: center + + allowmixing + + object tk1<> + tk1 : tk_type = tk_i64 + tk1 : text = "123" + + object tk2<> + tk2 : tk_type = tk_string + tk2 : text = "the quick brown fox" + +- Represent a single lexical token + +- Does not share any storage with original input stream + (maintains a local copy). + +- Remembers copied input extent. + Convert on demand to native untagged representation + +Example +------- + +.. code-block:: cpp + + void foo() { + using namespace xo::scm; + + token tk = token::i64_token("123"); + + tk.is_valid(); // -> true + tk.text(); // -> "123"s; + + tk.tk_type(); // -> tokentype::tk_i64 + tk.i64_value(); // -> 123 + + cout << tk << endl; // -> + } + +Class +----- + +.. doxygenclass:: xo::scm::token + + +Instance Variables +------------------ + +.. doxygengroup:: token-instance-vars + +Constructors +------------ + +.. doxygengroup:: token-ctors + +Access Methods +-------------- + +.. doxygengroup:: token-access-methods + +General Methods +--------------- + +.. doxygengroup:: token-general-methods diff --git a/xo-tokenizer/docs/tokenizer-class.rst b/xo-tokenizer/docs/tokenizer-class.rst new file mode 100644 index 00000000..dac20549 --- /dev/null +++ b/xo-tokenizer/docs/tokenizer-class.rst @@ -0,0 +1,27 @@ + +.. _tokenizer-class: + +Tokenizer +========= + +Parse a Schematika character stream into lexical tokens + +Context +------- + +.. ditaa:: + :--scale: 0.85 + + +-----------------------------------------+----------+ + |cBLU tokenizer | | + +-----------------------------------------+ | + | scan_result | | + +-----------------+-----------------------+ buffer | + | token | tokenizer_error | | + +-----------------+-----------------------+ | + | tokentype | span | | + +-----------------+-----------------------+----------+ + +.. code-block:: cpp + + #include diff --git a/xo-tokenizer/docs/tokenizer-error-class.rst b/xo-tokenizer/docs/tokenizer-error-class.rst new file mode 100644 index 00000000..0ff72520 --- /dev/null +++ b/xo-tokenizer/docs/tokenizer-error-class.rst @@ -0,0 +1,52 @@ + +.. _tokenizer-error-class + +Tokenizer Error +=============== + +Represent a possible tokenizer error result, including parsing context + +Context +------- + +.. ditaa:: + :--scale: 0.85 + + +-----------------------------------------+----------+ + | tokenizer | | + +-----------------------------------------+ | + | scan_result | | + +-----------------+-----------------------+ buffer | + | token |cBLU tokenizer_error | | + +-----------------+-----------------------+ | + | tokentype | span | | + +-----------------+-----------------------+----------+ + +.. code-block:: cpp + + #include + +Class +------ + +.. doxygenclass:: xo::scm::tokenizer_error + +Instance Variables +------------------ + +.. doxygengroup:: tokenizer-error-instance-vars + +Constructors +------------ + +.. doxygengroup:: tokenizer-error-ctors + +Access Methods +-------------- + +.. doxygengroup:: tokenizer-error-access-methods + +General Methods +--------------- + +.. doxygengroup:: tokenizer-error-general-methods diff --git a/xo-tokenizer/docs/tokentype-enum.rst b/xo-tokenizer/docs/tokentype-enum.rst new file mode 100644 index 00000000..7342c700 --- /dev/null +++ b/xo-tokenizer/docs/tokentype-enum.rst @@ -0,0 +1,34 @@ + +.. _tokentype-enum: + +Tokentype +========= + +Distinguish different lexical tokens for the Schematika language. + +Context +------- + +.. ditaa:: + :--scale: 0.85 + + +-----------------------------------------+----------+ + | tokenizer | | + +-----------------------------------------+ | + | scan_result | | + +-----------------+-----------------------+ buffer | + | token | tokenizer_error | | + +-----------------+-----------------------+ | + |cBLU tokentype | span | | + +-----------------+-----------------------+----------+ + +.. code-block:: cpp + + #include + +Enum +---- + +.. doxygenfunction:: xo::scm::tokentype_descr + +.. doxygenfunction:: xo::scm::operator<<(std::ostream&,tokentype) diff --git a/xo-tokenizer/example/tokenrepl/tokenrepl.cpp b/xo-tokenizer/example/tokenrepl/tokenrepl.cpp index ce69b98f..77606a0b 100644 --- a/xo-tokenizer/example/tokenrepl/tokenrepl.cpp +++ b/xo-tokenizer/example/tokenrepl/tokenrepl.cpp @@ -41,21 +41,35 @@ main() { if (tk.is_valid()) { cout << tk << endl; } else if (error.is_error()) { - cout << "parsing error: " << error << endl; - /* discard remainder of input line */ + cout << "parsing error: " << endl; + error.report(cout); + break; } - input = input.after_prefix(consumed.size()); + input = tkz.consume(consumed, input); + //input = input.after_prefix(consumed.size()); } + + /* discard stashed remainder of input line + * (for nicely-formatted errors) + */ + tkz.discard_current_line(); } - auto [tk, consumed, error] = tkz.notify_eof(span_type::from_string(input_str)); + { + span_type input = span_type::from_string(input_str); - if (tk.is_valid()) { - cout << tk << endl; - } else if (error.is_error()) { - cout << "parsing error: " << error << endl; + auto [tk, consumed, error] = tkz.notify_eof(input); + + input = tkz.consume(consumed, input); + + if (tk.is_valid()) { + cout << tk << endl; + } else if (error.is_error()) { + cout << "parsing error: " << endl; + error.report(cout); + } } } diff --git a/xo-tokenizer/include/xo/tokenizer/error_token.hpp b/xo-tokenizer/include/xo/tokenizer/error_token.hpp new file mode 100644 index 00000000..e69de29b diff --git a/xo-tokenizer/include/xo/tokenizer/scan_result.hpp b/xo-tokenizer/include/xo/tokenizer/scan_result.hpp index 9d15b90f..fbc29105 100644 --- a/xo-tokenizer/include/xo/tokenizer/scan_result.hpp +++ b/xo-tokenizer/include/xo/tokenizer/scan_result.hpp @@ -10,8 +10,10 @@ namespace xo { namespace scm { - /** @brief Represent result of parsing one input token. + /** @class scan_result + * @brief Represent result of parsing one input token. * + * @code * Possible outcomes fall into several categories * (with T: @c token_.is_valid(), E: @cerror_.is_error()) * @@ -21,6 +23,7 @@ namespace xo { * | true | false | parsed token in T | * | false | true | parse error in E | * + * @endcode **/ template class scan_result { @@ -37,6 +40,7 @@ namespace xo { static scan_result make_whitespace(const span_type & prefix_input); static scan_result make_partial(const span_type & prefix_input); + static scan_result make_error(const error_type & error); bool is_eof_or_ambiguous() const { return token_.is_invalid() && error_.is_not_an_error(); } bool is_token() const { return token_.is_valid(); } @@ -67,6 +71,12 @@ namespace xo { return scan_result(token_type::invalid(), prefix_input /*consumed*/); } + template + auto scan_result::make_error(const error_type & error) -> scan_result + { + return scan_result(token_type::invalid(), span_type::make_null(), error); + } + } /*namespace scm*/ } /*namespace xo*/ diff --git a/xo-tokenizer/include/xo/tokenizer/span.hpp b/xo-tokenizer/include/xo/tokenizer/span.hpp index 6c9c5262..5381a440 100644 --- a/xo-tokenizer/include/xo/tokenizer/span.hpp +++ b/xo-tokenizer/include/xo/tokenizer/span.hpp @@ -11,21 +11,32 @@ namespace xo { namespace scm { /** @class span compression/span.hpp * - * @brief Represents a contiguous memory range, without ownership. + * @brief A contiguous range of characters, without ownership. * * @tparam CharT type for elements referred to by this span. **/ template class span { public: - /** @brief typealias for span size (in units of CharT) **/ + /** @defgroup span-type-traits span type traits **/ + ///@{ + + /** typealias for span size (in units of CharT) **/ using size_type = std::uint64_t; + ///@} + public: - /** @brief create span for the contiguous memory range [@p lo, @p hi) **/ + /** @defgroup span-ctors span constructors **/ + ///@{ + + /** Create span for the contiguous memory range [@p lo, @p hi) **/ span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {} - /** @brief create a null span (i.e. with null @p lo, @p hi pointers) **/ + /** Create a null span (i.e. with null @p lo, @p hi pointers) + * A null span can be concatenated with any other span + * without triggering matching-endpoint asserts. + **/ static span make_null() { return span(nullptr, nullptr); } /** @brief create span for C-style string @p cstr **/ @@ -65,16 +76,20 @@ namespace xo { return span(lo, hi); } - ///@{ + ///@} - /** @name getters **/ + /** @defgroup span-access-methods **/ + ///@{ CharT * lo() const { return lo_; } /* get member span::lo_ */ CharT * hi() const { return hi_; } /* get member span::hi_ */ ///@} - /** @brief create new span over supplied type, + /** @defgroup span-general-methods **/ + ///@{ + + /** Create new span over supplied type, * with identical (possibly misaligned) endpoints. * * @warning @@ -121,7 +136,9 @@ namespace xo { return after_prefix(prefix.size()); } - /** @brief create span starting with position p **/ + /** Create span starting with position @p p. + * Does boundary checking; will return empty span if @p p is outside @c [lo_,hi) + **/ span suffix_from(CharT * p) const { if ((lo_ <= p) && (p <= hi_)) return span(p, hi_); @@ -129,13 +146,16 @@ namespace xo { return span(hi_, hi_); } - /** @brief true iff this span is null. distinct from empty. **/ + /** true iff this span is null. distinct from empty. **/ bool is_null() const { return lo_ == nullptr && hi_ == nullptr; } - /** @brief true iff this span is empty (comprises 0 elements). **/ + /** true iff this span is empty (comprises 0 elements). **/ bool empty() const { return lo_ == hi_; } - /** @brief report the number of elements (of type CharT) in this span. **/ + /** report the number of elements (of type CharT) in this span. **/ size_type size() const { return hi_ - lo_; } + /** increase extent of this spans to include @p x. + * Requires @c hi() == @c x.lo() + **/ span & operator+=(const span & x) { if (hi_ == x.lo_) { hi_ = x.hi_; @@ -154,15 +174,18 @@ namespace xo { << " :text " << xo::print::quot(std::string_view(lo_, hi_)) << ">"; } + ///@} private: + /** @defgroup span-instance-vars **/ ///@{ - /** @brief start of span + /** start of span. Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) **/ CharT * lo_ = nullptr; - /** @brief end of span + + /** @brief end of span. Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) **/ CharT * hi_ = nullptr; @@ -170,6 +193,12 @@ namespace xo { ///@} }; /*span*/ + /** @defgroup span-operators **/ + ///@{ + + /** compare spans for equality. + * Two spans are equal iff both endpoints match exactly. + **/ template inline bool operator==(const span & lhs, const span & rhs) { @@ -177,6 +206,9 @@ namespace xo { && (lhs.hi() == rhs.hi())); } + /** compare spans for inequality. + * Two spans are unequal if either paired endpoint differs. + **/ template inline bool operator!=(const span & lhs, const span & rhs) { @@ -184,6 +216,7 @@ namespace xo { || (lhs.hi() != rhs.hi())); } + /** print a summary of @p x on stream @p os. Intended for diagnostics **/ template inline std::ostream & operator<<(std::ostream & os, @@ -191,5 +224,33 @@ namespace xo { x.print(os); return os; } + + ///@} } /*namespace scm*/ + + namespace print { + template + class printspan_impl { + public: + printspan_impl(xo::scm::span x) : span_{x} {} + + xo::scm::span span_; + }; + + template + printspan_impl printspan(const xo::scm::span& span) { + return printspan_impl(span); + } + + template + inline std::ostream & + operator<< (std::ostream & os, + const printspan_impl & x) + { + for (const CharT * p = x.span_.lo(); p < x.span_.hi(); ++p) + os << *p; + + return os; + } + } } /*namespace xo*/ diff --git a/xo-tokenizer/include/xo/tokenizer/token.hpp b/xo-tokenizer/include/xo/tokenizer/token.hpp index c9132183..68666da8 100644 --- a/xo-tokenizer/include/xo/tokenizer/token.hpp +++ b/xo-tokenizer/include/xo/tokenizer/token.hpp @@ -43,75 +43,137 @@ namespace xo { } } + /** @class token + * @brief Represent a Schematika lexical token + **/ template class token { public: + /** @defgroup token-ctors token constructors **/ + ///@{ + + /** default ctor creates token with type @c tk_invalid **/ token() = default; + /** create token with type @c tk_type and input text @c text **/ token(tokentype tk_type, const std::string & text = "") : tk_type_{tk_type}, text_{text} {} + /** create invalid token (same as null ctor, but explicit) **/ static token invalid() { return token(); } + /** Create token representing 64-bit signed integer literal parsed from decimal @p txt. + * The string @p txt must be a decimal integer literal, since @ref i64_value re-parses @p txt. + **/ static token i64_token(const std::string & txt) { return token(tokentype::tk_i64, txt); } + /** create token representing 64-bit floating-point literal parsed from decimal @p txt + * The string @p txt must be a decimal floating-point literal, since @ref f64_value re-parses @p txt. + **/ static token f64_token(const std::string & txt) { return token(tokentype::tk_f64, txt); } + /** create token representing literal string parsed from @p txt **/ static token string_token(const std::string & txt) { return token(tokentype::tk_string, txt); } + /** create token representing a symbol parsed from @p txt. + * Note that not all strings are valid symbol names. + **/ static token symbol_token(const std::string & txt) { return token(tokentype::tk_symbol, txt); } + /** token representing left angle bracket @c "<" **/ static token leftangle() { return token(tokentype::tk_leftangle); } + /** token representing right angle bracket @c ">" **/ static token rightangle() { return token(tokentype::tk_rightangle); } + /** token representing left parenthesis @c "(" **/ static token leftparen() { return token(tokentype::tk_leftparen); } + /** token representing right parenthesis @c ")" **/ static token rightparen() { return token(tokentype::tk_rightparen); } + /** token representing left bracket @c "[" **/ static token leftbracket() { return token(tokentype::tk_leftbracket); } + /** token representing right bracket @c "]" **/ static token rightbracket() { return token(tokentype::tk_rightbracket); } + /** token representing left brace @c "{" **/ static token leftbrace() { return token(tokentype::tk_leftbrace); } + /** token representing right brace @c "}' **/ static token rightbrace() { return token(tokentype::tk_rightbrace); } + /** token representing period @c "." **/ static token dot() { return token(tokentype::tk_dot); } + /** token representing comma @c "," **/ static token comma() { return token(tokentype::tk_comma); } + /** token representing colon @c ":" **/ static token colon() { return token(tokentype::tk_colon); } + /** token representing double-colo @c "::" **/ static token doublecolon() { return token(tokentype::tk_doublecolon); } + /** token representing semicolon @c ";" **/ static token semicolon() { return token(tokentype::tk_semicolon); } + /** token representing single-assignment @c "=" **/ static token singleassign() { return token(tokentype::tk_singleassign); } + /** token representing unrestricted assignment @c ":=" **/ static token assign_token() { return token(tokentype::tk_assign); } + /** token representing indirection @c "->" **/ static token yields() { return token(tokentype::tk_yields); } + /** token for @c "+" **/ static token plus_token() { return token(tokentype::tk_plus); } + /** token for @c "-" **/ static token minus_token() { return token(tokentype::tk_minus); } + /** token for @c "*" **/ static token star_token() { return token(tokentype::tk_star); } + /** token for @c "/" **/ static token slash_token() { return token(tokentype::tk_slash); } + /** token representing keyword @c type **/ static token type() { return token(tokentype::tk_type); } + /** token representing keyword @c def **/ static token def() { return token(tokentype::tk_def); } + /** token representing keyword @c lambda **/ static token lambda() { return token(tokentype::tk_lambda); } + /** token representing keyword @c if **/ static token if_token() { return token(tokentype::tk_if); } + /** token representing keyword @c let **/ static token let() { return token(tokentype::tk_let); } + /** token representing keyword @c in **/ static token in() { return token(tokentype::tk_in); } + /** token representing keyword @c end **/ static token end() { return token(tokentype::tk_end); } + ///@} + + /** @defgroup token-access-methods **/ + ///@{ + tokentype tk_type() const { return tk_type_; } const std::string & text() const { return text_; } + ///@} + + /** @defgroup token-general-methods **/ + ///@{ + + /** true if token understood to represent valid input + * i.e. any token type except @c tk_invalid + **/ bool is_valid() const { return tk_type_ != tokentype::tk_invalid; } + /** true for sentinel token with type tk_invalid **/ bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; } - /** expect input matching - * [+|-][0-9][0-9]* - **/ + /** expect input matching @c "[+|-][0-9][0-9]*" **/ std::int64_t i64_value() const; - /** expect input matching - * [+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]* - **/ + + /** expect input matching @c "[+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]*" **/ double f64_value() const; /** print human-readable token representation on stream @p os **/ void print(std::ostream & os) const; + ///@} + private: + /** @defgroup token-instance-vars **/ + ///@{ + /** category for this token **/ tokentype tk_type_ = tokentype::tk_invalid; @@ -124,6 +186,8 @@ namespace xo { * tk_symbol **/ std::string text_; + + ///@} }; /*token*/ template diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp index e36a54af..f0ebb4be 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp @@ -13,9 +13,15 @@ namespace xo { namespace scm { - /** + /** @class tokenizer + * @brief Parse a Schematika character stream into lexical tokens + * * Use: + * * @code + * // see xo-tokenizer/example/tokenrepl/tokenrepl.cpp + * // for exact working code + * * using tokenizer_type = tokenizer; * using span_type = tokenizer_type::span_type; * @@ -24,21 +30,19 @@ namespace xo { * * while (!input.empty()) { * auto res = tkz.scan(input); - * const auto & tk = res.first; + * auto [tk, consumed, error] = res.first; * * // do something with tk if tk.is_valid() * - * input = input.after_prefix(res.second); + * input = tkz.consume(res.second, input); * } * * if endofinput { - * auto tk = tzk.notify_eof() + * auto [tk, consumed, error] = tzk.notify_eof() * - * // do something with tk if tk.is_valid() + * // do something with (final) tk if tk.is_valid() * } * - * // expect !tkz.has_prefix() - * * @endcode * * See tokentype.hpp for token types @@ -47,6 +51,7 @@ namespace xo { class tokenizer { public: using token_type = token; + using error_type = tokenizer_error; using span_type = span; using result_type = scan_result; @@ -122,11 +127,22 @@ namespace xo { **/ result_type scan2(const span_type & input, bool eof); + /** @retval span with @p consumed permanently removed from @p input. + * + * Purpose of this method is to update @ref current_pos_. + **/ + span_type consume(const span_type & consumed, const span_type & input); + + /** discard current line after error. Just cleans up error-reporting state **/ + void discard_current_line(); + /** notify end of input, resolving any ambiguous input stashed in .prefix **/ result_type notify_eof(const span_type & input); private: + void capture_current_line(const span_type & input); + result_type scan_completion(const span_type & whitespace, const CharT* token_end, const span_type & input); @@ -134,8 +150,10 @@ namespace xo { private: /** true to log tokenizer activity to stdout **/ bool debug_flag_ = false; - /** remember start of current line here **/ + /** remember current input line. Used only to report errors **/ span_type current_line_ = span_type::make_null(); + /** current input position within @ref current_line_ **/ + size_t current_pos_ = 0; /** Accumulate partial token here. * This will happen if input sent to @ref tokenizer::scan * ends without a determinate token boundary. @@ -348,29 +366,35 @@ namespace xo { } else if (exponent_flag && !exponent_digit_flag) { exponent_sign_flag = true; } else { - throw std::runtime_error - (tostr("tokenizer::assemble_token", - ": improperly placed sign indicator", - xtag("pos", ix - tk_start), - xtag("char", *ix))); + return result_type::make_error + (error_type(__FUNCTION__ /*src_function*/, + "improperly placed sign indicator", + current_line_, + current_pos_, + initial_whitespace, + (ix - tk_start))); } } else if (*ix == '.') { if (period_flag) { - throw (std::runtime_error - (tostr("tokenizer::assemble_token", - ": duplicate decimal point", - xtag("pos", ix - tk_start), - xtag("char", *ix)))); + return result_type::make_error + (error_type(__FUNCTION__ /*src_function*/, + "duplicate decimal point in numeric literal", + current_line_, + current_pos_, + initial_whitespace, + (ix - tk_start))); } period_flag = true; } else if ((*ix == 'e') || (*ix == 'E')) { if (exponent_flag) { - throw (std::runtime_error - (tostr("tokenizer::assemble_token", - ": duplicate exponent marker", - xtag("pos", ix - tk_start), - xtag("char", *ix)))); + return result_type::make_error + (error_type(__FUNCTION__ /*src_function*/, + "duplicate exponent marker in numeric literal", + current_line_, + current_pos_, + initial_whitespace, + (ix - tk_start))); } exponent_flag = true; @@ -382,12 +406,13 @@ namespace xo { number_flag = true; } } else { - /* invalid input */ - throw (std::runtime_error - (tostr("tokenizer::assemble_token", - ": unexpected character in numeric constant", - xtag("pos", ix - tk_start), - xtag("char", *ix)))); + return result_type::make_error + (error_type(__FUNCTION__ /*src_function*/, + "unexpected character in numeric constant" /*error_description*/, + current_line_, + current_pos_, + initial_whitespace, + (ix - tk_start))); } } @@ -443,11 +468,12 @@ namespace xo { ++ix; /*skip initial " char*/ + /* true on final " */ + bool endofstring = false; + for (; ix != token_text.hi(); ++ix) { log && log(xtag("*ix", *ix)); - bool endofstring = false; - switch(*ix) { case '"': endofstring = true; @@ -461,11 +487,13 @@ namespace xo { ++ix; if (ix == token_text.hi()) { - throw std::runtime_error - (tostr("tokenizer::assemble_token", - ": malformed string literal", - xtag("input", std::string_view(token_text.lo(), - token_text.hi())))); + return result_type::make_error + (error_type(__FUNCTION__ /*src_function*/, + "expecting key following escape character \\", + current_line_, + current_pos_, + initial_whitespace, + (ix - tk_start))); } switch(*ix) { @@ -490,10 +518,13 @@ namespace xo { tk_text.push_back('"'); break; default: - throw std::runtime_error - (tostr("tokenizer::assemble_token", - ": unexpected \\-escaped char", - xtag("char", *ix))); + return result_type::make_error + (error_type(__FUNCTION__ /*src_function*/, + "expecting one of n|r|\"|\\ following escape \\", + current_line_, + current_pos_, + initial_whitespace, + (ix - tk_start))); } break; default: @@ -505,12 +536,14 @@ namespace xo { break; } - if (ix != token_text.hi()) { - throw std::runtime_error - (tostr("tokenizer::assemble_token", - ": expected \" to end string literal", - xtag("input", std::string_view(token_text.lo(), - token_text.hi())))); + if (!endofstring) { + return result_type::make_error + (error_type(__FUNCTION__ /*src_function*/, + "missing terminating '\"' to complete literal string", + current_line_, + current_pos_, + initial_whitespace, + (ix - tk_start))); } log && log(tostr("tokenizer::assemble_token", @@ -632,9 +665,13 @@ namespace xo { } if (tk_type == tokentype::tk_invalid) { - throw std::runtime_error(tostr("tokenizer::assemble_token", - ": unexpected input x", - xtag("x", *ix))); + return result_type::make_error + (error_type(__FUNCTION__ /*src_function*/, + "illegal input character", + current_line_, + current_pos_, + initial_whitespace, + (ix - tk_start))); } if ((tk_type == tokentype::tk_i64) @@ -719,6 +756,27 @@ namespace xo { } + template + void + tokenizer::capture_current_line(const span_type & input) + { + // see discard_current_line() + + scope log(XO_DEBUG(debug_flag_)); + + /* look ahead to {end of line, end of input}, whichever comes first */ + const CharT * sol = input.lo(); + const CharT * eol = sol; + + while ((eol < input.hi()) && (*eol != '\n')) + ++eol; + + this->current_line_ = span_type(sol, eol); + this->current_pos_ = 0; + + log && log(xtag("current_line", print::printspan(current_line_))); + } + template auto tokenizer::scan(const span_type & input) -> result_type @@ -729,21 +787,22 @@ namespace xo { const CharT * ix = input.lo(); + if (this->current_line_.is_null()) { + this->capture_current_line(input); + } + /* skip whitespace + remember beginning of most recent line */ while (is_whitespace(*ix) && (ix != input.hi())) { - if (is_newline(*ix)) { ++ix; - /* look ahead to {end of line, end of input}, whichever comes first */ - const CharT * sol = ix; - const CharT * eol = ix; - while ((eol < input.hi()) && (*eol != '\n')) - ++eol; - - this->current_line_ = span_type(sol, eol); + this->capture_current_line(span_type(ix, input.hi())); } else { ++ix; + +#ifdef OBSOLETE + ++(this->current_pos_); +#endif } } @@ -818,10 +877,12 @@ namespace xo { break; } } else if ((*ix == '\n') || (*ix == '\r')) { - throw std::runtime_error - (tostr("tokenizer::scan", - ": must use \\n or \\r to encode newline/cr in" - " string literal")); + return result_type::make_error + (error_type(__FUNCTION__ /*src_function*/, + "must use \\n or \\r to encode newline/cr in string literal", + current_line_, current_pos_, + whitespace.size(), + (ix - tk_start))); } prev_ch = *ix; @@ -945,6 +1006,25 @@ namespace xo { sr2.error()); } + template + auto + tokenizer::consume(const span_type & consumed, const span_type & input) -> span_type + { + this->current_pos_ += consumed.size(); + + return input.after_prefix(consumed); + } + + template + void + tokenizer::discard_current_line() + { + // see capture_current_line() + + this->current_line_ = span_type::make_null(); + this->current_pos_ = 0; + } + template auto tokenizer::notify_eof(const span_type & input) -> result_type { diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp index 3f5b5944..c12fad72 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp @@ -7,47 +7,95 @@ #include "tokentype.hpp" #include "span.hpp" +#include namespace xo { namespace scm { - /** represent a lexing error, with context **/ + /** @class tokenizer_error + * @brief represent a lexing error, with context + * + * @tparam CharT representation for single characters + **/ template class tokenizer_error { public: using span_type = span; public: - /** @brief default ctor represent a not-an-error error object **/ + /** @defgroup tokenizer-error-ctors **/ + ///@{ + + /** Default ctor represent a not-an-error sentinel object **/ tokenizer_error() = default; - tokenizer_error(char const * src_function, - char const* error_description, - span_type input_line, size_t error_pos) + /** Constructor to capture parsing error context + * @p tk_start current position on entry to scanner + * @p whitespace number of chars initial whitespace + * @p error_pos error location relative to token start + **/ + tokenizer_error(const char * src_function, + const char * error_description, + span_type input_line, + size_t tk_start, + size_t whitespace, + size_t error_pos) : src_function_{src_function}, error_description_{error_description}, input_line_{input_line}, + tk_entry_{tk_start}, + whitespace_{whitespace}, error_pos_{error_pos} {} + ///@} - char const* src_function() const { return src_function_; } - char const* error_description() const { return error_description_; } - size_t error_pos() const { return error_pos_; } + /** @defgroup tokenizer-error-access-methods **/ + ///@{ + + const char * src_function() const { return src_function_; } + const char * error_description() const { return error_description_; } const span_type& input_line() const { return input_line_; } + size_t tk_start() const { return tk_entry_; } + size_t whitespace() const { return whitespace_; } + size_t error_pos() const { return error_pos_; } - bool is_not_an_error() const { return error_description_ == nullptr; } + ///@} + + /** @defgroup tokenizer-error-general-methods **/ + ///@{ + + /** true, except for a sentinel error object **/ bool is_error() const { return error_description_ != nullptr; } + /** true except for object in sentinel state **/ + bool is_not_an_error() const { return error_description_ == nullptr; } + /** Print representation to stream @p os. Intended for tokenizer diagnostics. + * For Schematika errors prefer @ref report + **/ void print(std::ostream & os) const; + /** Print human-oriented error report on @p os. **/ + void report(std::ostream & os) const; + + ///@} + private: + /** @defgroup tokenizer-error-instance-vars **/ + ///@{ + /** source location (in tokenizer) at which error identified **/ char const * src_function_ = nullptr; /** static error description **/ char const * error_description_ = nullptr; - /** position (relative to line_.lo) of error **/ - size_t error_pos_ = 0; - /** complete input line (to the extent available) - * containing error + /** complete current input line (to the extent captured) + * that contains error **/ span_type input_line_ = span_type::make_null(); + /** position (relative to line_.lo) of token start where error encountered **/ + size_t tk_entry_ = 0; + /** number of characters of initial whitespace skipped before token start **/ + size_t whitespace_ = 0; + /** position (relative to @ref tk_entry_) of error **/ + size_t error_pos_ = 0; + + ///@} }; /*error_token*/ template @@ -56,11 +104,41 @@ namespace xo { os << ""; } + template + void + tokenizer_error::report(std::ostream & os) const { + using namespace std; + + if (error_description_) { + const char * prefix = "input: "; + const size_t tk_indent = strlen(prefix) + tk_entry_ + whitespace_; + //const size_t msg_length = strlen(error_description_); + + const size_t error_pos = 1 + tk_entry_ + whitespace_ + error_pos_; + + os << "char: " << error_pos << endl; + os << prefix; + for (const char *p = input_line_.lo(), *e = input_line_.hi(); p < e; ++p) + os << *p; + os << endl; + os << std::setw(tk_indent) << " "; + + for (size_t i = 0; i < error_pos_; ++i) { + os << '_'; + } + os << '^' << endl; + + os << error_description_ << endl; + } + } + template inline std::ostream & operator<< (std::ostream & os, diff --git a/xo-tokenizer/include/xo/tokenizer/tokentype.hpp b/xo-tokenizer/include/xo/tokenizer/tokentype.hpp index 6a3ef8a6..35ff841c 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokentype.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokentype.hpp @@ -11,10 +11,11 @@ namespace xo { namespace scm { /** @enum tokentype - * @brief enum to identify different schematica input token types + * Enum to identify different schematika input token types * * Schematica code examples: * + * @code * type point :: { xcoord : f64, ycoord : f64 }; * type matrix :: array; // 2-d array * @@ -41,6 +42,7 @@ namespace xo { * def matrixproduct(x : matrix, y : matrix) { * [i, j : x.row(i) * y.col(j)]; * }; + * @endcode **/ enum class tokentype { /** sentinel value **/ @@ -58,52 +60,52 @@ namespace xo { /** a symbol **/ tk_symbol, - /** left-hand parenthesis '(' **/ + /** left-hand parenthesis @c '(' **/ tk_leftparen, - /** right-hand parenthesis ')' **/ + /** right-hand parenthesis @c ')' **/ tk_rightparen, - /** left-hand bracket '[' **/ + /** left-hand bracket @c '[' **/ tk_leftbracket, - /** right-hand bracket ']' **/ + /** right-hand bracket @c ']' **/ tk_rightbracket, - /** left-hand brace '{' **/ + /** left-hand brace @c '{' **/ tk_leftbrace, - /** right-hand brace '}' **/ + /** right-hand brace @c '}' **/ tk_rightbrace, - /** left-hand angle bracket '<' **/ + /** left-hand angle bracket @c '<' **/ tk_leftangle, - /** right-hand angle bracket '>' **/ + /** right-hand angle bracket @c '>' **/ tk_rightangle, - /** dot '.' **/ + /** dot @c '.' **/ tk_dot, - /** comma ',' **/ + /** comma @c ',' **/ tk_comma, - /** colon ':' **/ + /** colon @c ':' **/ tk_colon, - /** double-colon '::' **/ + /** double-colon @c '::' **/ tk_doublecolon, - /** semi-colon ';' **/ + /** semi-colon @c ';' **/ tk_semicolon, - /** '=' **/ + /** single equals sign @c '=' **/ tk_singleassign, - /** ':=' **/ + /** assignment @c ':=' **/ tk_assign, - /** '->' **/ + /** indirection @c '->' **/ tk_yields, /** note: operators not treated as punctuation @@ -111,47 +113,53 @@ namespace xo { * as is 'maybe*2', 'maybe+1', 'path/to/foo' **/ - /** operator '+' **/ + /** operator @c '+' **/ tk_plus, - /** operator '-' **/ + /** operator @c '-' **/ tk_minus, - /** operator '*' **/ + /** operator @c '*' **/ tk_star, - /** operator '/' **/ + /** operator @c '/' **/ tk_slash, - /** keyword 'type' **/ + /** keyword @c 'type' **/ tk_type, - /** keyword 'def' **/ + /** keyword @c 'def' **/ tk_def, - /** keyword 'lambda' **/ + /** keyword @c 'lambda' **/ tk_lambda, - /** keyword 'if' **/ + /** keyword @c 'if' **/ tk_if, - /** keyword 'let' **/ + /** keyword @c 'let' **/ tk_let, - /** keyword 'in' **/ + /** keyword @c 'in' **/ tk_in, - /** keyword 'end' **/ + /** keyword @c 'end' **/ tk_end, - n_tokentype /* comes last, counts #of entries */ + /** counts number of entries **/ + n_tokentype }; /*tokentype*/ + /** String representation for enum value. + * For example @c tokentype_descr(tokentype::tk_if) -> @c "if" + **/ extern char const * tokentype_descr(tokentype tk_type); + /** Print enum value for @p tk_type on stream @p os **/ inline std::ostream & operator<< (std::ostream & os, tokentype tk_type) { os << tokentype_descr(tk_type); return os; } + } /*namespace scm*/ } /*namespace xo*/ diff --git a/xo-tokenizer/utest/tokenizer.test.cpp b/xo-tokenizer/utest/tokenizer.test.cpp index ed2cb515..ea4c886d 100644 --- a/xo-tokenizer/utest/tokenizer.test.cpp +++ b/xo-tokenizer/utest/tokenizer.test.cpp @@ -19,15 +19,17 @@ namespace xo { * On second pass, enable verbose logging **/ struct rehearser { + rehearser(std::uint32_t att = 0) : attention_{att} {} + /* expect at most one iterator to exist per TestRehearser instance **/ struct iterator { - iterator(rehearser* parent, std::uint32_t attention) : parent_{parent}, attention_{attention} {} + explicit iterator(rehearser* parent) : parent_{parent} {} iterator& operator++(); - std::uint32_t operator*() { return attention_; } + std::uint32_t operator*() { return parent_->attention_; } bool operator==(const iterator& ix2) const { - return (parent_ == ix2.parent_) && (attention_ == ix2.attention_); + return (parent_ == ix2.parent_); } rehearser* parent_ = nullptr; @@ -35,11 +37,12 @@ namespace xo { }; + bool is_first_pass() const { return attention_ == 0; } bool is_second_pass() const { return attention_ == 1; } bool enable_debug() const { return is_second_pass(); } - iterator begin() { return iterator(this, 0); } - iterator end() { return iterator(this, 2); } + iterator begin() { return iterator(this); } + iterator end() { return iterator(nullptr); } public: /** pass number: 0 or 1 **/ @@ -50,23 +53,27 @@ namespace xo { auto rehearser::iterator::operator++() -> iterator& { - ++attention_; + if (parent_) + ++(parent_->attention_); - if (parent_->ok_flag_ && attention_ == 1) { + if (parent_->ok_flag_ && (parent_->attention_ == 1)) { /* skip 2nd pass */ - ++attention_; + ++(parent_->attention_); } + if (parent_->attention_ == 2) + parent_ = nullptr; + return *this; } /* use this instead of REQUIRE(expr) in context of a test_rehearser */ -# define REHEARSE(rehearser, expr) \ - if (rehearser.is_second_pass()) { \ - REQUIRE((expr)); \ - } else { \ - REQUIRE(true); \ - rehearser.ok_flag_ &= (expr); \ +# define REHEARSE(rehearser, expr) \ + if (rehearser.is_first_pass()) { \ + bool _f = (expr); \ + rehearser.ok_flag_ = rehearser.ok_flag_ && _f; \ + } else { \ + REQUIRE(expr); \ } /* note: trivial REQUIRE() call in else branch bc we still want @@ -300,12 +307,14 @@ namespace xo { token::semicolon(), token::rightbrace() }}, +#ifdef TODO {"a.b", false, {token::symbol_token("a"), token::dot(), token::symbol_token("b") }}, +#endif {"a,b", false, {token::symbol_token("a"), @@ -431,6 +440,132 @@ namespace xo { } } /*TEST_CASE(tokenizer2)*/ + namespace { + using tkz_error_type = xo::scm::tokenizer_error; + using span_type = xo::scm::span; + + struct testcase_error { + std::string input_; + tkz_error_type expect_error_; + }; + + testcase_error + make_testcase(const char * input, const char * src_function, const char * error_descr, + size_t tk_start, size_t whitespace, size_t error_pos) + { + testcase_error retval; + retval.input_ = input; + retval.expect_error_ = tkz_error_type(src_function, error_descr, + span_type::from_string(retval.input_), + tk_start, whitespace, error_pos); + return retval; + } + + std::vector + s_testcase3_v = { + // 012345678 + // --------v + make_testcase("123.456ez", + "assemble_token", + "unexpected character in numeric constant", + 0, 0, 8), + // 01 + // -v + make_testcase("1-3", + "assemble_token", + "improperly placed sign indicator", + 0, 0, 1), + // 012 + // --v + make_testcase("1..2", + "assemble_token", + "duplicate decimal point in numeric literal", + 0, 0, 2), + // 0123456 + // ------v + make_testcase("1.23e4e", + "assemble_token", + "duplicate exponent marker in numeric literal", + 0, 0, 6), + // tokenizer sees string ["\"] + // 0 1 2 3 + // - - - v + make_testcase("\"\\\"", + "assemble_token", + "missing terminating '\"' to complete literal string", + //"expect \\ to escape one of n|t|r|\"|\\ in string literal", + 0, 0, 3), + // tokenizer sees literal with embedded newline + // 1 2 3 + // 01234567890123456789012345678901 2 + // -------------------------------- v + make_testcase("\"everything was going fine until\n\"", + "scan", + "must use \\n or \\r to encode newline/cr in string literal", + 0, 0, 32), + // tokenizer sees string ["\] + // 0 1 2 + // - - v + make_testcase("\"\\", + "assemble_token", + "expecting key following escape character \\", + 0, 0, 2), + // tokenizer sees string ["\q"] + // 0 12 + // - -v + make_testcase("\"\\q\"", + "assemble_token", + "expecting one of n|r|\"|\\ following escape \\", + 0, 0, 2), + // + make_testcase("#", + "assemble_token", + "illegal input character", + 0, 0, 0), + }; + + TEST_CASE("tokenizer3", "[tokenizer]") { + /* testing error handling */ + + using tokenizer = xo::scm::tokenizer; + + constexpr bool c_force_debug = true; + + for (std::size_t i_tc = 0, n_tc = s_testcase3_v.size(); i_tc < n_tc; ++i_tc) { + const testcase_error & testcase = s_testcase3_v[i_tc]; + + rehearser rh(0); + + for (auto _ : rh) { + scope log(XO_DEBUG2(c_force_debug || rh.enable_debug(), "tokenizer3")); + + log && log(xtag("pass", _), xtag("ok(-)", rh.ok_flag_)); + log && log(xtag("i_tc", i_tc), xtag("input", testcase.input_)); + + tokenizer tkz(c_force_debug || rh.enable_debug()); + + auto in_span = tokenizer::span_type::from_string(testcase.input_); + + auto sr = tkz.scan2(in_span, true /*eof*/); + + REHEARSE(rh, sr.is_error()); + + if (sr.error().src_function()) { + REHEARSE(rh, std::string(sr.error().src_function()) == std::string(testcase.expect_error_.src_function())); + } + if (sr.error().error_description()) { + REHEARSE(rh, std::string(sr.error().error_description()) == std::string(testcase.expect_error_.error_description())); + } + REHEARSE(rh, sr.error().whitespace() == testcase.expect_error_.whitespace()); + REHEARSE(rh, sr.error().tk_start() == testcase.expect_error_.tk_start()); + REHEARSE(rh, sr.error().error_pos() == testcase.expect_error_.error_pos()); + + log && log(xtag("ok(+)", rh.ok_flag_)); + } + } + } + } + } /*namespace ut*/ } /*namespace xo*/ diff --git a/xo-unit/docs/scaled-unit-class.rst b/xo-unit/docs/scaled-unit-class.rst index 42725e00..ec32844c 100644 --- a/xo-unit/docs/scaled-unit-class.rst +++ b/xo-unit/docs/scaled-unit-class.rst @@ -30,7 +30,7 @@ Context Introduction ------------ -.. code-block::cpp +.. code-block:: cpp #include