xo-tokenizer: docs + error-handling improvement

drop exceptions for return-value error object
This commit is contained in:
Roland Conybeare 2025-06-23 23:08:12 -05:00
commit 6fbfd065a2
30 changed files with 1086 additions and 162 deletions

View file

@ -115,6 +115,6 @@ add_subdirectory(xo-pyjit)
# ---------------------------------------------------------------- # ----------------------------------------------------------------
# documentation. must follow add_subdirectory() for satellite projects # documentation. must follow add_subdirectory() for satellite projects
xo_umbrella_doxygen_deps(xo_flatstring xo_ratio) xo_umbrella_doxygen_deps(xo_flatstring xo_ratio xo_unit xo_tokenizer xo_jit)
xo_umbrella_doxygen_config() xo_umbrella_doxygen_config()
xo_umbrella_sphinx_config(index.rst docs/install.rst) xo_umbrella_sphinx_config(index.rst docs/install.rst docs/glossary.rst)

View file

@ -0,0 +1,35 @@
# ----------------------------------------------------------------
# for example:
# $ PREFIX=/usr/local # for example
# $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build
#
# will get
# CMAKE_MODULE_PATH
# from xo-cmake-config --cmake-module-path
#
# and expect .cmake macros in
# CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake
# ----------------------------------------------------------------
find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED)
if ("${XO_CMAKE_CONFIG_EXECUTABLE}" STREQUAL "XO_CMAKE_CONFIG_EXECUTABLE-NOT_FOUND")
message(FATAL "could not find xo-cmake-config executable")
endif()
message(STATUS "XO_CMAKE_CONFIG_EXECUTABLE=${XO_CMAKE_CONFIG_EXECUTABLE}")
if (NOT XO_SUBMODULE_BUILD)
if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix))
# default to typical install location for xo-project-macros
execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH)
message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}")
endif()
endif()
# needs to have been installed somewhere on CMAKE_MODULE_PATH,
# (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX)
#
include(xo_macros/xo_cxx)
xo_cxx_bootstrap_message()

View file

@ -44,3 +44,11 @@ pygments_style = 'sphinx'
html_theme = 'sphinx_rtd_theme' html_theme = 'sphinx_rtd_theme'
html_static_path = ['_static'] html_static_path = ['_static']
html_favicon = '_static/img/favicon.ico' html_favicon = '_static/img/favicon.ico'
# disable caching (at least helpful in development)
html_meta = {
'http-equiv=Cache-Control': 'no-cache, no-store, must-revalidate',
'http-equiv=Pragma': 'no-cache',
'http-equiv=Expires': '0'
}

View file

@ -109,7 +109,7 @@ let
# #
xo-expression = self.callPackage pkgs/xo-expression.nix {}; xo-expression = self.callPackage pkgs/xo-expression.nix {};
xo-pyexpression = self.callPackage pkgs/xo-pyexpression.nix {}; xo-pyexpression = self.callPackage pkgs/xo-pyexpression.nix {};
xo-tokenizer = self.callPackage pkgs/xo-tokenizer.nix {}; xo-tokenizer = self.callPackage pkgs/xo-tokenizer.nix { buildDocs = true; };
xo-reader = self.callPackage pkgs/xo-reader.nix {}; xo-reader = self.callPackage pkgs/xo-reader.nix {};
xo-jit = self.callPackage pkgs/xo-jit.nix { #stdenv = jitStdenv; xo-jit = self.callPackage pkgs/xo-jit.nix { #stdenv = jitStdenv;
@ -152,11 +152,18 @@ pkgs.mkShell {
pkgs.python3Packages.python pkgs.python3Packages.python
pkgs.python3Packages.pybind11 pkgs.python3Packages.pybind11
pkgs.python3Packages.sphinx-rtd-theme pkgs.python3Packages.sphinx-rtd-theme
#pkgs.python3Packages.sphinx-autobuild # needs patch for typeguard; defer for now
pkgs.python3Packages.breathe pkgs.python3Packages.breathe
pkgs.python3Packages.sphinxcontrib-ditaa pkgs.python3Packages.sphinxcontrib-ditaa
pkgs.python3Packages.sphinxcontrib-plantuml pkgs.python3Packages.sphinxcontrib-plantuml
pkgs.python3Packages.pillow pkgs.python3Packages.pillow
pkgs.gdb
pkgs.emacs
pkgs.ditaa
pkgs.ripgrep
pkgs.git
pkgs.cloc pkgs.cloc
pkgs.sphinx pkgs.sphinx
@ -169,6 +176,8 @@ pkgs.mkShell {
pkgs.eigen pkgs.eigen
pkgs.cmake pkgs.cmake
pkgs.catch2 pkgs.catch2
pkgs.zlib
pkgs.unzip
]; ];
shellHook = '' shellHook = ''

View file

@ -1 +1,12 @@
scm = schematika .. _glossary:
Glossary
--------
.. glossary::
schematika
scm
| Experimental programming language.
| Designed for convenient integration with C++ and python.
.. toctree::

View file

@ -92,8 +92,11 @@ Aternatively can enter nix environment, then follow instructions for cmake build
# etc # etc
Development
===========
LSP Setup LSP Setup
========= ---------
To setup xo-umbrella2 build to work with a language server: To setup xo-umbrella2 build to work with a language server:
@ -105,3 +108,13 @@ To setup xo-umbrella2 build to work with a language server:
In this case subsystem LSP setup should be omitted, git root is ``path/to/xo-umbrella2``, In this case subsystem LSP setup should be omitted, git root is ``path/to/xo-umbrella2``,
not ``path/to/xo-umbrella2/xo-ratio`` etc. not ``path/to/xo-umbrella2/xo-ratio`` etc.
Sphinx Autobuild Setup
----------------------
To serve cache-busting headers
.. code-block::
$ cd xo-umbrella2
$ sphinx-autobuild . .build/sphinx/html --port 3000

View file

@ -17,5 +17,6 @@ Some features: kalman filters, stochastic processes, complex event processing, s
xo-unit/docs/index xo-unit/docs/index
xo-tokenizer/docs/index xo-tokenizer/docs/index
xo-jit/docs/index xo-jit/docs/index
glossary
genindex genindex
search search

View file

@ -1,5 +1,8 @@
# xo-tokenizer/CMakeLists.txt # xo-tokenizer/docs/CMakeLists.txt
xo_doxygen_collect_deps() xo_doxygen_collect_deps()
xo_docdir_doxygen_config() xo_docdir_doxygen_config()
xo_docdir_sphinx_config(index.rst install.rst) xo_docdir_sphinx_config(
index.rst install.rst examples.rst implementation.rst
token-class.rst tokenizer-error-class.rst span-class.rst tokentype-enum.rst
)

1
xo-tokenizer/docs/_static/README vendored Normal file
View file

@ -0,0 +1 @@
add any static {.html, .js, ..} files for sphinx to pickup here

Binary file not shown.

After

Width:  |  Height:  |  Size: 302 KiB

39
xo-tokenizer/docs/conf.py Normal file
View file

@ -0,0 +1,39 @@
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
project = 'xo tokenizer documentation'
copyright = '2024-2025, Roland Conybeare'
author = 'Roland Conybeare'
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
#extensions = []
extensions = [ "breathe",
"sphinx.ext.mathjax", # inline math
"sphinx.ext.autodoc", # generate info from docstrings
"sphinxcontrib.ditaa", # diagrams-through-ascii-art
"sphinxcontrib.plantuml" # text -> uml diagrams
]
# note: breathe requires doxygen xml output -> must have GENERATE_XML = YES in Doxyfile.in
# match project name in Doxyfile.in
breathe_default_project = "xodoxxml"
templates_path = ['_templates']
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
pygments_style = 'sphinx'
# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
#html_theme = 'alabaster'
html_theme = 'sphinx_rtd_theme'
html_static_path = ['_static']
html_favicon = '_static/img/favicon.ico'

View file

@ -1,6 +1,6 @@
.. _examples: .. _examples:
.. toctree .. toctree::
:maxdepth: 2 :maxdepth: 2
Examples Examples
@ -31,19 +31,28 @@ See ``xo-tokenizer/examples/tokenrepl`` for (slighly elaborated) version of code
// input may contain multiple tokens // input may contain multiple tokens
while (!input.empty()) { while (!input.empty()) {
auto [tk, nread] = tkz.scan(input); auto [tk, consumed, error] = tkz.scan(input);
if (tk.is_valid()) { if (tk.is_valid()) {
cout << tk; cout << tk;
} }
input = input.after_prefix(nread); input = input.after_prefix(consumed.size());
} }
} }
auto tk = tkz.notify_eof(); auto [tk, consumed, error] = tkz.notify_eof(spxn_type::from_string(input_str));
if (tk.is_valid()) { if (tk.is_valid()) {
cout << tk; cout << tk;
} }
} }
.. code-block::
:linenos:
$ .build/xo-tokenizer/utest/utest.tokenizer
> 123
<token :type tk_i64 :text 123>
> 123e5
<token :type tk_f64 :text 123e5>

View file

@ -0,0 +1,36 @@
.. _implementation:
.. toctree::
:maxdepth: 2
Components
==========
Library dependency tower for *xo-tokenizer*:
.. ditaa::
+-----------------+
| xo_unit |
+-----------------+
| xo_indentlog |
+-----------------+
| xo_cmake |
+-----------------+
Install instructions :doc:`here<install>`
Abstraction tower for *xo-tokenizer* components:
.. ditaa::
:--scale: 0.85
+-----------------------------------------+----------+
| tokenizer | |
+-----------------------------------------+ |
| scan_result | |
+-----------------+-----------------------+ buffer |
| token | tokenizer_error | |
+-----------------+-----------------------+ |
| tokentype | span | |
+-----------------+-----------------------+----------+

View file

@ -1,6 +1,6 @@
.. xo-tokenizer documentation master file. .. xo-tokenizer documentation master file.
xo-tokenizer documentation Xo-tokenizer documentation
========================== ==========================
xo-tokenizer provides a tokenizer for the Schematika language. xo-tokenizer provides a tokenizer for the Schematika language.
@ -15,5 +15,8 @@ may appear in variable names: ``one-of-those-days`` is an ordinary symbol.
install install
examples examples
genindex implementation
search token-class
tokenizer-error-class
span-class
tokentype-enum

View file

@ -1,8 +1,23 @@
.. _install: .. _install:
.. toctree .. toctree::
:maxdepth: 2 :maxdepth: 2
Source
======
Souce code lives on github `here`_
.. _here: https://github.com/rconybea/xo-tokenizer
To clone from git:
.. code-block:: bash
git clone https://github.com/rconybea/xo-tokenizer
Tested with gcc 13.3
Install Install
======= =======

View file

@ -0,0 +1,84 @@
.. _span-class:
Span
====
Identify an unowned contiguous memory range
Context
-------
.. ditaa::
:--scale: 0.85
+-----------------------------------------+----------+
| tokenizer | |
+-----------------------------------------+ |
| scan_result | |
+-----------------+-----------------------+ buffer |
| token | tokenizer_error | |
+-----------------+-----------------------+ |
| tokentype |cBLU span | |
+-----------------+-----------------------+----------+
.. code-block:: cpp
#include <xo/tokenizer/span.hpp>
.. uml::
:scale: 99%
:align: center
allowmixing
object span1<<span>>
span1 : lo = p
span2 : hi = p+25
object dest<<memory>>
dest : def fact(n : i64) { ... }
- Identify a sequence of characters stored in contiguous memory.
- Lightweight, consists of a pair of pointers.
- Does not own storage. Lifetime management for target memory is
up to the caller.
Class
-----
.. doxygenclass:: xo::scm::span
Member Variables
----------------
.. doxygengroup:: span-instance-vars
Type Traits
-----------
.. doxygengroup:: span-type-traits
Constructors
------------
.. doxygengroup:: span-ctors
Access Methods
--------------
.. doxygengroup:: span-access-methods
General Methods
---------------
.. doxygengroup:: span-general-methods
Operators
---------
.. doxygengroup:: span-operators

View file

@ -0,0 +1,94 @@
.. _token-class:
Token
=====
Represent a single lexical token in the Schematika language
Context
-------
.. ditaa::
:--scale: 0.85
+-----------------------------------------+----------+
| tokenizer | |
+-----------------------------------------+ |
| scan_result | |
+-----------------+-----------------------+ buffer |
|cBLU token | tokenizer_error | |
+-----------------+-----------------------+ |
| tokentype | span | |
+-----------------+-----------------------+----------+
.. code-block:: cpp
#include <xo/tokenizer/token.hpp>
.. uml::
:scale: 99%
:align: center
allowmixing
object tk1<<token>>
tk1 : tk_type = tk_i64
tk1 : text = "123"
object tk2<<token>>
tk2 : tk_type = tk_string
tk2 : text = "the quick brown fox"
- Represent a single lexical token
- Does not share any storage with original input stream
(maintains a local copy).
- Remembers copied input extent.
Convert on demand to native untagged representation
Example
-------
.. code-block:: cpp
void foo() {
using namespace xo::scm;
token<char> tk = token<char>::i64_token("123");
tk.is_valid(); // -> true
tk.text(); // -> "123"s;
tk.tk_type(); // -> tokentype::tk_i64
tk.i64_value(); // -> 123
cout << tk << endl; // -> <token :type i64 :text 123>
}
Class
-----
.. doxygenclass:: xo::scm::token
Instance Variables
------------------
.. doxygengroup:: token-instance-vars
Constructors
------------
.. doxygengroup:: token-ctors
Access Methods
--------------
.. doxygengroup:: token-access-methods
General Methods
---------------
.. doxygengroup:: token-general-methods

View file

@ -0,0 +1,27 @@
.. _tokenizer-class:
Tokenizer
=========
Parse a Schematika character stream into lexical tokens
Context
-------
.. ditaa::
:--scale: 0.85
+-----------------------------------------+----------+
|cBLU tokenizer | |
+-----------------------------------------+ |
| scan_result | |
+-----------------+-----------------------+ buffer |
| token | tokenizer_error | |
+-----------------+-----------------------+ |
| tokentype | span | |
+-----------------+-----------------------+----------+
.. code-block:: cpp
#include <xo/tokenizer/tokenizer.hpp>

View file

@ -0,0 +1,52 @@
.. _tokenizer-error-class
Tokenizer Error
===============
Represent a possible tokenizer error result, including parsing context
Context
-------
.. ditaa::
:--scale: 0.85
+-----------------------------------------+----------+
| tokenizer | |
+-----------------------------------------+ |
| scan_result | |
+-----------------+-----------------------+ buffer |
| token |cBLU tokenizer_error | |
+-----------------+-----------------------+ |
| tokentype | span | |
+-----------------+-----------------------+----------+
.. code-block:: cpp
#include <xo/tokenizer/tokenizer_error.hpp>
Class
------
.. doxygenclass:: xo::scm::tokenizer_error
Instance Variables
------------------
.. doxygengroup:: tokenizer-error-instance-vars
Constructors
------------
.. doxygengroup:: tokenizer-error-ctors
Access Methods
--------------
.. doxygengroup:: tokenizer-error-access-methods
General Methods
---------------
.. doxygengroup:: tokenizer-error-general-methods

View file

@ -0,0 +1,34 @@
.. _tokentype-enum:
Tokentype
=========
Distinguish different lexical tokens for the Schematika language.
Context
-------
.. ditaa::
:--scale: 0.85
+-----------------------------------------+----------+
| tokenizer | |
+-----------------------------------------+ |
| scan_result | |
+-----------------+-----------------------+ buffer |
| token | tokenizer_error | |
+-----------------+-----------------------+ |
|cBLU tokentype | span | |
+-----------------+-----------------------+----------+
.. code-block:: cpp
#include <xo/tokenizer/tokentype.hpp>
Enum
----
.. doxygenfunction:: xo::scm::tokentype_descr
.. doxygenfunction:: xo::scm::operator<<(std::ostream&,tokentype)

View file

@ -41,21 +41,35 @@ main() {
if (tk.is_valid()) { if (tk.is_valid()) {
cout << tk << endl; cout << tk << endl;
} else if (error.is_error()) { } else if (error.is_error()) {
cout << "parsing error: " << error << endl; cout << "parsing error: " << endl;
/* discard remainder of input line */ error.report(cout);
break; break;
} }
input = input.after_prefix(consumed.size()); input = tkz.consume(consumed, input);
//input = input.after_prefix(consumed.size());
} }
/* discard stashed remainder of input line
* (for nicely-formatted errors)
*/
tkz.discard_current_line();
} }
auto [tk, consumed, error] = tkz.notify_eof(span_type::from_string(input_str)); {
span_type input = span_type::from_string(input_str);
if (tk.is_valid()) { auto [tk, consumed, error] = tkz.notify_eof(input);
cout << tk << endl;
} else if (error.is_error()) { input = tkz.consume(consumed, input);
cout << "parsing error: " << error << endl;
if (tk.is_valid()) {
cout << tk << endl;
} else if (error.is_error()) {
cout << "parsing error: " << endl;
error.report(cout);
}
} }
} }

View file

@ -10,8 +10,10 @@
namespace xo { namespace xo {
namespace scm { namespace scm {
/** @brief Represent result of parsing one input token. /** @class scan_result
* @brief Represent result of parsing one input token.
* *
* @code
* Possible outcomes fall into several categories * Possible outcomes fall into several categories
* (with T: @c token_.is_valid(), E: @cerror_.is_error()) * (with T: @c token_.is_valid(), E: @cerror_.is_error())
* *
@ -21,6 +23,7 @@ namespace xo {
* | true | false | parsed token in T | * | true | false | parsed token in T |
* | false | true | parse error in E | * | false | true | parse error in E |
* *
* @endcode
**/ **/
template <typename CharT> template <typename CharT>
class scan_result { class scan_result {
@ -37,6 +40,7 @@ namespace xo {
static scan_result make_whitespace(const span_type & prefix_input); static scan_result make_whitespace(const span_type & prefix_input);
static scan_result make_partial(const span_type & prefix_input); static scan_result make_partial(const span_type & prefix_input);
static scan_result make_error(const error_type & error);
bool is_eof_or_ambiguous() const { return token_.is_invalid() && error_.is_not_an_error(); } bool is_eof_or_ambiguous() const { return token_.is_invalid() && error_.is_not_an_error(); }
bool is_token() const { return token_.is_valid(); } bool is_token() const { return token_.is_valid(); }
@ -67,6 +71,12 @@ namespace xo {
return scan_result(token_type::invalid(), prefix_input /*consumed*/); return scan_result(token_type::invalid(), prefix_input /*consumed*/);
} }
template <typename CharT>
auto scan_result<CharT>::make_error(const error_type & error) -> scan_result
{
return scan_result(token_type::invalid(), span_type::make_null(), error);
}
} /*namespace scm*/ } /*namespace scm*/
} /*namespace xo*/ } /*namespace xo*/

View file

@ -11,21 +11,32 @@ namespace xo {
namespace scm { namespace scm {
/** @class span compression/span.hpp /** @class span compression/span.hpp
* *
* @brief Represents a contiguous memory range, without ownership. * @brief A contiguous range of characters, without ownership.
* *
* @tparam CharT type for elements referred to by this span. * @tparam CharT type for elements referred to by this span.
**/ **/
template <typename CharT> template <typename CharT>
class span { class span {
public: public:
/** @brief typealias for span size (in units of CharT) **/ /** @defgroup span-type-traits span type traits **/
///@{
/** typealias for span size (in units of CharT) **/
using size_type = std::uint64_t; using size_type = std::uint64_t;
///@}
public: public:
/** @brief create span for the contiguous memory range [@p lo, @p hi) **/ /** @defgroup span-ctors span constructors **/
///@{
/** Create span for the contiguous memory range [@p lo, @p hi) **/
span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {} span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {}
/** @brief create a null span (i.e. with null @p lo, @p hi pointers) **/ /** Create a null span (i.e. with null @p lo, @p hi pointers)
* A null span can be concatenated with any other span
* without triggering matching-endpoint asserts.
**/
static span make_null() { return span(nullptr, nullptr); } static span make_null() { return span(nullptr, nullptr); }
/** @brief create span for C-style string @p cstr **/ /** @brief create span for C-style string @p cstr **/
@ -65,16 +76,20 @@ namespace xo {
return span(lo, hi); return span(lo, hi);
} }
///@{ ///@}
/** @name getters **/ /** @defgroup span-access-methods **/
///@{
CharT * lo() const { return lo_; } /* get member span::lo_ */ CharT * lo() const { return lo_; } /* get member span::lo_ */
CharT * hi() const { return hi_; } /* get member span::hi_ */ CharT * hi() const { return hi_; } /* get member span::hi_ */
///@} ///@}
/** @brief create new span over supplied type, /** @defgroup span-general-methods **/
///@{
/** Create new span over supplied type,
* with identical (possibly misaligned) endpoints. * with identical (possibly misaligned) endpoints.
* *
* @warning * @warning
@ -121,7 +136,9 @@ namespace xo {
return after_prefix(prefix.size()); return after_prefix(prefix.size());
} }
/** @brief create span starting with position p **/ /** Create span starting with position @p p.
* Does boundary checking; will return empty span if @p p is outside @c [lo_,hi)
**/
span suffix_from(CharT * p) const { span suffix_from(CharT * p) const {
if ((lo_ <= p) && (p <= hi_)) if ((lo_ <= p) && (p <= hi_))
return span(p, hi_); return span(p, hi_);
@ -129,13 +146,16 @@ namespace xo {
return span(hi_, hi_); return span(hi_, hi_);
} }
/** @brief true iff this span is null. distinct from empty. **/ /** true iff this span is null. distinct from empty. **/
bool is_null() const { return lo_ == nullptr && hi_ == nullptr; } bool is_null() const { return lo_ == nullptr && hi_ == nullptr; }
/** @brief true iff this span is empty (comprises 0 elements). **/ /** true iff this span is empty (comprises 0 elements). **/
bool empty() const { return lo_ == hi_; } bool empty() const { return lo_ == hi_; }
/** @brief report the number of elements (of type CharT) in this span. **/ /** report the number of elements (of type CharT) in this span. **/
size_type size() const { return hi_ - lo_; } size_type size() const { return hi_ - lo_; }
/** increase extent of this spans to include @p x.
* Requires @c hi() == @c x.lo()
**/
span & operator+=(const span & x) { span & operator+=(const span & x) {
if (hi_ == x.lo_) { if (hi_ == x.lo_) {
hi_ = x.hi_; hi_ = x.hi_;
@ -154,15 +174,18 @@ namespace xo {
<< " :text " << xo::print::quot(std::string_view(lo_, hi_)) << " :text " << xo::print::quot(std::string_view(lo_, hi_))
<< ">"; << ">";
} }
///@}
private: private:
/** @defgroup span-instance-vars **/
///@{ ///@{
/** @brief start of span /** start of span.
Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
**/ **/
CharT * lo_ = nullptr; CharT * lo_ = nullptr;
/** @brief end of span
/** @brief end of span.
Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
**/ **/
CharT * hi_ = nullptr; CharT * hi_ = nullptr;
@ -170,6 +193,12 @@ namespace xo {
///@} ///@}
}; /*span*/ }; /*span*/
/** @defgroup span-operators **/
///@{
/** compare spans for equality.
* Two spans are equal iff both endpoints match exactly.
**/
template <typename CharT> template <typename CharT>
inline bool inline bool
operator==(const span<CharT> & lhs, const span<CharT> & rhs) { operator==(const span<CharT> & lhs, const span<CharT> & rhs) {
@ -177,6 +206,9 @@ namespace xo {
&& (lhs.hi() == rhs.hi())); && (lhs.hi() == rhs.hi()));
} }
/** compare spans for inequality.
* Two spans are unequal if either paired endpoint differs.
**/
template <typename CharT> template <typename CharT>
inline bool inline bool
operator!=(const span<CharT> & lhs, const span<CharT> & rhs) { operator!=(const span<CharT> & lhs, const span<CharT> & rhs) {
@ -184,6 +216,7 @@ namespace xo {
|| (lhs.hi() != rhs.hi())); || (lhs.hi() != rhs.hi()));
} }
/** print a summary of @p x on stream @p os. Intended for diagnostics **/
template <typename CharT> template <typename CharT>
inline std::ostream & inline std::ostream &
operator<<(std::ostream & os, operator<<(std::ostream & os,
@ -191,5 +224,33 @@ namespace xo {
x.print(os); x.print(os);
return os; return os;
} }
///@}
} /*namespace scm*/ } /*namespace scm*/
namespace print {
template <typename CharT>
class printspan_impl {
public:
printspan_impl(xo::scm::span<CharT> x) : span_{x} {}
xo::scm::span<CharT> span_;
};
template <typename CharT>
printspan_impl<CharT> printspan(const xo::scm::span<CharT>& span) {
return printspan_impl<CharT>(span);
}
template <typename CharT>
inline std::ostream &
operator<< (std::ostream & os,
const printspan_impl<CharT> & x)
{
for (const CharT * p = x.span_.lo(); p < x.span_.hi(); ++p)
os << *p;
return os;
}
}
} /*namespace xo*/ } /*namespace xo*/

View file

@ -43,75 +43,137 @@ namespace xo {
} }
} }
/** @class token
* @brief Represent a Schematika lexical token
**/
template <typename CharT> template <typename CharT>
class token { class token {
public: public:
/** @defgroup token-ctors token constructors **/
///@{
/** default ctor creates token with type @c tk_invalid **/
token() = default; token() = default;
/** create token with type @c tk_type and input text @c text **/
token(tokentype tk_type, const std::string & text = "") token(tokentype tk_type, const std::string & text = "")
: tk_type_{tk_type}, text_{text} {} : tk_type_{tk_type}, text_{text} {}
/** create invalid token (same as null ctor, but explicit) **/
static token invalid() { return token(); } static token invalid() { return token(); }
/** Create token representing 64-bit signed integer literal parsed from decimal @p txt.
* The string @p txt must be a decimal integer literal, since @ref i64_value re-parses @p txt.
**/
static token i64_token(const std::string & txt) { static token i64_token(const std::string & txt) {
return token(tokentype::tk_i64, txt); return token(tokentype::tk_i64, txt);
} }
/** create token representing 64-bit floating-point literal parsed from decimal @p txt
* The string @p txt must be a decimal floating-point literal, since @ref f64_value re-parses @p txt.
**/
static token f64_token(const std::string & txt) { static token f64_token(const std::string & txt) {
return token(tokentype::tk_f64, txt); return token(tokentype::tk_f64, txt);
} }
/** create token representing literal string parsed from @p txt **/
static token string_token(const std::string & txt) { static token string_token(const std::string & txt) {
return token(tokentype::tk_string, txt); return token(tokentype::tk_string, txt);
} }
/** create token representing a symbol parsed from @p txt.
* Note that not all strings are valid symbol names.
**/
static token symbol_token(const std::string & txt) { static token symbol_token(const std::string & txt) {
return token(tokentype::tk_symbol, txt); return token(tokentype::tk_symbol, txt);
} }
/** token representing left angle bracket @c "<" **/
static token leftangle() { return token(tokentype::tk_leftangle); } static token leftangle() { return token(tokentype::tk_leftangle); }
/** token representing right angle bracket @c ">" **/
static token rightangle() { return token(tokentype::tk_rightangle); } static token rightangle() { return token(tokentype::tk_rightangle); }
/** token representing left parenthesis @c "(" **/
static token leftparen() { return token(tokentype::tk_leftparen); } static token leftparen() { return token(tokentype::tk_leftparen); }
/** token representing right parenthesis @c ")" **/
static token rightparen() { return token(tokentype::tk_rightparen); } static token rightparen() { return token(tokentype::tk_rightparen); }
/** token representing left bracket @c "[" **/
static token leftbracket() { return token(tokentype::tk_leftbracket); } static token leftbracket() { return token(tokentype::tk_leftbracket); }
/** token representing right bracket @c "]" **/
static token rightbracket() { return token(tokentype::tk_rightbracket); } static token rightbracket() { return token(tokentype::tk_rightbracket); }
/** token representing left brace @c "{" **/
static token leftbrace() { return token(tokentype::tk_leftbrace); } static token leftbrace() { return token(tokentype::tk_leftbrace); }
/** token representing right brace @c "}' **/
static token rightbrace() { return token(tokentype::tk_rightbrace); } static token rightbrace() { return token(tokentype::tk_rightbrace); }
/** token representing period @c "." **/
static token dot() { return token(tokentype::tk_dot); } static token dot() { return token(tokentype::tk_dot); }
/** token representing comma @c "," **/
static token comma() { return token(tokentype::tk_comma); } static token comma() { return token(tokentype::tk_comma); }
/** token representing colon @c ":" **/
static token colon() { return token(tokentype::tk_colon); } static token colon() { return token(tokentype::tk_colon); }
/** token representing double-colo @c "::" **/
static token doublecolon() { return token(tokentype::tk_doublecolon); } static token doublecolon() { return token(tokentype::tk_doublecolon); }
/** token representing semicolon @c ";" **/
static token semicolon() { return token(tokentype::tk_semicolon); } static token semicolon() { return token(tokentype::tk_semicolon); }
/** token representing single-assignment @c "=" **/
static token singleassign() { return token(tokentype::tk_singleassign); } static token singleassign() { return token(tokentype::tk_singleassign); }
/** token representing unrestricted assignment @c ":=" **/
static token assign_token() { return token(tokentype::tk_assign); } static token assign_token() { return token(tokentype::tk_assign); }
/** token representing indirection @c "->" **/
static token yields() { return token(tokentype::tk_yields); } static token yields() { return token(tokentype::tk_yields); }
/** token for @c "+" **/
static token plus_token() { return token(tokentype::tk_plus); } static token plus_token() { return token(tokentype::tk_plus); }
/** token for @c "-" **/
static token minus_token() { return token(tokentype::tk_minus); } static token minus_token() { return token(tokentype::tk_minus); }
/** token for @c "*" **/
static token star_token() { return token(tokentype::tk_star); } static token star_token() { return token(tokentype::tk_star); }
/** token for @c "/" **/
static token slash_token() { return token(tokentype::tk_slash); } static token slash_token() { return token(tokentype::tk_slash); }
/** token representing keyword @c type **/
static token type() { return token(tokentype::tk_type); } static token type() { return token(tokentype::tk_type); }
/** token representing keyword @c def **/
static token def() { return token(tokentype::tk_def); } static token def() { return token(tokentype::tk_def); }
/** token representing keyword @c lambda **/
static token lambda() { return token(tokentype::tk_lambda); } static token lambda() { return token(tokentype::tk_lambda); }
/** token representing keyword @c if **/
static token if_token() { return token(tokentype::tk_if); } static token if_token() { return token(tokentype::tk_if); }
/** token representing keyword @c let **/
static token let() { return token(tokentype::tk_let); } static token let() { return token(tokentype::tk_let); }
/** token representing keyword @c in **/
static token in() { return token(tokentype::tk_in); } static token in() { return token(tokentype::tk_in); }
/** token representing keyword @c end **/
static token end() { return token(tokentype::tk_end); } static token end() { return token(tokentype::tk_end); }
///@}
/** @defgroup token-access-methods **/
///@{
tokentype tk_type() const { return tk_type_; } tokentype tk_type() const { return tk_type_; }
const std::string & text() const { return text_; } const std::string & text() const { return text_; }
///@}
/** @defgroup token-general-methods **/
///@{
/** true if token understood to represent valid input
* i.e. any token type except @c tk_invalid
**/
bool is_valid() const { return tk_type_ != tokentype::tk_invalid; } bool is_valid() const { return tk_type_ != tokentype::tk_invalid; }
/** true for sentinel token with type tk_invalid **/
bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; } bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; }
/** expect input matching /** expect input matching @c "[+|-][0-9][0-9]*" **/
* [+|-][0-9][0-9]*
**/
std::int64_t i64_value() const; std::int64_t i64_value() const;
/** expect input matching
* [+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]* /** expect input matching @c "[+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]*" **/
**/
double f64_value() const; double f64_value() const;
/** print human-readable token representation on stream @p os **/ /** print human-readable token representation on stream @p os **/
void print(std::ostream & os) const; void print(std::ostream & os) const;
///@}
private: private:
/** @defgroup token-instance-vars **/
///@{
/** category for this token **/ /** category for this token **/
tokentype tk_type_ = tokentype::tk_invalid; tokentype tk_type_ = tokentype::tk_invalid;
@ -124,6 +186,8 @@ namespace xo {
* tk_symbol * tk_symbol
**/ **/
std::string text_; std::string text_;
///@}
}; /*token*/ }; /*token*/
template <typename CharT> template <typename CharT>

View file

@ -13,9 +13,15 @@
namespace xo { namespace xo {
namespace scm { namespace scm {
/** /** @class tokenizer
* @brief Parse a Schematika character stream into lexical tokens
*
* Use: * Use:
*
* @code * @code
* // see xo-tokenizer/example/tokenrepl/tokenrepl.cpp
* // for exact working code
*
* using tokenizer_type = tokenizer<char>; * using tokenizer_type = tokenizer<char>;
* using span_type = tokenizer_type::span_type; * using span_type = tokenizer_type::span_type;
* *
@ -24,21 +30,19 @@ namespace xo {
* *
* while (!input.empty()) { * while (!input.empty()) {
* auto res = tkz.scan(input); * auto res = tkz.scan(input);
* const auto & tk = res.first; * auto [tk, consumed, error] = res.first;
* *
* // do something with tk if tk.is_valid() * // do something with tk if tk.is_valid()
* *
* input = input.after_prefix(res.second); * input = tkz.consume(res.second, input);
* } * }
* *
* if endofinput { * if endofinput {
* auto tk = tzk.notify_eof() * auto [tk, consumed, error] = tzk.notify_eof()
* *
* // do something with tk if tk.is_valid() * // do something with (final) tk if tk.is_valid()
* } * }
* *
* // expect !tkz.has_prefix()
*
* @endcode * @endcode
* *
* See tokentype.hpp for token types * See tokentype.hpp for token types
@ -47,6 +51,7 @@ namespace xo {
class tokenizer { class tokenizer {
public: public:
using token_type = token<CharT>; using token_type = token<CharT>;
using error_type = tokenizer_error<CharT>;
using span_type = span<const CharT>; using span_type = span<const CharT>;
using result_type = scan_result<CharT>; using result_type = scan_result<CharT>;
@ -122,11 +127,22 @@ namespace xo {
**/ **/
result_type scan2(const span_type & input, bool eof); result_type scan2(const span_type & input, bool eof);
/** @retval span with @p consumed permanently removed from @p input.
*
* Purpose of this method is to update @ref current_pos_.
**/
span_type consume(const span_type & consumed, const span_type & input);
/** discard current line after error. Just cleans up error-reporting state **/
void discard_current_line();
/** notify end of input, resolving any ambiguous input stashed in .prefix /** notify end of input, resolving any ambiguous input stashed in .prefix
**/ **/
result_type notify_eof(const span_type & input); result_type notify_eof(const span_type & input);
private: private:
void capture_current_line(const span_type & input);
result_type scan_completion(const span_type & whitespace, result_type scan_completion(const span_type & whitespace,
const CharT* token_end, const CharT* token_end,
const span_type & input); const span_type & input);
@ -134,8 +150,10 @@ namespace xo {
private: private:
/** true to log tokenizer activity to stdout **/ /** true to log tokenizer activity to stdout **/
bool debug_flag_ = false; bool debug_flag_ = false;
/** remember start of current line here **/ /** remember current input line. Used only to report errors **/
span_type current_line_ = span_type::make_null(); span_type current_line_ = span_type::make_null();
/** current input position within @ref current_line_ **/
size_t current_pos_ = 0;
/** Accumulate partial token here. /** Accumulate partial token here.
* This will happen if input sent to @ref tokenizer::scan * This will happen if input sent to @ref tokenizer::scan
* ends without a determinate token boundary. * ends without a determinate token boundary.
@ -348,29 +366,35 @@ namespace xo {
} else if (exponent_flag && !exponent_digit_flag) { } else if (exponent_flag && !exponent_digit_flag) {
exponent_sign_flag = true; exponent_sign_flag = true;
} else { } else {
throw std::runtime_error return result_type::make_error
(tostr("tokenizer::assemble_token", (error_type(__FUNCTION__ /*src_function*/,
": improperly placed sign indicator", "improperly placed sign indicator",
xtag("pos", ix - tk_start), current_line_,
xtag("char", *ix))); current_pos_,
initial_whitespace,
(ix - tk_start)));
} }
} else if (*ix == '.') { } else if (*ix == '.') {
if (period_flag) { if (period_flag) {
throw (std::runtime_error return result_type::make_error
(tostr("tokenizer::assemble_token", (error_type(__FUNCTION__ /*src_function*/,
": duplicate decimal point", "duplicate decimal point in numeric literal",
xtag("pos", ix - tk_start), current_line_,
xtag("char", *ix)))); current_pos_,
initial_whitespace,
(ix - tk_start)));
} }
period_flag = true; period_flag = true;
} else if ((*ix == 'e') || (*ix == 'E')) { } else if ((*ix == 'e') || (*ix == 'E')) {
if (exponent_flag) { if (exponent_flag) {
throw (std::runtime_error return result_type::make_error
(tostr("tokenizer::assemble_token", (error_type(__FUNCTION__ /*src_function*/,
": duplicate exponent marker", "duplicate exponent marker in numeric literal",
xtag("pos", ix - tk_start), current_line_,
xtag("char", *ix)))); current_pos_,
initial_whitespace,
(ix - tk_start)));
} }
exponent_flag = true; exponent_flag = true;
@ -382,12 +406,13 @@ namespace xo {
number_flag = true; number_flag = true;
} }
} else { } else {
/* invalid input */ return result_type::make_error
throw (std::runtime_error (error_type(__FUNCTION__ /*src_function*/,
(tostr("tokenizer::assemble_token", "unexpected character in numeric constant" /*error_description*/,
": unexpected character in numeric constant", current_line_,
xtag("pos", ix - tk_start), current_pos_,
xtag("char", *ix)))); initial_whitespace,
(ix - tk_start)));
} }
} }
@ -443,11 +468,12 @@ namespace xo {
++ix; /*skip initial " char*/ ++ix; /*skip initial " char*/
/* true on final " */
bool endofstring = false;
for (; ix != token_text.hi(); ++ix) { for (; ix != token_text.hi(); ++ix) {
log && log(xtag("*ix", *ix)); log && log(xtag("*ix", *ix));
bool endofstring = false;
switch(*ix) { switch(*ix) {
case '"': case '"':
endofstring = true; endofstring = true;
@ -461,11 +487,13 @@ namespace xo {
++ix; ++ix;
if (ix == token_text.hi()) { if (ix == token_text.hi()) {
throw std::runtime_error return result_type::make_error
(tostr("tokenizer::assemble_token", (error_type(__FUNCTION__ /*src_function*/,
": malformed string literal", "expecting key following escape character \\",
xtag("input", std::string_view(token_text.lo(), current_line_,
token_text.hi())))); current_pos_,
initial_whitespace,
(ix - tk_start)));
} }
switch(*ix) { switch(*ix) {
@ -490,10 +518,13 @@ namespace xo {
tk_text.push_back('"'); tk_text.push_back('"');
break; break;
default: default:
throw std::runtime_error return result_type::make_error
(tostr("tokenizer::assemble_token", (error_type(__FUNCTION__ /*src_function*/,
": unexpected \\-escaped char", "expecting one of n|r|\"|\\ following escape \\",
xtag("char", *ix))); current_line_,
current_pos_,
initial_whitespace,
(ix - tk_start)));
} }
break; break;
default: default:
@ -505,12 +536,14 @@ namespace xo {
break; break;
} }
if (ix != token_text.hi()) { if (!endofstring) {
throw std::runtime_error return result_type::make_error
(tostr("tokenizer::assemble_token", (error_type(__FUNCTION__ /*src_function*/,
": expected \" to end string literal", "missing terminating '\"' to complete literal string",
xtag("input", std::string_view(token_text.lo(), current_line_,
token_text.hi())))); current_pos_,
initial_whitespace,
(ix - tk_start)));
} }
log && log(tostr("tokenizer::assemble_token", log && log(tostr("tokenizer::assemble_token",
@ -632,9 +665,13 @@ namespace xo {
} }
if (tk_type == tokentype::tk_invalid) { if (tk_type == tokentype::tk_invalid) {
throw std::runtime_error(tostr("tokenizer::assemble_token", return result_type::make_error
": unexpected input x", (error_type(__FUNCTION__ /*src_function*/,
xtag("x", *ix))); "illegal input character",
current_line_,
current_pos_,
initial_whitespace,
(ix - tk_start)));
} }
if ((tk_type == tokentype::tk_i64) if ((tk_type == tokentype::tk_i64)
@ -719,6 +756,27 @@ namespace xo {
} }
template <typename CharT>
void
tokenizer<CharT>::capture_current_line(const span_type & input)
{
// see discard_current_line()
scope log(XO_DEBUG(debug_flag_));
/* look ahead to {end of line, end of input}, whichever comes first */
const CharT * sol = input.lo();
const CharT * eol = sol;
while ((eol < input.hi()) && (*eol != '\n'))
++eol;
this->current_line_ = span_type(sol, eol);
this->current_pos_ = 0;
log && log(xtag("current_line", print::printspan(current_line_)));
}
template <typename CharT> template <typename CharT>
auto auto
tokenizer<CharT>::scan(const span_type & input) -> result_type tokenizer<CharT>::scan(const span_type & input) -> result_type
@ -729,21 +787,22 @@ namespace xo {
const CharT * ix = input.lo(); const CharT * ix = input.lo();
if (this->current_line_.is_null()) {
this->capture_current_line(input);
}
/* skip whitespace + remember beginning of most recent line */ /* skip whitespace + remember beginning of most recent line */
while (is_whitespace(*ix) && (ix != input.hi())) { while (is_whitespace(*ix) && (ix != input.hi())) {
if (is_newline(*ix)) { if (is_newline(*ix)) {
++ix; ++ix;
/* look ahead to {end of line, end of input}, whichever comes first */
const CharT * sol = ix;
const CharT * eol = ix;
while ((eol < input.hi()) && (*eol != '\n')) this->capture_current_line(span_type(ix, input.hi()));
++eol;
this->current_line_ = span_type(sol, eol);
} else { } else {
++ix; ++ix;
#ifdef OBSOLETE
++(this->current_pos_);
#endif
} }
} }
@ -818,10 +877,12 @@ namespace xo {
break; break;
} }
} else if ((*ix == '\n') || (*ix == '\r')) { } else if ((*ix == '\n') || (*ix == '\r')) {
throw std::runtime_error return result_type::make_error
(tostr("tokenizer::scan", (error_type(__FUNCTION__ /*src_function*/,
": must use \\n or \\r to encode newline/cr in" "must use \\n or \\r to encode newline/cr in string literal",
" string literal")); current_line_, current_pos_,
whitespace.size(),
(ix - tk_start)));
} }
prev_ch = *ix; prev_ch = *ix;
@ -945,6 +1006,25 @@ namespace xo {
sr2.error()); sr2.error());
} }
template <typename CharT>
auto
tokenizer<CharT>::consume(const span_type & consumed, const span_type & input) -> span_type
{
this->current_pos_ += consumed.size();
return input.after_prefix(consumed);
}
template <typename CharT>
void
tokenizer<CharT>::discard_current_line()
{
// see capture_current_line()
this->current_line_ = span_type::make_null();
this->current_pos_ = 0;
}
template <typename CharT> template <typename CharT>
auto auto
tokenizer<CharT>::notify_eof(const span_type & input) -> result_type { tokenizer<CharT>::notify_eof(const span_type & input) -> result_type {

View file

@ -7,47 +7,95 @@
#include "tokentype.hpp" #include "tokentype.hpp"
#include "span.hpp" #include "span.hpp"
#include <iomanip>
namespace xo { namespace xo {
namespace scm { namespace scm {
/** represent a lexing error, with context **/ /** @class tokenizer_error
* @brief represent a lexing error, with context
*
* @tparam CharT representation for single characters
**/
template <typename CharT> template <typename CharT>
class tokenizer_error { class tokenizer_error {
public: public:
using span_type = span<const CharT>; using span_type = span<const CharT>;
public: public:
/** @brief default ctor represent a not-an-error error object **/ /** @defgroup tokenizer-error-ctors **/
///@{
/** Default ctor represent a not-an-error sentinel object **/
tokenizer_error() = default; tokenizer_error() = default;
tokenizer_error(char const * src_function, /** Constructor to capture parsing error context
char const* error_description, * @p tk_start current position on entry to scanner
span_type input_line, size_t error_pos) * @p whitespace number of chars initial whitespace
* @p error_pos error location relative to token start
**/
tokenizer_error(const char * src_function,
const char * error_description,
span_type input_line,
size_t tk_start,
size_t whitespace,
size_t error_pos)
: src_function_{src_function}, : src_function_{src_function},
error_description_{error_description}, error_description_{error_description},
input_line_{input_line}, input_line_{input_line},
tk_entry_{tk_start},
whitespace_{whitespace},
error_pos_{error_pos} {} error_pos_{error_pos} {}
///@}
char const* src_function() const { return src_function_; } /** @defgroup tokenizer-error-access-methods **/
char const* error_description() const { return error_description_; } ///@{
size_t error_pos() const { return error_pos_; }
const char * src_function() const { return src_function_; }
const char * error_description() const { return error_description_; }
const span_type& input_line() const { return input_line_; } const span_type& input_line() const { return input_line_; }
size_t tk_start() const { return tk_entry_; }
size_t whitespace() const { return whitespace_; }
size_t error_pos() const { return error_pos_; }
bool is_not_an_error() const { return error_description_ == nullptr; } ///@}
/** @defgroup tokenizer-error-general-methods **/
///@{
/** true, except for a sentinel error object **/
bool is_error() const { return error_description_ != nullptr; } bool is_error() const { return error_description_ != nullptr; }
/** true except for object in sentinel state **/
bool is_not_an_error() const { return error_description_ == nullptr; }
/** Print representation to stream @p os. Intended for tokenizer diagnostics.
* For Schematika errors prefer @ref report
**/
void print(std::ostream & os) const; void print(std::ostream & os) const;
/** Print human-oriented error report on @p os. **/
void report(std::ostream & os) const;
///@}
private: private:
/** @defgroup tokenizer-error-instance-vars **/
///@{
/** source location (in tokenizer) at which error identified **/ /** source location (in tokenizer) at which error identified **/
char const * src_function_ = nullptr; char const * src_function_ = nullptr;
/** static error description **/ /** static error description **/
char const * error_description_ = nullptr; char const * error_description_ = nullptr;
/** position (relative to line_.lo) of error **/ /** complete current input line (to the extent captured)
size_t error_pos_ = 0; * that contains error
/** complete input line (to the extent available)
* containing error
**/ **/
span_type input_line_ = span_type::make_null(); span_type input_line_ = span_type::make_null();
/** position (relative to line_.lo) of token start where error encountered **/
size_t tk_entry_ = 0;
/** number of characters of initial whitespace skipped before token start **/
size_t whitespace_ = 0;
/** position (relative to @ref tk_entry_) of error **/
size_t error_pos_ = 0;
///@}
}; /*error_token*/ }; /*error_token*/
template <typename CharT> template <typename CharT>
@ -56,11 +104,41 @@ namespace xo {
os << "<tokenizer-error" os << "<tokenizer-error"
<< xtag("src-function", src_function_) << xtag("src-function", src_function_)
<< xtag("message", error_description_) << xtag("message", error_description_)
<< xtag("error-pos", error_pos_)
<< xtag("input", input_line_) << xtag("input", input_line_)
<< xtag("whitespace", whitespace_)
<< xtag("tk-start", tk_entry_)
<< xtag("error-pos", error_pos_)
<< ">"; << ">";
} }
template <typename CharT>
void
tokenizer_error<CharT>::report(std::ostream & os) const {
using namespace std;
if (error_description_) {
const char * prefix = "input: ";
const size_t tk_indent = strlen(prefix) + tk_entry_ + whitespace_;
//const size_t msg_length = strlen(error_description_);
const size_t error_pos = 1 + tk_entry_ + whitespace_ + error_pos_;
os << "char: " << error_pos << endl;
os << prefix;
for (const char *p = input_line_.lo(), *e = input_line_.hi(); p < e; ++p)
os << *p;
os << endl;
os << std::setw(tk_indent) << " ";
for (size_t i = 0; i < error_pos_; ++i) {
os << '_';
}
os << '^' << endl;
os << error_description_ << endl;
}
}
template <typename CharT> template <typename CharT>
inline std::ostream & inline std::ostream &
operator<< (std::ostream & os, operator<< (std::ostream & os,

View file

@ -11,10 +11,11 @@
namespace xo { namespace xo {
namespace scm { namespace scm {
/** @enum tokentype /** @enum tokentype
* @brief enum to identify different schematica input token types * Enum to identify different schematika input token types
* *
* Schematica code examples: * Schematica code examples:
* *
* @code
* type point :: { xcoord : f64, ycoord : f64 }; * type point :: { xcoord : f64, ycoord : f64 };
* type matrix :: array<double, 2>; // 2-d array * type matrix :: array<double, 2>; // 2-d array
* *
@ -41,6 +42,7 @@ namespace xo {
* def matrixproduct(x : matrix, y : matrix) { * def matrixproduct(x : matrix, y : matrix) {
* [i, j : x.row(i) * y.col(j)]; * [i, j : x.row(i) * y.col(j)];
* }; * };
* @endcode
**/ **/
enum class tokentype { enum class tokentype {
/** sentinel value **/ /** sentinel value **/
@ -58,52 +60,52 @@ namespace xo {
/** a symbol **/ /** a symbol **/
tk_symbol, tk_symbol,
/** left-hand parenthesis '(' **/ /** left-hand parenthesis @c '(' **/
tk_leftparen, tk_leftparen,
/** right-hand parenthesis ')' **/ /** right-hand parenthesis @c ')' **/
tk_rightparen, tk_rightparen,
/** left-hand bracket '[' **/ /** left-hand bracket @c '[' **/
tk_leftbracket, tk_leftbracket,
/** right-hand bracket ']' **/ /** right-hand bracket @c ']' **/
tk_rightbracket, tk_rightbracket,
/** left-hand brace '{' **/ /** left-hand brace @c '{' **/
tk_leftbrace, tk_leftbrace,
/** right-hand brace '}' **/ /** right-hand brace @c '}' **/
tk_rightbrace, tk_rightbrace,
/** left-hand angle bracket '<' **/ /** left-hand angle bracket @c '<' **/
tk_leftangle, tk_leftangle,
/** right-hand angle bracket '>' **/ /** right-hand angle bracket @c '>' **/
tk_rightangle, tk_rightangle,
/** dot '.' **/ /** dot @c '.' **/
tk_dot, tk_dot,
/** comma ',' **/ /** comma @c ',' **/
tk_comma, tk_comma,
/** colon ':' **/ /** colon @c ':' **/
tk_colon, tk_colon,
/** double-colon '::' **/ /** double-colon @c '::' **/
tk_doublecolon, tk_doublecolon,
/** semi-colon ';' **/ /** semi-colon @c ';' **/
tk_semicolon, tk_semicolon,
/** '=' **/ /** single equals sign @c '=' **/
tk_singleassign, tk_singleassign,
/** ':=' **/ /** assignment @c ':=' **/
tk_assign, tk_assign,
/** '->' **/ /** indirection @c '->' **/
tk_yields, tk_yields,
/** note: operators not treated as punctuation /** note: operators not treated as punctuation
@ -111,47 +113,53 @@ namespace xo {
* as is 'maybe*2', 'maybe+1', 'path/to/foo' * as is 'maybe*2', 'maybe+1', 'path/to/foo'
**/ **/
/** operator '+' **/ /** operator @c '+' **/
tk_plus, tk_plus,
/** operator '-' **/ /** operator @c '-' **/
tk_minus, tk_minus,
/** operator '*' **/ /** operator @c '*' **/
tk_star, tk_star,
/** operator '/' **/ /** operator @c '/' **/
tk_slash, tk_slash,
/** keyword 'type' **/ /** keyword @c 'type' **/
tk_type, tk_type,
/** keyword 'def' **/ /** keyword @c 'def' **/
tk_def, tk_def,
/** keyword 'lambda' **/ /** keyword @c 'lambda' **/
tk_lambda, tk_lambda,
/** keyword 'if' **/ /** keyword @c 'if' **/
tk_if, tk_if,
/** keyword 'let' **/ /** keyword @c 'let' **/
tk_let, tk_let,
/** keyword 'in' **/ /** keyword @c 'in' **/
tk_in, tk_in,
/** keyword 'end' **/ /** keyword @c 'end' **/
tk_end, tk_end,
n_tokentype /* comes last, counts #of entries */ /** counts number of entries **/
n_tokentype
}; /*tokentype*/ }; /*tokentype*/
/** String representation for enum value.
* For example @c tokentype_descr(tokentype::tk_if) -> @c "if"
**/
extern char const * extern char const *
tokentype_descr(tokentype tk_type); tokentype_descr(tokentype tk_type);
/** Print enum value for @p tk_type on stream @p os **/
inline std::ostream & inline std::ostream &
operator<< (std::ostream & os, tokentype tk_type) { operator<< (std::ostream & os, tokentype tk_type) {
os << tokentype_descr(tk_type); os << tokentype_descr(tk_type);
return os; return os;
} }
} /*namespace scm*/ } /*namespace scm*/
} /*namespace xo*/ } /*namespace xo*/

View file

@ -19,15 +19,17 @@ namespace xo {
* On second pass, enable verbose logging * On second pass, enable verbose logging
**/ **/
struct rehearser { struct rehearser {
rehearser(std::uint32_t att = 0) : attention_{att} {}
/* expect at most one iterator to exist per TestRehearser instance **/ /* expect at most one iterator to exist per TestRehearser instance **/
struct iterator { struct iterator {
iterator(rehearser* parent, std::uint32_t attention) : parent_{parent}, attention_{attention} {} explicit iterator(rehearser* parent) : parent_{parent} {}
iterator& operator++(); iterator& operator++();
std::uint32_t operator*() { return attention_; } std::uint32_t operator*() { return parent_->attention_; }
bool operator==(const iterator& ix2) const { bool operator==(const iterator& ix2) const {
return (parent_ == ix2.parent_) && (attention_ == ix2.attention_); return (parent_ == ix2.parent_);
} }
rehearser* parent_ = nullptr; rehearser* parent_ = nullptr;
@ -35,11 +37,12 @@ namespace xo {
}; };
bool is_first_pass() const { return attention_ == 0; }
bool is_second_pass() const { return attention_ == 1; } bool is_second_pass() const { return attention_ == 1; }
bool enable_debug() const { return is_second_pass(); } bool enable_debug() const { return is_second_pass(); }
iterator begin() { return iterator(this, 0); } iterator begin() { return iterator(this); }
iterator end() { return iterator(this, 2); } iterator end() { return iterator(nullptr); }
public: public:
/** pass number: 0 or 1 **/ /** pass number: 0 or 1 **/
@ -50,23 +53,27 @@ namespace xo {
auto rehearser::iterator::operator++() -> iterator& auto rehearser::iterator::operator++() -> iterator&
{ {
++attention_; if (parent_)
++(parent_->attention_);
if (parent_->ok_flag_ && attention_ == 1) { if (parent_->ok_flag_ && (parent_->attention_ == 1)) {
/* skip 2nd pass */ /* skip 2nd pass */
++attention_; ++(parent_->attention_);
} }
if (parent_->attention_ == 2)
parent_ = nullptr;
return *this; return *this;
} }
/* use this instead of REQUIRE(expr) in context of a test_rehearser */ /* use this instead of REQUIRE(expr) in context of a test_rehearser */
# define REHEARSE(rehearser, expr) \ # define REHEARSE(rehearser, expr) \
if (rehearser.is_second_pass()) { \ if (rehearser.is_first_pass()) { \
REQUIRE((expr)); \ bool _f = (expr); \
} else { \ rehearser.ok_flag_ = rehearser.ok_flag_ && _f; \
REQUIRE(true); \ } else { \
rehearser.ok_flag_ &= (expr); \ REQUIRE(expr); \
} }
/* note: trivial REQUIRE() call in else branch bc we still want /* note: trivial REQUIRE() call in else branch bc we still want
@ -300,12 +307,14 @@ namespace xo {
token::semicolon(), token::semicolon(),
token::rightbrace() token::rightbrace()
}}, }},
#ifdef TODO
{"a.b", {"a.b",
false, false,
{token::symbol_token("a"), {token::symbol_token("a"),
token::dot(), token::dot(),
token::symbol_token("b") token::symbol_token("b")
}}, }},
#endif
{"a,b", {"a,b",
false, false,
{token::symbol_token("a"), {token::symbol_token("a"),
@ -431,6 +440,132 @@ namespace xo {
} }
} /*TEST_CASE(tokenizer2)*/ } /*TEST_CASE(tokenizer2)*/
namespace {
using tkz_error_type = xo::scm::tokenizer_error<char>;
using span_type = xo::scm::span<const char>;
struct testcase_error {
std::string input_;
tkz_error_type expect_error_;
};
testcase_error
make_testcase(const char * input, const char * src_function, const char * error_descr,
size_t tk_start, size_t whitespace, size_t error_pos)
{
testcase_error retval;
retval.input_ = input;
retval.expect_error_ = tkz_error_type(src_function, error_descr,
span_type::from_string(retval.input_),
tk_start, whitespace, error_pos);
return retval;
}
std::vector<testcase_error>
s_testcase3_v = {
// 012345678
// --------v
make_testcase("123.456ez",
"assemble_token",
"unexpected character in numeric constant",
0, 0, 8),
// 01
// -v
make_testcase("1-3",
"assemble_token",
"improperly placed sign indicator",
0, 0, 1),
// 012
// --v
make_testcase("1..2",
"assemble_token",
"duplicate decimal point in numeric literal",
0, 0, 2),
// 0123456
// ------v
make_testcase("1.23e4e",
"assemble_token",
"duplicate exponent marker in numeric literal",
0, 0, 6),
// tokenizer sees string ["\"]
// 0 1 2 3
// - - - v
make_testcase("\"\\\"",
"assemble_token",
"missing terminating '\"' to complete literal string",
//"expect \\ to escape one of n|t|r|\"|\\ in string literal",
0, 0, 3),
// tokenizer sees literal with embedded newline
// 1 2 3
// 01234567890123456789012345678901 2
// -------------------------------- v
make_testcase("\"everything was going fine until\n\"",
"scan",
"must use \\n or \\r to encode newline/cr in string literal",
0, 0, 32),
// tokenizer sees string ["\]
// 0 1 2
// - - v
make_testcase("\"\\",
"assemble_token",
"expecting key following escape character \\",
0, 0, 2),
// tokenizer sees string ["\q"]
// 0 12
// - -v
make_testcase("\"\\q\"",
"assemble_token",
"expecting one of n|r|\"|\\ following escape \\",
0, 0, 2),
//
make_testcase("#",
"assemble_token",
"illegal input character",
0, 0, 0),
};
TEST_CASE("tokenizer3", "[tokenizer]") {
/* testing error handling */
using tokenizer = xo::scm::tokenizer<char>;
constexpr bool c_force_debug = true;
for (std::size_t i_tc = 0, n_tc = s_testcase3_v.size(); i_tc < n_tc; ++i_tc) {
const testcase_error & testcase = s_testcase3_v[i_tc];
rehearser rh(0);
for (auto _ : rh) {
scope log(XO_DEBUG2(c_force_debug || rh.enable_debug(), "tokenizer3"));
log && log(xtag("pass", _), xtag("ok(-)", rh.ok_flag_));
log && log(xtag("i_tc", i_tc), xtag("input", testcase.input_));
tokenizer tkz(c_force_debug || rh.enable_debug());
auto in_span = tokenizer::span_type::from_string(testcase.input_);
auto sr = tkz.scan2(in_span, true /*eof*/);
REHEARSE(rh, sr.is_error());
if (sr.error().src_function()) {
REHEARSE(rh, std::string(sr.error().src_function()) == std::string(testcase.expect_error_.src_function()));
}
if (sr.error().error_description()) {
REHEARSE(rh, std::string(sr.error().error_description()) == std::string(testcase.expect_error_.error_description()));
}
REHEARSE(rh, sr.error().whitespace() == testcase.expect_error_.whitespace());
REHEARSE(rh, sr.error().tk_start() == testcase.expect_error_.tk_start());
REHEARSE(rh, sr.error().error_pos() == testcase.expect_error_.error_pos());
log && log(xtag("ok(+)", rh.ok_flag_));
}
}
}
}
} /*namespace ut*/ } /*namespace ut*/
} /*namespace xo*/ } /*namespace xo*/

View file

@ -30,7 +30,7 @@ Context
Introduction Introduction
------------ ------------
.. code-block::cpp .. code-block:: cpp
#include <xo/unit/scaled_unit.hpp> #include <xo/unit/scaled_unit.hpp>