From 8c36bbce285093ad784794f434385acbe6b382da Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Sat, 19 Jul 2025 16:47:59 -0500 Subject: [PATCH] reader reports tokenizer errors through normal return --- xo-reader/examples/exprrepl/exprrepl.cpp | 12 ++++- xo-reader/examples/exprreplxx/exprreplxx.cpp | 18 ++++++- xo-reader/include/xo/reader/envframestack.hpp | 2 + .../include/xo/reader/exprstatestack.hpp | 2 + xo-reader/include/xo/reader/parser.hpp | 10 +++- xo-reader/include/xo/reader/reader.hpp | 17 ++++-- xo-reader/include/xo/reader/reader_error.hpp | 53 +++++++++++++++++++ xo-reader/src/reader/exprstatestack.cpp | 5 ++ xo-reader/src/reader/parser.cpp | 7 +++ xo-reader/src/reader/reader.cpp | 43 ++++++++++----- xo-tokenizer/example/tokenrepl/tokenrepl.cpp | 2 +- xo-tokenizer/include/xo/tokenizer/span.hpp | 17 ++++-- .../include/xo/tokenizer/tokenizer.hpp | 12 +++-- .../include/xo/tokenizer/tokenizer_error.hpp | 4 +- 14 files changed, 174 insertions(+), 30 deletions(-) create mode 100644 xo-reader/include/xo/reader/reader_error.hpp diff --git a/xo-reader/examples/exprrepl/exprrepl.cpp b/xo-reader/examples/exprrepl/exprrepl.cpp index 42bf4baa..bf771d02 100644 --- a/xo-reader/examples/exprrepl/exprrepl.cpp +++ b/xo-reader/examples/exprrepl/exprrepl.cpp @@ -61,10 +61,15 @@ main() { input = span_type::from_string(input_str); while (!input.empty()) { - auto [expr, consumed, psz] = rdr.read_expr(input, eof); + auto [expr, consumed, psz, error] = rdr.read_expr(input, eof); if (expr) { cout << expr << endl; + } else if (error.is_error()) { + cout << "parsing error: " << endl; + error.report(cout); + + break; } input = input.after_prefix(consumed); @@ -72,9 +77,12 @@ main() { } } - auto [expr, _1, _2] = rdr.read_expr(input, true /*eof*/); + auto [expr, _1, _2, error] = rdr.read_expr(input, true /*eof*/); if (expr) { cout << expr << endl; + } else if (error.is_error()) { + cout << "parsing error: " << endl; + error.report(cout); } } diff --git a/xo-reader/examples/exprreplxx/exprreplxx.cpp b/xo-reader/examples/exprreplxx/exprreplxx.cpp index 4b3660e4..17abcbe6 100644 --- a/xo-reader/examples/exprreplxx/exprreplxx.cpp +++ b/xo-reader/examples/exprreplxx/exprreplxx.cpp @@ -93,27 +93,41 @@ main() { input = span_type::from_string(input_str); while (!input.empty()) { - auto [expr, consumed, psz] = rdr.read_expr(input, eof); + auto [expr, consumed, psz, error] = rdr.read_expr(input, eof); if (expr) { ppconfig ppc; ppstate_standalone pps(&cout, 0, &ppc); pps.prettyn(expr); + } else if (error.is_error()) { + cout << "parsing error (detected in " << error.src_function() << "): " << endl; + error.report(cout); + break; } input = input.after_prefix(consumed); parser_stack_size = psz; } + + /* here: input.empty() or error encountered */ + + /* discard stashed remainder of input line + * (for nicely-formatted errors) + */ + rdr.reset_to_idle_toplevel(); } - auto [expr, _1, _2] = rdr.read_expr(input, true /*eof*/); + auto [expr, _1, _2, error] = rdr.read_expr(input, true /*eof*/); if (expr) { ppconfig ppc; ppstate_standalone pps(&cout, 0, &ppc); pps.prettyn>(rp(expr)); + } else if (error.is_error()) { + cout << "parsing error (detected in " << error.src_function() << "): " << endl; + error.report(cout); } rx.history_save("repl_history.txt"); diff --git a/xo-reader/include/xo/reader/envframestack.hpp b/xo-reader/include/xo/reader/envframestack.hpp index 2ec256fe..1f00a3f3 100644 --- a/xo-reader/include/xo/reader/envframestack.hpp +++ b/xo-reader/include/xo/reader/envframestack.hpp @@ -40,6 +40,8 @@ namespace xo { void push_envframe(const rp & x); rp pop_envframe(); + void reset_to_toplevel() { stack_.resize(1); } + /** relative to top-of-stack. * 0 -> top (last in), z-1 -> bottom (first in) **/ diff --git a/xo-reader/include/xo/reader/exprstatestack.hpp b/xo-reader/include/xo/reader/exprstatestack.hpp index e8f51d45..1186c552 100644 --- a/xo-reader/include/xo/reader/exprstatestack.hpp +++ b/xo-reader/include/xo/reader/exprstatestack.hpp @@ -29,6 +29,8 @@ namespace xo { void push_exprstate(std::unique_ptr exs); std::unique_ptr pop_exprstate(); + void reset_to_toplevel(); + /** relative to top-of-stack. * 0 -> top (last in), z-1 -> bottom (first in) **/ diff --git a/xo-reader/include/xo/reader/parser.hpp b/xo-reader/include/xo/reader/parser.hpp index 4e89e886..496a8623 100644 --- a/xo-reader/include/xo/reader/parser.hpp +++ b/xo-reader/include/xo/reader/parser.hpp @@ -104,7 +104,8 @@ namespace xo { * $varname(n) : $typename(n)) [-> $typename[ret]] * body-expr * [ end $functionname ] - * literal-expr = integer-literal + * literal-expr = boolean-literal + * | integer-literal * | fp-literal * | string-literal * | symbol-literal @@ -211,6 +212,13 @@ namespace xo { **/ rp include_token(const token_type & tk); + /** reset to starting parsing state. + * use this after encountering an error, to avoid cascade of + * spurious secondary errors.. particularly important when + * invoked asa part of a REPL. + **/ + void reset_to_idle_toplevel(); + /** print human-readable representation on stream @p os **/ void print(std::ostream & os) const; diff --git a/xo-reader/include/xo/reader/reader.hpp b/xo-reader/include/xo/reader/reader.hpp index 841c3a4d..ffbdee56 100644 --- a/xo-reader/include/xo/reader/reader.hpp +++ b/xo-reader/include/xo/reader/reader.hpp @@ -6,6 +6,7 @@ #pragma once #include "parser.hpp" +#include "reader_error.hpp" #include "xo/expression/Expression.hpp" #include "xo/expression/pretty_expression.hpp" #include "xo/tokenizer/tokenizer.hpp" @@ -19,8 +20,8 @@ namespace xo { using Expression = xo::ast::Expression; using span_type = span; - reader_result(rp expr, span_type rem, std::size_t psz) - : expr_{std::move(expr)}, rem_{rem}, parser_stack_size_{psz} {} + reader_result(rp expr, span_type rem, std::size_t psz, const reader_error & error) + : expr_{std::move(expr)}, rem_{rem}, parser_stack_size_{psz}, error_{error} {} /** true if reader parsed a complete expression **/ bool expr_complete() const { return expr_.get(); } @@ -37,6 +38,9 @@ namespace xo { * will be zero whenever @ref expr_ is non-null **/ std::size_t parser_stack_size_ = 0; + + /** error description, whenever .error_.is_error() is true **/ + reader_error error_; }; /** @@ -53,7 +57,7 @@ namespace xo { * * for (auto rem = input; !rem.empty();) { * // res: (parsed-expr, used) - * auto res = rdr.read_expr(rem, eof); + * auto [expres = rdr.read_expr(rem, eof); * * if (res.first) { * // do something with res.first (parsed expr) @@ -112,6 +116,13 @@ namespace xo { **/ reader_result read_expr(const span_type & input, bool eof); + /** reset to known starting point after encountering an error. + * - remainder of stashed current line. + * Necesary for well-formatted error reporting. + * - current parsing state + **/ + void reset_to_idle_toplevel(); + private: /** tokenizer: text -> tokens **/ tokenizer_type tokenizer_; diff --git a/xo-reader/include/xo/reader/reader_error.hpp b/xo-reader/include/xo/reader/reader_error.hpp new file mode 100644 index 00000000..d23502d6 --- /dev/null +++ b/xo-reader/include/xo/reader/reader_error.hpp @@ -0,0 +1,53 @@ +/* reader_error.hpp + * + * Author: Roland Conybeare, Jul 2025 + */ + +#include "xo/tokenizer/tokenizer_error.hpp" + +namespace xo { + namespace scm { + class reader_error { + public: + using input_state_type = typename tokenizer_error::input_state_type; + + public: + /** default ctor represents a not-an-error sentinel object **/ + reader_error() = default; + /** construct to capture parsing error context + * @ + **/ + reader_error(const char * src_function, + const char * error_description, + const input_state_type & input_state, + size_t error_pos) + : tk_error_{src_function, error_description, input_state, error_pos} + {} + + const tokenizer_error & tk_error() const { return tk_error_; } + + /** true, except for sentinel not-an-error object **/ + bool is_error() const { return tk_error_.is_error(); } + /** false, except for object in sentinel state **/ + bool is_not_an_error() const { return tk_error_.is_not_an_error(); } + + const char * src_function() const { return tk_error_.src_function(); } + + /** print error representation to stream @p os. Intended for parser/tokenizer + * diagnostics. For Schematika errors prefer @ref report + **/ + void print(std::ostream & os) const { tk_error_.print(os); } + + /** print human-oriented error report on @p os. **/ + void report(std::ostream & os) const { tk_error_.report(os); } + + private: + /** for parser-level errors, will still use this for + * {src function, error description, input state and error pos} + **/ + tokenizer_error tk_error_; + }; + } +} + +/* end reader_error.hpp */ diff --git a/xo-reader/src/reader/exprstatestack.cpp b/xo-reader/src/reader/exprstatestack.cpp index bb38305a..354586d7 100644 --- a/xo-reader/src/reader/exprstatestack.cpp +++ b/xo-reader/src/reader/exprstatestack.cpp @@ -20,6 +20,11 @@ namespace xo { return *(stack_[z-1]); } + void + exprstatestack::reset_to_toplevel() { + this->stack_.resize(1); + } + void exprstatestack::push_exprstate(std::unique_ptr exs) { constexpr bool c_debug_flag = true; diff --git a/xo-reader/src/reader/parser.cpp b/xo-reader/src/reader/parser.cpp index dfa34658..2d8e3818 100644 --- a/xo-reader/src/reader/parser.cpp +++ b/xo-reader/src/reader/parser.cpp @@ -92,6 +92,13 @@ namespace xo { return retval; } /*include_token*/ + void + parser::reset_to_idle_toplevel() + { + xs_stack_.reset_to_toplevel(); + env_stack_.reset_to_toplevel(); + } /*discard_current_state*/ + void parser::print(std::ostream & os) const { os << "tokenizer_.scan2(input, eof); - const auto & tk = sr.get_token(); - const span_type & used_span = sr.consumed(); + auto [tk, used_span, error] = this->tokenizer_.scan2(input, eof); log && log(xtag("consumed", used_span)); log && log(xtag("input.pre", input)); - input = input.after_prefix(used_span); - - log && log(xtag("expr_span.pre", expr_span)); - + input = this->tokenizer_.consume(used_span, input); expr_span += used_span; if (tk.is_valid()) { @@ -63,18 +60,33 @@ namespace xo { xtag("expr", expr)); /* token completes an expression -> victory */ - return reader_result(expr, expr_span, parser_.stack_size()); + return reader_result(expr, expr_span, parser_.stack_size(), reader_error()); } else { /* token did not complete an expression * (e.g. token for '[') * - * input span may contain more tokens -> iterate + * input span may conotain more tokens -> iterate */ } } else { - assert(input.empty()); + if (error.is_error()) { + /* tokenizer detected an error */ - /* no more tokens in input */ + std::cout << "tokenizer error pre-report:" << std::endl; + error.report(std::cout); + + return reader_result(nullptr, expr_span, parser_.stack_size(), + reader_error(error.src_function(), + error.error_description(), + error.input_state(), + error.error_pos())); + } else { + /* control should not come here */ + + assert(input.empty()); + } + + /* ono more tokens in input */ break; } } @@ -99,7 +111,14 @@ namespace xo { log && log(xtag("outcome", "noop")); - return reader_result(nullptr, expr_span, parser_.stack_size()); + return reader_result(nullptr, expr_span, parser_.stack_size(), reader_error()); + } + + void + reader::reset_to_idle_toplevel() + { + this->tokenizer_.discard_current_line(); + this->parser_.reset_to_idle_toplevel(); } } /*namespace scm*/ diff --git a/xo-tokenizer/example/tokenrepl/tokenrepl.cpp b/xo-tokenizer/example/tokenrepl/tokenrepl.cpp index 9afe0059..d6eacfea 100644 --- a/xo-tokenizer/example/tokenrepl/tokenrepl.cpp +++ b/xo-tokenizer/example/tokenrepl/tokenrepl.cpp @@ -41,7 +41,7 @@ main() { if (tk.is_valid()) { cout << tk << endl; } else if (error.is_error()) { - cout << "parsing error: " << endl; + cout << "tokenizer error: " << endl; error.report(cout); break; diff --git a/xo-tokenizer/include/xo/tokenizer/span.hpp b/xo-tokenizer/include/xo/tokenizer/span.hpp index bb1eeccb..8cf7a4a7 100644 --- a/xo-tokenizer/include/xo/tokenizer/span.hpp +++ b/xo-tokenizer/include/xo/tokenizer/span.hpp @@ -104,6 +104,18 @@ namespace xo { /** @defgroup span-general-methods **/ ///@{ + /** @brief strip prefix until first occurence of '\n', including the newline **/ + void discard_until_newline() { + for (const CharT * p = lo_; p < hi_; ++p) { + if (*p == '\n') { + lo_ = p + 1; + return; + } + } + + lo_ = hi_; + } + /** Create new span over supplied type, * with identical (possibly misaligned) endpoints. * @@ -142,8 +154,7 @@ namespace xo { /** @brief create span with @p prefix of this span removed **/ span after_prefix(const span & prefix) const { - assert(prefix.lo() == lo_); - if (prefix.lo() != lo_) { + if (!prefix.is_null() && (prefix.lo() != lo_)) { throw std::runtime_error ("after_prefix: expected prefix of this span"); } @@ -174,7 +185,7 @@ namespace xo { span & operator+=(const span & x) { if (hi_ == x.lo_) { hi_ = x.hi_; - } else { + } else if (!x.is_null()) { assert(false); } diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp index 75e4d4d1..74cc43f6 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp @@ -31,12 +31,16 @@ namespace xo { * span_type input = ...; * * while (!input.empty()) { - * auto res = tkz.scan(input); - * auto [tk, consumed, error] = res.first; + * auto [tk, consumed, error] = tkz.scan(input); * - * // do something with tk if tk.is_valid() + * if (tk.is_valid()) { + * // do something with tk + * } else if (error.is_error()) { + * error.report(cout); + * break; + * } * - * input = tkz.consume(res.second, input); + * input = tkz.consume(consumed, input); * } * * if endofinput { diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp index 9488b62b..743e3faf 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp @@ -27,7 +27,7 @@ namespace xo { /** @defgroup tokenizer-error-ctors **/ ///@{ - /** Default ctor represent a not-an-error sentinel object **/ + /** Default ctor represents a not-an-error sentinel object **/ tokenizer_error() = default; /** Constructor to capture parsing error context * @p tk_start current position on entry to scanner @@ -69,7 +69,7 @@ namespace xo { /** true, except for a sentinel error object **/ bool is_error() const { return error_description_ != nullptr; } - /** true except for object in sentinel state **/ + /** false except for object in sentinel state **/ bool is_not_an_error() const { return error_description_ == nullptr; } /** Print representation to stream @p os. Intended for tokenizer diagnostics.