+ xo-tokenizer2 xo-reader2 xo-expression2 xo-interpreter2

2nd gen schematika interpreter using fomo
This commit is contained in:
Roland Conybeare 2026-01-10 12:39:09 -05:00
commit f7bd3b0db3
41 changed files with 3566 additions and 9 deletions

View file

@ -0,0 +1,230 @@
/* @file TkInputState.hpp
*
* author: Roland Conybeare, Jun 2025
*/
#pragma once
#include "span.hpp"
namespace xo {
namespace scm {
/** enum to report outcome of @ref capture_current_line **/
enum class input_error {
/** normal return, input line successfully identified and captured **/
ok = 0,
/** incomplete input; should not have been submitted
* to @ref capture_current_line.
* note: submit last line of input with eof_flag=true
**/
incomplete,
N
};
/** @class input_state
* @brief Track detailed input position for use in error messages
*
* input characters fall into two categories:
* - consumed: memory can be reclaimed/recycled
* - buffered: memory will be retained unaltered until consumed
*
* remarks:
* - always in one of two states:
* - empty
* - contains exactly one line of input
* - also record current input position.
* Use this for example to identify where tokenizer rejected input.
* - .current_pos advances by one token
*
* - buffered characters always form a single contiguous range.
* - input_state does not own any storage; storage is owned elsewhere
*
* @text
*
* <------------------.current_line------------------>
* > <-- .whitespace
* cccccccccccccccccccccccccccccccc__TTTTTTTTxxxxxxxxx
* ^ ^ ^
* .current_line.lo | .current_line.hi
* .current_pos
*
* <----prev_line----> <----current_line---->
* > <--whitespace
* ppppppppppppppppppp cccccccccccc__TTTTTTTT
* ^
*
* @endtext
**/
class TkInputState {
public:
/** @defgroup input-state-type-traits input-state type straits **/
///@{
using CharT = char;
/** type representing a contiguous span of tokenizer input characters **/
using span_type = span<const CharT>;
///@}
public:
/** @defgroup input-state-ctors input_state constructors **/
///@{
TkInputState() = default;
explicit TkInputState(bool debug_flag) : debug_flag_{debug_flag} {}
/** Create instance with supplied @p current_line, @p current_pos, @p whitespace.
* Introduced for unit tests, not used in tokenizer.
**/
explicit TkInputState(const span<const CharT>& current_line,
size_t current_pos,
size_t whitespace) : current_line_{current_line},
current_pos_{current_pos},
whitespace_{whitespace} {}
///@}
/** @defgroup input-state-static-methods input_state static methods **/
///@{
/** recognize the newline character '\n' **/
static bool is_newline(CharT ch);
/** identifies whitespace chars.
* These are chars that do not belong to any token.
* They are not permitted to appear within
* a symbol or string token.
* Appearance of a whitespace char forces completioon of
* preceding token.
**/
static bool is_whitespace(CharT ch);
///@}
/** @defgroup input-state-access-methods **/
///@{
#pragma GCC diagnostic push
#ifndef __APPLE__
#pragma GCC diagnostic ignored "-Wchanges-meaning"
#endif
const span_type & current_line() const { return current_line_; }
#pragma GCC diagnostic pop
size_t tk_start() const { return tk_start_; }
size_t current_pos() const { return current_pos_; }
size_t whitespace() const { return whitespace_; }
bool debug_flag() const { return debug_flag_; }
///@}
/** @defgroup input-state-general-methods **/
///@{
/** Input state less @p n chars.
* Use to recover input state before a complete but error-triggering token
**/
TkInputState rewind(std::size_t n) const;
/** Capture prefix of @p input up to first newline.
* Set read position to start of line.
*
* Alters:
* .current_line
* .current_pos
*
* Return pair comprising error code and input span representing first line
* (including trailing newline) from @p input.
**/
std::pair<input_error, span_type> capture_current_line(const span_type & input,
bool eof_flag);
/** atomically return current line while discarding it from input state
*
* Alters
* .current_line
* .current_pos
* .whitespace
**/
span_type consume_current_line();
/** Reset input state for start of next line.
* Expression parser may use this to discard remainder of input line
* after a parsing error.
*
* Alters:
* .current_line
* .current_pos
* .whitespace
**/
void discard_current_line();
/** Advance input position by @p z
*
* Alters:
* .current_pos
**/
void advance(size_t z);
/** Advance .current_pos to pos.
* Require: pos in @ref current_line_
**/
void advance_until(const CharT * pos);
/** Skip prefix of input, starting at current read position,
* comprising only whitespace.
*
* Presume input position is at end of token;
* on return @ref whitespace_ counts number of whitespace characters
* skipped.
*
* Return pointer to first non-whitespace character after @ref current_pos_
* or @ref current_line_.hi if reached end of buffered line.
*
* Alters:
* .whitespace
**/
const CharT * skip_leading_whitespace();
///@}
private:
/** @defgroup input-state-instance-vars input_state instance variables **/
///@{
/** remember current input line. Used only to report errors **/
span<const CharT> current_line_ = span<const CharT>();
/** start of last token within @ref current_line_ **/
size_t tk_start_ = 0;
/** input position within @ref current_line_ **/
size_t current_pos_ = 0;
/** number of whitespace chars since end of preceding token,
* or last newline, whichever is less
**/
size_t whitespace_ = 0;
/** true to log input activity */
bool debug_flag_ = false;
///@}
}; /*TkInputState*/
inline std::ostream &
operator<<(std::ostream & os,
const TkInputState & x)
{
using xo::print::unq;
os << "<input_state"
<< xtag("tk", x.tk_start())
<< xtag("pos", x.current_pos())
<< xtag("line",
unq(std::string_view(x.current_line().lo(),
x.current_line().hi())))
<< xtag("whitespace", x.whitespace())
<< ">";
return os;
}
} /*namespace scm*/
} /*namespace xo*/
/* end TkInputState.hpp */

View file

@ -0,0 +1,226 @@
/* file Token.hpp
*
* author: Roland Conybeare, Jul 2024
*/
#pragma once
#include "tokentype.hpp"
#include "xo/indentlog/print/tag.hpp"
#include <stdexcept>
#include <ostream>
#include <string>
#include <cstdint>
namespace xo {
namespace scm {
namespace detail {
/* compute a * b^p, p >= 0 */
constexpr double
pow_aux(double a, double b, int p) {
while (p > 0) {
if (p % 2 == 1) {
/* a * b^p = a * b^(2q + 1) = a.b * 10^(2q) */
a *= b;
p -= 1;
} else {
/* a * b^p = a * b^(2q) = a * (b^2)^q */
b = b * b;
p /= 2;
}
}
/* a * b^0 = a */
return a;
}
constexpr double
pow10(int p) {
if (p >= 0)
return pow_aux(1.0, 10.0, p);
else
return 1.0 / pow_aux(1.0, 10.0, -p);
}
}
/** @class token
* @brief Represent a Schematika lexical token
**/
class Token {
public:
/** @defgroup token-ctors token constructors **/
///@{
/** default ctor creates token with type @c tk_invalid **/
Token() = default;
/** create token with type @c tk_type and input text @c text **/
Token(tokentype tk_type, const std::string & text = "")
: tk_type_{tk_type}, text_{text} {}
/** create invalid token (same as null ctor, but explicit) **/
static Token invalid() { return Token(); }
/** Create token representing a boolean literal from text @p txt
* @p txt must be @c true or @c false
**/
static Token bool_token(const std::string & txt) {
return Token(tokentype::tk_bool, txt);
}
/** Create token representing 64-bit signed integer literal parsed from decimal @p txt.
* The string @p txt must be a decimal integer literal, since @ref i64_value re-parses @p txt.
**/
static Token i64_token(const std::string & txt) {
return Token(tokentype::tk_i64, txt);
}
/** create token representing 64-bit floating-point literal parsed from decimal @p txt
* The string @p txt must be a decimal floating-point literal, since @ref f64_value re-parses @p txt.
**/
static Token f64_token(const std::string & txt) {
return Token(tokentype::tk_f64, txt);
}
/** create token representing literal string parsed from @p txt **/
static Token string_token(const std::string & txt) {
return Token(tokentype::tk_string, txt);
}
/** create token representing a symbol parsed from @p txt.
* Note that not all strings are valid symbol names.
**/
static Token symbol_token(const std::string & txt) {
return Token(tokentype::tk_symbol, txt);
}
/** token representing left angle bracket @c "<" **/
static Token leftangle() { return Token(tokentype::tk_leftangle); }
/** token representing right angle bracket @c ">" **/
static Token rightangle() { return Token(tokentype::tk_rightangle); }
/** token representing left parenthesis @c "(" **/
static Token leftparen() { return Token(tokentype::tk_leftparen); }
/** Token representing right parenthesis @c ")" **/
static Token rightparen() { return Token(tokentype::tk_rightparen); }
/** token representing left bracket @c "[" **/
static Token leftbracket() { return Token(tokentype::tk_leftbracket); }
/** token representing right bracket @c "]" **/
static Token rightbracket() { return Token(tokentype::tk_rightbracket); }
/** token representing left brace @c "{" **/
static Token leftbrace() { return Token(tokentype::tk_leftbrace); }
/** token representing right brace @c "}' **/
static Token rightbrace() { return Token(tokentype::tk_rightbrace); }
/** token representing period @c "." **/
static Token dot() { return Token(tokentype::tk_dot); }
/** token representing comma @c "," **/
static Token comma() { return Token(tokentype::tk_comma); }
/** token representing colon @c ":" **/
static Token colon() { return Token(tokentype::tk_colon); }
/** token representing double-colo @c "::" **/
static Token doublecolon() { return Token(tokentype::tk_doublecolon); }
/** token representing semicolon @c ";" **/
static Token semicolon() { return Token(tokentype::tk_semicolon); }
/** token representing single-assignment @c "=" **/
static Token singleassign() { return Token(tokentype::tk_singleassign); }
/** token representing unrestricted assignment @c ":=" **/
static Token assign_token() { return Token(tokentype::tk_assign); }
/** token representing indirection @c "->" **/
static Token yields() { return Token(tokentype::tk_yields); }
/** token for @c "+" **/
static Token plus_token() { return Token(tokentype::tk_plus); }
/** token for @c "-" **/
static Token minus_token() { return Token(tokentype::tk_minus); }
/** token for @c "*" **/
static Token star_token() { return Token(tokentype::tk_star); }
/** token for @c "/" **/
static Token slash_token() { return Token(tokentype::tk_slash); }
/** token representing keyword @c type **/
static Token type() { return Token(tokentype::tk_type); }
/** token representing keyword @c def **/
static Token def() { return Token(tokentype::tk_def); }
/** token representing keyword @c lambda **/
static Token lambda() { return Token(tokentype::tk_lambda); }
/** token representing keyword @c if **/
static Token if_token() { return Token(tokentype::tk_if); }
/** token representing keyword @c else **/
static Token else_token() { return Token(tokentype::tk_else); }
/** token representing keyword @c let **/
static Token let() { return Token(tokentype::tk_let); }
/** token representing keyword @c in **/
static Token in() { return Token(tokentype::tk_in); }
/** token representing keyword @c end **/
static Token end() { return Token(tokentype::tk_end); }
///@}
/** @defgroup token-access-methods **/
///@{
tokentype tk_type() const { return tk_type_; }
const std::string & text() const { return text_; }
///@}
/** @defgroup token-general-methods **/
///@{
/** true if token understood to represent valid input
* i.e. any token type except @c tk_invalid
**/
bool is_valid() const { return tk_type_ != tokentype::tk_invalid; }
/** true for sentinel token with type tk_invalid **/
bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; }
/** true for tokens with variable text. false for those with fixed textual representation **/
bool has_variable_text() const { return (tk_type_ == tokentype::tk_i64
|| tk_type_ == tokentype::tk_f64
|| tk_type_ == tokentype::tk_string
|| tk_type_ == tokentype::tk_symbol); }
/** expect input matching @c true or @c false **/
bool bool_value() const;
/** expect input matching @c [+|-][0-9][0-9]* **/
std::int64_t i64_value() const;
/** expect input matching @c [+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]* **/
double f64_value() const;
/** print human-readable token representation on stream @p os **/
void print(std::ostream & os) const;
///@}
private:
/** @defgroup token-instance-vars **/
///@{
/** category for this token **/
tokentype tk_type_ = tokentype::tk_invalid;
/** characters comprising this token.
* only provided for certain token types:
*
* tk_i64
* tk_f64
* tk_string
* tk_symbol
**/
std::string text_;
///@}
};
inline std::ostream &
operator<< (std::ostream & os,
const Token & tk)
{
tk.print(os);
return os;
}
} /*namespace scm*/
#ifndef ppdetail_atomic
namespace print {
PPDETAIL_ATOMIC(xo::scm::token<char>);
}
#endif
} /*namespace xo*/
/* end Token.hpp */

View file

@ -0,0 +1,167 @@
/* file Tokenizer.hpp
*
* author: Roland Conybeare, Jul 2024
*/
#pragma once
#include "Token.hpp"
#include "TkInputState.hpp"
#include "span.hpp"
#include "scan_result.hpp"
#include "xo/indentlog/scope.hpp"
#include "xo/indentlog/print/ppdetail_atomic.hpp"
#include <cassert>
namespace xo {
namespace scm {
/** @class Tokenizer
* @brief Parse a Schematika character stream into lexical tokens
*
* Use:
*
* @code
* // see xo-tokenizer2/example/tokenrepl/tokenrepl.cpp
* // for exact working code
*
* using tokenizer_type = tokenizer<char>;
* using span_type = tokenizer_type::span_type;
*
* tokenizer_type tkz;
* span_type input = ...;
*
* while (!input.empty()) {
* auto [tk, consumed, error] = tkz.scan(input);
*
* if (tk.is_valid()) {
* // do something with tk
* } else if (error.is_error()) {
* error.report(cout);
* break;
* }
*
* input = input.after_prefix(consumed);
* }
*
* if endofinput {
* auto [tk, consumed, error] = tzk.notify_eof()
*
* // do something with (final) tk if tk.is_valid()
* }
*
* @endcode
*
* See tokentype.hpp for token types
**/
class Tokenizer {
public:
using CharT = char;
using token_type = Token;
using error_type = TokenizerError;
using span_type = span<const CharT>;
using input_state_type = TkInputState;
using result_type = scan_result;
public:
/** @defgroup tokenizer-ctors tokenizer constructors **/
///@{
Tokenizer(bool debug_flag = false);
///@}
/** @defgroup tokenizer-access-methods tokenizer access methods **/
///@{
#pragma GCC diagnostic push
#ifndef __APPLE__
#pragma GCC diagnostic ignored "-Wchanges-meaning"
#endif
const TkInputState & input_state() const { return input_state_; }
#pragma GCC diagnostic pop
///@}
/** @defgroup tokenizer-general-methods tokenizer methods **/
///@{
/** identifies punctuation chars.
* These are chars that are not permitted to appear within
* a symbol token. Instead they force completion of
* a preceding token, and start a new token with themselves
**/
static bool is_1char_punctuation(CharT ch);
/** more-relaxed version of is_1char_punctuation.
* Chars that are not permitted to appear within a symbol token,
* but may form token combined with next character
**/
static bool is_2char_punctuation(CharT ch);
/** assemble token from text @p token_text.
* @p initial_whitespace Amount of whitespace input being consumed from input.
* @p token_text subset of input_line representing a single token.
* @p p_input_state input state containing input_line. On exit current line cleared
* if error
*
* retval.consumed will represent some possibly-empty prefix of @p input
**/
static scan_result assemble_token(std::size_t initial_whitespace,
const span_type & token_text,
TkInputState * p_input_state);
/** degenerate version of assemble_token() on reaching end-of-file **/
static scan_result assemble_final_token(const span_type & token_text,
TkInputState * p_input_state);
/** true if tokenizer contains stored prefix of
* possibly-incomplete token
**/
bool has_prefix() const { return !prefix_.empty(); }
/** scan for next input token, given @p input.
* Note:
* - tokenizer can consume input (e.g. whitespace)
* without completing a token
* - input will remember the extent of the last line of input
* for which parsing has begun, but not completed.
* It's required that at least that portion of the input span
* remain valid across scan(), scan2() calls
*
* @return {parsed token, consumed span}
**/
scan_result scan(const span_type & input,
bool eof_flag);
/** discard current line after error. Just cleans up error-reporting state **/
void discard_current_line();
///@}
private:
/** @defgroup tokenizer-instance-vars tokenizer instance variables **/
///@{
/** track input state (line#,pos,..) for error messages.
* There's an ordering problem here:
* 1. input_state_.skip_leading_whitespace() advances
* current line automagically when it sees \n
* 2. need to capture value of @ref input_state_ _before_ newline
* 3. but neeed newline to end token
* Also recall input_state_type needed for reporting errors.
**/
input_state_type input_state_;
/** Accumulate partial token here.
* This will happen if input sent to @ref tokenizer::scan
* ends without whitespace such that last available token's
* extent is not determined
**/
std::string prefix_;
///@}
}; /*tokenizer*/
} /*namespace scm*/
} /*namespace xo*/
/* end Tokenizer.hpp */

View file

@ -0,0 +1,114 @@
/* file TokenizerError.hpp
*
* author: Roland Conybeare, Jun 2025
*/
#pragma once
#include "TkInputState.hpp"
#include "tokentype.hpp"
#include "span.hpp"
#include <iomanip>
namespace xo {
namespace scm {
/** @class tokenizer_error
* @brief represent a lexing error, with context
*
* @tparam CharT representation for single characters
**/
class TokenizerError {
public:
using CharT = char;
using span_type = span<const CharT>;
public:
/** @defgroup tokenizer-error-ctors **/
///@{
/** Default ctor represents a not-an-error sentinel object **/
TokenizerError() = default;
/** Constructor to capture parsing error context
* @p tk_start current position on entry to scanner
* @p error_pos error location relative to token start
**/
TokenizerError(const char * src_function,
std::string error_description,
const TkInputState & input_state,
size_t error_pos)
: src_function_{src_function},
error_description_{std::move(error_description)},
input_state_{input_state},
error_pos_{error_pos}
{
scope log(XO_DEBUG(input_state.debug_flag()));
log && log(xtag("input_state.current_pos", input_state.current_pos()),
xtag("error_pos", error_pos));
}
///@}
/** @defgroup tokenizer-error-access-methods **/
///@{
const char * src_function() const { return src_function_; }
const std::string & error_description() const { return error_description_; }
#pragma GCC diagnostic push
#ifndef __APPLE__
#pragma GCC diagnostic ignored "-Wchanges-meaning"
#endif
const TkInputState & input_state() const { return input_state_; }
#pragma GCC diagnostic pop
size_t tk_start() const { return input_state_.current_pos(); }
size_t whitespace() const { return input_state_.whitespace(); }
size_t error_pos() const { return error_pos_; }
///@}
/** @defgroup tokenizer-error-general-methods **/
///@{
/** true, except for a sentinel error object **/
bool is_error() const { return !error_description_.empty(); }
/** false except for object in sentinel state **/
bool is_not_an_error() const { return error_description_.empty(); }
/** Print representation to stream @p os. Intended for tokenizer diagnostics.
* For Schematika errors prefer @ref report
**/
void print(std::ostream & os) const;
/** Print human-oriented error report on @p os. **/
void report(std::ostream & os) const;
///@}
private:
/** @defgroup tokenizer-error-vars **/
///@{
/** source location (in tokenizer) at which error identified **/
char const * src_function_ = nullptr;
/** static error description **/
std::string error_description_;
/** input state associated with this error.
* Sufficient to precisely locate it with context.
**/
TkInputState input_state_;
/** position (relative to @ref tk_entry_) of error **/
size_t error_pos_ = 0;
///@}
}; /*error_token*/
inline std::ostream &
operator<< (std::ostream & os,
const TokenizerError & tkerr)
{
tkerr.print(os);
return os;
}
} /*namespace scm*/
} /*namespace xo*/
/* end tokenizer_error.hpp */

View file

@ -0,0 +1,328 @@
/** @file buffer.hpp **/
#pragma once
#include "span.hpp"
#include <utility>
#include <cstdint>
#include <cassert>
#include <new>
namespace xo {
namespace scm {
/**
* @class buffer buffer.hpp
*
* @brief Container for a (possibly owned) FIFO queue of chars
*
* @tparam CharT. buffer element type.
*
* @code
* .buf
*
* +------------------------------------------+
* | | ... | | X| ... | X| | ... | |
* +------------------------------------------+
* ^ ^ ^ ^
* 0 .lo .hi .buf_z
*
* <-contents-><----avail----->
* @endcode
*
* Buffer does not support wrapped content:
* content that has not been consumed always occupies contiguous memory.
*
* Example:
* @code
* // 1.
* buffer<char> buf(64*1024);
* buf.empty() -> true
* buf.buf_z() -> 65536
* buf.lo_pos() -> 0
* buf.hi_pos() -> 65536
* buf.contents() -> empty span
* buf.avail() -> span entire buffer memory
*
* // write to (a prefix of) buf.avail()
* ::strncpy(buf.buf(), "hello, world\n", 13);
* buf.produce(span_type(buf.buf(), buf.buf() + 13));
*
* buf.lo_pos() -> 0
* buf.hi_pos() -> 13
* buf.contents() -> "hello, world\n";
*
*
* // examine stored content (does not change buffer state)
* auto span = buf.contents();
* cerr << string_view(span.lo(), span.hi()); // "hello, world\n"
*
* // consume (a prefix of) stored content
* buf.consume(span.prefix(7);
*
* buf.lo_pos() -> 7
* buf.hi_pos() -> 13
* buf.contents() -> "world\n"
*
* // consuming all remain content resets to original state
* buf.consume(buf.contents());
*
* buf.empty() -> true
* buf.hi_pos() -> 0 // not 13!
*
* // 2.
* buffer<char> buf;
* buf.empty() -> true
* buf.buf_z() -> 0
* buf.lo_pos() -> 0
* buf.hi_pos() -> 0
* buf.contents() -> empty span
* buf.avail() -> empty span
*
* // allocate memory separately from ctor
* buf.alloc(64*1024);
* @endcode
**/
template <typename CharT>
class buffer {
public:
/** @brief typealias for span of CharT **/
using span_type = span<CharT>;
/** @brief typealias for buffer size (counts CharT's, not bytes) **/
using size_type = std::uint64_t;
public:
/** @brief create empty buffer.
Does not allocate any storage; @see alloc
**/
buffer() = default;
/** @brief create empty buffer, and possibly allocate storage.
@param buf_z Buffer size. allocate storage (owned by this buffer) if >0.
@param align_z Align to this value, e.g. 8 to align storage on an 8-byte boundary
**/
buffer(size_type buf_z,
size_type align_z = sizeof(char))
: is_owner_{true},
buf_{buf_z ? (new (std::align_val_t(align_z)) CharT [buf_z]) : nullptr},
buf_z_{buf_z},
lo_pos_{0},
hi_pos_{0}
{}
/** @brief buffer is not copyable **/
buffer(buffer const & x) = delete;
/** @brief destructor. Release storage if owned **/
~buffer() { this->reset(); }
/** @name Access methods **/
///@{
/** @brief start of buffer memory **/
CharT * buf() const { return buf_; }
/** @brief buffer size (number of characters) **/
size_type buf_z() const { return buf_z_; }
/** @brief current start position within buffer **/
size_type lo_pos() const { return lo_pos_; }
/** @brief current end position within buffer **/
size_type hi_pos() const { return hi_pos_; }
///@}
/** @brief readonly access to a single buffer element.
Relative to start of buffer (ignores current consume position)
**/
CharT const & operator[](size_type i) const { return buf_[i]; }
/** @brief return span for current buffer contents **/
span_type contents() const { return span_type(buf_ + lo_pos_,
buf_ + hi_pos_); }
/** @brief returns span for writable buffer contents (unused prefix following produce position **/
span_type avail() const { return span_type(buf_ + hi_pos_,
buf_ + buf_z_); }
/** @brief @c true iff buffer is empty **/
bool empty() const { return lo_pos_ == hi_pos_; }
/**
@brief update buffer produce position, after (independently) writing contents of span to it
@pre left endpoint of @p span equals buffer produce position (@c .hi_pos)
@pre right endpoint of @p span within bounds of buffer memory range
@post right endpoint of @p span equals buffer produce position.
**/
void produce(span_type const & span) {
assert(span.lo() == buf_ + hi_pos_);
hi_pos_ += span.size();
}
/**
@brief update buffer consume position, when done with contents of span
@pre left endpoint of @p span equals buffer consume position (@c .lo_pos)
@pre right endpoint of @p span within bounds of buffer memory range
@post Either
buffer is empty, with @c .lo_pos = @c .hi_pos = @c 0.
buffer is non-empty, right endpoint of @p span equals new buffer consume position.
**/
void consume(span_type const & span) {
if (span.size()) {
assert(span.lo() == buf_ + lo_pos_);
lo_pos_ += span.size();
} else {
/* since .consume() that arrives at empty contents also resets .lo_pos .hi_pos,
* we don't want to blow up when called with an empty span -- argument
* may represent some pre-reset location in buffer
*/
}
if (lo_pos_ == hi_pos_) {
lo_pos_ = 0;
hi_pos_ = 0;
}
}
/**
@brief allocate buffer with desired amount of memory
@param buf_z desired buffer size
@param align_z alignment; buffer memory will be aligned on this byte-boundary.
**/
void alloc(size_type buf_z, size_type align_z = sizeof(char)) {
/* properly reset (+ discard) any existing state */
this->reset();
is_owner_ = true;
if (buf_z)
buf_ = new (std::align_val_t(align_z)) CharT [buf_z];
buf_z_ = buf_z;
lo_pos_ = 0;
hi_pos_ = 0;
}
/**
@brief attach buffer to (unowned) range of @p buf_z bytes starting at @p buf[0]
Buffer is not responsible for managing storage.
@post
1. buffer is empty
@post
2. buffer read position = buffer write position = 0
**/
void setbuf(CharT * buf, size_type buf_z) {
/* properly reset (+ discard) any existing state */
this->reset();
is_owner_ = false;
lo_pos_ = 0;
hi_pos_ = 0;
buf_ = buf;
buf_z_ = buf_z;
}
/**
@brief revert buffer to empty state and possibly zero it
@param zero_buffer_flag Zero buffer contents iff this is true
@post
1. buffer is empty
@post
2. buffer read position = buffer write position = 0
**/
void clear2empty(bool zero_buffer_flag) {
if (buf_ && zero_buffer_flag)
explicit_bzero(buf_, buf_z_ * sizeof(CharT));
lo_pos_ = 0;
hi_pos_ = 0;
}
/**
@brief swap representation with another buffer instance.
**/
void swap (buffer & x) {
std::swap(is_owner_, x.is_owner_);
std::swap(buf_, x.buf_);
std::swap(buf_z_, x.buf_z_);
std::swap(lo_pos_, x.lo_pos_);
std::swap(hi_pos_, x.hi_pos_);
}
/**
@brief reset buffer to an empty state and recover owned storage
**/
void reset() {
if (is_owner_ && buf_)
delete [] buf_;
is_owner_ = false;
buf_ = nullptr;
buf_z_ = 0;
lo_pos_ = 0;
hi_pos_ = 0;
}
/**
@brief move-assignment operator.
@param x right-hand-side to move from.
@post
@p x is in a valid, empty,
**/
buffer & operator= (buffer && x) {
is_owner_ = x.is_owner_;
buf_ = x.buf_;
buf_z_ = x.buf_z_;
lo_pos_ = x.lo_pos_;
hi_pos_ = x.hi_pos_;
x.is_owner_ = false;
x.lo_pos_ = 0;
x.hi_pos_ = 0;
x.buf_ = nullptr;
x.buf_z_ = 0;
return *this;
}
/** @brief buffer is not assignable */
buffer & operator= (buffer & x) = delete;
private:
/** @brief true iff buffer is responsible for freeing storage at @c buf_ **/
bool is_owner_ = false;
/** @brief buffer contents. buffer memory comprises @c buf_[0] to @c buf_[buf_z_] **/
CharT * buf_ = nullptr;
/** @brief buffer size (in units of CharT) **/
size_type buf_z_ = 0;
/** @brief buffer read (consume) position
@invariant
0 <= lo_pos_ <= hi_pos_ < buf_z_
**/
size_type lo_pos_ = 0;
/** @brief buffer write (produce) position
@invariant
0 <= hi_pos_ < hi_pos_ < buf_z_
**/
size_type hi_pos_ = 0;
};
/** @brief Overload for @c swap, so that @c buffer<CharT> swappable **/
template <typename CharT>
inline void
swap(buffer<CharT> & lhs,
buffer<CharT> & rhs) {
lhs.swap(rhs);
}
} /*namespace scm*/
} /*namespace xo*/
/* end buffer.hpp */

View file

@ -0,0 +1,81 @@
/* file scan_result.hpp
*
* author: Roland Conybeare, Jun 2025
*/
#pragma once
#include "Token.hpp"
#include "TokenizerError.hpp"
#include "TkInputState.hpp"
namespace xo {
namespace scm {
/** @class scan_result
* @brief Represent result of parsing one input token.
*
* @code
* Possible outcomes fall into several categories
* (with T: @c token_.is_valid(), E: @cerror_.is_error())
*
* | T | E | description |
* |-------+-------+-------------------------------------|
* | false | false | end of input, including end of line |
* | true | false | parsed token in T |
* | false | true | parse error in E |
*
* @endcode
**/
class scan_result {
public:
using CharT = char;
using token_type = Token;
using span_type = span<const CharT>;
using error_type = TokenizerError;
using input_state_type = TkInputState;
public:
scan_result(const Token & token,
const span_type & consumed,
const TokenizerError & error = TokenizerError())
: token_{token}, consumed_{consumed}, error_{error} {}
static scan_result make_whitespace(const span_type & prefix_input);
static scan_result make_partial(const span_type & prefix_input);
/**
* @p error_src can be __FUNCTION__ from site where error generated.
* @p error_msg error message
* @p error_pos error position, relative to start of token
* @p input_state_ref input state object;
* copied into scan_result, and leaving input_state_ref.current_line cleared
**/
static scan_result make_error_consume_current_line(const char * error_src,
std::string error_msg,
size_t error_pos,
input_state_type & input_state_ref);
bool is_eof_or_ambiguous() const { return token_.is_invalid() && error_.is_not_an_error(); }
bool is_token() const { return token_.is_valid(); }
bool is_error() const { return error_.is_error(); }
const Token & get_token() const { return token_; }
const span_type & consumed() const { return consumed_; }
const TokenizerError & error() const { return error_; }
public:
/** Successfully parsed token, whenever tk_type != tokentype::tk_invalid.
* Will be tokentype::tk_invalid in normal cause of events for valid input,
* when consuming whitespace
**/
token_type token_;
/** input span represented by .token, on success. Otherwise not defined **/
span_type consumed_;
/** error description, whenever .error_.is_error() is true **/
TokenizerError error_;
};
} /*namespace scm*/
} /*namespace xo*/
/* end scan_result.hpp */

View file

@ -0,0 +1,291 @@
/** @file span.hpp **/
#pragma once
#include "xo/indentlog/scope.hpp"
#include "xo/indentlog/print/ppdetail_atomic.hpp"
#include <ostream>
#include <cstdint>
#include <cassert>
namespace xo {
namespace scm {
/** @class span compression/span.hpp
*
* @brief A contiguous range of characters, without ownership.
*
* @tparam CharT type for elements referred to by this span.
**/
template <typename CharT>
class span {
public:
/** @defgroup span-type-traits span type traits **/
///@{
/** typealias for span size (in units of CharT) **/
using size_type = std::uint64_t;
///@}
public:
/** @defgroup span-ctors span constructors **/
///@{
/** null span **/
span() : lo_{nullptr}, hi_{nullptr} {}
/** Create span for the contiguous memory range [@p lo, @p hi) **/
span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {}
/** explicit conversion from span<U> **/
template<typename CharU>
span(const span<CharU> & other,
std::enable_if_t<std::is_convertible_v<CharU*, CharT*>
&& !std::is_same_v<CharU, CharT>> * = nullptr)
: lo_{other.lo()}, hi_{other.hi()} {}
/** copy ctor (explicit to avoid ambiguity with template ctor) **/
span(const span & other) = default;
span & operator=(const span & other) = default;
/** Create a null span (i.e. with null @p lo, @p hi pointers)
* A null span can be concatenated with any other span
* without triggering matching-endpoint asserts.
**/
static span make_null() { return span(static_cast<CharT*>(nullptr), static_cast<CharT*>(nullptr)); }
/** @brief create span for C-style string @p cstr **/
static span from_cstr(const CharT * cstr) {
CharT * lo = cstr;
CharT * hi = cstr ? cstr + strlen(cstr) : nullptr;
return span(lo, hi);
}
/** @brief create span from std::string @p str **/
static span from_string(const std::string& str) {
CharT * lo = &(*str.begin());
CharT * hi = &(*str.end());
return span(lo, hi);
}
/** @brief concatenate two contiguous spans */
static span concat(const span & span1, const span & span2) {
if (span1.is_null())
return span2;
if (span2.is_null())
return span1;
if (span1.hi() != span2.lo()) {
scope log(XO_DEBUG(true));
log && log(xtag("span1.hi", (void*)span1.hi()), xtag("span2.lo", (void*)span2.lo()));
}
assert(span1.hi() == span2.lo());
CharT * lo = span1.lo();
CharT * hi = span2.hi();
return span(lo, hi);
}
///@}
/** @defgroup span-access-methods **/
///@{
CharT * lo() const { return lo_; } /* get member span::lo_ */
CharT * hi() const { return hi_; } /* get member span::hi_ */
///@}
/** @defgroup span-general-methods **/
///@{
/** @brief strip prefix until first occurence of '\n', including the newline **/
void discard_until_newline() {
for (const CharT * p = lo_; p < hi_; ++p) {
if (*p == '\n') {
lo_ = p + 1;
return;
}
}
lo_ = hi_;
}
/** Create new span over supplied type,
* with identical (possibly misaligned) endpoints.
*
* @warning
* 1. New span uses exactly the same memory addresses.
* Endpoint pointers may not be aligned.
* 2. Implementation assumes code compiled with
* @code -fno-strict-aliasing @endcode enabled.
*
* @tparam OtherT element type for new span
**/
template <typename OtherT>
span<OtherT>
cast() const { return span<OtherT>(reinterpret_cast<OtherT *>(lo_),
reinterpret_cast<OtherT *>(hi_)); }
/** @brief create span including the first @p z members of this span. **/
span prefix(size_type z) const { return span(lo_, lo_ + z); }
/** @brief create span representing prefix up to (but not including) @p *p
**/
span prefix_upto(CharT * p) const {
if (p <= hi_)
return span(lo_, p);
else
return span(lo_, hi_);
}
/** @brief create span with first @p z members of this span removed **/
span after_prefix(size_type z) const {
if (lo_ + z > hi_)
z = hi_ - lo_;
return span(lo_ + z, hi_);
}
/** @brief create span with @p prefix of this span removed **/
span after_prefix(const span & prefix) const {
if (!prefix.is_null() && (prefix.lo() != lo_)) {
throw std::runtime_error
("after_prefix: expected prefix of this span");
}
return after_prefix(prefix.size());
}
/** Create span starting with position @p p.
* Does boundary checking; will return empty span if @p p is outside @c [lo_,hi)
**/
span suffix_from(CharT * p) const {
if ((lo_ <= p) && (p <= hi_))
return span(p, hi_);
else
return span(hi_, hi_);
}
/** true iff this span is null. distinct from empty. **/
bool is_null() const { return lo_ == nullptr && hi_ == nullptr; }
/** true iff this span is empty (comprises 0 elements). **/
bool empty() const { return lo_ == hi_; }
/** report the number of elements (of type CharT) in this span. **/
size_type size() const { return hi_ - lo_; }
/** increase extent of this spans to include @p x.
* Requires @c hi() == @c x.lo()
**/
span & operator+=(const span & x) {
if (hi_ == x.lo_) {
hi_ = x.hi_;
} else if (!x.is_null()) {
assert(false);
}
return *this;
}
/** print representation for this span on stream @p os **/
void print(std::ostream & os) const {
os << "<span"
<< xtag("addr", (void*)lo_)
<< xtag("size", size())
<< " :text " << xo::print::quot(std::string_view(lo_, hi_))
<< ">";
}
///@}
private:
/** @defgroup span-instance-vars **/
///@{
/** start of span.
Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
**/
CharT * lo_ = nullptr;
/** @brief end of span.
Span comprises memory address between @p lo (inclusive) and @p hi (exclusive)
**/
CharT * hi_ = nullptr;
///@}
}; /*span*/
/** @defgroup span-operators **/
///@{
/** compare spans for equality.
* Two spans are equal iff both endpoints match exactly.
**/
template <typename CharT>
inline bool
operator==(const span<CharT> & lhs, const span<CharT> & rhs) {
return ((lhs.lo() == rhs.lo())
&& (lhs.hi() == rhs.hi()));
}
/** compare spans for inequality.
* Two spans are unequal if either paired endpoint differs.
**/
template <typename CharT>
inline bool
operator!=(const span<CharT> & lhs, const span<CharT> & rhs) {
return ((lhs.lo() != rhs.lo())
|| (lhs.hi() != rhs.hi()));
}
/** print a summary of @p x on stream @p os. Intended for diagnostics **/
template <typename CharT>
inline std::ostream &
operator<<(std::ostream & os,
const span<CharT> & x) {
x.print(os);
return os;
}
///@}
} /*namespace scm*/
namespace print {
template <typename CharT>
class printspan_impl {
public:
printspan_impl(xo::scm::span<CharT> x) : span_{x} {}
xo::scm::span<CharT> span_;
};
template <typename CharT>
printspan_impl<CharT> printspan(const xo::scm::span<CharT>& span) {
return printspan_impl<CharT>(span);
}
template <typename CharT>
inline std::ostream &
operator<< (std::ostream & os,
const printspan_impl<CharT> & x)
{
for (const CharT * p = x.span_.lo(); p < x.span_.hi(); ++p)
os << *p;
return os;
}
#ifndef ppdetail_atomic
template <typename CharT> \
PPDETAIL_ATOMIC_BODY(printspan_impl<CharT>);
template <typename CharT> \
PPDETAIL_ATOMIC_BODY(xo::scm::span<CharT>);
#endif
}
} /*namespace xo*/

View file

@ -0,0 +1,192 @@
/** @file tokentype.hpp
*
* author: Roland Conybeare, Jul 2024
**/
#pragma once
#include "xo/indentlog/print/tag.hpp" // for STRINGIFY
#include "xo/indentlog/print/ppdetail_atomic.hpp"
#include <ostream>
namespace xo {
namespace scm {
/** @enum tokentype
* Enum to identify different schematika input token types
*
* Schematica code examples:
*
* @code
* type point :: { xcoord : f64, ycoord : f64 };
* type matrix :: array<double, 2>; // 2-d array
*
* decl hypot(x : f64, y : f64) -> f64;
*
* def hypot(x : f64, y : f64) {
* let
* x2 = (x * x);
* y2 = (y * y);
* hypot2 = (x2 + y2);
* in
* sqrt(hypot2);
* };
*
* def someconst 4;
*
* def foo(v : vec<i32>) {
* def (pi : f64) = 3.1415926;
* def (h : (f64,f64) -> f64) = hypot;
*
* h = hypot3;
* };
*
* def matrixproduct(x : matrix, y : matrix) {
* [i, j : x.row(i) * y.col(j)];
* };
* @endcode
**/
enum class tokentype {
/** sentinel value **/
tk_invalid = -1,
/** a boolean constant **/
tk_bool,
/** an integer constant (signed 64-bit integer) **/
tk_i64,
/** a 64-bit floating-point constant **/
tk_f64,
/** a string literal **/
tk_string,
/** a symbol **/
tk_symbol,
/** left-hand parenthesis @c '(' **/
tk_leftparen,
/** right-hand parenthesis @c ')' **/
tk_rightparen,
/** left-hand bracket @c '[' **/
tk_leftbracket,
/** right-hand bracket @c ']' **/
tk_rightbracket,
/** left-hand brace @c '{' **/
tk_leftbrace,
/** right-hand brace @c '}' **/
tk_rightbrace,
/** left-hand angle bracket @c '<' **/
tk_leftangle,
/** right-hand angle bracket @c '>' **/
tk_rightangle,
/** less-equal @c '<=' **/
tk_lessequal,
/** great-equal @c '>=' **/
tk_greatequal,
/** dot @c '.' **/
tk_dot,
/** comma @c ',' **/
tk_comma,
/** colon @c ':' **/
tk_colon,
/** double-colon @c '::' **/
tk_doublecolon,
/** semi-colon @c ';' **/
tk_semicolon,
/** single equals sign @c '=' **/
tk_singleassign,
/** assignment @c ':=' **/
tk_assign,
/** indirection @c '->' **/
tk_yields,
/** note: operators not treated as punctuation
* 'do-always' is a legal variable name,
* as is 'maybe*2', 'maybe+1', 'path/to/foo'
**/
/** operator @c '+' **/
tk_plus,
/** operator @c '-' **/
tk_minus,
/** operator @c '*' **/
tk_star,
/** operator @c '/' **/
tk_slash,
/** operator @c '==' **/
tk_cmpeq,
/** operator @c '!=' **/
tk_cmpne,
/** keyword @c 'type' **/
tk_type,
/** keyword @c 'def' **/
tk_def,
/** keyword @c 'lambda' **/
tk_lambda,
/** keyword @c 'if' **/
tk_if,
/** keyworkd @c 'then' **/
tk_then,
/** keyword @c 'else' **/
tk_else,
/** keyword @c 'let' **/
tk_let,
/** keyword @c 'in' **/
tk_in,
/** keyword @c 'end' **/
tk_end,
/** counts number of entries **/
n_tokentype
}; /*tokentype*/
/** String representation for enum value.
* For example @c tokentype_descr(tokentype::tk_if) -> @c "if"
**/
extern char const *
tokentype_descr(tokentype tk_type);
/** Print enum value for @p tk_type on stream @p os **/
inline std::ostream &
operator<< (std::ostream & os, tokentype tk_type) {
os << tokentype_descr(tk_type);
return os;
}
} /*namespace scm*/
#ifndef ppdetail_atomic
namespace print {
PPDETAIL_ATOMIC(xo::scm::tokentype);
} /*namespace print*/
#endif
} /*namespace xo*/
/* end tokentype.hpp */