From 7f1afac903c2a018e6c72927fb1be9615d018f95 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Fri, 21 Nov 2025 09:19:06 -0500 Subject: [PATCH] xo-tokenizer: refactor + satisfy clang on osx --- .../include/xo/interpreter/GlobalEnv.hpp | 7 +- .../interpreter/VirtualSchematikaMachine.hpp | 2 +- xo-interpreter/src/interpreter/GlobalEnv.cpp | 3 +- xo-interpreter/src/interpreter/Schematika.cpp | 3 +- .../include/xo/tokenizer/input_state.hpp | 10 ++- .../include/xo/tokenizer/tokenizer.hpp | 88 +++++++++++-------- 6 files changed, 70 insertions(+), 43 deletions(-) diff --git a/xo-interpreter/include/xo/interpreter/GlobalEnv.hpp b/xo-interpreter/include/xo/interpreter/GlobalEnv.hpp index 0dcc392c..f6e660b9 100644 --- a/xo-interpreter/include/xo/interpreter/GlobalEnv.hpp +++ b/xo-interpreter/include/xo/interpreter/GlobalEnv.hpp @@ -16,7 +16,10 @@ namespace xo { /** Create top-level global environment, allocating via @p mm. * Expect one of these per interpreter session. **/ - static gp make_empty(gc::IAlloc * mm, const rp & symtab); + static gp make_empty(gc::IAlloc * mm, + const rp & symtab); + + gc::IAlloc * get_mm() const { return mm_; } // inherited from Object.. virtual TaggedPtr self_tp() const final override; @@ -30,7 +33,7 @@ namespace xo { private: /** memory manager to use **/ - gc::IAlloc * mm_; + gc::IAlloc * mm_ = nullptr; /** global symbol table. * variables known to @c symtab_ are represented by diff --git a/xo-interpreter/include/xo/interpreter/VirtualSchematikaMachine.hpp b/xo-interpreter/include/xo/interpreter/VirtualSchematikaMachine.hpp index da582e2e..ca4ede2a 100644 --- a/xo-interpreter/include/xo/interpreter/VirtualSchematikaMachine.hpp +++ b/xo-interpreter/include/xo/interpreter/VirtualSchematikaMachine.hpp @@ -57,7 +57,7 @@ namespace xo { void report_error(const std::string & err); /** implementation class; contains instruction implementations **/ - friend class VsmOps; + friend struct VsmOps; private: /** program counter. diff --git a/xo-interpreter/src/interpreter/GlobalEnv.cpp b/xo-interpreter/src/interpreter/GlobalEnv.cpp index dd94b6b5..7eee0f86 100644 --- a/xo-interpreter/src/interpreter/GlobalEnv.cpp +++ b/xo-interpreter/src/interpreter/GlobalEnv.cpp @@ -17,7 +17,8 @@ namespace xo { } GlobalEnv::GlobalEnv(gc::IAlloc * mm, - const rp & symtab) : mm_{mm}, symtab_{symtab} + const rp & symtab) : mm_{mm}, + symtab_{symtab} {} TaggedPtr diff --git a/xo-interpreter/src/interpreter/Schematika.cpp b/xo-interpreter/src/interpreter/Schematika.cpp index 870d9e7a..19a6907e 100644 --- a/xo-interpreter/src/interpreter/Schematika.cpp +++ b/xo-interpreter/src/interpreter/Schematika.cpp @@ -8,6 +8,7 @@ #include "xo/reader/reader.hpp" #include #include +#include // for STDIN_FILENO on OSX namespace xo { using xo::gc::IAlloc; @@ -231,7 +232,7 @@ namespace xo { // ----- Schematika ----- - Schematika::Schematika(const Config & cfg) : p_impl_{std::move(Impl::make(cfg))} + Schematika::Schematika(const Config & cfg) : p_impl_{Impl::make(cfg)} {} Schematika::~Schematika() diff --git a/xo-tokenizer/include/xo/tokenizer/input_state.hpp b/xo-tokenizer/include/xo/tokenizer/input_state.hpp index 8e73321f..0e93512d 100644 --- a/xo-tokenizer/include/xo/tokenizer/input_state.hpp +++ b/xo-tokenizer/include/xo/tokenizer/input_state.hpp @@ -91,7 +91,11 @@ namespace xo { /** Skip prefix of input comprising whitespace. * Return pointer to first non-whitespace character in @p input, - * or @c input.hi if input contains only whitespace + * or @c input.hi if input contains only whitespace. + * + * if @p input contains any newlines, preserves suffix after last + * such newilne in @p current_line_ + * **/ const CharT * skip_leading_whitespace(const span_type & input); @@ -105,7 +109,7 @@ namespace xo { span current_line_ = span(); /** current input position within @ref current_line_ **/ size_t current_pos_ = 0; - /** whitespace since end of preceding token, + /** number of whitespace chars since end of preceding token, * or last newline, whichever is less **/ size_t whitespace_ = 0; @@ -114,7 +118,7 @@ namespace xo { bool debug_flag_ = false; ///@} - }; + }; /*input_state*/ template bool diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp index 8d6ac215..0dd46877 100644 --- a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp +++ b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp @@ -90,21 +90,16 @@ namespace xo { * a symbol token. Instead they force completion of * a preceding token, and start a new token with themselves **/ - bool is_1char_punctuation(CharT ch) const; + static bool is_1char_punctuation(CharT ch); /** more-relazed version of is_1char_punctuation. * Chars that are not permitted to appear within a symbol token, * but may form token combined with next character **/ - bool is_2char_punctuation(CharT ch) const; - - /** true if tokenizer contains stored prefix of - * possibly-incomplete token - **/ - bool has_prefix() const { return !prefix_.empty(); } + static bool is_2char_punctuation(CharT ch); /** assemble token from text @p token_text. - * @p token_text will often but not always represent a subset of @p input. + * @p token_text will often (but not always) represent a subset of @p input. * (For example consider multi-line string literals) * Also the span @p token_text may (in uncommon cases) * have been copied to separate storage from @p input @@ -115,13 +110,20 @@ namespace xo { * * retval.consumed will represent some possibly-empty prefix of @p input **/ - result_type assemble_token(std::size_t initial_whitespace, - std::size_t initial_token_prefix_from_input, - const span_type & token_text, - const span_type & input) const; + static result_type assemble_token(std::size_t initial_whitespace, + std::size_t initial_token_prefix_from_input, + const span_type & token_text, + const span_type & input, + const input_state_type & input_state); /** degenerate version of assemble_token() on reaching end-of-file **/ - result_type assemble_final_token(const span_type & token_text) const; + static result_type assemble_final_token(const span_type & token_text, + const input_state_type & input_state); + + /** true if tokenizer contains stored prefix of + * possibly-incomplete token + **/ + bool has_prefix() const { return !prefix_.empty(); } /** scan for next input token, given @p input. * Note: @@ -160,7 +162,8 @@ namespace xo { private: result_type scan_completion(const span_type & whitespace, const CharT* token_end, - const span_type & input); + const span_type & input, + const input_state_type & input_state); private: /** @defgroup tokenizer-instance-vars tokenizer instance variables **/ @@ -168,7 +171,11 @@ namespace xo { /** track input state (line#,pos,..) for error messages. * There's an ordering problem here: - * 1. input_state_.skip_leading_whitespace() advances current line when it sees newline. + * 1. input_state_.skip_leading_whitespace() advances current line automagically + * when it sees \n + * 2. need to capture value of @ref input_state_ _before_ newline + * 3. but neeed newline to end token + * Also recall input_state_type needed for reporting errors. **/ input_state_type input_state_; /** Accumulate partial token here. @@ -187,7 +194,7 @@ namespace xo { template bool - tokenizer::is_1char_punctuation(CharT ch) const { + tokenizer::is_1char_punctuation(CharT ch) { switch(ch) { case '(': return true; @@ -247,7 +254,7 @@ namespace xo { template bool - tokenizer::is_2char_punctuation(CharT ch) const { + tokenizer::is_2char_punctuation(CharT ch) { /* can't put '-' here, because of the way it appears in numeric literals * characters here may not appear in symbol names */ @@ -278,12 +285,13 @@ namespace xo { tokenizer::assemble_token(std::size_t initial_whitespace, std::size_t initial_token_prefix_from_input, const span_type & token_text, - const span_type & input) const -> result_type + const span_type & input, + const input_state_type & input_state) -> result_type { /* literal|pretty|streamlined */ log_config::style = function_style::streamlined; - scope log(XO_DEBUG(input_state_.debug_flag())); + scope log(XO_DEBUG(input_state.debug_flag())); log && log(xtag("token_text", token_text), xtag("initial_whitespace", initial_whitespace), xtag("initial_token_prefix_from_input", initial_token_prefix_from_input), @@ -386,7 +394,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "improperly placed sign indicator", - input_state_, + input_state, (ix - tk_start) )); } @@ -395,7 +403,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "duplicate decimal point in numeric literal", - input_state_, + input_state, (ix - tk_start))); } @@ -405,7 +413,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "duplicate exponent marker in numeric literal", - input_state_, + input_state, (ix - tk_start))); } @@ -421,7 +429,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "unexpected character in numeric constant" /*error_description*/, - input_state_, + input_state, (ix - tk_start))); } } @@ -524,7 +532,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "expecting key following escape character \\", - input_state_, + input_state, (ix - tk_start))); } @@ -553,7 +561,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "expecting one of n|r|\"|\\ following escape \\", - input_state_, + input_state, (ix - tk_start))); } break; @@ -570,7 +578,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "missing terminating '\"' to complete literal string", - input_state_, + input_state, (ix - tk_start))); } @@ -712,7 +720,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "illegal input character", - input_state_, + input_state, (ix - tk_start))); } @@ -767,21 +775,26 @@ namespace xo { input.prefix(initial_whitespace + initial_token_prefix_from_input)); } /*assemble_token*/ + /* TODO: input_state_ as argument ? */ template auto - tokenizer::assemble_final_token(const span_type & token_text) const -> result_type + tokenizer::assemble_final_token(const span_type & token_text, + const input_state_type & input_state) -> result_type { return assemble_token(0 /*initial_whitespace*/, 0 /*initial_token_prefix_from_input*/, token_text, - span_type::make_null()); + span_type::make_null(), + input_state); } + /* TODO: prefix_, input_state_ as arguments */ template auto tokenizer::scan_completion(const span_type & whitespace, const CharT* token_end, - const span_type & input) -> result_type { + const span_type & input, + const input_state_type & input_state) -> result_type { auto token_span = input.after_prefix(whitespace).prefix_upto(token_end); @@ -789,7 +802,8 @@ namespace xo { return assemble_token(whitespace.size(), token_span.size() /*initial_token_prefix_from_input*/, token_span, - input); + input, + input_state); } else { /* whatever we stashed in .prefix_, should be consumed from input. * control here implies reached end of input with either @@ -927,7 +941,8 @@ namespace xo { /* include next char and complete token */ ++ix; - return scan_completion(whitespace_span, ix /*token_end*/, input); + return scan_completion(whitespace_span, ix /*token_end*/, input, + this->input_state_); } /* here: -123, -.5e-21 for example */ @@ -945,7 +960,8 @@ namespace xo { if (ch2 != '=') { /* ignore next char and complete token */ - return scan_completion(whitespace_span, ix /*token_end*/, input); + return scan_completion(whitespace_span, ix /*token_end*/, input, + this->input_state_); } /* here: >= for example */ @@ -995,7 +1011,8 @@ namespace xo { } } - return scan_completion(whitespace_span, ix /*token_end*/, input); + return scan_completion(whitespace_span, ix /*token_end*/, input, + this->input_state_); } /*scan*/ template @@ -1052,7 +1069,8 @@ namespace xo { */ return result_type::make_whitespace(input); } else { - auto retval = assemble_final_token(span_type::from_string(prefix_)); + auto retval = assemble_final_token(span_type::from_string(prefix_), + this->input_state_); this->prefix_.clear();