From 7f1afac903c2a018e6c72927fb1be9615d018f95 Mon Sep 17 00:00:00 2001
From: Roland Conybeare <rconybeare@gmail.com>
Date: Fri, 21 Nov 2025 09:19:06 -0500
Subject: [PATCH] xo-tokenizer: refactor + satisfy clang on osx

---
 .../include/xo/interpreter/GlobalEnv.hpp      |  7 +-
 .../interpreter/VirtualSchematikaMachine.hpp  |  2 +-
 xo-interpreter/src/interpreter/GlobalEnv.cpp  |  3 +-
 xo-interpreter/src/interpreter/Schematika.cpp |  3 +-
 .../include/xo/tokenizer/input_state.hpp      | 10 ++-
 .../include/xo/tokenizer/tokenizer.hpp        | 88 +++++++++++--------
 6 files changed, 70 insertions(+), 43 deletions(-)
diff --git a/xo-interpreter/include/xo/interpreter/GlobalEnv.hpp b/xo-interpreter/include/xo/interpreter/GlobalEnv.hpp
index 0dcc392c..f6e660b9 100644
--- a/xo-interpreter/include/xo/interpreter/GlobalEnv.hpp
+++ b/xo-interpreter/include/xo/interpreter/GlobalEnv.hpp
@@ -16,7 +16,10 @@ namespace xo {
             /** Create top-level global environment, allocating via @p mm.
              *  Expect one of these per interpreter session.
              **/
-            static gp<GlobalEnv> make_empty(gc::IAlloc * mm, const rp<GlobalSymtab> & symtab);
+            static gp<GlobalEnv> make_empty(gc::IAlloc * mm,
+                                            const rp<GlobalSymtab> & symtab);
+
+            gc::IAlloc * get_mm() const { return mm_; }
 
             // inherited from Object..
             virtual TaggedPtr self_tp() const final override;
@@ -30,7 +33,7 @@ namespace xo {
 
         private:
             /** memory manager to use **/
-            gc::IAlloc * mm_;
+            gc::IAlloc * mm_ = nullptr;
 
             /** global symbol table.
              *  variables known to @c symtab_ are represented by
diff --git a/xo-interpreter/include/xo/interpreter/VirtualSchematikaMachine.hpp b/xo-interpreter/include/xo/interpreter/VirtualSchematikaMachine.hpp
index da582e2e..ca4ede2a 100644
--- a/xo-interpreter/include/xo/interpreter/VirtualSchematikaMachine.hpp
+++ b/xo-interpreter/include/xo/interpreter/VirtualSchematikaMachine.hpp
@@ -57,7 +57,7 @@ namespace xo {
             void report_error(const std::string & err);
 
             /** implementation class; contains instruction implementations **/
-            friend class VsmOps;
+            friend struct VsmOps;
 
         private:
             /** program counter.
diff --git a/xo-interpreter/src/interpreter/GlobalEnv.cpp b/xo-interpreter/src/interpreter/GlobalEnv.cpp
index dd94b6b5..7eee0f86 100644
--- a/xo-interpreter/src/interpreter/GlobalEnv.cpp
+++ b/xo-interpreter/src/interpreter/GlobalEnv.cpp
@@ -17,7 +17,8 @@ namespace xo {
         }
 
         GlobalEnv::GlobalEnv(gc::IAlloc * mm,
-                             const rp<GlobalSymtab> & symtab) : mm_{mm}, symtab_{symtab}
+                             const rp<GlobalSymtab> & symtab) : mm_{mm},
+                                                                symtab_{symtab}
         {}
 
         TaggedPtr
diff --git a/xo-interpreter/src/interpreter/Schematika.cpp b/xo-interpreter/src/interpreter/Schematika.cpp
index 870d9e7a..19a6907e 100644
--- a/xo-interpreter/src/interpreter/Schematika.cpp
+++ b/xo-interpreter/src/interpreter/Schematika.cpp
@@ -8,6 +8,7 @@
 #include "xo/reader/reader.hpp"
 #include <replxx.hxx>
 #include <ostream>
+#include <unistd.h> // for STDIN_FILENO on OSX
 
 namespace xo {
     using xo::gc::IAlloc;
@@ -231,7 +232,7 @@ namespace xo {
 
         // ----- Schematika -----
 
-        Schematika::Schematika(const Config & cfg) : p_impl_{std::move(Impl::make(cfg))}
+        Schematika::Schematika(const Config & cfg) : p_impl_{Impl::make(cfg)}
         {}
 
         Schematika::~Schematika()
diff --git a/xo-tokenizer/include/xo/tokenizer/input_state.hpp b/xo-tokenizer/include/xo/tokenizer/input_state.hpp
index 8e73321f..0e93512d 100644
--- a/xo-tokenizer/include/xo/tokenizer/input_state.hpp
+++ b/xo-tokenizer/include/xo/tokenizer/input_state.hpp
@@ -91,7 +91,11 @@ namespace xo {
 
             /** Skip prefix of input comprising whitespace.
              *  Return pointer to first non-whitespace character in @p input,
-             *  or @c input.hi if input contains only whitespace
+             *  or @c input.hi if input contains only whitespace.
+             *
+             *  if @p input contains any newlines, preserves suffix after last
+             *  such newilne in @p current_line_
+             *
              **/
             const CharT * skip_leading_whitespace(const span_type & input);
 
@@ -105,7 +109,7 @@ namespace xo {
             span<const CharT> current_line_ = span<const CharT>();
             /** current input position within @ref current_line_ **/
             size_t current_pos_ = 0;
-            /** whitespace since end of preceding token,
+            /** number of whitespace chars since end of preceding token,
              *  or last newline, whichever is less
              **/
             size_t whitespace_ = 0;
@@ -114,7 +118,7 @@ namespace xo {
             bool debug_flag_ = false;
 
             ///@}
-        };
+        }; /*input_state*/
 
         template <typename CharT>
         bool
diff --git a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp
index 8d6ac215..0dd46877 100644
--- a/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp
+++ b/xo-tokenizer/include/xo/tokenizer/tokenizer.hpp
@@ -90,21 +90,16 @@ namespace xo {
              *  a symbol token.  Instead they force completion of
              *  a preceding token,  and start a new token with themselves
              **/
-            bool is_1char_punctuation(CharT ch) const;
+            static bool is_1char_punctuation(CharT ch);
 
             /** more-relazed version of is_1char_punctuation.
              *  Chars that are not permitted to appear within a symbol token,
              *  but may form token combined with next character
              **/
-            bool is_2char_punctuation(CharT ch) const;
-
-            /** true if tokenizer contains stored prefix of
-             *  possibly-incomplete token
-             **/
-            bool has_prefix() const { return !prefix_.empty(); }
+            static bool is_2char_punctuation(CharT ch);
 
             /** assemble token from text @p token_text.
-             *  @p token_text will often but not always represent a subset of @p input.
+             *  @p token_text will often (but not always) represent a subset of @p input.
              *  (For example consider multi-line string literals)
              *  Also the span @p token_text may (in uncommon cases)
              *  have been copied to separate storage from @p input
@@ -115,13 +110,20 @@ namespace xo {
              *
              *  retval.consumed will represent some possibly-empty prefix of @p input
              **/
-            result_type assemble_token(std::size_t initial_whitespace,
-                                       std::size_t initial_token_prefix_from_input,
-                                       const span_type & token_text,
-                                       const span_type & input) const;
+            static result_type assemble_token(std::size_t initial_whitespace,
+                                              std::size_t initial_token_prefix_from_input,
+                                              const span_type & token_text,
+                                              const span_type & input,
+                                              const input_state_type & input_state);
 
             /** degenerate version of assemble_token() on reaching end-of-file **/
-            result_type assemble_final_token(const span_type & token_text) const;
+            static result_type assemble_final_token(const span_type & token_text,
+                                                    const input_state_type & input_state);
+
+            /** true if tokenizer contains stored prefix of
+             *  possibly-incomplete token
+             **/
+            bool has_prefix() const { return !prefix_.empty(); }
 
             /** scan for next input token,  given @p input.
              *  Note:
@@ -160,7 +162,8 @@ namespace xo {
         private:
             result_type scan_completion(const span_type & whitespace,
                                         const CharT* token_end,
-                                        const span_type & input);
+                                        const span_type & input,
+                                        const input_state_type & input_state);
 
         private:
             /** @defgroup tokenizer-instance-vars tokenizer instance variables **/
@@ -168,7 +171,11 @@ namespace xo {
 
             /** track input state (line#,pos,..) for error messages.
              *  There's an ordering problem here:
-             *  1. input_state_.skip_leading_whitespace() advances current line when it sees newline.
+             *  1. input_state_.skip_leading_whitespace() advances current line automagically
+             *     when it sees \n
+             *  2. need to capture value of @ref input_state_ _before_ newline
+             *  3. but neeed newline to end token
+             *  Also recall input_state_type needed for reporting errors.
              **/
             input_state_type input_state_;
             /** Accumulate partial token here.
@@ -187,7 +194,7 @@ namespace xo {
 
         template <typename CharT>
         bool
-        tokenizer<CharT>::is_1char_punctuation(CharT ch) const {
+        tokenizer<CharT>::is_1char_punctuation(CharT ch) {
             switch(ch) {
             case '(':
                 return true;
@@ -247,7 +254,7 @@ namespace xo {
 
         template <typename CharT>
         bool
-        tokenizer<CharT>::is_2char_punctuation(CharT ch) const {
+        tokenizer<CharT>::is_2char_punctuation(CharT ch) {
             /* can't put '-' here, because of the way it appears in numeric literals
              * characters here may not appear in symbol names
              */
@@ -278,12 +285,13 @@ namespace xo {
         tokenizer<CharT>::assemble_token(std::size_t initial_whitespace,
                                          std::size_t initial_token_prefix_from_input,
                                          const span_type & token_text,
-                                         const span_type & input) const -> result_type
+                                         const span_type & input,
+                                         const input_state_type & input_state) -> result_type
         {
             /* literal|pretty|streamlined */
             log_config::style = function_style::streamlined;
 
-            scope log(XO_DEBUG(input_state_.debug_flag()));
+            scope log(XO_DEBUG(input_state.debug_flag()));
             log && log(xtag("token_text", token_text),
                        xtag("initial_whitespace", initial_whitespace),
                        xtag("initial_token_prefix_from_input", initial_token_prefix_from_input),
@@ -386,7 +394,7 @@ namespace xo {
                                 return result_type::make_error
                                     (error_type(__FUNCTION__ /*src_function*/,
                                                 "improperly placed sign indicator",
-                                                input_state_,
+                                                input_state,
                                                 (ix - tk_start)
                                         ));
                             }
@@ -395,7 +403,7 @@ namespace xo {
                                 return result_type::make_error
                                     (error_type(__FUNCTION__ /*src_function*/,
                                                 "duplicate decimal point in numeric literal",
-                                                input_state_,
+                                                input_state,
                                                 (ix - tk_start)));
                             }
 
@@ -405,7 +413,7 @@ namespace xo {
                                 return result_type::make_error
                                     (error_type(__FUNCTION__ /*src_function*/,
                                                 "duplicate exponent marker in numeric literal",
-                                                input_state_,
+                                                input_state,
                                                 (ix - tk_start)));
                             }
 
@@ -421,7 +429,7 @@ namespace xo {
                             return result_type::make_error
                                 (error_type(__FUNCTION__ /*src_function*/,
                                             "unexpected character in numeric constant" /*error_description*/,
-                                            input_state_,
+                                            input_state,
                                             (ix - tk_start)));
                         }
                     }
@@ -524,7 +532,7 @@ namespace xo {
                             return result_type::make_error
                                 (error_type(__FUNCTION__ /*src_function*/,
                                             "expecting key following escape character \\",
-                                            input_state_,
+                                            input_state,
                                             (ix - tk_start)));
                         }
 
@@ -553,7 +561,7 @@ namespace xo {
                             return result_type::make_error
                                 (error_type(__FUNCTION__ /*src_function*/,
                                             "expecting one of n|r|\"|\\ following escape \\",
-                                            input_state_,
+                                            input_state,
                                             (ix - tk_start)));
                         }
                         break;
@@ -570,7 +578,7 @@ namespace xo {
                     return result_type::make_error
                         (error_type(__FUNCTION__ /*src_function*/,
                                     "missing terminating '\"' to complete literal string",
-                                    input_state_,
+                                    input_state,
                                     (ix - tk_start)));
                 }
 
@@ -712,7 +720,7 @@ namespace xo {
                 return result_type::make_error
                     (error_type(__FUNCTION__ /*src_function*/,
                                 "illegal input character",
-                                input_state_,
+                                input_state,
                                 (ix - tk_start)));
             }
 
@@ -767,21 +775,26 @@ namespace xo {
                                input.prefix(initial_whitespace + initial_token_prefix_from_input));
         } /*assemble_token*/
 
+        /* TODO: input_state_ as argument ? */
         template <typename CharT>
         auto
-        tokenizer<CharT>::assemble_final_token(const span_type & token_text) const -> result_type
+        tokenizer<CharT>::assemble_final_token(const span_type & token_text,
+                                               const input_state_type & input_state) -> result_type
         {
             return assemble_token(0 /*initial_whitespace*/,
                                   0 /*initial_token_prefix_from_input*/,
                                   token_text,
-                                  span_type::make_null());
+                                  span_type::make_null(),
+                                  input_state);
         }
 
+        /* TODO: prefix_, input_state_ as arguments */
         template <typename CharT>
         auto
         tokenizer<CharT>::scan_completion(const span_type & whitespace,
                                           const CharT* token_end,
-                                          const span_type & input) -> result_type {
+                                          const span_type & input,
+                                          const input_state_type & input_state) -> result_type {
 
             auto token_span = input.after_prefix(whitespace).prefix_upto(token_end);
 
@@ -789,7 +802,8 @@ namespace xo {
                 return assemble_token(whitespace.size(),
                                       token_span.size() /*initial_token_prefix_from_input*/,
                                       token_span,
-                                      input);
+                                      input,
+                                      input_state);
             } else {
                 /* whatever we stashed in .prefix_, should be consumed from input.
                  * control here implies reached end of input with either
@@ -927,7 +941,8 @@ namespace xo {
                             /* include next char and complete token */
                             ++ix;
 
-                            return scan_completion(whitespace_span, ix /*token_end*/, input);
+                            return scan_completion(whitespace_span, ix /*token_end*/, input,
+                                                   this->input_state_);
                         }
 
                         /* here: -123, -.5e-21 for example */
@@ -945,7 +960,8 @@ namespace xo {
 
                         if (ch2 != '=') {
                             /* ignore next char and complete token */
-                            return scan_completion(whitespace_span, ix /*token_end*/, input);
+                            return scan_completion(whitespace_span, ix /*token_end*/, input,
+                                                   this->input_state_);
                         }
 
                         /* here: >= for example */
@@ -995,7 +1011,8 @@ namespace xo {
                 }
             }
 
-            return scan_completion(whitespace_span, ix /*token_end*/, input);
+            return scan_completion(whitespace_span, ix /*token_end*/, input,
+                                   this->input_state_);
         } /*scan*/
 
         template <typename CharT>
@@ -1052,7 +1069,8 @@ namespace xo {
                  */
                 return result_type::make_whitespace(input);
             } else {
-                auto retval = assemble_final_token(span_type::from_string(prefix_));
+                auto retval = assemble_final_token(span_type::from_string(prefix_),
+                                                   this->input_state_);
 
                 this->prefix_.clear();