From 77dbbdcb2239567a295bc4d4cb0c32fbf1be70c0 Mon Sep 17 00:00:00 2001 From: Roland Conybeare Date: Fri, 21 Nov 2025 09:19:06 -0500 Subject: [PATCH] xo-tokenizer: refactor + satisfy clang on osx --- include/xo/tokenizer/input_state.hpp | 10 +++- include/xo/tokenizer/tokenizer.hpp | 88 +++++++++++++++++----------- 2 files changed, 60 insertions(+), 38 deletions(-) diff --git a/include/xo/tokenizer/input_state.hpp b/include/xo/tokenizer/input_state.hpp index 8e73321f..0e93512d 100644 --- a/include/xo/tokenizer/input_state.hpp +++ b/include/xo/tokenizer/input_state.hpp @@ -91,7 +91,11 @@ namespace xo { /** Skip prefix of input comprising whitespace. * Return pointer to first non-whitespace character in @p input, - * or @c input.hi if input contains only whitespace + * or @c input.hi if input contains only whitespace. + * + * if @p input contains any newlines, preserves suffix after last + * such newilne in @p current_line_ + * **/ const CharT * skip_leading_whitespace(const span_type & input); @@ -105,7 +109,7 @@ namespace xo { span current_line_ = span(); /** current input position within @ref current_line_ **/ size_t current_pos_ = 0; - /** whitespace since end of preceding token, + /** number of whitespace chars since end of preceding token, * or last newline, whichever is less **/ size_t whitespace_ = 0; @@ -114,7 +118,7 @@ namespace xo { bool debug_flag_ = false; ///@} - }; + }; /*input_state*/ template bool diff --git a/include/xo/tokenizer/tokenizer.hpp b/include/xo/tokenizer/tokenizer.hpp index 8d6ac215..0dd46877 100644 --- a/include/xo/tokenizer/tokenizer.hpp +++ b/include/xo/tokenizer/tokenizer.hpp @@ -90,21 +90,16 @@ namespace xo { * a symbol token. Instead they force completion of * a preceding token, and start a new token with themselves **/ - bool is_1char_punctuation(CharT ch) const; + static bool is_1char_punctuation(CharT ch); /** more-relazed version of is_1char_punctuation. * Chars that are not permitted to appear within a symbol token, * but may form token combined with next character **/ - bool is_2char_punctuation(CharT ch) const; - - /** true if tokenizer contains stored prefix of - * possibly-incomplete token - **/ - bool has_prefix() const { return !prefix_.empty(); } + static bool is_2char_punctuation(CharT ch); /** assemble token from text @p token_text. - * @p token_text will often but not always represent a subset of @p input. + * @p token_text will often (but not always) represent a subset of @p input. * (For example consider multi-line string literals) * Also the span @p token_text may (in uncommon cases) * have been copied to separate storage from @p input @@ -115,13 +110,20 @@ namespace xo { * * retval.consumed will represent some possibly-empty prefix of @p input **/ - result_type assemble_token(std::size_t initial_whitespace, - std::size_t initial_token_prefix_from_input, - const span_type & token_text, - const span_type & input) const; + static result_type assemble_token(std::size_t initial_whitespace, + std::size_t initial_token_prefix_from_input, + const span_type & token_text, + const span_type & input, + const input_state_type & input_state); /** degenerate version of assemble_token() on reaching end-of-file **/ - result_type assemble_final_token(const span_type & token_text) const; + static result_type assemble_final_token(const span_type & token_text, + const input_state_type & input_state); + + /** true if tokenizer contains stored prefix of + * possibly-incomplete token + **/ + bool has_prefix() const { return !prefix_.empty(); } /** scan for next input token, given @p input. * Note: @@ -160,7 +162,8 @@ namespace xo { private: result_type scan_completion(const span_type & whitespace, const CharT* token_end, - const span_type & input); + const span_type & input, + const input_state_type & input_state); private: /** @defgroup tokenizer-instance-vars tokenizer instance variables **/ @@ -168,7 +171,11 @@ namespace xo { /** track input state (line#,pos,..) for error messages. * There's an ordering problem here: - * 1. input_state_.skip_leading_whitespace() advances current line when it sees newline. + * 1. input_state_.skip_leading_whitespace() advances current line automagically + * when it sees \n + * 2. need to capture value of @ref input_state_ _before_ newline + * 3. but neeed newline to end token + * Also recall input_state_type needed for reporting errors. **/ input_state_type input_state_; /** Accumulate partial token here. @@ -187,7 +194,7 @@ namespace xo { template bool - tokenizer::is_1char_punctuation(CharT ch) const { + tokenizer::is_1char_punctuation(CharT ch) { switch(ch) { case '(': return true; @@ -247,7 +254,7 @@ namespace xo { template bool - tokenizer::is_2char_punctuation(CharT ch) const { + tokenizer::is_2char_punctuation(CharT ch) { /* can't put '-' here, because of the way it appears in numeric literals * characters here may not appear in symbol names */ @@ -278,12 +285,13 @@ namespace xo { tokenizer::assemble_token(std::size_t initial_whitespace, std::size_t initial_token_prefix_from_input, const span_type & token_text, - const span_type & input) const -> result_type + const span_type & input, + const input_state_type & input_state) -> result_type { /* literal|pretty|streamlined */ log_config::style = function_style::streamlined; - scope log(XO_DEBUG(input_state_.debug_flag())); + scope log(XO_DEBUG(input_state.debug_flag())); log && log(xtag("token_text", token_text), xtag("initial_whitespace", initial_whitespace), xtag("initial_token_prefix_from_input", initial_token_prefix_from_input), @@ -386,7 +394,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "improperly placed sign indicator", - input_state_, + input_state, (ix - tk_start) )); } @@ -395,7 +403,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "duplicate decimal point in numeric literal", - input_state_, + input_state, (ix - tk_start))); } @@ -405,7 +413,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "duplicate exponent marker in numeric literal", - input_state_, + input_state, (ix - tk_start))); } @@ -421,7 +429,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "unexpected character in numeric constant" /*error_description*/, - input_state_, + input_state, (ix - tk_start))); } } @@ -524,7 +532,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "expecting key following escape character \\", - input_state_, + input_state, (ix - tk_start))); } @@ -553,7 +561,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "expecting one of n|r|\"|\\ following escape \\", - input_state_, + input_state, (ix - tk_start))); } break; @@ -570,7 +578,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "missing terminating '\"' to complete literal string", - input_state_, + input_state, (ix - tk_start))); } @@ -712,7 +720,7 @@ namespace xo { return result_type::make_error (error_type(__FUNCTION__ /*src_function*/, "illegal input character", - input_state_, + input_state, (ix - tk_start))); } @@ -767,21 +775,26 @@ namespace xo { input.prefix(initial_whitespace + initial_token_prefix_from_input)); } /*assemble_token*/ + /* TODO: input_state_ as argument ? */ template auto - tokenizer::assemble_final_token(const span_type & token_text) const -> result_type + tokenizer::assemble_final_token(const span_type & token_text, + const input_state_type & input_state) -> result_type { return assemble_token(0 /*initial_whitespace*/, 0 /*initial_token_prefix_from_input*/, token_text, - span_type::make_null()); + span_type::make_null(), + input_state); } + /* TODO: prefix_, input_state_ as arguments */ template auto tokenizer::scan_completion(const span_type & whitespace, const CharT* token_end, - const span_type & input) -> result_type { + const span_type & input, + const input_state_type & input_state) -> result_type { auto token_span = input.after_prefix(whitespace).prefix_upto(token_end); @@ -789,7 +802,8 @@ namespace xo { return assemble_token(whitespace.size(), token_span.size() /*initial_token_prefix_from_input*/, token_span, - input); + input, + input_state); } else { /* whatever we stashed in .prefix_, should be consumed from input. * control here implies reached end of input with either @@ -927,7 +941,8 @@ namespace xo { /* include next char and complete token */ ++ix; - return scan_completion(whitespace_span, ix /*token_end*/, input); + return scan_completion(whitespace_span, ix /*token_end*/, input, + this->input_state_); } /* here: -123, -.5e-21 for example */ @@ -945,7 +960,8 @@ namespace xo { if (ch2 != '=') { /* ignore next char and complete token */ - return scan_completion(whitespace_span, ix /*token_end*/, input); + return scan_completion(whitespace_span, ix /*token_end*/, input, + this->input_state_); } /* here: >= for example */ @@ -995,7 +1011,8 @@ namespace xo { } } - return scan_completion(whitespace_span, ix /*token_end*/, input); + return scan_completion(whitespace_span, ix /*token_end*/, input, + this->input_state_); } /*scan*/ template @@ -1052,7 +1069,8 @@ namespace xo { */ return result_type::make_whitespace(input); } else { - auto retval = assemble_final_token(span_type::from_string(prefix_)); + auto retval = assemble_final_token(span_type::from_string(prefix_), + this->input_state_); this->prefix_.clear();