xo-tokenizer: + doc for tokenizer + other doc-related improvements

2025-06-25 23:50:30 -05:00 · 2025-06-25 23:50:30 -05:00 · 573afb6030
commit 573afb6030
parent fbfd5b2861
9 changed files with 143 additions and 30 deletions
--- a/include/xo/tokenizer/input_state.hpp
+++ b/include/xo/tokenizer/input_state.hpp
@ -38,7 +38,7 @@ namespace xo {

            ///@}

-            /** @defgroup input-state static methods **/
+            /** @defgroup input-state-static-methods input_state static methods **/
            ///@{

            /** recognize the newline character '\n' **/
@ -80,7 +80,7 @@ namespace xo {
            void discard_current_line();

            /** Add @p z to current position **/
-            void consume(size_t z) { current_pos_ += z; }
+            void consume(size_t z);

            /** Skip prefix of input comprising whitespace.
             *  Return pointer to first non-whitespace character in @p input,
@ -91,7 +91,7 @@ namespace xo {
            ///@}

        private:
-            /** @defgroup input-state-instance-vars **/
+            /** @defgroup input-state-instance-vars input_state instance variables **/
            ///@{

            /** remember current input line.  Used only to report errors **/
@ -128,6 +128,16 @@ namespace xo {
            return false;
        }

+        template <typename CharT>
+        void
+        input_state<CharT>::consume(size_t z) {
+            scope log(XO_DEBUG(debug_flag_));
+
+            this->current_pos_ += z;
+
+            log && log(xtag("z", z), xtag("current_pos", current_pos_));
+        }
+
        template <typename CharT>
        void
        input_state<CharT>::discard_current_line() {
--- a/include/xo/tokenizer/tokenizer.hpp
+++ b/include/xo/tokenizer/tokenizer.hpp
@ -58,8 +58,16 @@ namespace xo {
            using result_type = scan_result<CharT>;

        public:
+            /** @defgroup tokenizer-ctors tokenizer constructors **/
+            ///@{
+
            tokenizer(bool debug_flag = false);

+            ///@}
+
+            /** @defgroup tokenizer-general-methods tokenizer methods **/
+            ///@{
+
            /** identifies punctuation chars.
             *  These are chars that are not permitted to appear within
             *  a symbol token.  Instead they force completion of
@ -130,19 +138,26 @@ namespace xo {
             **/
            result_type notify_eof(const span_type & input);

+            ///@}
+
        private:
            result_type scan_completion(const span_type & whitespace,
                                        const CharT* token_end,
                                        const span_type & input);

        private:
+            /** @defgroup tokenizer-instance-vars tokenizer instance variables **/
+            ///@{
+
            /** track input state (line#,pos,..) for error messages **/
            input_state_type input_state_;
            /** Accumulate partial token here.
             *  This will happen if input sent to @ref tokenizer::scan
-             *  ends without a determinate token boundary.
+             *  ends without whitespace such that last available token's extent is not determined
             **/
            std::string prefix_;
+
+            ///@}
        }; /*tokenizer*/

        template <typename CharT>
@ -338,7 +353,8 @@ namespace xo {
                                                //current_line_,
                                                //current_pos_,
                                                //initial_whitespace,
-                                                (ix - tk_start)));
+                                                (ix - tk_start)
+                                        ));
                            }
                        } else if (*ix == '.') {
                            if (period_flag) {
@ -378,9 +394,6 @@ namespace xo {
                                (error_type(__FUNCTION__ /*src_function*/,
                                            "unexpected character in numeric constant" /*error_description*/,
                                            input_state_,
-                                            //current_line_,
-                                            //current_pos_,
-                                            //initial_whitespace,
                                            (ix - tk_start)));
                        }
                    }
--- a/include/xo/tokenizer/tokenizer_error.hpp
+++ b/include/xo/tokenizer/tokenizer_error.hpp
@ -36,16 +36,17 @@ namespace xo {
            tokenizer_error(const char * src_function,
                            const char * error_description,
                            const input_state_type & input_state,
-                            //span_type input_line,
-                            //size_t tk_start,
-                            //size_t whitespace,
                            size_t error_pos)
                : src_function_{src_function},
                  error_description_{error_description},
                  input_state_{input_state},
-                  //tk_entry_{tk_start},
-                  //whitespace_{whitespace},
-                  error_pos_{error_pos} {}
+                  error_pos_{error_pos}
+                {
+                    scope log(XO_DEBUG(input_state.debug_flag()));
+
+                    log && log(xtag("input_state.current_pos", input_state.current_pos()),
+                               xtag("error_pos", error_pos));
+                }
            ///@}

            /** @defgroup tokenizer-error-access-methods **/
@ -57,7 +58,6 @@ namespace xo {
 #pragma GCC diagnostic ignored "-Wchanges-meaning"
            const input_state_type & input_state() const { return input_state_; }
 #pragma GCC diagnostic pop
-            //const span_type& input_line() const { return input_line_; }
            size_t tk_start() const { return input_state_.current_pos(); }
            size_t whitespace() const { return input_state_.whitespace(); }
            size_t error_pos() const { return error_pos_; }
@ -94,8 +94,6 @@ namespace xo {
             *  Sufficient to precisely locate it with context.
             **/
            input_state_type input_state_;
-            /** position (relative to line_.lo) of token start where error encountered **/
-            size_t tk_entry_ = 0;
            /** position (relative to @ref tk_entry_) of error **/
            size_t error_pos_ = 0;

@ -110,7 +108,6 @@ namespace xo {
               << xtag("message", error_description_)
               << xtag("input", input_state_.current_line())
               << xtag("whitespace", input_state_.whitespace())
-               << xtag("tk-start", tk_entry_)
               << xtag("error-pos", error_pos_)
               << ">";
        }
@ -122,10 +119,13 @@ namespace xo {

            if (error_description_) {
                const char * prefix = "input: ";
-                const size_t tk_indent = strlen(prefix) + tk_entry_ + input_state_.whitespace();
-                //const size_t msg_length = strlen(error_description_);
-
-                const size_t error_pos = 1 + tk_entry_ + input_state_.whitespace() + error_pos_;
+                /* input_state.current_pos: position of first character following preceding token.
+                 * input_state.whitespace:  whitespace between current_pos and start of failing token
+                 * error_pos:               position (relative to start) at which failure detected
+                 */
+                const size_t tk_start = input_state_.current_pos() + input_state_.whitespace();
+                const size_t tk_indent = (strlen(prefix) + tk_start);
+                const size_t error_pos = 1 + tk_start + error_pos_;

                os << "char: " << error_pos << endl;
                os << prefix;