diff --git a/.xo-tokenizer/.gitignore b/.xo-tokenizer/.gitignore deleted file mode 100644 index 3d3a7826..00000000 --- a/.xo-tokenizer/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# emacs workspace config -.projectile -# clangd working space (see emacs+lsp) -.cache -# typical cmake build directory (source-tree-nephew) -.build* -# symlink to builddir/compile_commands.json; should be set manually in dev sandbox -compile_commands.json diff --git a/.xo-tokenizer/CMakeLists.txt b/.xo-tokenizer/CMakeLists.txt deleted file mode 100644 index 896c1b97..00000000 --- a/.xo-tokenizer/CMakeLists.txt +++ /dev/null @@ -1,33 +0,0 @@ -# xo-tokenizer/CMakeLists.txt - -cmake_minimum_required(VERSION 3.10) - -project(xo_tokenizer VERSION 0.1) - -include(GNUInstallDirs) -include(cmake/xo-bootstrap-macros.cmake) - -xo_cxx_toplevel_options3() - -# ---------------------------------------------------------------- -# c++ settings - -set(PROJECT_CXX_FLAGS "") -#set(PROJECT_CXX_FLAGS "-fconcepts-diagnostics-depth=2") -add_definitions(${PROJECT_CXX_FLAGS}) - -# ---------------------------------------------------------------- - -add_subdirectory(src/tokenizer) -add_subdirectory(example) -#add_subdirectory(utest) # tests failing, temporarily remove -xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets) - -if (XO_ENABLE_EXAMPLES) - install(TARGETS xo_tokenizer_repl DESTINATION bin/xo/example/tokenizer) -endif() - -# ---------------------------------------------------------------- -# docs targets depend on all the other library/utest targets -# -add_subdirectory(docs) diff --git a/.xo-tokenizer/README.md b/.xo-tokenizer/README.md deleted file mode 100644 index 3f0befba..00000000 --- a/.xo-tokenizer/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# schematica tokenizer library - -## Getting Started - -### build + install 'xo-cmake` dependency - -- [github/Rconybea/xo-cmake](https://github.com/Rconybea/xo-cmake) - -Installs a few cmake ingredients, along with a build assistant `xo-build` for XO projects such as this one. - -### build + install other required XO dependencies -``` -$ xo-build --clone --configure --build --install xo-indentlog -$ xo-build --clone --configure --build --install xo-refnct -$ xo-build --clone --configure --build --install xo-subsys -$ xo-build --clone --configure --build --install xo-reflectutil -``` - -Note: can use `-n` to dry-run here - -### copy `xo-tokenizer` repository locally -``` -$ xo-build --clone xo-tokenizer -``` - -or equivalently -``` -$ git clone git@github.com:Rconybea/xo-tokenizer.git -``` - -### build + install `xo-tokenizer` - -``` -$ xo-build --configure --build --install xo-tokenizer -``` - -or equivalently: - -``` -$ PREFIX=/usr/local # or wherever you prefer -$ cmake -DCMAKE_INSTALL_PREFIX=${PREFIX} -S xo-tokenizer -B xo-tokenizer/.build -$ cmake --build xo-tokenizer/.build -$ cmake --install xo-tokenizer/.build -``` - -### build for unit test coverage -``` -$ cmake -DCMAKE_BUILD_TYPE=coverage -DCMAKE_INSTALL_PREFIX=$PREFIX xo-tokenizer/.build-ccov -$ cmake --build xo-tokenizer/.build-ccov -``` - -### LSP support -``` -$ cd xo-tokenizer -$ ln -s .build/compile_commands.json # lsp will look for compile_commands.json in the root of the source tree -``` diff --git a/.xo-tokenizer/cmake/xo-bootstrap-macros.cmake b/.xo-tokenizer/cmake/xo-bootstrap-macros.cmake deleted file mode 100644 index 592272c0..00000000 --- a/.xo-tokenizer/cmake/xo-bootstrap-macros.cmake +++ /dev/null @@ -1,41 +0,0 @@ -# ---------------------------------------------------------------- -# for example: -# $ PREFIX=/usr/local # for example -# $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build -# -# will get -# CMAKE_MODULE_PATH -# from xo-cmake-config --cmake-module-path -# -# and expect .cmake macros in -# CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake -# ---------------------------------------------------------------- - -find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED) - -if ("${XO_CMAKE_CONFIG_EXECUTABLE}" STREQUAL "XO_CMAKE_CONFIG_EXECUTABLE-NOT_FOUND") - message(FATAL "could not find xo-cmake-config executable") -endif() - -message(STATUS "XO_CMAKE_CONFIG_EXECUTABLE=${XO_CMAKE_CONFIG_EXECUTABLE}") - -if (XO_SUBMODULE_BUILD) - if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix)) - # local version of xo-cmake macros - set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/xo-cmake/cmake") - message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}") - endif() -else() - if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix)) - # default to typical install location for xo-project-macros - execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH) - message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}") - endif() -endif() - -# needs to have been installed somewhere on CMAKE_MODULE_PATH, -# (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX) -# -include(xo_macros/xo_cxx) - -xo_cxx_bootstrap_message() diff --git a/.xo-tokenizer/cmake/xo_tokenizerConfig.cmake.in b/.xo-tokenizer/cmake/xo_tokenizerConfig.cmake.in deleted file mode 100644 index e1b8fe7a..00000000 --- a/.xo-tokenizer/cmake/xo_tokenizerConfig.cmake.in +++ /dev/null @@ -1,9 +0,0 @@ -@PACKAGE_INIT@ - -include(CMakeFindDependencyMacro) -#find_dependency(refcnt) -find_dependency(indentlog) -#find_dependency(subsys) -include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") -include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Share.cmake") -check_required_components("@PROJECT_NAME@") diff --git a/.xo-tokenizer/docs/CMakeLists.txt b/.xo-tokenizer/docs/CMakeLists.txt deleted file mode 100644 index 30289162..00000000 --- a/.xo-tokenizer/docs/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -# xo-tokenizer/docs/CMakeLists.txt - -xo_doxygen_collect_deps() -xo_docdir_doxygen_config() -xo_docdir_sphinx_config( - index.rst install.rst examples.rst implementation.rst - input-state-class.rst scan-result-class.rst schematika-tokens.rst span-class.rst - token-class.rst tokenizer-error-class.rst tokentype-enum.rst -) diff --git a/.xo-tokenizer/docs/README b/.xo-tokenizer/docs/README deleted file mode 100644 index ea8a9a25..00000000 --- a/.xo-tokenizer/docs/README +++ /dev/null @@ -1,41 +0,0 @@ -standalone build - - +-----------------------------------------------+ - | cmake | - | CMakeLists.txt | - | $PREFIX/share/cmake/xo_macros/xo_cxx.cmake | - +-----------------------------------------------+ - | - | +----------------------+ - +------------------------------------------------->| .build/docs/Doxyfile | - | +----------------------+ - | ^ - | (cmake | - | /------------/ - | | - | +---------------------------------------+ +-----------------+ - +---->| doxygen |--->| .build/docs/dox | - | | $PREFIX/share/xo-macros/Doxyfile.in | | +- html/ | - | +---------------------------------------+ | +- xml/ | - | +-----------------+ - | | - | /------------/ - | | - | v - | +---------------------------------------+ +--------------------+ - \---->| sphinx |--->| .build/docs/sphinx | - | +- conf.py | | +- html/ | - | +- _static/ | +--------------------+ - | +- *.rst | - +---------------------------------------+ - -umbrella build relies on top-level cmake macros - -files - - README this file - CMakeLists.txt build entry point - conf.py sphinx config - _static static files for sphinx - - index.rst toplevel sphinx document; entry point diff --git a/.xo-tokenizer/docs/_static/README b/.xo-tokenizer/docs/_static/README deleted file mode 100644 index 8230095c..00000000 --- a/.xo-tokenizer/docs/_static/README +++ /dev/null @@ -1 +0,0 @@ -add any static {.html, .js, ..} files for sphinx to pickup here \ No newline at end of file diff --git a/.xo-tokenizer/docs/_static/img/favicon.ico b/.xo-tokenizer/docs/_static/img/favicon.ico deleted file mode 100644 index 15da2145..00000000 Binary files a/.xo-tokenizer/docs/_static/img/favicon.ico and /dev/null differ diff --git a/.xo-tokenizer/docs/conf.py b/.xo-tokenizer/docs/conf.py deleted file mode 100644 index 31e2f0b2..00000000 --- a/.xo-tokenizer/docs/conf.py +++ /dev/null @@ -1,39 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# For the full list of built-in configuration values, see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Project information ----------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information - -project = 'xo tokenizer documentation' -copyright = '2024-2025, Roland Conybeare' -author = 'Roland Conybeare' - -# -- General configuration --------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration - -#extensions = [] -extensions = [ "breathe", - "sphinx.ext.mathjax", # inline math - "sphinx.ext.autodoc", # generate info from docstrings - "sphinxcontrib.ditaa", # diagrams-through-ascii-art - "sphinxcontrib.plantuml" # text -> uml diagrams - ] - -# note: breathe requires doxygen xml output -> must have GENERATE_XML = YES in Doxyfile.in -# match project name in Doxyfile.in -breathe_default_project = "xodoxxml" - -templates_path = ['_templates'] -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - -pygments_style = 'sphinx' - -# -- Options for HTML output ------------------------------------------------- -# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output - -#html_theme = 'alabaster' -html_theme = 'sphinx_rtd_theme' -html_static_path = ['_static'] -html_favicon = '_static/img/favicon.ico' diff --git a/.xo-tokenizer/docs/examples.rst b/.xo-tokenizer/docs/examples.rst deleted file mode 100644 index 16f963a8..00000000 --- a/.xo-tokenizer/docs/examples.rst +++ /dev/null @@ -1,99 +0,0 @@ -.. _examples: - -.. toctree:: - :maxdepth: 2 - -Examples -======== - -See ``xo-tokenizer/examples/tokenrepl`` for (slighly elaborated) version of code below - -.. code-block:: cpp - :linenos: - - #include "xo/tokenizer/tokenizer.hpp" - - int - main() { - using namespace xo::scm; - using namespace std; - - using tokenizer_type = tokenizer; - using span_type = tokenizer_type::span_type; - - tokenizer_type tkz; - string input_str; - - while (getline(cin, input_str)) { - // we want tokenizer to see newline, it's syntax - input_str.push_back('\n'); - span_type input(input_str.begin(), input_str.end()); - - // input may contain multiple tokens - while (!input.empty()) { - auto [tk, consumed, error] = tkz.scan(input); - - if (tk.is_valid()) { - cout << tk; - } - - input = input.after_prefix(consumed.size()); - } - } - - auto [tk, consumed, error] = tkz.notify_eof(spxn_type::from_string(input_str)); - - if (tk.is_valid()) { - cout << tk; - } else if (error.is_error()) { - cout << "parsing error: " << endl; - error.report(cout); - } - } - -Reminder: enable building examples with ``cmake -DXO_ENABLE_EXAMPLES=1 ..`` - -.. code-block:: - :linenos: - - $ .build/xo-tokenizer/example/tokenrepl/xo_tokenizer_repl - > 123 - - > 123e5 - - > def sq(x: i64) -> i64 { x * x } - - - - - - - - - - - - - - - -Example of error reporting (via ``error.report(cout)`` above) - -.. code-block:: - :linenos: - - $ .build/xo-tokenizer/example/tokenrepl/xo_tokenizer_repl - - > 123q - parsing error: - char: 4 - input: 123q - ---^ - unexpected character in numeric constant - - > (8 * 8 * 123fd) - parsing error: - char: 13 - input: (8 * 8 * 123fd) - ---^ - unexpected character in numeric constant diff --git a/.xo-tokenizer/docs/implementation.rst b/.xo-tokenizer/docs/implementation.rst deleted file mode 100644 index 1063f4fa..00000000 --- a/.xo-tokenizer/docs/implementation.rst +++ /dev/null @@ -1,38 +0,0 @@ -.. _implementation: - -.. toctree:: - :maxdepth: 2 - -Components -========== - -Library dependency tower for *xo-tokenizer*: - -.. ditaa:: - - +-----------------+ - | xo_tokenizer | - +-----------------+ - | xo_indentlog | - +-----------------+ - | xo_cmake | - +-----------------+ - -Install instructions :doc:`here` - -Abstraction tower for *xo-tokenizer* components: - -.. ditaa:: - :--scale: 0.85 - - +-----------------------------------------+----------+ - | tokenizer | | - +-----------------------------------------+ | - | scan_result | | - +-----------------+-----------------------+ | - | | tokenizer_error | buffer | - | token +-----------------------+ | - | | input_state | | - +-----------------+-----------------------+ | - | tokentype | span | | - +-----------------+-----------------------+----------+ diff --git a/.xo-tokenizer/docs/index.rst b/.xo-tokenizer/docs/index.rst deleted file mode 100644 index 6efad465..00000000 --- a/.xo-tokenizer/docs/index.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. xo-tokenizer documentation master file. - -xo-tokenizer documentation -========================== - -xo-tokenizer provides a tokenizer for the Schematika language. - -Syntax is generally C-like, but with some important differences. -Notably, characters used for arithmetic operators (``+``, ``-``, ``*``, ``/``) -may appear in variable names: ``one-of-those-days`` is an ordinary symbol. - -Typically applications would use xo-reader to construct Schematika expressions -instead of interacting directly with ``xo::scm::tokenizer``. - -.. toctree:: - :maxdepth: 2 - :caption: xo-tokenizer contents - - install - examples - schematika-tokens - implementation - tokenizer-class - scan-result-class - token-class - tokenizer-error-class - input-state-class - span-class - tokentype-enum diff --git a/.xo-tokenizer/docs/input-state-class.rst b/.xo-tokenizer/docs/input-state-class.rst deleted file mode 100644 index d995868e..00000000 --- a/.xo-tokenizer/docs/input-state-class.rst +++ /dev/null @@ -1,77 +0,0 @@ - -.. _input-state-class: - -Input State -=========== - -Track detailed state of input stream to collect information useful for detailed error reporting - -Context -------- - -.. ditaa:: - :--scale: 0.85 - - +-----------------------------------------+----------+ - | tokenizer | | - +-----------------------------------------+ | - | scan_result | | - +-----------------+-----------------------+ | - | | tokenizer_error | buffer | - | token +-----------------------+ | - | |cBLU input_state | | - +-----------------+-----------------------+ | - | tokentype | span | | - +-----------------+-----------------------+----------+ - -.. code-block:: cpp - - #include - -.. uml:: - :scale: 99% - :align: center - - allowmixing - - object in1<> - in1 : current_line = input - in1 : current_pos - in1 : whitespace - in1 : debug_flag - - object input - input : (x * y * 123d) - - input o-- sp1 - - -Class ------ - -.. doxygenclass:: xo::scm::input_state - -Instance Variables ------------------- - -.. doxygengroup:: input-state-instance-vars - -Constructors ------------- - -.. doxygengroup:: input-state-ctors - -Static Methods --------------- - -.. doxygengroup:: input-state-static-methods - -Access Methods --------------- - -.. doxygengroup:: input-state-access-methods - -General Methods ---------------- - -.. doxygengroup:: input-state-general-methods diff --git a/.xo-tokenizer/docs/install.rst b/.xo-tokenizer/docs/install.rst deleted file mode 100644 index c9ab8598..00000000 --- a/.xo-tokenizer/docs/install.rst +++ /dev/null @@ -1,111 +0,0 @@ -.. _install: - -.. toctree:: - :maxdepth: 2 - -Source -====== - -Souce code lives on github `here`_ - -.. _here: https://github.com/rconybea/xo-tokenizer - -To clone from git: - -.. code-block:: bash - - git clone https://github.com/rconybea/xo-tokenizer - -Tested with gcc 13.3 - -Install -======= - -``xo-tokenizer`` uses supporting library ``xo-indentlog`` and cmake macros ``xo-cmake``. -These are on github: - -- `xo-tokenizer source`_ (Schematika tokenizer) -- `xo-indentlog source`_ (structured logging) -- `xo-cmake source`_ (shared cmake macros) - -.. _xo-tokenizer source: https://github.com/rconybea/xo-tokenizer -.. _xo-indentlog source: https://github.com/rconybea/indentlog -.. _xo-cmake source: https://github.com/rconybea/xo-cmake - -Installing from source ----------------------- - -Install scripts for `xo-tokenizer` and `xo-indentlog` depend on helper scripts installed from `xo-cmake`. - -Preamble: - -.. code-block:: bash - - mkdir -p ~/proj/xo - cd ~/proj/xo - - git clone https://github.com/rconybea/xo-cmake - - PREFIX=/usr/local # ..or desired installation prefix - - # want PREFIX/bin in PATH to use xo-cmake helpers - PATH=$PREFIX/bin:$PATH - -Install `xo-cmake`: - -.. code-block:: bash - - cmake -B xo-cmake/.build -S xo-cmake - cmake --build xo-cmake/.build -j # placeholder, can omit for now - cmake --install xo-cmake/.build - -Install `xo-indentlog`: - -.. code-block:: bash - - xo-build --clone --configure --build --install indentlog - -Install `xo-tokenizer`: - -.. code-block:: bash - - xo-build --clone --configure --build --install xo-tokenizer - -Directories under ``PREFIX`` will then contain: - -.. code-block:: - - PREFIX - +- bin - | +- xo-build - | +- xo-cmake-config - | \- xo-cmake-lcov-harness - +- include - | \- xo - | +- indentlog/ - | \- tokenizer/ - +- lib - | \- cmake - | +- indentlog/ - | \- xo_tokenizer/ - +- share - \- cmake - \- xo_macros - +- Doxyfile.in - +- gen-ccov.in - \- xo-bootstrap-macros.cmake - -Use CMake Support ------------------ - -To use built-in cmake suport, when using ``xo-tokenizer`` from another project: - -Make sure ``PREFIX/lib/cmake`` is searched by cmake (if necessary, include it in ``CMAKE_PREFIX_PATH``) - -Add to ``CMakeLists.txt``: - -.. code-block:: cmake - - FindPackage(xo_tokenizer CONFIG REQUIRED) - - target_link_libraries(mytarget INTERFACE xo_tokenizer) diff --git a/.xo-tokenizer/docs/scan-result-class.rst b/.xo-tokenizer/docs/scan-result-class.rst deleted file mode 100644 index 6581a839..00000000 --- a/.xo-tokenizer/docs/scan-result-class.rst +++ /dev/null @@ -1,29 +0,0 @@ - -.. _scan-result-class: - -Scan Result -=========== - -Represent the result of a tokenizer scan call - -Context -------- - -.. ditaa:: - :--scale: 0.85 - - +-----------------------------------------+----------+ - | tokenizer | | - +-----------------------------------------+ | - |cBLU scan_result | | - +-----------------+-----------------------+ | - | | tokenizer_error | buffer | - | token +-----------------------+ | - | | input_state | | - +-----------------+-----------------------+ | - | tokentype | span | | - +-----------------+-----------------------+----------+ - -.. code-block:: cpp - - #include diff --git a/.xo-tokenizer/docs/schematika-tokens.rst b/.xo-tokenizer/docs/schematika-tokens.rst deleted file mode 100644 index 3d99a7ee..00000000 --- a/.xo-tokenizer/docs/schematika-tokens.rst +++ /dev/null @@ -1,105 +0,0 @@ -.. _schematika-tokens: - -Schematika Tokens -================= - -.. list-table:: Schematika Tokens - :widths: 15 30 30 - :header-rows: 1 - - * - tokentype - - examples - - description - * - tk_i64 - - ``123``, ``-8`` - - 64-bit integer literal - * - tk_f64 - - ``1.234``, ``-10``., ``-1.981e-10``, ``3e6`` - - 64-bit floating-point literal - * - tk_string - - ``"hello"``, ``"Q: \"what's up?\"\nA: \"parsing!\""`` - - string literal. Usual escapes ``\n``, ``\r``, ``\t``, ``\"``, ``\\`` - * - tk_symbol - - ``apple``, ``funKy``, ``x123``, ``_mumble``, ``hyphenated-var`` - - symbol name - * - tk_type - - ``type`` - - keyword - * - tk_def - - ``def`` - - keyword - * - tk_lambda - - ``lambda`` - - keyword - * - tk_if - - ``if`` - - keyword - * - tk_let - - ``let`` - - keyword - * - tk_in - - ``in`` - - keyword - * - tk_end - - ``end`` - - keyword - * - tk_leftparen - - ``(`` - - - * - tk_rightparen - - ``)`` - - - * - tl_leftbracket - - ``[`` - - - * - tk_rightbracket - - ``]`` - - - * - tk_leftbrace - - ``{`` - - - * - tk_rightbrace - - ``}`` - - - * - tk_leftangle - - ``<`` - - - * - tk_rightangle - - ``>`` - - - * - tk_dot - - ``.`` - - - * - tk_comma - - ``,`` - - - * - tk_colon - - ``:`` - - - * - tk_doublecolon - - ``::`` - - - * - tk_semicolon - - ``;`` - - - * - tk_singleassign - - ``=`` - - - * - tk_assign - - ``:=`` - - - * - tk_yields - - ``->`` - - - * - tk_plus - - ``+`` - - allowed in symbol - * - tk_minus - - ``-`` - - allowed in symbol - * - tk_star - - ``*`` - - allowed in symbol - * - tk_slash - - ``/`` - - allowed in symbol diff --git a/.xo-tokenizer/docs/span-class.rst b/.xo-tokenizer/docs/span-class.rst deleted file mode 100644 index b641ca1f..00000000 --- a/.xo-tokenizer/docs/span-class.rst +++ /dev/null @@ -1,87 +0,0 @@ - -.. _span-class: - -Span -==== - -Identify an unowned contiguous memory range - -Context -------- - -.. ditaa:: - :--scale: 0.85 - - +-----------------------------------------+----------+ - | tokenizer | | - +-----------------------------------------+ | - | scan_result | | - +-----------------+-----------------------+ | - | | tokenizer_error | buffer | - | token +-----------------------+ | - | | input_state | | - +-----------------+-----------------------+ | - | tokentype |cBLU span | | - +-----------------+-----------------------+----------+ - -.. code-block:: cpp - - #include - -.. uml:: - :scale: 99% - :align: center - - allowmixing - - object span1<> - span1 : lo = p - span1 : hi = p+25 - - object dest<> - dest : def fact(n : i64) { ... } - - span1 o-- dest - -- Identify a sequence of characters stored in contiguous memory. - -- Lightweight, consists of a pair of pointers. - -- Does not own storage. Lifetime management for target memory is - up to the caller. - - -Class ------ - -.. doxygenclass:: xo::scm::span - -Member Variables ----------------- - -.. doxygengroup:: span-instance-vars - -Type Traits ------------ - -.. doxygengroup:: span-type-traits - -Constructors ------------- - -.. doxygengroup:: span-ctors - -Access Methods --------------- - -.. doxygengroup:: span-access-methods - -General Methods ---------------- - -.. doxygengroup:: span-general-methods - -Operators ---------- - -.. doxygengroup:: span-operators diff --git a/.xo-tokenizer/docs/token-class.rst b/.xo-tokenizer/docs/token-class.rst deleted file mode 100644 index 8d19a852..00000000 --- a/.xo-tokenizer/docs/token-class.rst +++ /dev/null @@ -1,96 +0,0 @@ - -.. _token-class: - -Token -===== - -Represent a single lexical token in the Schematika language - -Context -------- - -.. ditaa:: - :--scale: 0.85 - - +-----------------------------------------+----------+ - | tokenizer | | - +-----------------------------------------+ | - | scan_result | | - +-----------------+-----------------------+ | - |cBLU | tokenizer_error | buffer | - | token +-----------------------+ | - | | input_state | | - +-----------------+-----------------------+ | - | tokentype | span | | - +-----------------+-----------------------+----------+ - -.. code-block:: cpp - - #include - -.. uml:: - :scale: 99% - :align: center - - allowmixing - - object tk1<> - tk1 : tk_type = tk_i64 - tk1 : text = "123" - - object tk2<> - tk2 : tk_type = tk_string - tk2 : text = "the quick brown fox" - -- Represent a single lexical token - -- Does not share any storage with original input stream - (maintains a local copy). - -- Remembers copied input extent. - Convert on demand to native untagged representation - -Example -------- - -.. code-block:: cpp - - void foo() { - using namespace xo::scm; - - token tk = token::i64_token("123"); - - tk.is_valid(); // -> true - tk.text(); // -> "123"s; - - tk.tk_type(); // -> tokentype::tk_i64 - tk.i64_value(); // -> 123 - - cout << tk << endl; // -> - } - -Class ------ - -.. doxygenclass:: xo::scm::token - - -Instance Variables ------------------- - -.. doxygengroup:: token-instance-vars - -Constructors ------------- - -.. doxygengroup:: token-ctors - -Access Methods --------------- - -.. doxygengroup:: token-access-methods - -General Methods ---------------- - -.. doxygengroup:: token-general-methods diff --git a/.xo-tokenizer/docs/tokenizer-class.rst b/.xo-tokenizer/docs/tokenizer-class.rst deleted file mode 100644 index 4903dae9..00000000 --- a/.xo-tokenizer/docs/tokenizer-class.rst +++ /dev/null @@ -1,68 +0,0 @@ - -.. _tokenizer-class: - -Tokenizer -========= - -Parse a Schematika character stream into lexical tokens - -Context -------- - -.. ditaa:: - :--scale: 0.85 - - +-----------------------------------------+----------+ - |cBLU tokenizer | | - +-----------------------------------------+ | - | scan_result | | - +-----------------+-----------------------+ | - | | tokenizer_error | buffer | - | token +-----------------------+ | - | | input_state | | - +-----------------+-----------------------+ | - | tokentype | span | | - +-----------------+-----------------------+----------+ - -.. code-block:: cpp - - #include - -.. uml:: - :scale: 99% - :align: center - - allowmixing - - object tkz1<> - tkz1 : input_state = ins1 - - object ins1<> - ins1 : current_line = (9 * 8) - - tkz1 o-- ins1 - -- Assemble a stream of lexical tokens from a text stream. - -- Lexical errors reported via scan_result instance; - errors reported with detailed context - -Class ------ - -.. doxygenclass:: xo::scm::tokenizer - -Instance Variables ------------------- - -.. doxygengroup:: tokenizer-instance-vars - -Constructors ------------- - -.. doxygengroup:: tokenizer-ctors - -Methods -------- - -.. doxygengroup:: tokenizer-general-methods diff --git a/.xo-tokenizer/docs/tokenizer-error-class.rst b/.xo-tokenizer/docs/tokenizer-error-class.rst deleted file mode 100644 index 848f2e98..00000000 --- a/.xo-tokenizer/docs/tokenizer-error-class.rst +++ /dev/null @@ -1,54 +0,0 @@ - -.. _tokenizer-error-class - -Tokenizer Error -=============== - -Represent a possible tokenizer error result, including parsing context - -Context -------- - -.. ditaa:: - :--scale: 0.85 - - +-----------------------------------------+----------+ - | tokenizer | | - +-----------------------------------------+ | - | scan_result | | - +-----------------+-----------------------+ | - | |cBLU tokenizer_error | buffer | - | token +-----------------------+ | - | | input_state | | - +-----------------+-----------------------+ | - | tokentype | span | | - +-----------------+-----------------------+----------+ - -.. code-block:: cpp - - #include - -Class ------- - -.. doxygenclass:: xo::scm::tokenizer_error - -Instance Variables ------------------- - -.. doxygengroup:: tokenizer-error-vars - -Constructors ------------- - -.. doxygengroup:: tokenizer-error-ctors - -Access Methods --------------- - -.. doxygengroup:: tokenizer-error-access-methods - -General Methods ---------------- - -.. doxygengroup:: tokenizer-error-general-methods diff --git a/.xo-tokenizer/docs/tokentype-enum.rst b/.xo-tokenizer/docs/tokentype-enum.rst deleted file mode 100644 index 0f371dda..00000000 --- a/.xo-tokenizer/docs/tokentype-enum.rst +++ /dev/null @@ -1,36 +0,0 @@ - -.. _tokentype-enum: - -Tokentype -========= - -Distinguish different lexical tokens for the Schematika language. - -Context -------- - -.. ditaa:: - :--scale: 0.85 - - +-----------------------------------------+----------+ - | tokenizer | | - +-----------------------------------------+ | - | scan_result | | - +-----------------+-----------------------+ | - | | tokenizer_error | buffer | - | token +-----------------------+ | - | | input_state | | - +-----------------+-----------------------+ | - |cBLU tokentype | span | | - +-----------------+-----------------------+----------+ - -.. code-block:: cpp - - #include - -Enum ----- - -.. doxygenfunction:: xo::scm::tokentype_descr - -.. doxygenfunction:: xo::scm::operator<<(std::ostream&,tokentype) diff --git a/.xo-tokenizer/example/CMakeLists.txt b/.xo-tokenizer/example/CMakeLists.txt deleted file mode 100644 index e761ade5..00000000 --- a/.xo-tokenizer/example/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(tokenrepl) diff --git a/.xo-tokenizer/example/tokenrepl/CMakeLists.txt b/.xo-tokenizer/example/tokenrepl/CMakeLists.txt deleted file mode 100644 index 60243b7e..00000000 --- a/.xo-tokenizer/example/tokenrepl/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -# xo-tokenizer/example/tokenrepl/CMakeLists.txt - -set(SELF_EXE xo_tokenizer_repl) -set(SELF_SRCS tokenrepl.cpp) - -if (XO_ENABLE_EXAMPLES) - xo_add_executable(${SELF_EXE} ${SELF_SRCS}) - xo_self_dependency(${SELF_EXE} xo_tokenizer) -endif() - -# end CMakeLists.txt diff --git a/.xo-tokenizer/example/tokenrepl/tokenrepl.cpp b/.xo-tokenizer/example/tokenrepl/tokenrepl.cpp deleted file mode 100644 index 61f6ea74..00000000 --- a/.xo-tokenizer/example/tokenrepl/tokenrepl.cpp +++ /dev/null @@ -1,71 +0,0 @@ -/** @file tokenrepl.cpp **/ - -#include "xo/tokenizer/tokenizer.hpp" -#include -#include // for isatty - -bool repl_getline(bool interactive, - std::istream & in, - std::ostream & out, - std::string & input) -{ - if (interactive) { - out << "> "; - std::flush(out); - } - - return static_cast(std::getline(in, input)); -} - -int -main() { - using namespace xo::scm; - using namespace std; - - using tokenizer_type = tokenizer; - using span_type = tokenizer_type::span_type; - - xo::log_config::min_log_level = xo::log_level::severe; - - bool interactive = isatty(STDIN_FILENO); - - tokenizer_type tkz(xo::log_config::min_log_level <= xo::log_level::info); - string input_str; - - size_t line_no = 1; - - constexpr std::size_t c_maxlines = 25; - - while (repl_getline(interactive, cin, cout, input_str)) { - // we want tokenizer to see newline, it's syntax - input_str.push_back('\n'); - span_type input = span_type::from_string(input_str); - - // reminder: input may contain multiple tokens - while (!input.empty()) { - auto [tk, consumed, error] = tkz.scan(input, false /*!eof*/); - - if (tk.is_valid()) { - cout << tk << endl; - } else if (error.is_error()) { - cout << "tokenizer error: " << endl; - error.report(cout); - - break; - } - - input = input.after_prefix(consumed); - } - - /* here: input.empty() or error encountered */ - - ++line_no; - - if (line_no > c_maxlines) { - cout << "always exit after " << c_maxlines << " lines of input" << endl; - break; - } - } -} - -/** end tokenrepl.cpp */ diff --git a/.xo-tokenizer/include/xo/tokenizer/buffer.hpp b/.xo-tokenizer/include/xo/tokenizer/buffer.hpp deleted file mode 100644 index 7b19316b..00000000 --- a/.xo-tokenizer/include/xo/tokenizer/buffer.hpp +++ /dev/null @@ -1,328 +0,0 @@ -/** @file buffer.hpp **/ - -#pragma once - -#include "span.hpp" -#include -#include -#include -#include - -namespace xo { - namespace scm { - /** - * @class buffer buffer.hpp - * - * @brief Container for a (possibly owned) FIFO queue of chars - * - * @tparam CharT. buffer element type. - * - * @code - * .buf - * - * +------------------------------------------+ - * | | ... | | X| ... | X| | ... | | - * +------------------------------------------+ - * ^ ^ ^ ^ - * 0 .lo .hi .buf_z - * - * <-contents-><----avail-----> - * @endcode - * - * Buffer does not support wrapped content: - * content that has not been consumed always occupies contiguous memory. - * - * Example: - * @code - * // 1. - * buffer buf(64*1024); - * buf.empty() -> true - * buf.buf_z() -> 65536 - * buf.lo_pos() -> 0 - * buf.hi_pos() -> 65536 - * buf.contents() -> empty span - * buf.avail() -> span entire buffer memory - * - * // write to (a prefix of) buf.avail() - * ::strncpy(buf.buf(), "hello, world\n", 13); - * buf.produce(span_type(buf.buf(), buf.buf() + 13)); - * - * buf.lo_pos() -> 0 - * buf.hi_pos() -> 13 - * buf.contents() -> "hello, world\n"; - * - * - * // examine stored content (does not change buffer state) - * auto span = buf.contents(); - * cerr << string_view(span.lo(), span.hi()); // "hello, world\n" - * - * // consume (a prefix of) stored content - * buf.consume(span.prefix(7); - * - * buf.lo_pos() -> 7 - * buf.hi_pos() -> 13 - * buf.contents() -> "world\n" - * - * // consuming all remain content resets to original state - * buf.consume(buf.contents()); - * - * buf.empty() -> true - * buf.hi_pos() -> 0 // not 13! - * - * // 2. - * buffer buf; - * buf.empty() -> true - * buf.buf_z() -> 0 - * buf.lo_pos() -> 0 - * buf.hi_pos() -> 0 - * buf.contents() -> empty span - * buf.avail() -> empty span - * - * // allocate memory separately from ctor - * buf.alloc(64*1024); - * @endcode - **/ - template - class buffer { - public: - /** @brief typealias for span of CharT **/ - using span_type = span; - /** @brief typealias for buffer size (counts CharT's, not bytes) **/ - using size_type = std::uint64_t; - - public: - /** @brief create empty buffer. - - Does not allocate any storage; @see alloc - **/ - buffer() = default; - /** @brief create empty buffer, and possibly allocate storage. - - @param buf_z Buffer size. allocate storage (owned by this buffer) if >0. - @param align_z Align to this value, e.g. 8 to align storage on an 8-byte boundary - **/ - buffer(size_type buf_z, - size_type align_z = sizeof(char)) - : is_owner_{true}, - buf_{buf_z ? (new (std::align_val_t(align_z)) CharT [buf_z]) : nullptr}, - buf_z_{buf_z}, - lo_pos_{0}, - hi_pos_{0} - {} - /** @brief buffer is not copyable **/ - buffer(buffer const & x) = delete; - /** @brief destructor. Release storage if owned **/ - ~buffer() { this->reset(); } - - /** @name Access methods **/ - ///@{ - - /** @brief start of buffer memory **/ - CharT * buf() const { return buf_; } - /** @brief buffer size (number of characters) **/ - size_type buf_z() const { return buf_z_; } - /** @brief current start position within buffer **/ - size_type lo_pos() const { return lo_pos_; } - /** @brief current end position within buffer **/ - size_type hi_pos() const { return hi_pos_; } - - ///@} - - /** @brief readonly access to a single buffer element. - - Relative to start of buffer (ignores current consume position) - **/ - CharT const & operator[](size_type i) const { return buf_[i]; } - - /** @brief return span for current buffer contents **/ - span_type contents() const { return span_type(buf_ + lo_pos_, - buf_ + hi_pos_); } - /** @brief returns span for writable buffer contents (unused prefix following produce position **/ - span_type avail() const { return span_type(buf_ + hi_pos_, - buf_ + buf_z_); } - - /** @brief @c true iff buffer is empty **/ - bool empty() const { return lo_pos_ == hi_pos_; } - - - /** - @brief update buffer produce position, after (independently) writing contents of span to it - - @pre left endpoint of @p span equals buffer produce position (@c .hi_pos) - @pre right endpoint of @p span within bounds of buffer memory range - @post right endpoint of @p span equals buffer produce position. - **/ - void produce(span_type const & span) { - assert(span.lo() == buf_ + hi_pos_); - - hi_pos_ += span.size(); - } - - /** - @brief update buffer consume position, when done with contents of span - - @pre left endpoint of @p span equals buffer consume position (@c .lo_pos) - @pre right endpoint of @p span within bounds of buffer memory range - @post Either - buffer is empty, with @c .lo_pos = @c .hi_pos = @c 0. - buffer is non-empty, right endpoint of @p span equals new buffer consume position. - **/ - void consume(span_type const & span) { - if (span.size()) { - assert(span.lo() == buf_ + lo_pos_); - - lo_pos_ += span.size(); - } else { - /* since .consume() that arrives at empty contents also resets .lo_pos .hi_pos, - * we don't want to blow up when called with an empty span -- argument - * may represent some pre-reset location in buffer - */ - } - - if (lo_pos_ == hi_pos_) { - lo_pos_ = 0; - hi_pos_ = 0; - } - } - - /** - @brief allocate buffer with desired amount of memory - - @param buf_z desired buffer size - @param align_z alignment; buffer memory will be aligned on this byte-boundary. - **/ - void alloc(size_type buf_z, size_type align_z = sizeof(char)) { - /* properly reset (+ discard) any existing state */ - this->reset(); - - is_owner_ = true; - if (buf_z) - buf_ = new (std::align_val_t(align_z)) CharT [buf_z]; - buf_z_ = buf_z; - lo_pos_ = 0; - hi_pos_ = 0; - } - - /** - @brief attach buffer to (unowned) range of @p buf_z bytes starting at @p buf[0] - - Buffer is not responsible for managing storage. - - @post - 1. buffer is empty - @post - 2. buffer read position = buffer write position = 0 - **/ - void setbuf(CharT * buf, size_type buf_z) { - /* properly reset (+ discard) any existing state */ - this->reset(); - - is_owner_ = false; - lo_pos_ = 0; - hi_pos_ = 0; - buf_ = buf; - buf_z_ = buf_z; - } - - /** - @brief revert buffer to empty state and possibly zero it - - @param zero_buffer_flag Zero buffer contents iff this is true - - @post - 1. buffer is empty - @post - 2. buffer read position = buffer write position = 0 - **/ - void clear2empty(bool zero_buffer_flag) { - if (buf_ && zero_buffer_flag) - explicit_bzero(buf_, buf_z_ * sizeof(CharT)); - - lo_pos_ = 0; - hi_pos_ = 0; - } - - /** - @brief swap representation with another buffer instance. - **/ - void swap (buffer & x) { - std::swap(is_owner_, x.is_owner_); - std::swap(buf_, x.buf_); - std::swap(buf_z_, x.buf_z_); - std::swap(lo_pos_, x.lo_pos_); - std::swap(hi_pos_, x.hi_pos_); - } - - /** - @brief reset buffer to an empty state and recover owned storage - **/ - void reset() { - if (is_owner_ && buf_) - delete [] buf_; - - is_owner_ = false; - buf_ = nullptr; - buf_z_ = 0; - lo_pos_ = 0; - hi_pos_ = 0; - } - - /** - @brief move-assignment operator. - @param x right-hand-side to move from. - - @post - @p x is in a valid, empty, - **/ - buffer & operator= (buffer && x) { - is_owner_ = x.is_owner_; - buf_ = x.buf_; - buf_z_ = x.buf_z_; - lo_pos_ = x.lo_pos_; - hi_pos_ = x.hi_pos_; - - x.is_owner_ = false; - x.lo_pos_ = 0; - x.hi_pos_ = 0; - x.buf_ = nullptr; - x.buf_z_ = 0; - - return *this; - } - - /** @brief buffer is not assignable */ - buffer & operator= (buffer & x) = delete; - - private: - /** @brief true iff buffer is responsible for freeing storage at @c buf_ **/ - bool is_owner_ = false; - /** @brief buffer contents. buffer memory comprises @c buf_[0] to @c buf_[buf_z_] **/ - CharT * buf_ = nullptr; - /** @brief buffer size (in units of CharT) **/ - size_type buf_z_ = 0; - - /** @brief buffer read (consume) position - - @invariant - 0 <= lo_pos_ <= hi_pos_ < buf_z_ - **/ - size_type lo_pos_ = 0; - /** @brief buffer write (produce) position - - @invariant - 0 <= hi_pos_ < hi_pos_ < buf_z_ - **/ - size_type hi_pos_ = 0; - }; - - /** @brief Overload for @c swap, so that @c buffer swappable **/ - template - inline void - swap(buffer & lhs, - buffer & rhs) { - lhs.swap(rhs); - } - } /*namespace scm*/ -} /*namespace xo*/ - -/* end buffer.hpp */ diff --git a/.xo-tokenizer/include/xo/tokenizer/input_state.hpp b/.xo-tokenizer/include/xo/tokenizer/input_state.hpp deleted file mode 100644 index fbff7e57..00000000 --- a/.xo-tokenizer/include/xo/tokenizer/input_state.hpp +++ /dev/null @@ -1,363 +0,0 @@ -/* @file input_state.hpp - * - * author: Roland Conybeare, Jun 2025 - */ - -#pragma once - -#include "span.hpp" - -namespace xo { - namespace scm { - /** enum to report outcome of @ref capture_current_line **/ - enum class input_error { - /** normal return, input line successfully identified and captured **/ - ok = 0, - /** incomplete input; should not have been submitted to @ref capture_current_line. - * note: submit last line of input with eof_flag=true - **/ - incomplete, - N - }; - - /** @class input_state - * @brief Track detailed input position for use in error messages - * - * input characters fall into two categories: - * - consumed: memory can be reclaimed/recycled - * - buffered: memory will be retained unaltered until consumed - * - * remarks: - * - always in one of two states: - * - empty - * - contains exactly one line of input - * - also record current input position. - * Use this for example to identify where tokenizer rejected input. - * - .current_pos advances by one token - * - * - buffered characters always form a single contiguous range. - * - input_state does not own any storage; storage is owned elsewhere - * - * @text - * - * <------------------.current_line------------------> - * > <-- .whitespace - * cccccccccccccccccccccccccccccccc__TTTTTTTTxxxxxxxxx - * ^ ^ ^ - * .current_line.lo | .current_line.hi - * .current_pos - * - * <----prev_line----> <----current_line----> - * > <--whitespace - * ppppppppppppppppppp cccccccccccc__TTTTTTTT - * ^ - * - * @endtext - **/ - template - class input_state { - public: - /** @defgroup input-state-type-traits input-state type straits **/ - ///@{ - - /** type representing a contiguous span of tokenizer input characters **/ - using span_type = span; - - ///@} - - public: - /** @defgroup input-state-ctors input_state constructors **/ - ///@{ - - input_state() = default; - explicit input_state(bool debug_flag) : debug_flag_{debug_flag} {} - /** Create instance with supplied @p current_line, @p current_pos, @p whitespace. - * Introduced for unit tests, not used in tokenizer. - **/ - explicit input_state(const span& current_line, - size_t current_pos, - size_t whitespace) : current_line_{current_line}, - current_pos_{current_pos}, - whitespace_{whitespace} {} - - ///@} - - /** @defgroup input-state-static-methods input_state static methods **/ - ///@{ - - /** recognize the newline character '\n' **/ - static bool is_newline(CharT ch); - /** identifies whitespace chars. - * These are chars that do not belong to any token. - * They are not permitted to appear within - * a symbol or string token. - * Appearance of a whitespace char forces completioon of - * preceding token. - **/ - static bool is_whitespace(CharT ch); - - ///@} - - /** @defgroup input-state-access-methods **/ - ///@{ - -#pragma GCC diagnostic push -#ifndef __APPLE__ -#pragma GCC diagnostic ignored "-Wchanges-meaning" -#endif - const span_type & current_line() const { return current_line_; } -#pragma GCC diagnostic pop - size_t tk_start() const { return tk_start_; } - size_t current_pos() const { return current_pos_; } - size_t whitespace() const { return whitespace_; } - bool debug_flag() const { return debug_flag_; } - - ///@} - - /** @defgroup input-state-general-methods **/ - ///@{ - - /** Input state less @p n chars. - * Use to recover input state before a complete but error-triggering token - **/ - input_state rewind(std::size_t n) const; - - /** Capture prefix of @p input up to first newline. - * Set read position to start of line. - * - * Alters: - * .current_line - * .current_pos - * - * Return pair comprising error code and input span representing first line - * (including trailing newline) from @p input. - **/ - std::pair capture_current_line(const span_type & input, - bool eof_flag); - - /** atomically return current line while discarding it from input state - * - * Alters - * .current_line - * .current_pos - * .whitespace - **/ - span_type consume_current_line(); - - /** Reset input state for start of next line. - * Expression parser may use this to discard remainder of input line - * after a parsing error. - * - * Alters: - * .current_line - * .current_pos - * .whitespace - **/ - void discard_current_line(); - - /** Advance input position by @p z - * - * Alters: - * .current_pos - **/ - void advance(size_t z); - - /** Advance .current_pos to pos. - * Require: pos in @ref current_line_ - **/ - void advance_until(const CharT * pos); - - /** Skip prefix of input, starting at current read position, - * comprising only whitespace. - * - * Presume input position is at end of token; - * on return @ref whitespace_ counts number of whitespace characters - * skipped. - * - * Return pointer to first non-whitespace character after @ref current_pos_ - * or @ref current_line_.hi if reached end of buffered line. - * - * Alters: - * .whitespace - **/ - const CharT * skip_leading_whitespace(); - - ///@} - - private: - /** @defgroup input-state-instance-vars input_state instance variables **/ - ///@{ - - /** remember current input line. Used only to report errors **/ - span current_line_ = span(); - /** start of last token within @ref current_line_ **/ - size_t tk_start_ = 0; - /** input position within @ref current_line_ **/ - size_t current_pos_ = 0; - /** number of whitespace chars since end of preceding token, - * or last newline, whichever is less - **/ - size_t whitespace_ = 0; - - /** true to log input activity */ - bool debug_flag_ = false; - - ///@} - }; /*input_state*/ - - template - bool - input_state::is_newline(CharT ch) { - return (ch == '\n'); - } - - template - bool - input_state::is_whitespace(CharT ch) { - switch(ch) { - case ' ': return true; - case '\t': return true; - case '\n': return true; - case '\r': return true; - } - - return false; - } - - template - input_state - input_state::rewind(std::size_t n) const { - return input_state(this->current_line_, - (n <= current_pos_) ? current_pos_ - n : 0, - 0 /*whitespace*/); - } - - template - void - input_state::advance(size_t z) { - scope log(XO_DEBUG(debug_flag_)); - - this->current_pos_ += z; - - log && log(xtag("z", z), xtag("current_pos", current_pos_)); - } - - template - void - input_state::advance_until(const CharT * pos) { - scope log(XO_DEBUG(debug_flag_)); - - assert(current_line_.lo() <= pos && pos <= current_line_.hi()); - - this->current_pos_ = pos - current_line_.lo(); - - log && log(xtag("current_pos", current_pos_)); - } - - template - auto - input_state::consume_current_line() -> span_type { - span_type retval = current_line_; - - this->discard_current_line(); - - return retval; - } - - template - void - input_state::discard_current_line() { - this->current_line_ = span_type::make_null(); - this->current_pos_ = 0; - this->whitespace_ = 0; - } - - template - auto - input_state::capture_current_line(const span_type & input, - bool eof_flag) -> std::pair - { - // see also discard_current_line() - // note: must capture entirety of first line, - // for example including leading whitespace. - // See discussion in tokenizer scan() method - - scope log(XO_DEBUG(debug_flag_)); - - /* look ahead to {end of line, end of input}, whichever comes first */ - const CharT * sol = input.lo(); - const CharT * eol = sol; - - if (sol == current_line_.lo()) { - log && log("short-circuit - current line already stashed"); - - /* nothing to do here */ - return std::make_pair(input_error::ok, current_line_); - } - - while ((eol < input.hi()) && (*eol != '\n')) - ++eol; - - if (*eol == '\n') { - /* include \n at end-of-line */ - ++eol; - } else { - if (!eof_flag) { - /* caller expected to provide complete line of input. complain and ignore */ - return std::make_pair(input_error::incomplete, - input.prefix(0ul)); - } - } - - this->current_line_ = span_type(sol, eol); - this->current_pos_ = 0; - this->whitespace_ = 0; - - log && log(xtag("current_line", print::printspan(current_line_)), - xtag("current_pos", current_pos_)); - - return std::make_pair(input_error::ok, - span_type(sol, eol)); - } - - template - const CharT * - input_state::skip_leading_whitespace() - { - scope log(XO_DEBUG(debug_flag_)); - - const CharT * ix = current_line_.lo() + current_pos_; - - this->whitespace_ = 0; - - /* skip whitespace + remember beginning of most recent line */ - while (is_whitespace(*ix) && (ix != current_line_.hi())) { - ++ix; - - ++(this->whitespace_); - } - - this->tk_start_ = ix - current_line_.lo(); - this->current_pos_ = ix - current_line_.lo(); - - return ix; - } - - template - inline std::ostream & - operator<<(std::ostream & os, - const input_state& x) - { - using xo::print::unq; - - os << ""; - - return os; - } - } -} diff --git a/.xo-tokenizer/include/xo/tokenizer/scan_result.hpp b/.xo-tokenizer/include/xo/tokenizer/scan_result.hpp deleted file mode 100644 index 975edf63..00000000 --- a/.xo-tokenizer/include/xo/tokenizer/scan_result.hpp +++ /dev/null @@ -1,112 +0,0 @@ -/* file scan_result.hpp - * - * author: Roland Conybeare, Jun 2025 - */ - -#pragma once - -#include "token.hpp" -#include "tokenizer_error.hpp" -#include "input_state.hpp" - -namespace xo { - namespace scm { - /** @class scan_result - * @brief Represent result of parsing one input token. - * - * @code - * Possible outcomes fall into several categories - * (with T: @c token_.is_valid(), E: @cerror_.is_error()) - * - * | T | E | description | - * |-------+-------+-------------------------------------| - * | false | false | end of input, including end of line | - * | true | false | parsed token in T | - * | false | true | parse error in E | - * - * @endcode - **/ - template - class scan_result { - public: - using token_type = token; - using span_type = span; - using error_type = tokenizer_error; - using input_state_type = input_state; - - public: - scan_result(const token_type & token, - const span_type & consumed, - const error_type & error = error_type()) - : token_{token}, consumed_{consumed}, error_{error} {} - - static scan_result make_whitespace(const span_type & prefix_input); - static scan_result make_partial(const span_type & prefix_input); - /** - * @p error_src can be __FUNCTION__ from site where error generated. - * @p error_msg error message - * @p error_pos error position, relative to start of token - * @p input_state_ref input state object; - * copied into scan_result, and leaving input_state_ref.current_line cleared - **/ - static scan_result make_error_consume_current_line(const char * error_src, - std::string error_msg, - size_t error_pos, - input_state_type & input_state_ref); - - bool is_eof_or_ambiguous() const { return token_.is_invalid() && error_.is_not_an_error(); } - bool is_token() const { return token_.is_valid(); } - bool is_error() const { return error_.is_error(); } - - const token_type & get_token() const { return token_; } - const span_type & consumed() const { return consumed_; } - const error_type & error() const { return error_; } - - public: - /** Successfully parsed token, whenever tk_type != tokentype::tk_invalid. - * Will be tokentype::tk_invalid in normal cause of events for valid input, - * when consuming whitespace - **/ - token_type token_; - /** input span represented by .token, on success. Otherwise not defined **/ - span_type consumed_; - /** error description, whenever .error_.is_error() is true **/ - error_type error_; - }; - - template - auto scan_result::make_whitespace(const span_type& whitespace_input) -> scan_result - { - return scan_result(token_type::invalid(), whitespace_input /*consumed*/); - } - - template - auto scan_result::make_partial(const span_type& prefix_input) -> scan_result - { - return scan_result(token_type::invalid(), prefix_input /*consumed*/); - } - - template - auto - scan_result::make_error_consume_current_line(const char * error_src, - std::string error_msg, - size_t error_pos, - input_state_type & input_state_ref) -> scan_result - { - /* report+consume entire input line */ - - /* copy before altered by .consume_current_line() */ - input_state_type input_state_copy = input_state_ref; - - return scan_result(token_type::invalid(), - input_state_ref.consume_current_line(), - error_type(error_src, - error_msg, - input_state_copy, - error_pos)); - } - - } /*namespace scm*/ -} /*namespace xo*/ - -/* end scan_result.hpp */ diff --git a/.xo-tokenizer/include/xo/tokenizer/span.hpp b/.xo-tokenizer/include/xo/tokenizer/span.hpp deleted file mode 100644 index 8cf7a4a7..00000000 --- a/.xo-tokenizer/include/xo/tokenizer/span.hpp +++ /dev/null @@ -1,291 +0,0 @@ -/** @file span.hpp **/ - -#pragma once - -#include "xo/indentlog/scope.hpp" -#include "xo/indentlog/print/ppdetail_atomic.hpp" -#include -#include -#include - -namespace xo { - namespace scm { - /** @class span compression/span.hpp - * - * @brief A contiguous range of characters, without ownership. - * - * @tparam CharT type for elements referred to by this span. - **/ - template - class span { - public: - /** @defgroup span-type-traits span type traits **/ - ///@{ - - /** typealias for span size (in units of CharT) **/ - using size_type = std::uint64_t; - - ///@} - - public: - /** @defgroup span-ctors span constructors **/ - ///@{ - - /** null span **/ - span() : lo_{nullptr}, hi_{nullptr} {} - - /** Create span for the contiguous memory range [@p lo, @p hi) **/ - span(CharT * lo, CharT * hi) : lo_{lo}, hi_{hi} {} - - /** explicit conversion from span **/ - template - span(const span & other, - std::enable_if_t - && !std::is_same_v> * = nullptr) - : lo_{other.lo()}, hi_{other.hi()} {} - - /** copy ctor (explicit to avoid ambiguity with template ctor) **/ - span(const span & other) = default; - span & operator=(const span & other) = default; - - /** Create a null span (i.e. with null @p lo, @p hi pointers) - * A null span can be concatenated with any other span - * without triggering matching-endpoint asserts. - **/ - static span make_null() { return span(static_cast(nullptr), static_cast(nullptr)); } - - /** @brief create span for C-style string @p cstr **/ - static span from_cstr(const CharT * cstr) { - CharT * lo = cstr; - CharT * hi = cstr ? cstr + strlen(cstr) : nullptr; - - return span(lo, hi); - } - - /** @brief create span from std::string @p str **/ - static span from_string(const std::string& str) { - CharT * lo = &(*str.begin()); - CharT * hi = &(*str.end()); - - return span(lo, hi); - } - - /** @brief concatenate two contiguous spans */ - static span concat(const span & span1, const span & span2) { - if (span1.is_null()) - return span2; - if (span2.is_null()) - return span1; - - if (span1.hi() != span2.lo()) { - scope log(XO_DEBUG(true)); - - log && log(xtag("span1.hi", (void*)span1.hi()), xtag("span2.lo", (void*)span2.lo())); - } - - assert(span1.hi() == span2.lo()); - - CharT * lo = span1.lo(); - CharT * hi = span2.hi(); - - return span(lo, hi); - } - - ///@} - - /** @defgroup span-access-methods **/ - ///@{ - - CharT * lo() const { return lo_; } /* get member span::lo_ */ - CharT * hi() const { return hi_; } /* get member span::hi_ */ - - ///@} - - /** @defgroup span-general-methods **/ - ///@{ - - /** @brief strip prefix until first occurence of '\n', including the newline **/ - void discard_until_newline() { - for (const CharT * p = lo_; p < hi_; ++p) { - if (*p == '\n') { - lo_ = p + 1; - return; - } - } - - lo_ = hi_; - } - - /** Create new span over supplied type, - * with identical (possibly misaligned) endpoints. - * - * @warning - * 1. New span uses exactly the same memory addresses. - * Endpoint pointers may not be aligned. - * 2. Implementation assumes code compiled with - * @code -fno-strict-aliasing @endcode enabled. - * - * @tparam OtherT element type for new span - **/ - template - span - cast() const { return span(reinterpret_cast(lo_), - reinterpret_cast(hi_)); } - - /** @brief create span including the first @p z members of this span. **/ - span prefix(size_type z) const { return span(lo_, lo_ + z); } - - /** @brief create span representing prefix up to (but not including) @p *p - **/ - span prefix_upto(CharT * p) const { - if (p <= hi_) - return span(lo_, p); - else - return span(lo_, hi_); - } - - /** @brief create span with first @p z members of this span removed **/ - span after_prefix(size_type z) const { - if (lo_ + z > hi_) - z = hi_ - lo_; - - return span(lo_ + z, hi_); - } - - /** @brief create span with @p prefix of this span removed **/ - span after_prefix(const span & prefix) const { - if (!prefix.is_null() && (prefix.lo() != lo_)) { - throw std::runtime_error - ("after_prefix: expected prefix of this span"); - } - - return after_prefix(prefix.size()); - } - - /** Create span starting with position @p p. - * Does boundary checking; will return empty span if @p p is outside @c [lo_,hi) - **/ - span suffix_from(CharT * p) const { - if ((lo_ <= p) && (p <= hi_)) - return span(p, hi_); - else - return span(hi_, hi_); - } - - /** true iff this span is null. distinct from empty. **/ - bool is_null() const { return lo_ == nullptr && hi_ == nullptr; } - /** true iff this span is empty (comprises 0 elements). **/ - bool empty() const { return lo_ == hi_; } - /** report the number of elements (of type CharT) in this span. **/ - size_type size() const { return hi_ - lo_; } - - /** increase extent of this spans to include @p x. - * Requires @c hi() == @c x.lo() - **/ - span & operator+=(const span & x) { - if (hi_ == x.lo_) { - hi_ = x.hi_; - } else if (!x.is_null()) { - assert(false); - } - - return *this; - } - - /** print representation for this span on stream @p os **/ - void print(std::ostream & os) const { - os << ""; - } - ///@} - - private: - /** @defgroup span-instance-vars **/ - ///@{ - - /** start of span. - Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) - **/ - CharT * lo_ = nullptr; - - /** @brief end of span. - Span comprises memory address between @p lo (inclusive) and @p hi (exclusive) - **/ - CharT * hi_ = nullptr; - - ///@} - }; /*span*/ - - /** @defgroup span-operators **/ - ///@{ - - /** compare spans for equality. - * Two spans are equal iff both endpoints match exactly. - **/ - template - inline bool - operator==(const span & lhs, const span & rhs) { - return ((lhs.lo() == rhs.lo()) - && (lhs.hi() == rhs.hi())); - } - - /** compare spans for inequality. - * Two spans are unequal if either paired endpoint differs. - **/ - template - inline bool - operator!=(const span & lhs, const span & rhs) { - return ((lhs.lo() != rhs.lo()) - || (lhs.hi() != rhs.hi())); - } - - /** print a summary of @p x on stream @p os. Intended for diagnostics **/ - template - inline std::ostream & - operator<<(std::ostream & os, - const span & x) { - x.print(os); - return os; - } - - ///@} - } /*namespace scm*/ - - namespace print { - template - class printspan_impl { - public: - printspan_impl(xo::scm::span x) : span_{x} {} - - xo::scm::span span_; - }; - - template - printspan_impl printspan(const xo::scm::span& span) { - return printspan_impl(span); - } - - template - inline std::ostream & - operator<< (std::ostream & os, - const printspan_impl & x) - { - for (const CharT * p = x.span_.lo(); p < x.span_.hi(); ++p) - os << *p; - - return os; - } - -#ifndef ppdetail_atomic - template \ - PPDETAIL_ATOMIC_BODY(printspan_impl); - - template \ - PPDETAIL_ATOMIC_BODY(xo::scm::span); -#endif - - } -} /*namespace xo*/ diff --git a/.xo-tokenizer/include/xo/tokenizer/token.hpp b/.xo-tokenizer/include/xo/tokenizer/token.hpp deleted file mode 100644 index 689a4512..00000000 --- a/.xo-tokenizer/include/xo/tokenizer/token.hpp +++ /dev/null @@ -1,473 +0,0 @@ -/* file token.hpp - * - * author: Roland Conybeare, Jul 2024 - */ - -#pragma once - -#include "tokentype.hpp" -#include "xo/indentlog/print/tag.hpp" -#include -#include -#include -#include - -namespace xo { - namespace scm { - namespace detail { - /* compute a * b^p, p >= 0 */ - constexpr double - pow_aux(double a, double b, int p) { - while (p > 0) { - if (p % 2 == 1) { - /* a * b^p = a * b^(2q + 1) = a.b * 10^(2q) */ - a *= b; - p -= 1; - } else { - /* a * b^p = a * b^(2q) = a * (b^2)^q */ - b = b * b; - p /= 2; - } - } - - /* a * b^0 = a */ - return a; - } - - constexpr double - pow10(int p) { - if (p >= 0) - return pow_aux(1.0, 10.0, p); - else - return 1.0 / pow_aux(1.0, 10.0, -p); - } - } - - /** @class token - * @brief Represent a Schematika lexical token - **/ - template - class token { - public: - /** @defgroup token-ctors token constructors **/ - ///@{ - - /** default ctor creates token with type @c tk_invalid **/ - token() = default; - /** create token with type @c tk_type and input text @c text **/ - token(tokentype tk_type, const std::string & text = "") - : tk_type_{tk_type}, text_{text} {} - - /** create invalid token (same as null ctor, but explicit) **/ - static token invalid() { return token(); } - /** Create token representing a boolean literal from text @p txt - * @p txt must be @c true or @c false - **/ - static token bool_token(const std::string & txt) { - return token(tokentype::tk_bool, txt); - } - /** Create token representing 64-bit signed integer literal parsed from decimal @p txt. - * The string @p txt must be a decimal integer literal, since @ref i64_value re-parses @p txt. - **/ - static token i64_token(const std::string & txt) { - return token(tokentype::tk_i64, txt); - } - /** create token representing 64-bit floating-point literal parsed from decimal @p txt - * The string @p txt must be a decimal floating-point literal, since @ref f64_value re-parses @p txt. - **/ - static token f64_token(const std::string & txt) { - return token(tokentype::tk_f64, txt); - } - /** create token representing literal string parsed from @p txt **/ - static token string_token(const std::string & txt) { - return token(tokentype::tk_string, txt); - } - /** create token representing a symbol parsed from @p txt. - * Note that not all strings are valid symbol names. - **/ - static token symbol_token(const std::string & txt) { - return token(tokentype::tk_symbol, txt); - } - /** token representing left angle bracket @c "<" **/ - static token leftangle() { return token(tokentype::tk_leftangle); } - /** token representing right angle bracket @c ">" **/ - static token rightangle() { return token(tokentype::tk_rightangle); } - /** token representing left parenthesis @c "(" **/ - static token leftparen() { return token(tokentype::tk_leftparen); } - /** token representing right parenthesis @c ")" **/ - static token rightparen() { return token(tokentype::tk_rightparen); } - /** token representing left bracket @c "[" **/ - static token leftbracket() { return token(tokentype::tk_leftbracket); } - /** token representing right bracket @c "]" **/ - static token rightbracket() { return token(tokentype::tk_rightbracket); } - /** token representing left brace @c "{" **/ - static token leftbrace() { return token(tokentype::tk_leftbrace); } - /** token representing right brace @c "}' **/ - static token rightbrace() { return token(tokentype::tk_rightbrace); } - /** token representing period @c "." **/ - static token dot() { return token(tokentype::tk_dot); } - /** token representing comma @c "," **/ - static token comma() { return token(tokentype::tk_comma); } - /** token representing colon @c ":" **/ - static token colon() { return token(tokentype::tk_colon); } - /** token representing double-colo @c "::" **/ - static token doublecolon() { return token(tokentype::tk_doublecolon); } - /** token representing semicolon @c ";" **/ - static token semicolon() { return token(tokentype::tk_semicolon); } - /** token representing single-assignment @c "=" **/ - static token singleassign() { return token(tokentype::tk_singleassign); } - /** token representing unrestricted assignment @c ":=" **/ - static token assign_token() { return token(tokentype::tk_assign); } - /** token representing indirection @c "->" **/ - static token yields() { return token(tokentype::tk_yields); } - - /** token for @c "+" **/ - static token plus_token() { return token(tokentype::tk_plus); } - /** token for @c "-" **/ - static token minus_token() { return token(tokentype::tk_minus); } - /** token for @c "*" **/ - static token star_token() { return token(tokentype::tk_star); } - /** token for @c "/" **/ - static token slash_token() { return token(tokentype::tk_slash); } - - /** token representing keyword @c type **/ - static token type() { return token(tokentype::tk_type); } - /** token representing keyword @c def **/ - static token def() { return token(tokentype::tk_def); } - /** token representing keyword @c lambda **/ - static token lambda() { return token(tokentype::tk_lambda); } - /** token representing keyword @c if **/ - static token if_token() { return token(tokentype::tk_if); } - /** token representing keyword @c else **/ - static token else_token() { return token(tokentype::tk_else); } - /** token representing keyword @c let **/ - static token let() { return token(tokentype::tk_let); } - /** token representing keyword @c in **/ - static token in() { return token(tokentype::tk_in); } - /** token representing keyword @c end **/ - static token end() { return token(tokentype::tk_end); } - - ///@} - - /** @defgroup token-access-methods **/ - ///@{ - - tokentype tk_type() const { return tk_type_; } - const std::string & text() const { return text_; } - - ///@} - - /** @defgroup token-general-methods **/ - ///@{ - - /** true if token understood to represent valid input - * i.e. any token type except @c tk_invalid - **/ - bool is_valid() const { return tk_type_ != tokentype::tk_invalid; } - /** true for sentinel token with type tk_invalid **/ - bool is_invalid() const { return tk_type_ == tokentype::tk_invalid; } - - /** true for tokens with variable text. false for those with fixed textual representation **/ - bool has_variable_text() const { return (tk_type_ == tokentype::tk_i64 - || tk_type_ == tokentype::tk_f64 - || tk_type_ == tokentype::tk_string - || tk_type_ == tokentype::tk_symbol); } - - /** expect input matching @c true or @c false **/ - bool bool_value() const; - - /** expect input matching @c [+|-][0-9][0-9]* **/ - std::int64_t i64_value() const; - - /** expect input matching @c [+|-][0-9]*[.][0-9]*[e|E][+|-][0-9]* **/ - double f64_value() const; - - /** print human-readable token representation on stream @p os **/ - void print(std::ostream & os) const; - - ///@} - - private: - /** @defgroup token-instance-vars **/ - ///@{ - - /** category for this token **/ - tokentype tk_type_ = tokentype::tk_invalid; - - /** characters comprising this token. - * only provided for certain token types: - * - * tk_i64 - * tk_f64 - * tk_string - * tk_symbol - **/ - std::string text_; - - ///@} - }; - - template - bool - token::bool_value() const { - if (tk_type_ != tokentype::tk_bool) { - throw (std::runtime_error - (tostr("token::bool_value", - ": token with type tk found where tk_bool expected", - xtag("tk", tk_type_)))); - } - - if (text_ == "true") - return true; - if (text_ == "false") - return false; - - throw (std::runtime_error - (tostr("token::bool_value", - ": unexpected input string tk_bool token", - xtag("text", text_)))); - - return false; - } - - template - std::int64_t - token::i64_value() const { - if (tk_type_ != tokentype::tk_i64) { - throw (std::runtime_error - (tostr("token::i64_value", - ": token with type tk found where tk_i64 expected", - xtag("tk", tk_type_)))); - } - - if (text_.empty()) { - throw (std::runtime_error - (tostr("token::i64_value", - ": unexpected empty input string for tk_i64 token"))); - } - - int sign = 1; - int value = 0; - { - auto ix = text_.begin(); - auto end_ix = text_.end(); - - CharT ch = *ix; - - if (ch == '+') { - ++ix; - } else if (ch == '-') { - sign = -1; - ++ix; - } - - if (ix == end_ix) { - throw (std::runtime_error - (tostr("token::i64_value", - ": input text found where at least one digit expected", - xtag("text", text_)))); - } - - for (; ix != end_ix; ++ix) { - CharT ch = *ix; - - if ((ch >= '0') && (ch <= '9')) { - value *= 10; - value += (ch - '0'); - } else { - throw (std::runtime_error - (tostr("token::i64_value", - ": unexpected char ch in integer token", - xtag("ch", ch)))); - } - } - } - - return sign * value; - } /*i64_value*/ - - template - double - token::f64_value() const { - if (tk_type_ != tokentype::tk_f64) { - throw (std::runtime_error - (tostr("token::f64_value", - ": token with type tk found where tk_f64 expected", - xtag("tk", tk_type_)))); - } - - if (text_.empty()) { - throw (std::runtime_error - (tostr("token::f64_value", - ": unexpected empty input string for tk_f64 token"))); - } - - int sign = 1; - /* integer representing denormalized unsigned mantissa - * (mantissa scaled by smallest power of 10 sufficient to make - * it an integer) - */ - std::int64_t mantissa = 0; - /* counts #of digits to the right of decimal point '.' */ - int rh_digits = 0; - /* sign of exponent */ - int exp_sign = 1; - /* value of exponenct = integer to the right of 'e' or 'E' */ - int exponent = 0; - - /* floating-point value will represent - * sign * mantissa * 10^(sign*exponent - rh_digits) - */ - { - auto ix = text_.begin(); - auto end_ix = text_.end(); - - CharT ch = *ix; - - if (ch == '+') { - ++ix; - } else if (ch == '-') { - sign = -1; - ++ix; - } - - if (ix == end_ix) { - throw (std::runtime_error - (tostr("token::f64_value", - ": input text found where at least one digit expected", - xtag("text", text_)))); - } - - /* true iff decimal point '.' present in mantissa */ - bool have_decimal_point = false; - /* true iff exponent prefix 'e' or 'E' present */ - //bool have_exponent = false; - /* counts number of digits in mantissa - * (both before and after, but not including, any decimal point - */ - int m_digits = 0; - /* digits to the left of decimal point */ - int lh_digits = 0; - - /* loop over mantissa digits */ - for (; ix != end_ix; ++ix) { - CharT ch = *ix; - - if (ch == '.') { - if (have_decimal_point) { - throw (std::runtime_error - (tostr("token::f64_value", - ": input text found where at most one decimal point expected", - xtag("text", text_)))); - } - - have_decimal_point = true; - lh_digits = m_digits; - } else if ((ch >= '0') && (ch <= '9')) { - mantissa *= 10; - mantissa += (ch - '0'); - ++m_digits; - } else if (ch == 'e' || ch == 'E') { - //have_exponent = true; - break; // done with mantissa - } else { - throw (std::runtime_error - (tostr("token::i64_value", - ": unexpected char ch in integer token", - xtag("ch", ch)))); - } - } - - if (have_decimal_point) - rh_digits = m_digits - lh_digits; - - if (ix != end_ix) { - /* continue to read exponent */ - - /* skip e|E */ - ++ix; - - if (ix == end_ix) { - throw (std::runtime_error - (tostr("token::f64_value", - ": on input text, expect at least one digit following exponent marker e|E", - xtag("text", text_)))); - } - - CharT ch = *ix; - - if (ch == '+') { - ++ix; /*skip*/ - } else if (ch == '-') { - exp_sign = -1; - ++ix; - } - - for (; ix != end_ix; ++ix) { - CharT ch = *ix; - - if ((ch >= '0') && (ch <= '9')) { - exponent *= 10; - exponent += (ch - '0'); - } else { - throw (std::runtime_error - (tostr("token::f64_value", - "; on input text, expect only digits following" - " (possibly signed) exponenct marker", - xtag("text", text_)))); - } - } - } - } - - /* floating-point value will represent - * sign * mantissa * 10^(sign*exponent - rh_digits) - */ - - double mantissa_f64 = sign * mantissa; - -#ifdef OBSOLETE_DEBUG - std::cerr << xtag("text", text_) - << xtag("rh_digits", rh_digits) - << xtag("mantissa_f64", mantissa_f64) - << xtag("exp_sign", exp_sign) - << xtag("exponent", exponent) - << std::endl; -#endif - - double retval = (mantissa_f64 - * detail::pow10((exp_sign * exponent) - - rh_digits)); - - return retval; - } /*f64_value*/ - - template - void - token::print(std::ostream & os) const { - os << ""; - } /*print*/ - - template - inline std::ostream & - operator<< (std::ostream & os, - const token & tk) - { - tk.print(os); - return os; - } - } /*Namespace scm*/ - -#ifndef ppdetail_atomic - namespace print { - PPDETAIL_ATOMIC(xo::scm::token); - } -#endif - -} /*namespace xo*/ - -/* end token.hpp */ diff --git a/.xo-tokenizer/include/xo/tokenizer/tokenizer.hpp b/.xo-tokenizer/include/xo/tokenizer/tokenizer.hpp deleted file mode 100644 index e589b400..00000000 --- a/.xo-tokenizer/include/xo/tokenizer/tokenizer.hpp +++ /dev/null @@ -1,1057 +0,0 @@ -/* file tokenizer.hpp - * - * author: Roland Conybeare, Jul 2024 - */ - -#pragma once - -#include "token.hpp" -#include "input_state.hpp" -#include "span.hpp" -#include "scan_result.hpp" -#include "xo/indentlog/scope.hpp" -#include "xo/indentlog/print/ppdetail_atomic.hpp" -#include - -namespace xo { - namespace scm { - /** @class tokenizer - * @brief Parse a Schematika character stream into lexical tokens - * - * Use: - * - * @code - * // see xo-tokenizer/example/tokenrepl/tokenrepl.cpp - * // for exact working code - * - * using tokenizer_type = tokenizer; - * using span_type = tokenizer_type::span_type; - * - * tokenizer_type tkz; - * span_type input = ...; - * - * while (!input.empty()) { - * auto [tk, consumed, error] = tkz.scan(input); - * - * if (tk.is_valid()) { - * // do something with tk - * } else if (error.is_error()) { - * error.report(cout); - * break; - * } - * - * input = tkz.consume(consumed, input); - * } - * - * if endofinput { - * auto [tk, consumed, error] = tzk.notify_eof() - * - * // do something with (final) tk if tk.is_valid() - * } - * - * @endcode - * - * See tokentype.hpp for token types - **/ - template - class tokenizer { - public: - using token_type = token; - using error_type = tokenizer_error; - using span_type = span; - using input_state_type = input_state; - using result_type = scan_result; - - public: - /** @defgroup tokenizer-ctors tokenizer constructors **/ - ///@{ - - tokenizer(bool debug_flag = false); - - ///@} - - /** @defgroup tokenizer-access-methods tokenizer access methods **/ - ///@{ - -#pragma GCC diagnostic push -#ifndef __APPLE__ -#pragma GCC diagnostic ignored "-Wchanges-meaning" -#endif - const input_state & input_state() const { return input_state_; } -#pragma GCC diagnostic pop - - ///@} - - /** @defgroup tokenizer-general-methods tokenizer methods **/ - ///@{ - - /** identifies punctuation chars. - * These are chars that are not permitted to appear within - * a symbol token. Instead they force completion of - * a preceding token, and start a new token with themselves - **/ - static bool is_1char_punctuation(CharT ch); - - /** more-relazed version of is_1char_punctuation. - * Chars that are not permitted to appear within a symbol token, - * but may form token combined with next character - **/ - static bool is_2char_punctuation(CharT ch); - - /** assemble token from text @p token_text. - * @p initial_whitespace Amount of whitespace input being consumed from input. - * @p token_text subset of input_line representing a single token. - * @p input_state input state containing input_line - * - * retval.consumed will represent some possibly-empty prefix of @p input - **/ - static result_type assemble_token(std::size_t initial_whitespace, - const span_type & token_text, - input_state_type & input_state); - - /** degenerate version of assemble_token() on reaching end-of-file **/ - static result_type assemble_final_token(const span_type & token_text, - const input_state_type & input_state); - - /** true if tokenizer contains stored prefix of - * possibly-incomplete token - **/ - bool has_prefix() const { return !prefix_.empty(); } - - /** scan for next input token, given @p input. - * Note: - * - tokenizer can consume input (e.g. whitespace) - * without completing a token - * - input will remember the extent of the last line of input - * for which parsing has begun, but not completed. - * It's required that at least that portion of the input span - * remain valid across scan(), scan2() calls - * - * @return {parsed token, consumed span} - **/ - result_type scan(const span_type & input, - bool eof_flag); - - /** discard current line after error. Just cleans up error-reporting state **/ - void discard_current_line(); - - ///@} - - private: - /** @defgroup tokenizer-instance-vars tokenizer instance variables **/ - ///@{ - - /** track input state (line#,pos,..) for error messages. - * There's an ordering problem here: - * 1. input_state_.skip_leading_whitespace() advances current line automagically - * when it sees \n - * 2. need to capture value of @ref input_state_ _before_ newline - * 3. but neeed newline to end token - * Also recall input_state_type needed for reporting errors. - **/ - input_state_type input_state_; - /** Accumulate partial token here. - * This will happen if input sent to @ref tokenizer::scan - * ends without whitespace such that last available token's extent is not determined - **/ - std::string prefix_; - - ///@} - }; /*tokenizer*/ - - template - tokenizer::tokenizer(bool debug_flag) - : input_state_{debug_flag} - {} - - template - bool - tokenizer::is_1char_punctuation(CharT ch) { - switch(ch) { - case '(': - return true; - case ')': - return true; - case '[': - return true; - case ']': - return true; - case '{': - return true; - case '}': - return true; - case '<': - /* can't be 1char punctuation -- can begin lessequal token */ - return false; - case '>': - /* can't be 1char punctuation -- can begin greatequal token, - * and appears in tk_yields token - */ - return false; - case ',': - return true; - case ';': - return true; - case ':': - /* can't be 1char punctuation -- can begin assignment token */ - return false; - case '=': - /* can't be 1char punctuation -- can begin comparison token '==' */ - return false; - case '!': - /* can't be 1char punctuation -- can begin comparison token '!=' */ - return false; - case '-': - /* can't be punctuation - * - can appear inside f64 token: e.g. 1.23e-9. - * - begins tk_yields token: -> - */ - return false; - case '+': - /* can't be punctuation -- can appear inside f64 token: e.g. 1.23e+4 */ - return false; - case '*': - /* not punctuation -- allowed in symbol */ - return false; - case '/': - /* not punctuation -- for symmetry with +,- */ - return false; - case '.': - /* can't be punctuation -- can appear inside f64 token: e.g. 1.23 */ - return false; - } - - return false; - } - - template - bool - tokenizer::is_2char_punctuation(CharT ch) { - /* can't put '-' here, because of the way it appears in numeric literals - * characters here may not appear in symbol names - */ - - switch(ch) { - case '<': - /* can begin <= */ - return true; - case '>': - /* can begin >= */ - return true; - case ':': - /* can begin := */ - return true; - case '=': - /* can begin == */ - return true; - case '!': - /* can begin != */ - return true; - } - - return false; - } - - template - auto - tokenizer::assemble_token(std::size_t initial_whitespace, - const span_type & token_text, - input_state_type & input_state_ref) -> result_type - { - /* literal|pretty|streamlined */ - log_config::style = function_style::streamlined; - - scope log(XO_DEBUG(input_state_ref.debug_flag())); - log && log(xtag("token_text", token_text), - xtag("initial_whitespace", initial_whitespace), - xtag("input_state", input_state_ref)); - - tokentype tk_type = tokentype::tk_invalid; - std::string tk_text; - - const CharT * tk_start = token_text.lo(); - const CharT * tk_end = token_text.hi(); - - const CharT * ix = tk_start; - - /* switch here applies to the first character in a token */ - switch (*ix) { - case '-': - case '+': - if (token_text.size() == 1) { - /* standalone '+' or '-' */ - if (*ix == '+') - tk_type = tokentype::tk_plus; - else if(*ix == '-') - tk_type = tokentype::tk_minus; - } - - /** fall through to numeric literal code below **/ - [[fallthrough]]; - case '.': - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - /* examples of valid floating-point numbers: - * .0 - * 1e0 - * 1e - * 0. - * +1e0 - * -1e0 - * +1E+2 - * -1E+2 - * -0.123e-10 - * non-examples: - * . - * - - * + - * e0 - * .e0 - * -.e-0 - * +.e+0 - * - * in particular: to be recognized as a number, - * must contain at least one digit - */ - - log && log("possible number-token"); - - /* true if initial sign -/+ encountered */ - bool sign_flag = false; - /* true if '.' encountered */ - bool period_flag = false; - /* true if 'e' | 'E' encountered. - */ - bool exponent_flag = false; - /* true when sign '-' | '+' precedes exponenct digits */ - bool exponent_sign_flag = false; - /* true when at least one digit follows exponent marker */ - bool exponent_digit_flag = false; - /* true if at least one digit encountered */ - bool number_flag = false; - - log && log(xtag("*ix", *ix), - xtag("tk.length", token_text.size())); - if (log && (ix + 1 < tk_end)) - log(xtag("*(ix+1)", *(ix + 1))); - - if ((*ix == '-') && (ix + 2 == token_text.hi()) && (*(ix + 1) == '>')) { - /* composing exactly '->' */ - tk_type = tokentype::tk_yields; - } else { - /* token (if valid) will be one of: {tk_i64, tk_f64, tk_dot}: */ - for (; ix != token_text.hi(); ++ix) { - if ((*ix == '-') || (*ix == '+')) { - /* sign allowed: - * 1. before period and before first digit - * 2. after exponent - */ - if (!period_flag && !number_flag && !sign_flag) { - sign_flag = true; - } else if (exponent_flag && !exponent_digit_flag) { - exponent_sign_flag = true; - } else { - return result_type::make_error_consume_current_line - (__FUNCTION__ /*src_function*/, - "improperly placed sign indicator", - (ix - tk_start), - input_state_ref); - } - } else if (*ix == '.') { - if (period_flag) { - return result_type::make_error_consume_current_line - (__FUNCTION__ /*src_function*/, - "duplicate decimal point in numeric literal", - (ix - tk_start), - input_state_ref); - } - - period_flag = true; - } else if ((*ix == 'e') || (*ix == 'E')) { - if (exponent_flag) { - return result_type::make_error_consume_current_line - (__FUNCTION__ /*src_function*/, - "duplicate exponent marker in numeric literal", - (ix - tk_start), - input_state_ref); - } - - exponent_flag = true; - } else if (isdigit(*ix)) { - if (exponent_flag) { - /* need digit before exponent to recognize as number */ - exponent_digit_flag = true; - } else { - number_flag = true; - } - } else { - return result_type::make_error_consume_current_line - (__FUNCTION__ /*src_function*/, - "unexpected character in numeric constant" /*error_description*/, - (ix - tk_start), - input_state_ref); - } - } - - if (number_flag) { - if (period_flag || exponent_flag) { - tk_type = tokentype::tk_f64; - } else { - tk_type = tokentype::tk_i64; - } - } else if (period_flag && !exponent_flag) { - tk_type = tokentype::tk_dot; - } else { - /* not a valid token */ - } - - log && log(xtag("sign_flag", sign_flag)); - log && log(xtag("period_flag", period_flag), - xtag("exponent_flag", exponent_flag), - xtag("exponent_sign_flag", exponent_sign_flag), - xtag("number_flag", number_flag)); - log && log(xtag("tk_type", tk_type)); - } - - break; - } - case '*': - if (token_text.size() == 1) { - /* standalone '*' */ - tk_type = tokentype::tk_star; - ++ix; - } else { - /* '*' isn't punctuation -- but may allow appearance in a longer token - * - * thinking that x*y is a symbol with an embedded '*' character; - * in particular want to support kebab-case symbols like 'foo-config' - */ - } - break; - case '/': - if (token_text.size() == 1) { - /* standalone '/' */ - tk_type = tokentype::tk_slash; - ++ix; - } - break; - case '=': - log && log("singleassign or cmpeq token"); - - if (*(ix + 1) == '=') { - tk_type = tokentype::tk_cmpeq; - ++ix; - ++ix; - } else { - /* standalone '=' */ - tk_type = tokentype::tk_singleassign; - ++ix; - } - break; - case '!': - if (*(ix + 1) == '=') { - tk_type = tokentype::tk_cmpne; - ++ix; - ++ix; - } else { - /* standlone '!' */ - - // TODO - } - break; - case '"': - { - log && log("recognize string-token"); - - tk_type = tokentype::tk_string; - - tk_text.reserve(token_text.hi() - token_text.lo()); - - ++ix; /*skip initial " char*/ - - /* true on final " */ - bool endofstring = false; - - for (; ix != token_text.hi(); ++ix) { - log && log(xtag("*ix", *ix)); - - switch(*ix) { - case '"': - endofstring = true; - - /* skip final " char, don't capture */ - ++ix; - - break; - case '\\': - /* skip escape char, don't capture */ - ++ix; - - if (ix == token_text.hi()) { - return result_type::make_error_consume_current_line - (__FUNCTION__ /*src_function*/, - "expecting key following escape character \\", - (ix - tk_start), - input_state_ref); - } - - switch(*ix) { - case '\\': - log && log(xtag("*ix", *ix), xtag("escaped", "t")); - tk_text.push_back(*ix); - break; - case 'n': - log && log(xtag("*ix", *ix), xtag("newline", "t")); - tk_text.push_back('\n'); - break; - case 't': - log && log(xtag("*ix", *ix), xtag("tab", "t")); - tk_text.push_back('\t'); - break; - case 'r': - log && log(xtag("*ix", *ix), xtag("cr", "t")); - tk_text.push_back('\r'); - break; - case '"': - log && log(xtag("*ix", *ix), xtag("quote", "t")); - tk_text.push_back('"'); - break; - default: - return result_type::make_error_consume_current_line - (__FUNCTION__ /*src_function*/, - "expecting one of n|r|\"|\\ following escape \\", - (ix - tk_start), - input_state_ref); - } - break; - default: - tk_text.push_back(*ix); - break; - } - - if (endofstring) - break; - } - - if (!endofstring) { - return result_type::make_error_consume_current_line - (__FUNCTION__ /*src_function*/, - "missing terminating '\"' to complete literal string", - (ix - tk_start), - input_state_ref); - } - - log && log(tostr("tokenizer::assemble_token", - xtag("tk_text", tk_text))); - - break; - } - case 'a': case 'A': - case 'b': case 'B': - case 'c': case 'C': - case 'd': case 'D': - case 'e': case 'E': - case 'f': case 'F': - case 'g': case 'G': - case 'h': case 'H': - case 'i': case 'I': - case 'j': case 'J': - case 'k': case 'K': - case 'l': case 'L': - case 'm': case 'M': - case 'n': case 'N': - case 'o': case 'O': - case 'p': case 'P': - case 'q': case 'Q': - case 'r': case 'R': - case 's': case 'S': - case 't': case 'T': - case 'u': case 'U': - case 'v': case 'V': - case 'w': case 'W': - case 'x': case 'X': - case 'y': case 'Y': - case 'z': case 'Z': - { - /* symbol/identifier must begin with a letter? - * we want to accept some other chars too. - * specifically want to allow identifiers: - * this-is-the-way - * this+is+also+the+way - * how/much/is/that/doggy - * put*an*asterisk*in*that - * something%special% - * - * like pure lisp, we don't allow: - * - identifier beginning with digit - * - period . - * - * unlike pure lisp, we don't allow anywhere in a symbol: - * - colon : - * - semicolon ; - * - comma , - * - * also we don't allow symbols to begin with special chars - */ - - tk_type = tokentype::tk_symbol; - break; - } - case '<': - { - log && log("leftangle or lessequal token"); - - if (*(ix + 1) == '=') { - tk_type = tokentype::tk_lessequal; - ++ix; - ++ix; - } else { - tk_type = tokentype::tk_leftangle; - ++ix; - } - break; - } - case '>': - { - log && log("rightangle or greatequal token"); - - if (*(ix + 1) == '=') { - tk_type = tokentype::tk_greatequal; - ++ix; - ++ix; - } else { - tk_type = tokentype::tk_rightangle; - ++ix; - } - break; - } - case '(': - tk_type = tokentype::tk_leftparen; - ++ix; - break; - case ')': - tk_type = tokentype::tk_rightparen; - ++ix; - break; - case '[': - tk_type = tokentype::tk_leftbracket; - ++ix; - break; - case ']': - tk_type = tokentype::tk_rightbracket; - ++ix; - break; - case '{': - tk_type = tokentype::tk_leftbrace; - ++ix; - break; - case '}': - tk_type = tokentype::tk_rightbrace; - ++ix; - break; - case ',': - tk_type = tokentype::tk_comma; - ++ix; - break; - case ';': - tk_type = tokentype::tk_semicolon; - ++ix; - break; - case ':': - { - log && log("colon or assignment token"); - - if (*(ix + 1) == '=') { - tk_type = tokentype::tk_assign; - ++ix; - ++ix; - } else { - tk_type = tokentype::tk_colon; - ++ix; - } - break; - } - default: - break; - } - - if (tk_type == tokentype::tk_invalid) { - return result_type::make_error_consume_current_line - (__FUNCTION__ /*src_function*/, - "illegal input character", - (ix - tk_start), - input_state_ref); - } - - if ((tk_type == tokentype::tk_i64) - || (tk_type == tokentype::tk_f64) - || (tk_type == tokentype::tk_symbol)) - { - /* note: capturing token text here; - * for numeric literals will re-parse in token::i64_value() / token::f64_value() - */ - tk_text = std::string(tk_start, tk_end); - } else if (tk_type == tokentype::tk_string) { - ; /* nothing to do here -- desired tk_text already constructed */ - } - - if (tk_type == tokentype::tk_symbol) { - /* check for keywords */ - - bool keep_text = false; - - if ((tk_text == "true") || (tk_text == "false")) { - tk_type = tokentype::tk_bool; - keep_text = true; - } else if (tk_text == "type") { - tk_type = tokentype::tk_type; - } else if (tk_text == "def") { - tk_type = tokentype::tk_def; - } else if (tk_text == "lambda") { - tk_type = tokentype::tk_lambda; - } else if (tk_text == "if") { - tk_type = tokentype::tk_if; - } else if (tk_text == "then") { - tk_type = tokentype::tk_then; - } else if (tk_text == "else") { - tk_type = tokentype::tk_else; - } else if (tk_text == "let") { - tk_type = tokentype::tk_let; - } else if (tk_text == "in") { - tk_type = tokentype::tk_in; - } else if (tk_text == "end") { - tk_type = tokentype::tk_end; - } else { - /* keep as symbol */ - keep_text = true; - } - - if (!keep_text) - tk_text.clear(); - } - - /* input.prefix(0): - * require caller preserves current input line until it's entirely exhausted - */ - return result_type(token_type(tk_type, std::move(tk_text)), - input_state_ref.current_line().prefix(0)); - } /*assemble_token*/ - - /* TODO: input_state_ as argument ? */ - template - auto - tokenizer::assemble_final_token(const span_type & token_text, - const input_state_type & input_state) -> result_type - { - return assemble_token(0 /*initial_whitespace*/, - token_text, - input_state); - } - - template - auto - tokenizer::scan(const span_type & input, - bool eof_flag) -> result_type - { - scope log(XO_DEBUG(input_state_.debug_flag())); - - log && log(xtag("input", input)); - - /* - Always at beginning of token when scan() invoked - * - scan will not report any portion of line as consumed until it has - * emitted all tokens in that line. - * rationale: caller is allowed to discard storage that - * scan() reports as consumed. But will be holding that line - * until all tokens have been read. - * - this means caller will typically call scan() - * with the same input span multiple times - */ - - /* automagically no-ops when the same input presented twice */ - this->input_state_.capture_current_line(input, eof_flag); - - const CharT * ix = this->input_state_.skip_leading_whitespace(); - - if(ix == input.hi()) { - log && log("end input -> consume current line"); - - /* entirety of current line has been tokenized - * -> caller may consume it - */ - return result_type::make_whitespace(this->input_state_.consume_current_line()); - } - - /* ix: if ix < input.hi: first non-whitespace character after input_state_.current_pos_ */ - - // TODO: - // 1. hoist complete_flag up here - // 2. use in each branch - // 3. common check for prefix-capturing after if-cascade below done - - /* here: *ix is not whitespace */ - - auto whitespace_z = input_state_.whitespace(); - - log && log(xtag("whitespace_z", whitespace_z)); - - /* tk_start points to known beginning of token - * (after any whitespace) - * - * goal is to leave ix pointing to 1 char past the end of the token - */ - const CharT * tk_start = ix; - - if (is_1char_punctuation(*ix)) { - /* 1-character token */ - ++ix; - } else if (is_2char_punctuation(*ix)) { - CharT ch1 = *ix; - - (void)ch1; - - ++ix; - -#ifdef OBSOLETE // no longer a thing. either input ends in whitespace, or ends translation unit - if (ix == input.hi()) { - /* need more input to know if/when token complete */ - this->prefix_ += std::string(tk_start, input.hi()); - - log && log(xtag("captured-prefix1", this->prefix_)); - } else -#endif - { - CharT ch2 = *ix; - - if (((ch2 >= '0') && (ch2 <= '9')) - || ((ch2 >= 'A') && (ch2 <= 'Z')) - || ((ch2 >= 'a') && (ch2 <= 'z'))) - { - /* treat as 1 char punctuation */ - ; - } else { - /* include next char */ - ++ix; - } - } - } else if (*ix == '"') { - bool complete_flag = false; - - /* 1. embedded space/tab allowed in string literal. - * 2. embedded newline/cr not allowed. - */ - CharT prev_ch = '"'; - - ++ix; - - for (; ix != input.hi(); ++ix) { - /* looking for unescaped " char to end literal */ - if (*ix == '"') { - if (prev_ch != '\\') { - ++ix; /* include terminating " for assemble_token */ - complete_flag = true; - break; - } - } else if ((*ix == '\n') || (*ix == '\r')) { - log && log ("string literal with naked newline or CR"); - - return result_type::make_error_consume_current_line - (__FUNCTION__ /*src_function*/, - "must use \\n or \\r to encode newline/cr in string literal", - (ix - tk_start), - this->input_state_); - } - - prev_ch = *ix; - } - - if (!complete_flag) { - log && log("unterminated string literal"); - - return result_type::make_error_consume_current_line - (__FUNCTION__ /*src_function*/, - "unterminated string literal", - (ix - tk_start), - this->input_state_); - } - } else { - /* ix is start of some token */ - - if (*ix == '-') { - /* this section load-bearing for input '->' scanning from beginning of token */ - ++ix; - - if (ix == input.hi()) { - /* need more input to know if/when token complete -- see captured-prefix5 below */ - } else { - CharT ch2 = *ix; - - if (ch2 == '>') { - /* include next char and complete token */ - ++ix; - - log && log("complete '->' token"); - - this->input_state_.advance_until(ix); - - return assemble_token(whitespace_z, - span_type(tk_start, ix) /*token*/, - input_state_); - } - - /* here: -123, -.5e-21 for example */ - } - } else if (*ix == '>') { - /* this section load-bearing for input '>=' scanning from beginning of token. - * Need this because '>' necessarily excluded from is_1char_punctuation() - */ - ++ix; - - if (ix == input.hi()) { - /* need more input to know if/when token complete -- see captured-prefix5 below */ - } else { - CharT ch2 = *ix; - - if (ch2 != '=') { - log && log("complete '>=' token"); - - this->input_state_.advance_until(ix); - - /* ignore next char and complete token */ - return assemble_token(whitespace_z, - span_type(tk_start, ix) /*token*/, - this->input_state_); - } - - /* here: >= for example */ - } - } - - /* scan until: - * - whitespace - * - punctuation - */ - for (; ix != input.hi(); ++ix) { - if (input_state_type::is_whitespace(*ix) - || is_1char_punctuation(*ix) - || is_2char_punctuation(*ix)) - { - break; - } - - /* this section load-bearing for input '>' after beginning of a token, e.g. p> */ - if ((ix > tk_start) && (*ix == '>')) - break; - - /* this section load-bearing for input '->' at the end of another token, e.g. p->q */ - if (*ix == '-') { - if (ix + 1 == input.hi()) { - /* need more input to know if/when token complete - * - * apple-banana parses as: {tk_symbol: apple-banana} - * apple-> parses as: {tk_symbol: apple} {tk_yields} - * apple- illegal (may not end symbol with '-') - */ - break; - } - - if (*(ix + 1) == '>') { - /* treat '->' as punctuation; complete preceding token */ - break; - } - } - } - -#ifdef OBSOLETE - if (ix == input.hi()) { - /* need more input to know if/when token complete */ - this->prefix_ += std::string(tk_start, input.hi()); - - log && log(xtag("captured-prefix5", this->prefix_)); - } -#endif - } - - log && log("assemble token z", xtag("token_z", ix - tk_start)); - - assert(tk_start < ix); - - this->input_state_.advance_until(ix); - - return assemble_token(whitespace_z, - span_type(tk_start, ix) /*token*/, - this->input_state_); - } /*scan*/ - -#ifdef OBSOLETE - template - auto - tokenizer::scan2(const span_type & input, bool eof) -> result_type { - scope log(XO_DEBUG(input_state_.debug_flag())); - - auto sr = this->scan(input); - - if (sr.is_token() || sr.is_error() || !eof) - return sr; - - /* control here only if input contains no unambiguous tokens. - * This implies it contains _at most one_ final token. - */ - - span_type input2 = input.after_prefix(sr.consumed()); - - /* need to include src.consumed() in retval */ - - auto sr2 = this->notify_eof(input2); - - return result_type(sr2.get_token(), - span_type::concat(sr.consumed(), sr2.consumed()), - sr2.error()); - } -#endif - -#ifdef OBSOLETE - template - auto - tokenizer::consume(const span_type & consumed, - const span_type & input) -> span_type - { - this->input_state_.consume(consumed.size()); - - return input.after_prefix(consumed); - } -#endif - - template - void - tokenizer::discard_current_line() - { - this->input_state_.discard_current_line(); - } - -#ifdef OBSOLETE - template - auto - tokenizer::notify_eof(const span_type & input) -> result_type { - scope log(XO_DEBUG(input_state_.debug_flag())); - - log && log(xtag("prefix_", prefix_), xtag("prefix_.size", prefix_.size()), xtag("input", input)); - - /* almost meretricious to include input here, - * when called from scan2() it can only be whitespace - */ - return result_type::make_whitespace(input); - } /*notify_eof*/ -#endif - } /*namespace scm*/ -} /*namespace xo*/ - -/* end tokenizer.hpp */ diff --git a/.xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp b/.xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp deleted file mode 100644 index 6a673e53..00000000 --- a/.xo-tokenizer/include/xo/tokenizer/tokenizer_error.hpp +++ /dev/null @@ -1,162 +0,0 @@ -/* file tokenizer_error.hpp - * - * author: Roland Conybeare, Jun 2025 - */ - -#pragma once - -#include "input_state.hpp" -#include "tokentype.hpp" -#include "span.hpp" -#include - -namespace xo { - namespace scm { - /** @class tokenizer_error - * @brief represent a lexing error, with context - * - * @tparam CharT representation for single characters - **/ - template - class tokenizer_error { - public: - using input_state_type = input_state; - using span_type = span; - - public: - /** @defgroup tokenizer-error-ctors **/ - ///@{ - - /** Default ctor represents a not-an-error sentinel object **/ - tokenizer_error() = default; - /** Constructor to capture parsing error context - * @p tk_start current position on entry to scanner - * @p error_pos error location relative to token start - **/ - tokenizer_error(const char * src_function, - std::string error_description, - const input_state_type & input_state, - size_t error_pos) - : src_function_{src_function}, - error_description_{std::move(error_description)}, - input_state_{input_state}, - error_pos_{error_pos} - { - scope log(XO_DEBUG(input_state.debug_flag())); - - log && log(xtag("input_state.current_pos", input_state.current_pos()), - xtag("error_pos", error_pos)); - } - ///@} - - /** @defgroup tokenizer-error-access-methods **/ - ///@{ - - const char * src_function() const { return src_function_; } - const std::string & error_description() const { return error_description_; } -#pragma GCC diagnostic push -#ifndef __APPLE__ -#pragma GCC diagnostic ignored "-Wchanges-meaning" -#endif - const input_state_type & input_state() const { return input_state_; } -#pragma GCC diagnostic pop - size_t tk_start() const { return input_state_.current_pos(); } - size_t whitespace() const { return input_state_.whitespace(); } - size_t error_pos() const { return error_pos_; } - - ///@} - - /** @defgroup tokenizer-error-general-methods **/ - ///@{ - - /** true, except for a sentinel error object **/ - bool is_error() const { return !error_description_.empty(); } - /** false except for object in sentinel state **/ - bool is_not_an_error() const { return error_description_.empty(); } - - /** Print representation to stream @p os. Intended for tokenizer diagnostics. - * For Schematika errors prefer @ref report - **/ - void print(std::ostream & os) const; - - /** Print human-oriented error report on @p os. **/ - void report(std::ostream & os) const; - - ///@} - - private: - /** @defgroup tokenizer-error-vars **/ - ///@{ - - /** source location (in tokenizer) at which error identified **/ - char const * src_function_ = nullptr; - /** static error description **/ - std::string error_description_; - /** input state associated with this error. - * Sufficient to precisely locate it with context. - **/ - input_state_type input_state_; - /** position (relative to @ref tk_entry_) of error **/ - size_t error_pos_ = 0; - - ///@} - }; /*error_token*/ - - template - void - tokenizer_error::print(std::ostream & os) const { - os << ""; - } - - template - void - tokenizer_error::report(std::ostream & os) const { - using namespace std; - - if (!error_description_.empty()) { - const char * prefix = "input: "; - /* input_state.tk_start: position of first character in token - * input_state.current_pos: position of first character following preceding token. - * error_pos: position (relative to start) at which failure detected - */ - const size_t tk_start = input_state_.tk_start(); - const size_t tk_indent = (strlen(prefix) + tk_start); - const size_t error_pos = 1 + tk_start + error_pos_; - - os << "token col: " << tk_start << ", error col: " << error_pos << "\n"; - os << prefix; - for (const char *p = input_state_.current_line().lo(), - *e = input_state_.current_line().hi(); p < e; ++p) - { - os << *p; - } - //os << endl; - os << std::setw(tk_indent) << " "; - - for (size_t i = 0; i < error_pos_; ++i) { - os << '_'; - } - os << '^' << endl; - - os << error_description_ << endl; - } - } - - template - inline std::ostream & - operator<< (std::ostream & os, - const tokenizer_error & tkerr) - { - tkerr.print(os); - return os; - } - } /*namespace scm*/ -} /*namespace xo*/ - -/* end tokenizer_error.hpp */ diff --git a/.xo-tokenizer/include/xo/tokenizer/tokentype.hpp b/.xo-tokenizer/include/xo/tokenizer/tokentype.hpp deleted file mode 100644 index eeeb7dd0..00000000 --- a/.xo-tokenizer/include/xo/tokenizer/tokentype.hpp +++ /dev/null @@ -1,192 +0,0 @@ -/** @file tokentype.hpp - * - * author: Roland Conybeare, Jul 2024 - **/ - -#pragma once - -#include "xo/indentlog/print/tag.hpp" // for STRINGIFY -#include "xo/indentlog/print/ppdetail_atomic.hpp" -#include - -namespace xo { - namespace scm { - /** @enum tokentype - * Enum to identify different schematika input token types - * - * Schematica code examples: - * - * @code - * type point :: { xcoord : f64, ycoord : f64 }; - * type matrix :: array; // 2-d array - * - * decl hypot(x : f64, y : f64) -> f64; - * - * def hypot(x : f64, y : f64) { - * let - * x2 = (x * x); - * y2 = (y * y); - * hypot2 = (x2 + y2); - * in - * sqrt(hypot2); - * }; - * - * def someconst 4; - * - * def foo(v : vec) { - * def (pi : f64) = 3.1415926; - * def (h : (f64,f64) -> f64) = hypot; - * - * h = hypot3; - * }; - * - * def matrixproduct(x : matrix, y : matrix) { - * [i, j : x.row(i) * y.col(j)]; - * }; - * @endcode - **/ - enum class tokentype { - /** sentinel value **/ - tk_invalid = -1, - - /** a boolean constant **/ - tk_bool, - - /** an integer constant (signed 64-bit integer) **/ - tk_i64, - - /** a 64-bit floating-point constant **/ - tk_f64, - - /** a string literal **/ - tk_string, - - /** a symbol **/ - tk_symbol, - - /** left-hand parenthesis @c '(' **/ - tk_leftparen, - - /** right-hand parenthesis @c ')' **/ - tk_rightparen, - - /** left-hand bracket @c '[' **/ - tk_leftbracket, - - /** right-hand bracket @c ']' **/ - tk_rightbracket, - - /** left-hand brace @c '{' **/ - tk_leftbrace, - - /** right-hand brace @c '}' **/ - tk_rightbrace, - - /** left-hand angle bracket @c '<' **/ - tk_leftangle, - - /** right-hand angle bracket @c '>' **/ - tk_rightangle, - - /** less-equal @c '<=' **/ - tk_lessequal, - - /** great-equal @c '>=' **/ - tk_greatequal, - - /** dot @c '.' **/ - tk_dot, - - /** comma @c ',' **/ - tk_comma, - - /** colon @c ':' **/ - tk_colon, - - /** double-colon @c '::' **/ - tk_doublecolon, - - /** semi-colon @c ';' **/ - tk_semicolon, - - /** single equals sign @c '=' **/ - tk_singleassign, - - /** assignment @c ':=' **/ - tk_assign, - - /** indirection @c '->' **/ - tk_yields, - - /** note: operators not treated as punctuation - * 'do-always' is a legal variable name, - * as is 'maybe*2', 'maybe+1', 'path/to/foo' - **/ - - /** operator @c '+' **/ - tk_plus, - /** operator @c '-' **/ - tk_minus, - /** operator @c '*' **/ - tk_star, - /** operator @c '/' **/ - tk_slash, - - /** operator @c '==' **/ - tk_cmpeq, - /** operator @c '!=' **/ - tk_cmpne, - - /** keyword @c 'type' **/ - tk_type, - - /** keyword @c 'def' **/ - tk_def, - - /** keyword @c 'lambda' **/ - tk_lambda, - - /** keyword @c 'if' **/ - tk_if, - - /** keyworkd @c 'then' **/ - tk_then, - - /** keyword @c 'else' **/ - tk_else, - - /** keyword @c 'let' **/ - tk_let, - - /** keyword @c 'in' **/ - tk_in, - - /** keyword @c 'end' **/ - tk_end, - - /** counts number of entries **/ - n_tokentype - }; /*tokentype*/ - - /** String representation for enum value. - * For example @c tokentype_descr(tokentype::tk_if) -> @c "if" - **/ - extern char const * - tokentype_descr(tokentype tk_type); - - /** Print enum value for @p tk_type on stream @p os **/ - inline std::ostream & - operator<< (std::ostream & os, tokentype tk_type) { - os << tokentype_descr(tk_type); - return os; - } - } /*namespace scm*/ - -#ifndef ppdetail_atomic - namespace print { - PPDETAIL_ATOMIC(xo::scm::tokentype); - } /*namespace print*/ -#endif -} /*namespace xo*/ - -/* end tokentype.hpp */ diff --git a/.xo-tokenizer/src/tokenizer/CMakeLists.txt b/.xo-tokenizer/src/tokenizer/CMakeLists.txt deleted file mode 100644 index 505b2040..00000000 --- a/.xo-tokenizer/src/tokenizer/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -# tokenizer/CMakeLists.txt - -set(SELF_LIB xo_tokenizer) -set(SELF_SRCS - tokentype.cpp - token.cpp) - -xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS}) -xo_dependency(${SELF_LIB} indentlog) - -# end CMakeLists.txt diff --git a/.xo-tokenizer/src/tokenizer/token.cpp b/.xo-tokenizer/src/tokenizer/token.cpp deleted file mode 100644 index 2ed92ad5..00000000 --- a/.xo-tokenizer/src/tokenizer/token.cpp +++ /dev/null @@ -1,9 +0,0 @@ -/** @file token.cpp - * - * author: Roland Conybeare - **/ - -#include "token.hpp" -#include - -/** end token.cpp **/ diff --git a/.xo-tokenizer/src/tokenizer/tokentype.cpp b/.xo-tokenizer/src/tokenizer/tokentype.cpp deleted file mode 100644 index 33d683de..00000000 --- a/.xo-tokenizer/src/tokenizer/tokentype.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/* file tokentype.cpp - * - * author: Roland Conybeare - */ - -#include "tokentype.hpp" - -namespace xo { - namespace scm { - char const * - tokentype_descr(tokentype tk_type) - { -#define CASE(x) case tokentype::x: return STRINGIFY(x) - - switch(tk_type) { - CASE(tk_bool); - CASE(tk_i64); - CASE(tk_f64); - CASE(tk_string); - CASE(tk_symbol); - CASE(tk_leftparen); - - CASE(tk_rightparen); - CASE(tk_leftbracket); - CASE(tk_rightbracket); - CASE(tk_leftbrace); - CASE(tk_rightbrace); - - CASE(tk_leftangle); - CASE(tk_rightangle); - CASE(tk_lessequal); - CASE(tk_greatequal); - CASE(tk_dot); - CASE(tk_comma); - CASE(tk_colon); - - CASE(tk_doublecolon); - CASE(tk_semicolon); - CASE(tk_singleassign); - CASE(tk_assign); - CASE(tk_yields); - - CASE(tk_plus); - CASE(tk_minus); - CASE(tk_star); - CASE(tk_slash); - - CASE(tk_cmpeq); - CASE(tk_cmpne); - - CASE(tk_type); - CASE(tk_def); - CASE(tk_lambda); - CASE(tk_if); - CASE(tk_then); - CASE(tk_else); - CASE(tk_let); - - CASE(tk_in); - CASE(tk_end); - - case tokentype::tk_invalid: - case tokentype::n_tokentype: - return "?tokentype"; - } - -#undef CASE - - return "???"; - } /*tokentype_descr*/ - } /*namespace scm*/ -} /*namespace xo*/ - -/* end tokentype.cpp */ diff --git a/.xo-tokenizer/utest/CMakeLists.txt b/.xo-tokenizer/utest/CMakeLists.txt deleted file mode 100644 index cc080294..00000000 --- a/.xo-tokenizer/utest/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -# build unittest tokenizer/utest - -set(SELF_EXECUTABLE_NAME utest.tokenizer) -set(SELF_SOURCE_FILES - tokenizer_utest_main.cpp - tokenizer.test.cpp - token.test.cpp) - -xo_add_utest_executable(${SELF_EXECUTABLE_NAME} ${SELF_SOURCE_FILES}) -xo_self_dependency(${SELF_EXECUTABLE_NAME} xo_tokenizer) -xo_external_target_dependency(${SELF_EXECUTABLE_NAME} Catch2 Catch2::Catch2) - -# end CMakeLists.txt diff --git a/.xo-tokenizer/utest/token.test.cpp b/.xo-tokenizer/utest/token.test.cpp deleted file mode 100644 index 80ee6e4f..00000000 --- a/.xo-tokenizer/utest/token.test.cpp +++ /dev/null @@ -1,266 +0,0 @@ -/* file token.test.cpp - * - * author: Roland Conybeare - */ - -#include "xo/tokenizer/token.hpp" -#include -#include - -namespace xo { - using token = xo::scm::token; - using xo::scm::tokentype; - - namespace ut { - // also see tokenizer.test.cpp for syntax - - namespace test2 { - struct testcase_i64 { - std::string text_; - bool expect_throw_; - std::int64_t expected_; - }; - - std::vector s_testcase_v = { - {"", true, 0}, - {"0", false, 0}, - {"-", true, 0}, - {"+", true, 0}, - {"-0", false, 0}, - {"+0", false, 0}, - {"1", false, 1}, - {"-1", false, -1}, - {"9", false, 9}, - {"-9", false, -9}, - {"12", false, 12}, - {"+12", false, 12}, - {"-12", false, -12}, - {"99", false, 99}, - {"-99", false, -99}, - {"123x", true, 0}, - }; - - TEST_CASE("parse-i64", "[token]") { - for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) { - INFO(xtag("i_tc", i_tc)); - - auto const & testcase = s_testcase_v[i_tc]; - - token tk(tokentype::tk_i64, - testcase.text_); - - REQUIRE(tk.tk_type() == tokentype::tk_i64); - - bool throw_flag = false; - try { - std::int64_t x = tk.i64_value(); - - REQUIRE(x == testcase.expected_); - } catch (std::exception & ex) { - throw_flag = true; - } - - REQUIRE(throw_flag == testcase.expect_throw_); - } - } - } - - namespace test3 { - TEST_CASE("error-i64", "[token]") { - token tk(tokentype::tk_i64, "+"); - - bool throw_flag = false; - - try { - tk.i64_value(); - } catch(std::exception & ex) { - throw_flag = true; - } - - REQUIRE(throw_flag); - } - } - - namespace test4 { - struct testcase_f64 { - std::string text_; - bool expect_throw_; - double expected_; - }; - - std::vector s_testcase_v = { - {"", true, 0}, - {"0", false, 0}, - {"-", true, 0}, - {"+", true, 0}, - {"-0", false, 0}, - - {"+0", false, 0}, - {"1", false, 1}, - {"-1", false, -1}, - {"9", false, 9}, - {"-9", false, -9}, - - {"12", false, 12}, - {"+12", false, 12}, - {"-12", false, -12}, - {"99", false, 99}, - {"-99", false, -99}, - - {"123x", true, 0}, - {"0.0", false, 0.0}, - {"0.1", false, 0.1}, - {"0.12", false, 0.12}, - {"0.123", false, 0.123}, - - {"0.1234", false, 0.1234}, - {"0.12345", false, 0.12345}, - {"0.123456", false, 0.123456}, - {"0.1234567", false, 0.1234567}, - {"0.12345678", false, 0.12345678}, - - {"0.123456789", false, 0.123456789}, - {"+0.0", false, 0.0}, - {"+0.1", false, 0.1}, - {"+0.12", false, 0.12}, - {"+0.123", false, 0.123}, - - {"+0.1234", false, 0.1234}, - {"+0.12345", false, 0.12345}, - {"+0.123456", false, 0.123456}, - {"+0.1234567", false, 0.1234567}, - {"+0.12345678", false, 0.12345678}, - - {"+0.123456789", false, 0.123456789}, - {"+0.0e0", false, 0.0}, - {"+0.1e0", false, 0.1}, - {"+0.12e0", false, 0.12}, - {"+0.123e0", false, 0.123}, - - {"+0.1234e0", false, 0.1234}, - {"+0.12345e0", false, 0.12345}, - {"+0.123456e0", false, 0.123456}, - {"+0.1234567e0", false, 0.1234567}, - {"+0.12345678e0", false, 0.12345678}, - - {"+0.123456789e0", false, 0.123456789}, - {"+0.0e1", false, 00.}, - {"+0.1e1", false, 01.}, - {"+0.12e1", false, 01.2}, - {"+0.123e1", false, 01.23}, - - {"+0.1234e1", false, 01.234}, - {"+0.12345e1", false, 01.2345}, - {"+0.123456e1", false, 01.23456}, - {"+0.1234567e1", false, 01.234567}, - {"+0.12345678e1", false, 01.2345678}, - - {"+0.123456789e1", false, 01.23456789}, - {"+0.0E1", false, 00.}, - {"+0.1E1", false, 01.}, - {"+0.12E1", false, 01.2}, - {"+0.123E1", false, 01.23}, - - {"+0.1234E1", false, 01.234}, - {"+0.12345E1", false, 01.2345}, - {"+0.123456E1", false, 01.23456}, - {"+0.1234567E1", false, 01.234567}, - {"+0.12345678E1", false, 01.2345678}, - - {"+0.123456789E1", false, 01.23456789}, - {"+0.0e9", false, 0.0}, - {"+0.1e9", false, 0.1e9}, - {"+0.12e9", false, 0.12e9}, - {"+0.123e9", false, 0.123e9}, - - {"+0.1234e9", false, 0.1234e9}, - {"+0.12345e9", false, 0.12345e9}, - {"+0.123456e9", false, 0.123456e9}, - {"+0.1234567e9", false, 0.1234567e9}, - {"+0.12345678e9", false, 0.12345678e9}, - - {"+0.123456789e9", false, 0.123456789e9}, - {"-0.0", false, -0.0}, - {"-0.1", false, -0.1}, - {"-0.12", false, -0.12}, - {"-0.123", false, -0.123}, - - {"-0.1234", false, -0.1234}, - {"-0.12345", false, -0.12345}, - {"-0.123456", false, -0.123456}, - {"-0.1234567", false, -0.1234567}, - {"-0.12345678", false, -0.12345678}, - - {"-0.123456789", false, -0.123456789}, - {"00.", false, 0.0}, - {"01.", false, 1.0}, - {"01.2", false, 1.2}, - {"01.23", false, 1.23}, - - {"01.234", false, 1.234}, - {"01.2345", false, 1.2345}, - {"01.23456", false, 1.23456}, - {"01.234567", false, 1.234567}, - {"01.2345678", false, 1.2345678}, - - {"01.23456789", false, 1.23456789}, - {"0.0", false, 0.0}, - {"1.2", false, 1.2}, - {"12.", false, 12.0}, - {"12.3", false, 12.3}, - - {"12.34", false, 12.34}, - {"12.345", false, 12.345}, - {"12.3456", false, 12.3456}, - {"12.34567", false, 12.34567}, - {"12.345678", false, 12.345678}, - - {"12.3456789", false, 12.3456789}, - {"01.23", false, 1.23}, - {"12.3", false, 12.3}, - {"123.", false, 123.0}, - {"123.4", false, 123.4}, - - {"123.45", false, 123.45}, - {"123.456", false, 123.456}, - {"123.4567", false, 123.4567}, - {"123.45678", false, 123.45678}, - {"123.456789", false, 123.456789}, - }; - - TEST_CASE("parse-f64", "[token]") { - for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) { - auto const & testcase = s_testcase_v[i_tc]; - - INFO(tostr(xtag("i_tc", i_tc), - xtag("text", testcase.text_) - )); - - token tk(tokentype::tk_f64, - testcase.text_); - - REQUIRE(tk.tk_type() == tokentype::tk_f64); - - bool throw_flag = false; - std::string ex_msg; - - try { - double x = tk.f64_value(); - - REQUIRE(x == Approx(testcase.expected_).epsilon(1.0e-15)); - } catch (std::exception & ex) { - ex_msg = ex.what(); - - throw_flag = true; - } - - INFO(xtag("ex_msg", ex_msg)); - - REQUIRE(throw_flag == testcase.expect_throw_); - } - } - } /*namespace*/ - } /*namespace ut*/ -} /*namespace xo*/ - -/* end token.test.cpp */ diff --git a/.xo-tokenizer/utest/tokenizer.test.cpp b/.xo-tokenizer/utest/tokenizer.test.cpp deleted file mode 100644 index 604b9d25..00000000 --- a/.xo-tokenizer/utest/tokenizer.test.cpp +++ /dev/null @@ -1,576 +0,0 @@ -/* file tokenizer.test.cpp - * - * author: Roland Conybeare - */ - -#include "xo/tokenizer/tokenizer.hpp" -#include - -namespace xo { - using xo::scm::tokentype; - using token = xo::scm::token; - using xo::scm::span; - - namespace ut { - /** Two-pass test harness. - * - * First pass - verify test assertions. - * Second pass only if first pass failed. - * On second pass, enable verbose logging - **/ - struct rehearser { - rehearser(std::uint32_t att = 0) : attention_{att} {} - - /* expect at most one iterator to exist per TestRehearser instance **/ - struct iterator { - explicit iterator(rehearser* parent) : parent_{parent} {} - - iterator& operator++(); - std::uint32_t operator*() { return parent_->attention_; } - - bool operator==(const iterator& ix2) const { - return (parent_ == ix2.parent_); - } - - rehearser* parent_ = nullptr; - std::uint32_t attention_ = 0; - - }; - - bool is_first_pass() const { return attention_ == 0; } - bool is_second_pass() const { return attention_ == 1; } - bool enable_debug() const { return is_second_pass(); } - - iterator begin() { return iterator(this); } - iterator end() { return iterator(nullptr); } - - public: - /** pass number: 0 or 1 **/ - std::uint32_t attention_ = 0; - /** @brief set to true when test starts; false if first pass fails **/ - bool ok_flag_ = true; - }; - - auto rehearser::iterator::operator++() -> iterator& - { - if (parent_) - ++(parent_->attention_); - - if (parent_->ok_flag_ && (parent_->attention_ == 1)) { - /* skip 2nd pass */ - ++(parent_->attention_); - } - - if (parent_->attention_ == 2) - parent_ = nullptr; - - return *this; - } - - /* use this instead of REQUIRE(expr) in context of a test_rehearser */ -# define REHEARSE(rehearser, expr) \ - if (rehearser.is_first_pass()) { \ - bool _f = (expr); \ - rehearser.ok_flag_ = rehearser.ok_flag_ && _f; \ - } else { \ - REQUIRE(expr); \ - } - - /* note: trivial REQUIRE() call in else branch bc we still want - * catch2 to count assertions when verification succeeds - */ -# define REQUIRE_ORCAPTURE(ok_flag, catch_flag, expr) \ - if (catch_flag) { \ - REQUIRE((expr)); \ - } else { \ - REQUIRE(true); \ - ok_flag &= (expr); \ - } - -# define REQUIRE_ORFAIL(ok_flag, catch_flag, expr) \ - REQUIRE_ORCAPTURE(ok_flag, catch_flag, expr); \ - if (!ok_flag) \ - return ok_flag - - namespace { - struct testcase_tkz { - std::string input_; - bool expect_throw_; - token expected_tk_; - bool consume_all_; - }; - - std::vector - s_testcase_v = { - /* - * - * expect_throw consume_all - * v v - */ - {"<", false, token::leftangle(), true}, - /* possible prefix of >= */ - {">", false, token::rightangle(), true}, - {"> ", false, token::rightangle(), true}, - - {"(", false, token::leftparen(), true}, - {")", false, token::rightparen(), true}, - - {"[", false, token::leftbracket(), true}, - {"]", false, token::rightbracket(), true}, - - {"{", false, token::leftbrace(), true}, - {" {", false, token::leftbrace(), true}, - - {"\t{", false, token::leftbrace(), true}, - {"\n{", false, token::leftbrace(), true}, - {"}", false, token::rightbrace(), true}, - - {"0", false, token::i64_token("0"), true}, - {"1", false, token::i64_token("1"), true}, - {"12", false, token::i64_token("12"), true}, - {"123", false, token::i64_token("123"), true}, - {"1234", false, token::i64_token("1234"), true}, - - {"0 ", false, token::i64_token("0"), false}, - {"1 ", false, token::i64_token("1"), false}, - {"12 ", false, token::i64_token("12"), false}, - {"123 ", false, token::i64_token("123"), false}, - {"1234 ", false, token::i64_token("1234"), false}, - - {"1<", false, token::i64_token("1"), false}, - {"1>", false, token::i64_token("1"), false}, - {"1(", false, token::i64_token("1"), false}, - {"1)", false, token::i64_token("1"), false}, - {"1[", false, token::i64_token("1"), false}, - {"1]", false, token::i64_token("1"), false}, - {"1{", false, token::i64_token("1"), false}, - {"1}", false, token::i64_token("1"), false}, - {"1;", false, token::i64_token("1"), false}, - {"1:", false, token::i64_token("1"), false}, - {"1,", false, token::i64_token("1"), false}, - - {".1", false, token::f64_token(".1"), true}, - {".12", false, token::f64_token(".12"), true}, - {".123", false, token::f64_token(".123"), true}, - - {"+.1", false, token::f64_token("+.1"), true}, - {"+.12", false, token::f64_token("+.12"), true}, - {"+.123", false, token::f64_token("+.123"), true}, - - {"-.1", false, token::f64_token("-.1"), true}, - {"-.12", false, token::f64_token("-.12"), true}, - {"-.123", false, token::f64_token("-.123"), true}, - - {"1.", false, token::f64_token("1."), true}, - {"1.2", false, token::f64_token("1.2"), true}, - {"1.23", false, token::f64_token("1.23"), true}, - - {"1e0", false, token::f64_token("1e0"), true}, - {"1e-1", false, token::f64_token("1e-1"), true}, - {"1e1", false, token::f64_token("1e1"), true}, - {"1e+1", false, token::f64_token("1e+1"), true}, - - {"\"hello\"", false, token::string_token("hello"), true}, - /* tokenizer sees this input: - * "\"hi\", she said" - */ - {"\"\\\"hi\\\", she said\"", false, token::string_token("\"hi\", she said"), true}, - /* tokenizer sees this input: - * "look ma, newline ->\n<- " - */ - {"\"look ma, newline ->\\n<- \"", false, - token::string_token("look ma, newline ->\n<- "), true}, - /* tokenizer sees this input: - * "tab to the right [\t], to the right [\t]" - */ - {"\"tab to the right [\\t], to the right [\\t]\"", false, - token::string_token("tab to the right [\t], to the right [\t]"), true}, - - {".", false, token::dot(), true}, - {":", false, token::colon(), true}, - {",", false, token::comma(), true}, - {"=", false, token::singleassign(), true}, - {":=", false, token::assign_token(), true}, - {"->", false, token::yields(), true}, - - {"+", false, token::plus_token(), true}, - {"-", false, token::minus_token(), true}, - {"*", false, token::star_token(), true}, - {"/", false, token::slash_token(), true}, - - {"symbol", false, token::symbol_token("symbol"), true}, - {"another-symbol", false, token::symbol_token("another-symbol"), true}, - - {"type", false, token::type(), true}, - {"def", false, token::def(), true}, - {"lambda", false, token::lambda(), true}, - {"if", false, token::if_token(), true}, - {"let", false, token::let(), true}, - {"in", false, token::in(), true}, - {"end", false, token::end(), true}, - - }; - } - - TEST_CASE("tokenizer", "[tokenizer]") { - for (std::size_t i_tc = 0, n_tc = s_testcase_v.size(); i_tc < n_tc; ++i_tc) { - - const testcase_tkz & testcase = s_testcase_v[i_tc]; - - rehearser rh; - - for (auto _ : rh) { - scope log(XO_DEBUG2(rh.enable_debug(), "tokenizer")); - - log && log(xtag("i_tc", i_tc), xtag("input", testcase.input_)); - - using tokenizer - = xo::scm::tokenizer; - - tokenizer tkz(rh.enable_debug()); - tokenizer::span_type - in_span(testcase.input_.c_str(), - testcase.input_.c_str() + testcase.input_.size()); - - auto sr = tkz.scan(in_span, true /*eof*/); - - REHEARSE(rh, sr.get_token().tk_type() == testcase.expected_tk_.tk_type()); - if (sr.get_token().tk_type() == tokentype::tk_i64) - { - REHEARSE(rh, !sr.get_token().text().empty()); - REHEARSE(rh, sr.get_token().i64_value() == testcase.expected_tk_.i64_value()); - } else if (sr.get_token().tk_type() == tokentype::tk_f64) - { - REHEARSE(rh, !sr.get_token().text().empty()); - REHEARSE(rh, sr.get_token().f64_value() == testcase.expected_tk_.f64_value()); - } else if(sr.get_token().tk_type() == tokentype::tk_string) - { - /* sr.get_token().text() can be empty, consider input "" */ - REHEARSE(rh, sr.get_token().text() == testcase.expected_tk_.text()); - } else if(sr.get_token().tk_type() == tokentype::tk_symbol) - { - REHEARSE(rh, !sr.get_token().text().empty()); - REHEARSE(rh, sr.get_token().text() == testcase.expected_tk_.text()); - } else { - REHEARSE(rh, sr.get_token().text().empty()); - } - - /* must consume all input for tests we're doing here */ - if (testcase.consume_all_) { - REHEARSE(rh, sr.consumed() == in_span); - } else { - REHEARSE(rh, sr.consumed() != in_span); - } - } - } - } - - namespace { - struct testcase2_tkz { - std::string input_; - bool expect_throw_; - std::vector expected_tk_v_; - }; - - std::vector - s_testcase2_v = { - {"def foo : f64 = 3.141;", - false, - {token::def(), - token::symbol_token("foo"), - token::colon(), - token::symbol_token("f64"), - token::singleassign(), - token::f64_token("3.141"), - token::semicolon() - }}, - {"def foo = lambda (x : f64) { def y = x * x; y; }", - false, - {token::def(), - token::symbol_token("foo"), - token::singleassign(), - token::lambda(), - token::leftparen(), - token::symbol_token("x"), - token::colon(), - token::symbol_token("f64"), - token::rightparen(), - token::leftbrace(), - token::def(), - token::symbol_token("y"), - token::singleassign(), - token::symbol_token("x"), - token::star_token(), - token::symbol_token("x"), - token::semicolon(), - token::symbol_token("y"), - token::semicolon(), - token::rightbrace() - }}, -#ifdef TODO - {"a.b", - false, - {token::symbol_token("a"), - token::dot(), - token::symbol_token("b") - }}, -#endif - {"a,b", - false, - {token::symbol_token("a"), - token::comma(), - token::symbol_token("b") - }}, - {"a:b", - false, - {token::symbol_token("a"), - token::colon(), - token::symbol_token("b") - }}, - {"a;b", - false, - {token::symbol_token("a"), - token::semicolon(), - token::symbol_token("b") - }}, - {"a:=b", - false, - {token::symbol_token("a"), - token::assign_token(), - token::symbol_token("b") - }}, - {"a=b", - false, - {token::symbol_token("a"), - token::singleassign(), - token::symbol_token("b") - }}, - {"p->q", - false, - {token::symbol_token("p"), - token::yields(), - token::symbol_token("q") - }}, - {"a + b", - false, - {token::symbol_token("a"), - token::plus_token(), - token::symbol_token("b") - }}, - {"a - b", - false, - {token::symbol_token("a"), - token::minus_token(), - token::symbol_token("b") - }}, - {"a-b", - false, - {token::symbol_token("a-b"), - }}, - {"(apple)", - false, - {token::leftparen(), - token::symbol_token("apple"), - token::rightparen() - }}, - {"", - false, - {token::leftangle(), - token::symbol_token("apple"), - token::rightangle() - }}, - }; - } - - TEST_CASE("tokenizer2", "[tokenizer]") { - /* this time testing token sequences */ - - using tokenizer = xo::scm::tokenizer; - - for (std::size_t i_tc = 0, n_tc = s_testcase2_v.size(); i_tc < n_tc; ++i_tc) { - const testcase2_tkz & testcase = s_testcase2_v[i_tc]; - - rehearser rh; - - for (auto _ : rh) { - scope log(XO_DEBUG2(rh.enable_debug(), "tokenizer2")); - - log && log(xtag("i_tc", i_tc), xtag("input", testcase.input_)); - - tokenizer tkz(rh.enable_debug()); - - tokenizer::span_type - in_span(testcase.input_.c_str(), - testcase.input_.c_str() + testcase.input_.size()); - - for (int i_tk = 0, n_tk = testcase.expected_tk_v_.size(); - i_tk < n_tk; ++i_tk) - { - log && log(xtag("i_tk", i_tk)); - - auto sr = tkz.scan(in_span, in_span.empty()); - const auto & tk = sr.get_token(); - - if (tk.is_valid()) { - REHEARSE(rh, tk.tk_type() == testcase.expected_tk_v_[i_tk].tk_type()); - } - if (tk.tk_type() == tokentype::tk_i64) - { - REHEARSE(rh, !tk.text().empty()); - REHEARSE(rh, tk.i64_value() == testcase.expected_tk_v_[i_tk].i64_value()); - } else if (tk.tk_type() == tokentype::tk_f64) - { - REHEARSE(rh, !tk.text().empty()); - REHEARSE(rh, tk.f64_value() == testcase.expected_tk_v_[i_tk].f64_value()); - } else if(tk.tk_type() == tokentype::tk_string) - { - /* tk.text() can be empty, consider input "" */ - REHEARSE(rh, tk.text() == testcase.expected_tk_v_[i_tk].text()); - } else if(tk.tk_type() == tokentype::tk_symbol) - { - REHEARSE(rh, !tk.text().empty()); - REHEARSE(rh, tk.text() == testcase.expected_tk_v_[i_tk].text()); - } else { - REHEARSE(rh, tk.text().empty()); - } - - in_span = in_span.after_prefix(sr.consumed()); - } - } - } - } /*TEST_CASE(tokenizer2)*/ - - namespace { - using tkz_error_type = xo::scm::tokenizer_error; - using input_state_type = xo::scm::input_state; - using span_type = xo::scm::span; - - struct testcase_error { - std::string input_; - tkz_error_type expect_error_; - }; - - testcase_error - make_testcase(const char * input, const char * src_function, const char * error_descr, - size_t tk_start, size_t whitespace, size_t error_pos) - { - size_t line_no = 1; - - testcase_error retval; - retval.input_ = input; - retval.expect_error_ = tkz_error_type(src_function, error_descr, - input_state_type(span_type::from_string(retval.input_), - tk_start, whitespace), - error_pos); - return retval; - } - - std::vector - s_testcase3_v = { - // 012345678 - // --------v - make_testcase("123.456ez", - "assemble_token", - "unexpected character in numeric constant", - 0, 0, 8), - // 01 - // -v - make_testcase("1-3", - "assemble_token", - "improperly placed sign indicator", - 0, 0, 1), - // 012 - // --v - make_testcase("1..2", - "assemble_token", - "duplicate decimal point in numeric literal", - 0, 0, 2), - // o 0123456 - // ------v - make_testcase("1.23e4e", - "assemble_token", - "duplicate exponent marker in numeric literal", - 0, 0, 6), - // tokenizer sees string ["\"] - // 0 1 2 3 - // - - - v - make_testcase("\"\\\"", - "assemble_token", - "missing terminating '\"' to complete literal string", - //"expect \\ to escape one of n|t|r|\"|\\ in string literal", - 0, 0, 3), - // tokenizer sees literal with embedded newline - // 1 2 3 - // 01234567890123456789012345678901 2 - // -------------------------------- v - make_testcase("\"everything was going fine until\n\"", - "scan", - "must use \\n or \\r to encode newline/cr in string literal", - 0, 0, 32), - // tokenizer sees string ["\] - // 0 1 2 - // - - v - make_testcase("\"\\", - "assemble_token", - "expecting key following escape character \\", - 0, 0, 2), - // tokenizer sees string ["\q"] - // 0 12 - // - -v - make_testcase("\"\\q\"", - "assemble_token", - "expecting one of n|r|\"|\\ following escape \\", - 0, 0, 2), - // - make_testcase("#", - "assemble_token", - "illegal input character", - 0, 0, 0), - }; - - TEST_CASE("tokenizer3", "[tokenizer]") { - /* testing error handling */ - - using tokenizer = xo::scm::tokenizer; - - constexpr bool c_force_debug = false; - - for (std::size_t i_tc = 0, n_tc = s_testcase3_v.size(); i_tc < n_tc; ++i_tc) { - const testcase_error & testcase = s_testcase3_v[i_tc]; - - rehearser rh(0); - - for (auto _ : rh) { - scope log(XO_DEBUG2(c_force_debug || rh.enable_debug(), "tokenizer3")); - - log && log(xtag("pass", _), xtag("ok(-)", rh.ok_flag_)); - log && log(xtag("i_tc", i_tc), xtag("input", testcase.input_)); - - tokenizer tkz(c_force_debug || rh.enable_debug()); - - auto in_span = tokenizer::span_type::from_string(testcase.input_); - - auto sr = tkz.scan(in_span, true /*eof*/); - - REHEARSE(rh, sr.is_error()); - - if (sr.error().src_function()) { - REHEARSE(rh, std::string(sr.error().src_function()) == std::string(testcase.expect_error_.src_function())); - } - if (!sr.error().error_description().empty()) { - REHEARSE(rh, std::string(sr.error().error_description()) == std::string(testcase.expect_error_.error_description())); - } - REHEARSE(rh, sr.error().whitespace() == testcase.expect_error_.whitespace()); - REHEARSE(rh, sr.error().tk_start() == testcase.expect_error_.tk_start()); - REHEARSE(rh, sr.error().error_pos() == testcase.expect_error_.error_pos()); - - log && log(xtag("ok(+)", rh.ok_flag_)); - } - } - } - } - - } /*namespace ut*/ -} /*namespace xo*/ - -/* end tokenizer.test.cpp */ diff --git a/.xo-tokenizer/utest/tokenizer_utest_main.cpp b/.xo-tokenizer/utest/tokenizer_utest_main.cpp deleted file mode 100644 index c5e273c4..00000000 --- a/.xo-tokenizer/utest/tokenizer_utest_main.cpp +++ /dev/null @@ -1,6 +0,0 @@ -/* file tokenizer_utest_main.cpp */ - -#define CATCH_CONFIG_MAIN -#include "catch2/catch.hpp" - -/* end tokenizer_utest_main.cpp */