diff --git a/xo-expression/include/xo/expression/Constant.hpp b/xo-expression/include/xo/expression/Constant.hpp index a302a43b..e5c9b0a9 100644 --- a/xo-expression/include/xo/expression/Constant.hpp +++ b/xo-expression/include/xo/expression/Constant.hpp @@ -88,7 +88,7 @@ namespace xo { value_td_{Reflect::require()}, value_(x) { - static_assert(std::is_standard_layout_v && std::is_trivial_v); + //static_assert(std::is_standard_layout_v && std::is_trivial_v); } private: diff --git a/xo-interpreter/include/xo/interpreter/BuiltinPrimitives.hpp b/xo-interpreter/include/xo/interpreter/BuiltinPrimitives.hpp index feb85051..e19fc61e 100644 --- a/xo-interpreter/include/xo/interpreter/BuiltinPrimitives.hpp +++ b/xo-interpreter/include/xo/interpreter/BuiltinPrimitives.hpp @@ -14,19 +14,6 @@ namespace xo { public: using ObjectConverter = xo::obj::ObjectConverter; - /** install conversions for PrimitiveExpr -> Primitive - * for particular function pointer types Fn. - * - * Source type from xo-expression - * Dest type from xo-object. - * - * Module dependence goes the other way - * i.e. xo-interpreter -uses-> xo-expression - * -uses-> xo-object - * For this reason rejected adding a virtual method to PrimitiveExprInterface - **/ - static void install_interpreter_conversions(ObjectConverter * target); - template static void install_pm(gc::IAlloc * mm, rp pm_expr, gp env) { gp rhs diff --git a/xo-interpreter/src/interpreter/BuiltinPrimitives.cpp b/xo-interpreter/src/interpreter/BuiltinPrimitives.cpp index 7f49a069..bead7c87 100644 --- a/xo-interpreter/src/interpreter/BuiltinPrimitives.cpp +++ b/xo-interpreter/src/interpreter/BuiltinPrimitives.cpp @@ -23,12 +23,6 @@ namespace xo { return x + y; } - void - BuiltinPrimitives::install_interpreter_conversions(ObjectConverter * /*target*/) - { - /* abandoning this path */ - } - void BuiltinPrimitives::install(gc::IAlloc * mm, gp env) { diff --git a/xo-interpreter/src/interpreter/VirtualSchematikaMachine.cpp b/xo-interpreter/src/interpreter/VirtualSchematikaMachine.cpp index 9a02f22b..9a9c7ddf 100644 --- a/xo-interpreter/src/interpreter/VirtualSchematikaMachine.cpp +++ b/xo-interpreter/src/interpreter/VirtualSchematikaMachine.cpp @@ -110,7 +110,6 @@ namespace xo { toplevel_env_{env}, log_level_{ll} { - BuiltinPrimitives::install_interpreter_conversions(&object_converter_); } // ----- VirtualSchematikaMachine ----- @@ -321,7 +320,15 @@ namespace xo { VSM_CONTINUE(); } else { - /* see ObjectConverter::ctor to add more builtin types */ + /* see ObjectConverter::ctor to add more builtin types + * + * generally conversion for a type Foo will appear in Foo.hpp + * see + * xo/object/Boolean.hpp + * xo/object/Integer.hpp + * xo/object/Float.hpp + * xo/object/String.hpp + */ VSM_ERROR(tostr("constant_op: unable to convert native value to object", xtag("id", expr->value_tp().td()->id()), diff --git a/xo-object/include/xo/object/String.hpp b/xo-object/include/xo/object/String.hpp index 91975752..aadbdecd 100644 --- a/xo-object/include/xo/object/String.hpp +++ b/xo-object/include/xo/object/String.hpp @@ -3,11 +3,41 @@ * author: Roland Conybeare, Aug 2025 */ -#include "xo/alloc/IAlloc.hpp" #include "xo/alloc/Object.hpp" +#include "ObjectConversion.hpp" +#include "xo/alloc/IAlloc.hpp" namespace xo { namespace obj { + /** unicode terminology (via https://utf8everywhere.org) + * 1. code unit: + * bit combination that represents a unit of encoded text. + * 8-bits for utf-8 i.e. code-unit = char + * 2. code point: + * a numerical value in the unicode namespace, e.g. U+3243F + * one or more code units encode a code point. + * utf-8 uses 1-4 code units to encode each code point. + * 3. abstract character: + * inherently open, because includes characters that are not + * (yet) representable in unicode. + * 4. (en)coded character: + * mapping between code points and abstract character. + * for example U+1F428 is coded character for emoji named 'KOALA' + * caveats: + * - some code points do not have abstract characters assigned. + * - some code points are reserved for non-characters + * (e.g. null, newline ..) + * - some abstract characters require multiple code points: + * for example requiring a composition sequence + * - some abstract characters have multiple encodings + * 5. user-perceived character. whatever you think that means. + * May be language-dependent. + * 6. grapheme cluster. a sequence of coded characters that + * "belong together". for example backspace would erase + * a grapheme cluster atomically. + * 7. a shape within a font. A sequence of code points maps to + * a sequence of glyphs. + **/ class String : public Object { public: enum class owner { unique, shared }; @@ -32,6 +62,18 @@ namespace xo { const char * c_str() const { return chars_; } std::size_t length() const; + /** Approximate number of columns (if using a fixed-width font) occupied + * by this string. Obtained by counting bytes up to null terminator, + * omitting utf-8 continuation bytes, i.e. bytes with high bit set + * and 2nd-highest bit clear. + * + * @text + * bits: 76543210 + * 10______ + * @endtext + **/ + std::size_t columns() const; + // inherited from Object.. virtual TaggedPtr self_tp() const final override; virtual void display(std::ostream & os) const final override; @@ -48,11 +90,36 @@ namespace xo { /** true iff storage in @ref chars_ is owned by this String. **/ owner owner_ = owner::shared; - /** length of @ref chars_ in bytes (storage allocated, not necessarily string length) **/ + /** length of @ref chars_ in bytes (storage allocated, not necessarily string length). + * Includes null terminator + **/ std::size_t z_chars_ = 0; - /** string contents. always null-terminated **/ + /** utf-8 string contents. always null-terminated. + * Note that this is #of bytes + **/ char * chars_ = nullptr; }; + + struct ObjectConversion_String { + static gp to_object(gc::IAlloc * mm, std::string x) { + return String::copy(mm, x.c_str()); + } + static std::string from_object(gc::IAlloc *, gp x) { + gp x_str = String::from(x); + if (x_str.get()) { + /* note: ignores allocator, always uses heap. + * This will affect operation of primitives (if any) that + * expect std::string. Alternative would be use IAlloc*, + * with Blob wrapper (or without if/when need for iterable + * memory is dropped). + */ + return std::string(x_str->c_str()); + } + } + }; + + template <> + struct ObjectConversion : public ObjectConversion_String {}; } /*namespace obj*/ } /*namespace xo*/ diff --git a/xo-object/src/object/ObjectConverter.cpp b/xo-object/src/object/ObjectConverter.cpp index 2c15e594..8ec4c620 100644 --- a/xo-object/src/object/ObjectConverter.cpp +++ b/xo-object/src/object/ObjectConverter.cpp @@ -7,6 +7,7 @@ #include "Integer.hpp" #include "Float.hpp" #include "Boolean.hpp" +#include "String.hpp" #include "TaggedPtr.hpp" #include "xo/alloc/Blob.hpp" @@ -121,6 +122,30 @@ namespace xo { return Reflect::make_tp(bool_obj->value() ? &s_true : &s_false); } + + gp + string_to_object(IAlloc * mm, const TaggedPtr & src) + { + std::string * native = src.recover_native(); + + assert(native); + + return String::copy(mm, native->c_str()); + } + + TaggedPtr + object_to_string(IAlloc * /*mm*/, gp obj) + { + gp string_obj = String::from(obj); + + if (!string_obj.get()) { + throw std::runtime_error(tostr("Object obj founcd where String expected", + xtag("obj", obj))); + } + + // still don't have good solver for this yet + assert(false); + } } ObjectConverter::ObjectConverter() @@ -131,6 +156,8 @@ namespace xo { this->establish_conversion(&float_to_object, &object_to_float); this->establish_conversion(&bool_to_object, &object_to_bool); + + this->establish_conversion(&string_to_object, &object_to_string); } gp diff --git a/xo-object/src/object/String.cpp b/xo-object/src/object/String.cpp index fdc3d3ba..b164c14a 100644 --- a/xo-object/src/object/String.cpp +++ b/xo-object/src/object/String.cpp @@ -99,6 +99,22 @@ namespace xo { return ::strlen(chars_); } + std::size_t + String::columns() const + { + size_t retval = 0; + + for (const char * p = chars_, * e = chars_ + z_chars_; *p && (p < e); ++p) { + if ((*p & 0xc0) == 0x80) { + /* continuation byte -> ignore */ + } else { + ++retval; + } + } + + return retval; + } + TaggedPtr String::self_tp() const { return Reflect::make_tp(const_cast(this)); diff --git a/xo-object/utest/String.test.cpp b/xo-object/utest/String.test.cpp index 85e5c28c..17253553 100644 --- a/xo-object/utest/String.test.cpp +++ b/xo-object/utest/String.test.cpp @@ -44,6 +44,9 @@ namespace xo { std::vector s_testcase_v = { Testcase_String(1024, 4096, 512, 512, {"hello"}), + // in emacs: C-x 8 RET lambda + // + Testcase_String(1024, 4096, 512, 512, {"λ"}), Testcase_String(1024, 4096, 512, 512, {"hello", ", world!"}) }; } @@ -168,6 +171,31 @@ namespace xo { } } + TEST_CASE("String.columns", "[String][unicode]") + { + const bool c_debug_flag = false; + up arena = ArenaAlloc::make("testarena", + 16*1024, c_debug_flag); + + Object::mm = arena.get(); + + gp s0 = String::copy(""); + + REQUIRE(s0->columns() == 0); + REQUIRE(s0->length() == 0); + + gp s1 = String::copy("l"); + + REQUIRE(s1->columns() == 1); + REQUIRE(s1->length() == 1); + + gp s2 = String::copy("λ"); + + REQUIRE(s2->columns() == 1); + /* two code units in code point */ + REQUIRE(s2->length() == 2); + } + TEST_CASE("String.append", "[String]") { const bool c_debug_flag = false; diff --git a/xo-ordinaltree/include/xo/ordinaltree/RedBlackTree.hpp b/xo-ordinaltree/include/xo/ordinaltree/RedBlackTree.hpp index 7a3ea1e4..2c021c9b 100644 --- a/xo-ordinaltree/include/xo/ordinaltree/RedBlackTree.hpp +++ b/xo-ordinaltree/include/xo/ordinaltree/RedBlackTree.hpp @@ -338,7 +338,7 @@ namespace xo { ContentsType contents_; /* accumulator for some binary function of Values. * must be associative, since value will be produced - * by any testing of calls to Reduce::combine(). + * by any ordering of calls to Reduce::combine(). * * e.g. {a, b, c, d} could be reduced by: * r(r(a,b), r(c,d)) diff --git a/xo-reader/include/xo/reader/expect_expr_xs.hpp b/xo-reader/include/xo/reader/expect_expr_xs.hpp index 91cc7f31..6f6513e3 100644 --- a/xo-reader/include/xo/reader/expect_expr_xs.hpp +++ b/xo-reader/include/xo/reader/expect_expr_xs.hpp @@ -68,6 +68,9 @@ namespace xo { virtual void on_f64_token(const token_type & tk, parserstatemachine * p_psm) override; + virtual void on_string_token(const token_type & tk, + parserstatemachine * p_psm) final override; + /** update exprstate in response to a successfully-parsed subexpression **/ virtual void on_expr(bp expr, parserstatemachine * p_psm) override; diff --git a/xo-reader/include/xo/reader/exprseq_xs.hpp b/xo-reader/include/xo/reader/exprseq_xs.hpp index 22b07f9e..60b483ad 100644 --- a/xo-reader/include/xo/reader/exprseq_xs.hpp +++ b/xo-reader/include/xo/reader/exprseq_xs.hpp @@ -59,6 +59,8 @@ namespace xo { parserstatemachine * p_psm) override; virtual void on_f64_token(const token_type & tk, parserstatemachine * p_psm) override; + virtual void on_string_token(const token_type & tk, + parserstatemachine * p_psm) final override; // ----- victory methods ----- diff --git a/xo-reader/include/xo/reader/exprstate.hpp b/xo-reader/include/xo/reader/exprstate.hpp index c21f453f..ed3187c3 100644 --- a/xo-reader/include/xo/reader/exprstate.hpp +++ b/xo-reader/include/xo/reader/exprstate.hpp @@ -218,6 +218,10 @@ namespace xo { virtual void on_f64_token(const token_type & tk, parserstatemachine * p_psm); + /** handle incoming string-literal token **/ + virtual void on_string_token(const token_type & tk, + parserstatemachine * p_psm); + protected: /** throw exception when next token is inconsistent with * parsing state diff --git a/xo-reader/src/reader/expect_expr_xs.cpp b/xo-reader/src/reader/expect_expr_xs.cpp index c297b969..10caafeb 100644 --- a/xo-reader/src/reader/expect_expr_xs.cpp +++ b/xo-reader/src/reader/expect_expr_xs.cpp @@ -173,7 +173,7 @@ namespace xo { */ progress_xs::start(var.promote(), p_psm); -#ifdef NOT_YET + #ifdef NOT_YET p_stack->push_exprstate(exprstate(exprstatetype::expr_progress, Variable::make(name, type))); #endif @@ -227,6 +227,21 @@ namespace xo { p_psm); } + void + expect_expr_xs::on_string_token(const token_type & tk, + parserstatemachine * p_psm) + { + scope log(XO_DEBUG(p_psm->debug_flag())); + + /* e.g. + * def msg = "hello, world"; + * \----tk----/ + */ + progress_xs::start + (Constant::make(tk.text()), + p_psm); + } + void expect_expr_xs::on_expr(bp expr, parserstatemachine * p_psm) diff --git a/xo-reader/src/reader/expect_type_xs.cpp b/xo-reader/src/reader/expect_type_xs.cpp index 93fd1bbd..62d797f5 100644 --- a/xo-reader/src/reader/expect_type_xs.cpp +++ b/xo-reader/src/reader/expect_type_xs.cpp @@ -45,6 +45,8 @@ namespace xo { if (tk.text() == "bool") td = Reflect::require(); + else if (tk.text() == "str") + td = Reflect::require(); else if (tk.text() == "f64") td = Reflect::require(); else if(tk.text() == "f32") diff --git a/xo-reader/src/reader/exprseq_xs.cpp b/xo-reader/src/reader/exprseq_xs.cpp index 7bbb30b6..48307d77 100644 --- a/xo-reader/src/reader/exprseq_xs.cpp +++ b/xo-reader/src/reader/exprseq_xs.cpp @@ -181,7 +181,7 @@ namespace xo { scope log(XO_DEBUG(p_psm->debug_flag())); - constexpr const char * c_self_name = "exprseq_xs::on_i64_token"; + constexpr const char * c_self_name = "exprseq_xs::on_f64_token"; if (xseqtype_ == exprseqtype::toplevel_interactive) { @@ -199,6 +199,44 @@ namespace xo { } } + void + exprseq_xs::on_string_token(const token_type & tk, + parserstatemachine * p_psm) + { + using xo::scm::Constant; + + scope log(XO_DEBUG(p_psm->debug_flag())); + + constexpr const char * c_self_name = "exprseq_xs::on_string_token"; + + if (xseqtype_ == exprseqtype::toplevel_interactive) + { + // remark: + // 1. Constant is an expression. At present (nov 2025) these are + // reference-counted (+ leak in xo-interpreter). + // 2. Could fix leak by adding a finalization feature to GC. + // Do intend to eventually support finalization, + // but not to use it here. + // 3. Instead mean to change allocation strategy for Expression + // to use GC instead. + // 4. As intermediate step try migrating Expression hierarchy + // to support arena allocation. + // See xo/alloc/ArenaAllocT.hpp + assoc'd unit test + // + progress_xs::start(Constant::make(tk.text()), p_psm); + } else { + /* policy: don't allow literals as toplevel expressions + * unless interactive session. + */ + const char * exp = get_expect_str(); + + this->illegal_input_on_token(c_self_name, + tk, + exp, + p_psm); + } + } + void exprseq_xs::on_typedescr(TypeDescr /*td*/, parserstatemachine * /*p_psm*/) diff --git a/xo-reader/src/reader/exprstate.cpp b/xo-reader/src/reader/exprstate.cpp index 3b631e26..5bbfcb86 100644 --- a/xo-reader/src/reader/exprstate.cpp +++ b/xo-reader/src/reader/exprstate.cpp @@ -336,7 +336,7 @@ namespace xo { { scope log(XO_DEBUG(p_psm->debug_flag())); - constexpr const char * c_self_name = "exprstate::on_bool"; + constexpr const char * c_self_name = "exprstate::on_bool_token"; const char * exp = get_expect_str(); this->illegal_input_on_token(c_self_name, tk, exp, p_psm); @@ -348,7 +348,7 @@ namespace xo { { scope log(XO_DEBUG(p_psm->debug_flag())); - constexpr const char * c_self_name = "exprstate::on_i64"; + constexpr const char * c_self_name = "exprstate::on_i64_token"; const char * exp = get_expect_str(); this->illegal_input_on_token(c_self_name, tk, exp, p_psm); @@ -360,7 +360,19 @@ namespace xo { { scope log(XO_DEBUG(p_psm->debug_flag())); - constexpr const char * c_self_name = "exprstate::on_f64"; + constexpr const char * c_self_name = "exprstate::on_f64_token"; + const char * exp = get_expect_str(); + + this->illegal_input_on_token(c_self_name, tk, exp, p_psm); + } + + void + exprstate::on_string_token(const token_type & tk, + parserstatemachine * p_psm) + { + scope log(XO_DEBUG(p_psm->debug_flag())); + + constexpr const char * c_self_name = "exprstate::on_string_token"; const char * exp = get_expect_str(); this->illegal_input_on_token(c_self_name, tk, exp, p_psm); @@ -399,7 +411,7 @@ namespace xo { return; case tokentype::tk_string: - assert(false); + this->on_string_token(tk, p_psm); return; case tokentype::tk_symbol: