xo-interpreter: handle litersl strings. Broken memory model.

This commit is contained in:
Roland Conybeare 2025-11-29 20:19:33 -05:00
commit cf846b2f8d
16 changed files with 234 additions and 32 deletions

View file

@ -3,11 +3,41 @@
* author: Roland Conybeare, Aug 2025
*/
#include "xo/alloc/IAlloc.hpp"
#include "xo/alloc/Object.hpp"
#include "ObjectConversion.hpp"
#include "xo/alloc/IAlloc.hpp"
namespace xo {
namespace obj {
/** unicode terminology (via https://utf8everywhere.org)
* 1. code unit:
* bit combination that represents a unit of encoded text.
* 8-bits for utf-8 i.e. code-unit = char
* 2. code point:
* a numerical value in the unicode namespace, e.g. U+3243F
* one or more code units encode a code point.
* utf-8 uses 1-4 code units to encode each code point.
* 3. abstract character:
* inherently open, because includes characters that are not
* (yet) representable in unicode.
* 4. (en)coded character:
* mapping between code points and abstract character.
* for example U+1F428 is coded character for emoji named 'KOALA'
* caveats:
* - some code points do not have abstract characters assigned.
* - some code points are reserved for non-characters
* (e.g. null, newline ..)
* - some abstract characters require multiple code points:
* for example requiring a composition sequence
* - some abstract characters have multiple encodings
* 5. user-perceived character. whatever you think that means.
* May be language-dependent.
* 6. grapheme cluster. a sequence of coded characters that
* "belong together". for example backspace would erase
* a grapheme cluster atomically.
* 7. a shape within a font. A sequence of code points maps to
* a sequence of glyphs.
**/
class String : public Object {
public:
enum class owner { unique, shared };
@ -32,6 +62,18 @@ namespace xo {
const char * c_str() const { return chars_; }
std::size_t length() const;
/** Approximate number of columns (if using a fixed-width font) occupied
* by this string. Obtained by counting bytes up to null terminator,
* omitting utf-8 continuation bytes, i.e. bytes with high bit set
* and 2nd-highest bit clear.
*
* @text
* bits: 76543210
* 10______
* @endtext
**/
std::size_t columns() const;
// inherited from Object..
virtual TaggedPtr self_tp() const final override;
virtual void display(std::ostream & os) const final override;
@ -48,11 +90,36 @@ namespace xo {
/** true iff storage in @ref chars_ is owned by this String.
**/
owner owner_ = owner::shared;
/** length of @ref chars_ in bytes (storage allocated, not necessarily string length) **/
/** length of @ref chars_ in bytes (storage allocated, not necessarily string length).
* Includes null terminator
**/
std::size_t z_chars_ = 0;
/** string contents. always null-terminated **/
/** utf-8 string contents. always null-terminated.
* Note that this is #of bytes
**/
char * chars_ = nullptr;
};
struct ObjectConversion_String {
static gp<Object> to_object(gc::IAlloc * mm, std::string x) {
return String::copy(mm, x.c_str());
}
static std::string from_object(gc::IAlloc *, gp<Object> x) {
gp<String> x_str = String::from(x);
if (x_str.get()) {
/* note: ignores allocator, always uses heap.
* This will affect operation of primitives (if any) that
* expect std::string. Alternative would be use IAlloc*,
* with Blob wrapper (or without if/when need for iterable
* memory is dropped).
*/
return std::string(x_str->c_str());
}
}
};
template <>
struct ObjectConversion<std::string> : public ObjectConversion_String {};
} /*namespace obj*/
} /*namespace xo*/

View file

@ -7,6 +7,7 @@
#include "Integer.hpp"
#include "Float.hpp"
#include "Boolean.hpp"
#include "String.hpp"
#include "TaggedPtr.hpp"
#include "xo/alloc/Blob.hpp"
@ -121,6 +122,30 @@ namespace xo {
return Reflect::make_tp(bool_obj->value() ? &s_true : &s_false);
}
gp<Object>
string_to_object(IAlloc * mm, const TaggedPtr & src)
{
std::string * native = src.recover_native<std::string>();
assert(native);
return String::copy(mm, native->c_str());
}
TaggedPtr
object_to_string(IAlloc * /*mm*/, gp<Object> obj)
{
gp<String> string_obj = String::from(obj);
if (!string_obj.get()) {
throw std::runtime_error(tostr("Object obj founcd where String expected",
xtag("obj", obj)));
}
// still don't have good solver for this yet
assert(false);
}
}
ObjectConverter::ObjectConverter()
@ -131,6 +156,8 @@ namespace xo {
this->establish_conversion<double>(&float_to_object<double>, &object_to_float<double>);
this->establish_conversion<bool>(&bool_to_object, &object_to_bool);
this->establish_conversion<std::string>(&string_to_object, &object_to_string);
}
gp<Object>

View file

@ -99,6 +99,22 @@ namespace xo {
return ::strlen(chars_);
}
std::size_t
String::columns() const
{
size_t retval = 0;
for (const char * p = chars_, * e = chars_ + z_chars_; *p && (p < e); ++p) {
if ((*p & 0xc0) == 0x80) {
/* continuation byte -> ignore */
} else {
++retval;
}
}
return retval;
}
TaggedPtr
String::self_tp() const {
return Reflect::make_tp(const_cast<String*>(this));

View file

@ -44,6 +44,9 @@ namespace xo {
std::vector<Testcase_String>
s_testcase_v = {
Testcase_String(1024, 4096, 512, 512, {"hello"}),
// in emacs: C-x 8 RET lambda
//
Testcase_String(1024, 4096, 512, 512, {"λ"}),
Testcase_String(1024, 4096, 512, 512, {"hello", ", world!"})
};
}
@ -168,6 +171,31 @@ namespace xo {
}
}
TEST_CASE("String.columns", "[String][unicode]")
{
const bool c_debug_flag = false;
up<ArenaAlloc> arena = ArenaAlloc::make("testarena",
16*1024, c_debug_flag);
Object::mm = arena.get();
gp<String> s0 = String::copy("");
REQUIRE(s0->columns() == 0);
REQUIRE(s0->length() == 0);
gp<String> s1 = String::copy("l");
REQUIRE(s1->columns() == 1);
REQUIRE(s1->length() == 1);
gp<String> s2 = String::copy("λ");
REQUIRE(s2->columns() == 1);
/* two code units in code point */
REQUIRE(s2->length() == 2);
}
TEST_CASE("String.append", "[String]")
{
const bool c_debug_flag = false;