/* file ExplicitDist.hpp * * author: Roland Conybeare, Oct 2022 */ #pragma once #include "Distribution.hpp" #include "Normal.hpp" #include "xo/indentlog/scope.hpp" #include "xo/indentlog/print/vector.hpp" #include "xo/indentlog/print/tostr.hpp" #include #include #include namespace xo { using xo::xtag; namespace distribution { class ProbabilityBucket { public: ProbabilityBucket() = default; double weight() const { return wt_; } double cdf() const { return cdf_; } /* note: when calling this, must invalidate ExplicitDist.cdf_valid_flag */ void scale_weight(double k) { this->wt_ *= k; } void assign_weight(double w) { this->wt_ = w; } /* implementation method: only ExplicitDist.renormalize() should call this */ void assign_cdf(double x) { this->cdf_ = x; } void display(std::ostream & os) const { using xo::xtag; os << "wt_) << xtag("cdf", this->cdf_) << ">"; } /*display*/ private: /* probability assigned to this bucket. * updated in-place by ExplicitDist.include_sample() */ double wt_ = 0.0; /* cumulative probability assigned to this bucket b * and all buckets b[i] for domain values < b * * invalidated by ExplicitDist.scale_bucket() * see ExplicitDist. */ double cdf_ = 0.0; }; /*ProbabilityBucket*/ inline std::ostream & operator<<(std::ostream & os, ProbabilityBucket const & x) { x.display(os); return os; } /*operator<<*/ namespace detail { /* three kinds of index here: * 1. lo: non-negative index into ExplicitDist.lo_v_[] * 2. hi: non-negative index into ExplicitDist.hi_v_[] * 3. signed: if >=0: index into ExplicitDist.hi_v_[] * if <0: index into ExplicitDist.lo_v_[] * * with lz=.lo_v.size(), hz=hi_v.size(): * * .hi_v[0] .hi_v[hz-1] * | v v * | * +--+--+ +--+--|--+--+ +--+ * | | | ... | | | | | ... | | * +--+--+ +--+--|--+--+ +--+ * ^ ^ * .lo_v[lz-1] .lo_v[0] * * ^ ^ ^ ^ * signed -lz -1| 0 ... hz-1 */ class ExplicitDistIndexUtil { public: static size_t signed2hi(int32_t signed_ix) { return signed_ix; } static size_t signed2lo(int32_t signed_ix) { return -signed_ix - 1; } static int32_t lo2signed(size_t lo_ix) { return -static_cast(lo_ix)-1; } static int32_t hi2signed(size_t hi_ix) { return hi_ix; } static ProbabilityBucket const * signed2bucket_c(int32_t signed_ix, std::vector const * lo_v, std::vector const * hi_v) { if (signed_ix >= 0) { size_t hi_ix = signed2hi(signed_ix); if (hi_v && (hi_ix < hi_v->size())) return &(*hi_v)[hi_ix]; } else { size_t lo_ix = signed2lo(signed_ix); if (lo_v && (lo_ix < lo_v->size())) return &(*lo_v)[lo_ix]; } return nullptr; } /*signed2bucket_c*/ static ProbabilityBucket * signed2bucket(int32_t signed_ix, std::vector * lo_v, std::vector * hi_v) { ProbabilityBucket const * b = signed2bucket_c(signed_ix, lo_v, hi_v); return const_cast(b); } /*signed2bucket*/ }; /*ExplicitDistIndexUtil*/ class ExplicitDistIterator : public ExplicitDistIndexUtil { public: ExplicitDistIterator() = default; ExplicitDistIterator(int32_t signed_ix, std::vector * lo_v, std::vector * hi_v) : signed_ix_{signed_ix}, p_lo_v_{lo_v}, p_hi_v_{hi_v} {} ProbabilityBucket & operator*() { ProbabilityBucket * pb = signed2bucket(this->signed_ix_, this->p_lo_v_, this->p_hi_v_); if (!pb) { throw std::runtime_error("ExplicitDistIterator: attempt to deref invalid iterator"); } return *pb; } /*operator**/ ExplicitDistIterator & operator++() { ++(this->signed_ix_); return *this; } ExplicitDistIterator & operator--() { --(this->signed_ix_); return *this; } private: /* signed iterator */ int32_t signed_ix_ = 0; /* will be ExplicitDist.lo_v[] */ std::vector * p_lo_v_ = nullptr; /* will be ExplicitDist.hi_v[] */ std::vector * p_hi_v_ = nullptr; }; /*ExplicitDistIterator*/ } /*namespace detail*/ /* explicit distribution with buckets. * (if you want to avoid bucketing at the expense of memory, * see StdEmpirical) * * sample-independent buckets can be faster to the extent * {#of buckets} << {#of samples} * * in particular * | StdEmpirical | PackedEmpirical * ------------------------------+--------------+----------------- * update existing sample/bucket | O(log(n)) | O(1) * create new sample/bucket | O(log(n)) | O(log(n)) * cdf | O(log(n)) | O(n) * * PackedEmpirical offers scaling + renormalization * * since .cdf() is slow, .ks_stat_1sided() is supported only in StdEmpirical. * (could generalize in future to some other implementation with fast .cdf()) * * Require: * Domain is something with metric, e.g. int|double. * categorical domain not supported here. * * see also: statistics/Histogram. Histogram keeps more per-bucket statistics */ template class ExplicitDist : public Distribution, public detail::ExplicitDistIndexUtil { public: using WeightVector = std::vector; using iterator = detail::ExplicitDistIterator; public: static ref::rp make(Domain bucket_dx, Domain ref_value) { return new ExplicitDist(bucket_dx, ref_value); } /* create distribution with n buckets of width bucket_dx, * covering range [ref_value, ref_value + n * bucket_dx] */ static ref::rp make_n(size_t n, Domain bucket_dx, Domain ref_value) { return new ExplicitDist(n, bucket_dx, ref_value); } size_t n_bucket() const { return this->lo_v_.size() + this->hi_v_.size(); } /* lub domain value with .cdf(lo) = 0 */ Domain lo() const { std::size_t lz = this->lo_v_.size(); return this->ref_value_ - lz * this->bucket_dx_; } /*lo*/ /* glb domain value with .cdf(hi) = 1 */ Domain hi() const { std::size_t hz = this->hi_v_.size(); return this->ref_value_ + hz * this->bucket_dx_; } /*hi*/ Domain bucket_lo(int32_t signed_ix) const { return this->ref_value_ + signed_ix * this->bucket_dx_; } /*bucket_lo*/ Domain bucket_mid(int32_t signed_ix) const { return this->ref_value_ + (0.5 + signed_ix) * this->bucket_dx_; } /*bucket_mid*/ Domain bucket_hi(int32_t signed_ix) const { return this->bucket_lo(signed_ix + 1); } /*bucket_hi*/ iterator begin() { return iterator(lo2signed(this->lo_v_.size() - 1), &(this->lo_v_), &(this->hi_v_)); } iterator end() { return iterator(hi2signed(this->hi_v_.size()), &(this->lo_v_), &(this->hi_v_)); } /* probability density at domain value x */ double density(Domain const & x) const { /* careful! may need to renormalize */ { ExplicitDist * self = const_cast *>(this); self->check_renormalize(); } auto v = this->lookup_bucket(x); ProbabilityBucket const * b = v.first; if (b) { /* probability density is constant within each bucket */ return b->weight() / this->bucket_dx_; } else { return 0.0; } } /*density*/ /* pair {bucket glb, density} for all buckets, in increasing domain order */ std::vector> density_v() const { /* careful! may need to renormalize */ { ExplicitDist * self = const_cast *>(this); self->check_renormalize(); } size_t n = this->n_bucket(); std::vector> retval(n); double w2d = 1.0 / this->bucket_dx_; for(size_t i=0; ibucket_lo(i); ProbabilityBucket const * b = this->lookup_signed_bucket(i); assert(b); double density = b->weight() * w2d; retval[i] = std::make_pair(lo, density); } return retval; } /*density_v*/ /* return: * .first signed bucket index ix; * refers to .lo_v[1-ix] if ix<0; * refers to .hi_v[+ix] if ix>=0 * .second fractional weight within bucket associated with x. * not scaled by ProbabilityWeight.weight */ std::pair signed_bucket_index(Domain const & x) const { double ix_f = ::floor((x - this->ref_value_) / this->bucket_dx_); int32_t ix = ix_f; /* * ^ * |............ * | : | * | wt : | unit area * | : | * +-----------+ * bucket_lo ^ bucket_lo + .bucket_dx * x */ Domain bucket_lo = this->ref_value_ + (ix * this->bucket_dx_); double wt = (x - bucket_lo) / this->bucket_dx_; return std::make_pair(ix, wt); } /*signed_bucket_index*/ ProbabilityBucket const * lookup_signed_bucket(int32_t signed_ix) const { return signed2bucket_c(signed_ix, &(this->lo_v_), &(this->hi_v_)); } /*lookup_signed_bucket*/ /* non-const version with write access */ ProbabilityBucket * lookup_signed_bucket(int32_t signed_ix) { return signed2bucket(signed_ix, &(this->lo_v_), &(this->hi_v_)); } /*lookup_signed_bucket*/ /* return null pointer if bucket that would contain x not represented * .second is fractional weight (relative to unity) within selected bucket */ std::pair lookup_bucket(Domain const & x) const { /* signed index. * ix>=0 => .lo_v[] * ix< 0 => .hi_v[] */ std::pair v = this->signed_bucket_index(x); int32_t ix = v.first; double fraction_wt = v.second; ProbabilityBucket const * pw = this->lookup_signed_bucket(ix); return std::make_pair(pw, fraction_wt); } /*lookup_bucket*/ /* non-const version */ std::pair lookup_bucket(Domain const & x) { ExplicitDist const * c_self = this; std::pair v = c_self->lookup_bucket(x); ProbabilityBucket * b = const_cast(v.first); double fraction_wt = v.second; return std::make_pair(b, fraction_wt); } /*lookup_bucket*/ /* like .lookup_bucket(), but create bucket if not already present * .second reports fractional weight (relative to unity) within bucket * that contains x. */ std::pair establish_bucket(Domain const & x) { /* signed index. * ix>=0 => .lo_v[] * ix< 0 => .hi_v[] */ std::pair v = this->signed_bucket_index(x); int32_t ix = v.first; double fraction_wt = v.second; ProbabilityBucket * pw = nullptr; if (ix >= 0) { size_t hi_ix = signed2hi(ix); if (hi_ix >= this->hi_v_.size()) { /* need to expand .hi_v[] */ this->hi_v_.resize(hi_ix+1); } pw = &(this->hi_v_[hi_ix]); } else { size_t lo_ix = signed2lo(ix); if (lo_ix >= this->lo_v_.size()) { /* need to expand .lo_v[] */ this->lo_v_.resize(lo_ix+1); } pw = &(this->lo_v_[lo_ix]); } return std::make_pair(pw, fraction_wt); } /*establish_bucket*/ void scale_bucket_by_signed_index(int32_t signed_ix, double k) { ProbabilityBucket * b = this->lookup_signed_bucket(signed_ix); if (b) { b->scale_weight(k); this->cdf_valid_flag_ = false; } else { /* if bucket isn't represented, then corresponding weight is zero, * both before and after scaling. In this special case * representation didn't change -> don't invalidate cdf */ } } /*scale_bucket_by_signed_index*/ /* scale probability assigned to bucket for domain value x, by factor k */ void scale_bucket(Domain const & x, double k) { assert(k >= 0.0); std::pair v = this->signed_bucket_index(x); this->scale_bucket_by_signed_index(v.first, k); } /*scale_bucket*/ /* scale probability weight for buckets in [lo, hi] by fn(p) * for a point p in each bucket, however: * under no circumstances try to evaluate fn() outside [lo, hi] * (relevant if either lo/hi fall inside a bucket) */ template void scale_interval(Domain const & lo, Domain const & hi, Function && fn) { XO_SCOPE_DISABLED(lscope); /* note: using inclusive upper index bounds here; * variying from idiomatic c++ style for symmetry */ int32_t min_bucket_ix = lo2signed(this->lo_v_.size() - 1); int32_t max_bucket_ix = hi2signed(this->hi_v_.size() - 1); int32_t lo_ix = this->signed_bucket_index(lo).first; int32_t hi_ix = this->signed_bucket_index(hi).first; /* start_ix: lowest explicit bucket associating with [lo, hi) */ int32_t start_ix = std::max(min_bucket_ix, lo_ix + 1); /* end_ix: highest explicit bucket associating with [lo, hi) */ int32_t end_ix = std::min(max_bucket_ix, hi_ix); if (lscope.enabled()) { lscope.log(xtag("min_bucket_ix", min_bucket_ix), xtag("max_bucket_ix", max_bucket_ix), xtag("lo_ix", lo_ix), xtag("hi_ix", hi_ix), xtag("start_ix", start_ix), xtag("end_ix", end_ix)); } /* for endpoints: avoid evaluating fn() outside [lo, hi] */ if (min_bucket_ix <= lo_ix) { double lo_k = fn(lo); if (lscope.enabled()) lscope.log("A", xtag("lo_ix", lo_ix), xtag("lo_k", lo_k)); this->scale_bucket_by_signed_index(lo_ix, lo_k); } for(int32_t ix = start_ix; ix <= end_ix; ++ix) { double k = fn(this->bucket_mid(ix)); if (lscope.enabled()) lscope.log("B", xtag("ix", ix), xtag("k", k)); this->scale_bucket_by_signed_index(ix, k); } /* for endpoints: avoid evaluating fn() outside [lo, hi] */ if (hi_ix <= max_bucket_ix) { double hi_k = fn(hi); if (lscope.enabled()) lscope.log("C", xtag("hi_ix", hi_ix), xtag("hi_k", hi_k)); this->scale_bucket_by_signed_index(hi_ix, hi_k); } } /*scale_interval*/ template void scale_all(Function && fn) { this->scale_interval(this->lo(), this->hi(), fn); } /*scale_all*/ /* convenience: scale by scaled normal cdf. * * support use case where: * 1. explicit dist represents distribution of asset value; * 2. assume one party A to trade/order has noisy signal, * with normally-distributed error around (unknown) true value s, * with variance o^2; * 3. observe a trade/order at some price p, * giving us one-sided information about A's knowledge in two scenarios: * i. A trades * ii. A does not trade * * We will be applying Bayes' rule to update prior. * If buy(p) = {A buys}, * N(x) is cumulative normal distribution: * N(x) -> 0 as x -> -oo; * N(x) -> 1 as x -> +oo; * * -1/2 2 * (d/dx)N(x) = (2.pi) . exp(-x /2) * then: * P{s=s'} * P{s=s'|buy(p)} = ---------.P{buy(p)|s=s'} * P{buy(p)} * * P{s=s'} * = ---------.N[(1/o)(s'-p)], * P{buy(p)} * * In this scenario, P{s=s'} is our prior, represented by distribution *this. * The unconditional probability P{buy(p)} is not a function of s', so: * * / * | * P{buy(p)} = | P{s=s'} . N[(1/o)(s'-p)] . ds' * | * / * * in other words can scale explicit prior *this by N[(1/o)(s'-p)] then renormalize * so total probability weight is 1. * * Conversely, if event is {A sells}, similar argument leads to scaling * by N[(1/o)(p-s')] * * sign. scale by N[(1/sigma).sign.(x-mean)] */ void scale_by_normal_cdf(int sign, Domain const & mean, Domain const & sigma) { auto fn([sign, mean, sigma](Domain const & s) { return Normal::cdf_impl(sign * (s - mean) / sigma); }); this->scale_all(fn); } /*scale_by_normal_cdf*/ // ----- inherited from Distribution ----- /* note: marked const; actually "logically const" */ virtual double cdf(Domain const & x) const override { /* .cdf() is slow here, because partial sums aren't stored * --> have to sum over O(n) buckets */ { ExplicitDist * self = const_cast *>(this); self->check_renormalize(); } std::pair v = this->lookup_bucket(x); ProbabilityBucket const * b = v.first; double fraction_wt = v.second; if (b) { /* for bucket that x belongs to, treat as uniformly distributed * fraction_wt = 1.0 -> 100% of b -> b.cdf() * fraction_wt = 0.0 -> 0% of b -> b.cdf() - b.weight() */ return b->cdf() + (fraction_wt - 1.0) * b->weight(); } else { if (x >= this->ref_value_) { /* x falls above farthest .hi_v[] */ return 1.0; } else { /* x falls below farthest .lo_v[] */ return 0.0; } } } /*cdf*/ /* O(n). restores .cdf_valid_flag */ void renormalize() { /* [1] compute sum of probability weights * [2] rescale all buckets so sum is 1. * [3] restore .cdf_valid_flag */ /* [1] */ double lo_sum_prob = 0.0; for (ProbabilityBucket & b : this->lo_v_) { assert(lo_sum_prob >= 0.0); lo_sum_prob += b.weight(); } double hi_sum_prob = 0.0; for (ProbabilityBucket & b : this->hi_v_) { assert(hi_sum_prob >= 0.0); hi_sum_prob += b.weight(); } assert(lo_sum_prob + hi_sum_prob > 0.0); /* [2] */ double renorm_factor = 1.0 / (lo_sum_prob + hi_sum_prob); { double lo_tail = lo_sum_prob; /* iterate over buckets < .ref_value, in /descending/ domain order */ for (ProbabilityBucket & b : this->lo_v_) { b.scale_weight(renorm_factor); b.assign_cdf(lo_tail); /* reduce tail after assign cdf to b, since * lo_tail includes b's probability weight */ lo_tail -= b.weight(); /* numerical roundoff can cause this */ if (lo_tail < 0.0) lo_tail = 0.0; } } { double cum_prob = lo_sum_prob; /* iterate over buckets >= .refvalue, in /ascending/ domain order */ for (ProbabilityBucket & b : this->hi_v_) { b.scale_weight(renorm_factor); /* increase cum_prob before assign cdf to b, * since b.cdf should include b's probability weight */ cum_prob += b.weight(); /* numerical roundoff can cause this */ if (cum_prob >= 1.0) cum_prob = 1.0; b.assign_cdf(cum_prob); } } /* [3] */ this->cdf_valid_flag_ = true; } /*renormalize*/ void check_renormalize() { if (!this->cdf_valid_flag_) { this->renormalize(); } } /*check_renormalize*/ void display(std::ostream & os) const { os << "cdf_valid_flag_); os << xtag("bucket_dx", this->bucket_dx_); os << xtag("ref_value", this->ref_value_); os << xtag("lz", this->lo_v_.size()); os << xtag("hz", this->hi_v_.size()); os << xtag("lo_v", this->lo_v_); os << xtag("hi_v", this->hi_v_); os << ">"; } /*display*/ std::string display_string() const { return xo::tostr(*this); } private: ExplicitDist(Domain bucket_dx, Domain ref_value) : cdf_valid_flag_{true}, bucket_dx_{bucket_dx}, ref_value_{ref_value}, lo_v_{}, hi_v_{1} { assert(bucket_dx_ > 0.0); /* must have at least one bucket, since need total probability weight = 1 */ this->hi_v_[0].assign_weight(1.0); this->hi_v_[0].assign_cdf(1.0); } ExplicitDist(size_t n, Domain bucket_dx, Domain ref_value) : cdf_valid_flag_{true}, bucket_dx_{bucket_dx}, ref_value_{ref_value}, lo_v_{}, hi_v_{n} { assert(bucket_dx > 0.0); /* must have at least one bucket, since need total probability weight = 1 */ assert(n > 0); double w = 1.0 / n; for(size_t i = 0; ihi_v_[i].assign_weight(w); this->hi_v_[i].assign_cdf((i+1)*w); } } /*ctor*/ private: /* with lz=.lo_v.size(), hz=hi_v.size(): * * .ref_value - .bucket_dx * lz * | .ref_value * | | .hi_v[0] .hi_v[hz-1] * v v v v * | * +--+--+ +--+--|--+--+ +--+ * | | | ... | | | | | ... | | * +--+--+ +--+--|--+--+ +--+ * | * ^ ^ ^ * .lo_v[lz-1] .lo_v[0] .ref_value + .bucket_dx * hz * */ /* .cdf_valid_flag: * -> false whenever .scale_bucket() runs. * -> true whenever .renormalize() runs. * * if false, then *this is 'not a probability distribution': * Sum b[i].weight != 1.0 (summing over probability buckets .lo_v[], .hi_v[]) * i */ bool cdf_valid_flag_ = true; /* width of each bucket */ Domain bucket_dx_; /* natural value here would be 0. * use .pos_v[] for values >= .ref_value * use .neg_v[] for values < .ref_value */ Domain ref_value_; /* buckets for domain values < .ref_value * * with dx = .bucket_dx * .lo_v[i] represents weights in range * [.ref_value - (i+1) * dx, .ref_value - i * dx) */ WeightVector lo_v_; /* buckets for domain values > .ref_value * * with dx = .bucket_dx * .hi_v[i] represents weights in range * [.ref_value + i * dx, .ref_value + (i+1) * dx)) */ WeightVector hi_v_; }; /*ExplicitDist*/ template inline std::ostream & operator<<(std::ostream & os, ExplicitDist const & x) { x.display(os); return os; } /*operator<<*/ } /*namespace distribution*/ } /*namespace xo*/ /* end ExplicitDist.hpp */