/* @file Histogram.hpp */ #pragma once #include "statistics/SampleStatistics.hpp" #include "logutil/scope.hpp" #include #include #include namespace xo { namespace statistics { /* sample statistics for a histogram bucket * (editorial: compare with distribution::Counter) */ class Bucket { public: Bucket() = default; Bucket(uint32_t n_sample, double sum, double mean, double mom2) : n_sample_(n_sample), sum_(sum), mean_(mean), moment2_(mom2) {} uint32_t n_sample() const { return n_sample_; } double sum() const { return sum_; } double mean() const { return mean_; } double sample_variance() const { return (n_sample_ > 1) ? moment2_ / (n_sample_ - 1) : 0.0; } double standard_error() const { return ::sqrt(this->sample_variance()); } /* to estimate standard error of the mean: * 0. let nk = .n_sample be the #of samples falling into this bin. * n is the total #of samples across all bins. * (i.e. Histogram.n_sample) * 1. imagine probability of a sample falling in this bin * is the observed frequency p = (.n_sample / n) * 2. imagine a Bernoulli random variable Bp(i) associated with each sample x(i) * {1, with probability p; 0 with probability q=1-p}) * 3. each Bp(i) has mean p, variance p(1-p) * 4. sum of the Bp(1) .. Bp(n) has mean n.p = nk, * variance * n.p.(1-p) * = n.(nk/n).(1 - nk/n) * = nk.(1 - nk/n) * (by central limit theorem we can treat this as approximately normal * for sufficiently large n) * 5. standard error of Sum{Bp(i)} * will be * sqrt(nk.(1 - nk/n)) */ double n_sample_stderr(uint32_t n) const { double nr = 1.0 / n; uint32_t nk = this->n_sample_; return ::sqrt(nk * (1.0 - nk * nr)); } /*n_sample_stderr*/ /* add one sample, x, to this bucket */ void include_sample(double x) { using logutil::scope; using logutil::xtag; constexpr char const * c_self = "Bucket::include_sample"; constexpr bool c_logging_enabled = false; /* size of sample _before_ adding x */ int n = this->n_sample_; this->n_sample_ = n+1; this->sum_ += x; double mean_n = this->mean_; double mom2_n = this->moment2_; double mean_np1 = SampleStatistics::update_online_mean(x, n, mean_n); double mom2_np1 = SampleStatistics::update_online_moment2(x, mean_np1, mean_n, mom2_n); scope lscope(c_self, c_logging_enabled); if(c_logging_enabled) { lscope.log("update", xtag("x", x), xtag("n", n), xtag("sum", sum_), xtag("mean(n)", mean_n), xtag("mom2(n)", mom2_n), xtag("mean(n+1)", mean_np1), xtag("mom2(n+1)", mom2_np1)); } this->mean_ = mean_np1; this->moment2_ = mom2_np1; } /*include_sample*/ private: /* #of samples in this bucket (will be #of times .sample() has been called) */ uint32_t n_sample_ = 0; /* sum of samples in this bucket */ double sum_ = 0.0; /* mean of values in this bucket * -- use online algo to avoid catastrophic errors for large #samples */ double mean_ = 0.0; double moment2_ = 0.0; }; /*Bucket*/ /* accumulate histogram on sampled data */ class Histogram { public: using const_iterator = std::vector::const_iterator; public: Histogram(uint32_t n_interior_bucket, double lo_bucket, double hi_bucket) : n_interior_bucket_(n_interior_bucket), lo_bucket_(lo_bucket), hi_bucket_(hi_bucket), bucket_v_(n_interior_bucket + 2) {} uint32_t n_sample() const { return n_sample_; } uint32_t n_bucket() const { return n_interior_bucket_ + 2; } double bucket_width() const { return (this->hi_bucket_ - this->lo_bucket_) / this->n_interior_bucket_; } const_iterator begin() const { return bucket_v_.begin(); } const_iterator end() const { return bucket_v_.end(); } Bucket const & lookup(uint32_t ix) const { return this->bucket_v_[ix]; } /* compute bucket representing pooled sample combining * contents of buckets [lo .. hi) */ Bucket pooled(uint32_t lo, uint32_t hi) const { /* NOTE: for pooled bucket, may want to compute "reliability variance", * i.e. report * M2 / (N - (sum(nk^2) / N)) * instead of * M2 / (N - 1) */ uint32_t n_sample = 0; double sum = 0.0; double mean = 0.0; double mom2 = 0.0; for(uint32_t i = lo; ilookup(i); n_sample += bucket.n_sample(); /* note that sum is not numerically well-behaved if summing * over a large #of buckets */ sum += bucket.sum(); double prev_mean = mean; /* relative weight of bucket b(i) relative to pooled statistics * from buckets b(lo) .. b(i-1) */ double wt = (bucket.n_sample() / static_cast(n_sample)); /* similar to SampleStatistics::update_online_mean() */ mean = prev_mean + wt * (bucket.mean() - prev_mean); /* similar to SampleStatistics::update_online_moment2() */ mom2 = (mom2 + (bucket.n_sample() * (bucket.mean() - prev_mean) * (bucket.mean() - mean))); } return Bucket(n_sample, sum, mean, mom2); } /*pooled*/ double bucket_lo_edge(uint32_t ix) const { if(ix == 0) { return -std::numeric_limits::infinity(); } else { return this->lo_bucket_ + (ix - 1) * this->bucket_width(); } } /*bucket_lo_edge*/ double bucket_hi_edge(uint32_t ix) const { if(ix < n_interior_bucket_ + 1) return this->lo_bucket_ + ix * this->bucket_width(); else return std::numeric_limits::infinity(); } /*bucket_hi_edge*/ /* index (into .bucket_v[]) of bucket to use for a sample with value x */ uint32_t bucket_ix(double x) const { if(x < this->lo_bucket_) return 0; if(x < this->hi_bucket_) return 1 + static_cast((x - this->lo_bucket_) / this->bucket_width()); return this->n_interior_bucket_ + 1; } /*bucket_ix*/ void include_sample(double x) { uint32_t ix = this->bucket_ix(x); ++(this->n_sample_); this->bucket_v_[ix].include_sample(x); } /*include_sample*/ private: /* #of samples across all buckets */ uint32_t n_sample_ = 0; /* #of interior buckets: split [.lo_bucket, .hi_bucket] into * equally-spaced intervals of width (.hi_bucket - .lo_bucket) / .n_bucket */ uint32_t n_interior_bucket_ = 0; /* right edge of first bucket (left edge is -oo) */ double lo_bucket_ = 0.0; /* left edge of last bucket (right edge is +oo) */ double hi_bucket_ = 0.0; /* hisogram buckets */ std::vector bucket_v_; }; /*Histogram*/ } /*namespace statistics*/ } /*namespace xo*/ /* end Histogram.hpp */