initial implementation
This commit is contained in:
commit
49d2fd3757
5 changed files with 425 additions and 0 deletions
64
CMakeLists.txt
Normal file
64
CMakeLists.txt
Normal file
|
|
@ -0,0 +1,64 @@
|
||||||
|
# xo-statistics/CMakeLists.txt
|
||||||
|
|
||||||
|
cmake_minimum_required(VERSION 3.10)
|
||||||
|
|
||||||
|
project(xo_statistics VERSION 1.0)
|
||||||
|
enable_language(CXX)
|
||||||
|
|
||||||
|
include(xo_macros/xo_cxx)
|
||||||
|
include(xo_macros/code-coverage)
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# unit test setup
|
||||||
|
|
||||||
|
enable_testing()
|
||||||
|
# activate code coverage for all executables + libraries (when configured with -DCODE_COVERAGE=ON)
|
||||||
|
add_code_coverage()
|
||||||
|
# 1. assuming that /nix/store/ prefixes .hpp files belonging to gcc, catch2 etc.
|
||||||
|
# we're not interested in code coverage for these sources.
|
||||||
|
# 2. exclude the utest/ subdir, we don't need coverage on the unit tests themselves;
|
||||||
|
# rather, want coverage on the code that the unit tests exercise.
|
||||||
|
#
|
||||||
|
add_code_coverage_all_targets(EXCLUDE /nix/store/* utest/*)
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# bespoke (usually temporary) c++ settings
|
||||||
|
|
||||||
|
set(PROJECT_CXX_FLAGS "")
|
||||||
|
#set(PROJECT_CXX_FLAGS "-fconcepts-diagnostics-depth=2")
|
||||||
|
add_definitions(${PROJECT_CXX_FLAGS})
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# common include paths etc.
|
||||||
|
|
||||||
|
xo_toplevel_compile_options()
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# external dependencies
|
||||||
|
#
|
||||||
|
# set CMAKE_INSTALL_PREFIX to analog of /usr
|
||||||
|
# to use .cmake assistants from /usr/lib/cmake/indentlog
|
||||||
|
#
|
||||||
|
# xo_dependency(..)
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
|
||||||
|
#add_subdirectory(example)
|
||||||
|
#add_subdirectory(utest)
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# output targets
|
||||||
|
|
||||||
|
set(SELF_LIB xo_statistics)
|
||||||
|
xo_add_headeronly_library(${SELF_LIB})
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# standard install + provide find_package() support
|
||||||
|
|
||||||
|
xo_install_library4(${SELF_LIB} ${PROJECT_NAME}Targets)
|
||||||
|
xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets)
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------
|
||||||
|
# install additional components
|
||||||
|
|
||||||
|
#install(TARGETS statistics_ex1 DESTINATION bin/xo-statistics/example)
|
||||||
4
cmake/xo_statisticsConfig.cmake.in
Normal file
4
cmake/xo_statisticsConfig.cmake.in
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
|
include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
|
||||||
|
check_required_components("@PROJECT_NAME@")
|
||||||
10
include/statistics/Accumulator.hpp
Normal file
10
include/statistics/Accumulator.hpp
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
/* @file Accumulator.hpp */
|
||||||
|
|
||||||
|
namespace xo {
|
||||||
|
nmaespace statistics {
|
||||||
|
class Accumulator {
|
||||||
|
}; /*Accumulator*/
|
||||||
|
} /*namespace statistics*/
|
||||||
|
} /*namespace xo*/
|
||||||
|
|
||||||
|
/* end Accumulator.hpp */
|
||||||
217
include/statistics/Histogram.hpp
Normal file
217
include/statistics/Histogram.hpp
Normal file
|
|
@ -0,0 +1,217 @@
|
||||||
|
/* @file Histogram.hpp */
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "statistics/SampleStatistics.hpp"
|
||||||
|
#include "logutil/scope.hpp"
|
||||||
|
#include <vector>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
namespace xo {
|
||||||
|
namespace statistics {
|
||||||
|
/* sample statistics for a histogram bucket
|
||||||
|
* (editorial: compare with distribution::Counter)
|
||||||
|
*/
|
||||||
|
class Bucket {
|
||||||
|
public:
|
||||||
|
Bucket() = default;
|
||||||
|
Bucket(uint32_t n_sample, double sum, double mean, double mom2)
|
||||||
|
: n_sample_(n_sample), sum_(sum), mean_(mean), moment2_(mom2) {}
|
||||||
|
|
||||||
|
uint32_t n_sample() const { return n_sample_; }
|
||||||
|
double sum() const { return sum_; }
|
||||||
|
double mean() const { return mean_; }
|
||||||
|
double sample_variance() const { return (n_sample_ > 1) ? moment2_ / (n_sample_ - 1) : 0.0; }
|
||||||
|
double standard_error() const { return ::sqrt(this->sample_variance()); }
|
||||||
|
|
||||||
|
/* to estimate standard error of the mean:
|
||||||
|
* 0. let nk = .n_sample be the #of samples falling into this bin.
|
||||||
|
* n is the total #of samples across all bins.
|
||||||
|
* (i.e. Histogram.n_sample)
|
||||||
|
* 1. imagine probability of a sample falling in this bin
|
||||||
|
* is the observed frequency p = (.n_sample / n)
|
||||||
|
* 2. imagine a Bernoulli random variable Bp(i) associated with each sample x(i)
|
||||||
|
* {1, with probability p; 0 with probability q=1-p})
|
||||||
|
* 3. each Bp(i) has mean p, variance p(1-p)
|
||||||
|
* 4. sum of the Bp(1) .. Bp(n) has mean n.p = nk,
|
||||||
|
* variance
|
||||||
|
* n.p.(1-p)
|
||||||
|
* = n.(nk/n).(1 - nk/n)
|
||||||
|
* = nk.(1 - nk/n)
|
||||||
|
* (by central limit theorem we can treat this as approximately normal
|
||||||
|
* for sufficiently large n)
|
||||||
|
* 5. standard error of Sum{Bp(i)}
|
||||||
|
* will be
|
||||||
|
* sqrt(nk.(1 - nk/n))
|
||||||
|
*/
|
||||||
|
double n_sample_stderr(uint32_t n) const {
|
||||||
|
double nr = 1.0 / n;
|
||||||
|
uint32_t nk = this->n_sample_;
|
||||||
|
|
||||||
|
return ::sqrt(nk * (1.0 - nk * nr));
|
||||||
|
} /*n_sample_stderr*/
|
||||||
|
|
||||||
|
/* add one sample, x, to this bucket */
|
||||||
|
void include_sample(double x) {
|
||||||
|
using logutil::scope;
|
||||||
|
using logutil::xtag;
|
||||||
|
|
||||||
|
constexpr char const * c_self = "Bucket::include_sample";
|
||||||
|
constexpr bool c_logging_enabled = false;
|
||||||
|
|
||||||
|
/* size of sample _before_ adding x */
|
||||||
|
int n = this->n_sample_;
|
||||||
|
|
||||||
|
this->n_sample_ = n+1;
|
||||||
|
this->sum_ += x;
|
||||||
|
|
||||||
|
double mean_n = this->mean_;
|
||||||
|
double mom2_n = this->moment2_;
|
||||||
|
double mean_np1 = SampleStatistics::update_online_mean(x, n, mean_n);
|
||||||
|
double mom2_np1 = SampleStatistics::update_online_moment2(x,
|
||||||
|
mean_np1, mean_n,
|
||||||
|
mom2_n);
|
||||||
|
scope lscope(c_self, c_logging_enabled);
|
||||||
|
if(c_logging_enabled) {
|
||||||
|
lscope.log("update",
|
||||||
|
xtag("x", x), xtag("n", n),
|
||||||
|
xtag("sum", sum_),
|
||||||
|
xtag("mean(n)", mean_n),
|
||||||
|
xtag("mom2(n)", mom2_n),
|
||||||
|
xtag("mean(n+1)", mean_np1),
|
||||||
|
xtag("mom2(n+1)", mom2_np1));
|
||||||
|
}
|
||||||
|
|
||||||
|
this->mean_ = mean_np1;
|
||||||
|
this->moment2_ = mom2_np1;
|
||||||
|
} /*include_sample*/
|
||||||
|
|
||||||
|
private:
|
||||||
|
/* #of samples in this bucket (will be #of times .sample() has been called) */
|
||||||
|
uint32_t n_sample_ = 0;
|
||||||
|
/* sum of samples in this bucket */
|
||||||
|
double sum_ = 0.0;
|
||||||
|
/* mean of values in this bucket
|
||||||
|
* -- use online algo to avoid catastrophic errors for large #samples
|
||||||
|
*/
|
||||||
|
double mean_ = 0.0;
|
||||||
|
double moment2_ = 0.0;
|
||||||
|
}; /*Bucket*/
|
||||||
|
|
||||||
|
/* accumulate histogram on sampled data */
|
||||||
|
class Histogram {
|
||||||
|
public:
|
||||||
|
using const_iterator = std::vector<Bucket>::const_iterator;
|
||||||
|
|
||||||
|
public:
|
||||||
|
Histogram(uint32_t n_interior_bucket, double lo_bucket, double hi_bucket)
|
||||||
|
: n_interior_bucket_(n_interior_bucket),
|
||||||
|
lo_bucket_(lo_bucket),
|
||||||
|
hi_bucket_(hi_bucket),
|
||||||
|
bucket_v_(n_interior_bucket + 2)
|
||||||
|
{}
|
||||||
|
|
||||||
|
uint32_t n_sample() const { return n_sample_; }
|
||||||
|
uint32_t n_bucket() const { return n_interior_bucket_ + 2; }
|
||||||
|
|
||||||
|
double bucket_width() const { return (this->hi_bucket_ - this->lo_bucket_) / this->n_interior_bucket_; }
|
||||||
|
|
||||||
|
const_iterator begin() const { return bucket_v_.begin(); }
|
||||||
|
const_iterator end() const { return bucket_v_.end(); }
|
||||||
|
Bucket const & lookup(uint32_t ix) const { return this->bucket_v_[ix]; }
|
||||||
|
|
||||||
|
/* compute bucket representing pooled sample combining
|
||||||
|
* contents of buckets [lo .. hi)
|
||||||
|
*/
|
||||||
|
Bucket pooled(uint32_t lo, uint32_t hi) const {
|
||||||
|
/* NOTE: for pooled bucket, may want to compute "reliability variance",
|
||||||
|
* i.e. report
|
||||||
|
* M2 / (N - (sum(nk^2) / N))
|
||||||
|
* instead of
|
||||||
|
* M2 / (N - 1)
|
||||||
|
*/
|
||||||
|
|
||||||
|
uint32_t n_sample = 0;
|
||||||
|
double sum = 0.0;
|
||||||
|
double mean = 0.0;
|
||||||
|
double mom2 = 0.0;
|
||||||
|
|
||||||
|
for(uint32_t i = lo; i<hi; ++i) {
|
||||||
|
Bucket const & bucket = this->lookup(i);
|
||||||
|
|
||||||
|
n_sample += bucket.n_sample();
|
||||||
|
/* note that sum is not numerically well-behaved if summing
|
||||||
|
* over a large #of buckets
|
||||||
|
*/
|
||||||
|
sum += bucket.sum();
|
||||||
|
|
||||||
|
double prev_mean = mean;
|
||||||
|
/* relative weight of bucket b(i) relative to pooled statistics
|
||||||
|
* from buckets b(lo) .. b(i-1)
|
||||||
|
*/
|
||||||
|
double wt = (bucket.n_sample() / static_cast<double>(n_sample));
|
||||||
|
|
||||||
|
/* similar to SampleStatistics::update_online_mean() */
|
||||||
|
mean = prev_mean + wt * (bucket.mean() - prev_mean);
|
||||||
|
/* similar to SampleStatistics::update_online_moment2() */
|
||||||
|
mom2 = (mom2 + (bucket.n_sample()
|
||||||
|
* (bucket.mean() - prev_mean)
|
||||||
|
* (bucket.mean() - mean)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return Bucket(n_sample, sum, mean, mom2);
|
||||||
|
} /*pooled*/
|
||||||
|
|
||||||
|
double bucket_lo_edge(uint32_t ix) const {
|
||||||
|
if(ix == 0) {
|
||||||
|
return -std::numeric_limits<double>::infinity();
|
||||||
|
} else {
|
||||||
|
return this->lo_bucket_ + (ix - 1) * this->bucket_width();
|
||||||
|
}
|
||||||
|
} /*bucket_lo_edge*/
|
||||||
|
|
||||||
|
double bucket_hi_edge(uint32_t ix) const {
|
||||||
|
if(ix < n_interior_bucket_ + 1)
|
||||||
|
return this->lo_bucket_ + ix * this->bucket_width();
|
||||||
|
else
|
||||||
|
return std::numeric_limits<double>::infinity();
|
||||||
|
} /*bucket_hi_edge*/
|
||||||
|
|
||||||
|
/* index (into .bucket_v[]) of bucket to use for a sample with value x */
|
||||||
|
uint32_t bucket_ix(double x) const {
|
||||||
|
if(x < this->lo_bucket_)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if(x < this->hi_bucket_)
|
||||||
|
return 1 + static_cast<uint32_t>((x - this->lo_bucket_) / this->bucket_width());
|
||||||
|
|
||||||
|
return this->n_interior_bucket_ + 1;
|
||||||
|
} /*bucket_ix*/
|
||||||
|
|
||||||
|
void include_sample(double x) {
|
||||||
|
uint32_t ix = this->bucket_ix(x);
|
||||||
|
|
||||||
|
++(this->n_sample_);
|
||||||
|
this->bucket_v_[ix].include_sample(x);
|
||||||
|
} /*include_sample*/
|
||||||
|
|
||||||
|
private:
|
||||||
|
/* #of samples across all buckets */
|
||||||
|
uint32_t n_sample_ = 0;
|
||||||
|
/* #of interior buckets: split [.lo_bucket, .hi_bucket] into
|
||||||
|
* equally-spaced intervals of width (.hi_bucket - .lo_bucket) / .n_bucket
|
||||||
|
*/
|
||||||
|
uint32_t n_interior_bucket_ = 0;
|
||||||
|
/* right edge of first bucket (left edge is -oo) */
|
||||||
|
double lo_bucket_ = 0.0;
|
||||||
|
/* left edge of last bucket (right edge is +oo) */
|
||||||
|
double hi_bucket_ = 0.0;
|
||||||
|
|
||||||
|
/* hisogram buckets */
|
||||||
|
std::vector<Bucket> bucket_v_;
|
||||||
|
}; /*Histogram*/
|
||||||
|
} /*namespace statistics*/
|
||||||
|
} /*namespace xo*/
|
||||||
|
|
||||||
|
/* end Histogram.hpp */
|
||||||
130
include/statistics/SampleStatistics.hpp
Normal file
130
include/statistics/SampleStatistics.hpp
Normal file
|
|
@ -0,0 +1,130 @@
|
||||||
|
/* @file SampleStatistics.hpp */
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
namespace xo {
|
||||||
|
namespace statistics {
|
||||||
|
/* accumlate statistics online for a sample */
|
||||||
|
class SampleStatistics {
|
||||||
|
public:
|
||||||
|
SampleStatistics() = default;
|
||||||
|
|
||||||
|
/* given we have a sample S(n) of size n with given mean,
|
||||||
|
* compute mean of sample with one event x added
|
||||||
|
*
|
||||||
|
* n. #of samples *preceding* x
|
||||||
|
*/
|
||||||
|
static double update_online_mean(double x, uint32_t n, double mean) {
|
||||||
|
/* to update mean in a numerically stable way:
|
||||||
|
* avoid computing running sample sum, to avoid
|
||||||
|
* adding floating point numbers with distant magnitudes;
|
||||||
|
* instead compute correction to the mean directly
|
||||||
|
*
|
||||||
|
* n / x(i) \
|
||||||
|
* mean(Sn) := Sum | ----- |
|
||||||
|
* i=1 \ n /
|
||||||
|
*
|
||||||
|
* so
|
||||||
|
* n+1 / x(i) \
|
||||||
|
* mean(S(n+1)) = Sum | ----- |
|
||||||
|
* i=1 \ n+1 /
|
||||||
|
*
|
||||||
|
* n n+1 / x(i) \
|
||||||
|
* = --- Sum | ----- |
|
||||||
|
* n+1 i=1 \ n /
|
||||||
|
*
|
||||||
|
* n / x(n+1) n x(i) \
|
||||||
|
* = --- | ------ + Sum ---- |
|
||||||
|
* n+1 \ n i=1 n /
|
||||||
|
*
|
||||||
|
* x(n+1) / n \
|
||||||
|
* = ------ + | --- . mean(S(n)) |
|
||||||
|
* n+1 \ n+1 /
|
||||||
|
*
|
||||||
|
* x(n+1) / -1 \
|
||||||
|
* = ------ + mean(S(n)) + | --- . mean(S(n)) |
|
||||||
|
* n+1 \ n+1 /
|
||||||
|
*
|
||||||
|
* = mean(S(n)) + (x(n+1) - mean(S(n))) / (n+1)
|
||||||
|
*/
|
||||||
|
return mean + ((1.0 / (n+1)) * (x - mean));
|
||||||
|
} /*update_online_mean*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* with S(n) = Sn = {set of n samples},
|
||||||
|
* u(n) = mean(Sn)
|
||||||
|
*
|
||||||
|
* (with mean, variance meaning "estimate for")
|
||||||
|
*
|
||||||
|
* 1 n / 2 \ / 1 \ 2
|
||||||
|
* variance(Sn) := --- . Sum | (x(i) | - | --- . Sum x(i) |
|
||||||
|
* n i=1 \ / \ n i=1 /
|
||||||
|
*
|
||||||
|
* using Welford's recurrence for 2nd moment:
|
||||||
|
*
|
||||||
|
* define
|
||||||
|
* M2(n+1) := M2(n) + (x(n+1) - mean(S(n)))
|
||||||
|
* . (x(n+1) - mean(S(n+1))
|
||||||
|
*
|
||||||
|
* then unbiased variance estimate for S(n+1) is:
|
||||||
|
*
|
||||||
|
* M2(n+1)
|
||||||
|
* -------
|
||||||
|
* n
|
||||||
|
*
|
||||||
|
* x. new sample value
|
||||||
|
* mean_np1. mean estimate for S(n+1)
|
||||||
|
* mean_n. mean estimate for S(n)
|
||||||
|
* moment2. 2nd moment for S(n)
|
||||||
|
*/
|
||||||
|
static double update_online_moment2(double x,
|
||||||
|
double mean_np1, double mean_n,
|
||||||
|
double moment2)
|
||||||
|
{
|
||||||
|
return moment2 + (x - mean_n) * (x - mean_np1);
|
||||||
|
} /*update_online_moment2*/
|
||||||
|
|
||||||
|
uint32_t n_sample() const { return n_sample_; }
|
||||||
|
double mean() const { return mean_; }
|
||||||
|
double moment2() const { return moment2_; }
|
||||||
|
/* 'sample variance' = variance estimate,
|
||||||
|
* applying Bessel correction for sample bias
|
||||||
|
*
|
||||||
|
* require: n_sample >= 2
|
||||||
|
*/
|
||||||
|
double sample_variance() const { return moment2_ / (n_sample_ - 1); }
|
||||||
|
|
||||||
|
/* biased variance estimate
|
||||||
|
* = (1 - 1/(n+1)) * .sample_variance()
|
||||||
|
*
|
||||||
|
* .variance() -> .sample_variance() as sample size -> +oo
|
||||||
|
*
|
||||||
|
* require: n_sample >= 1
|
||||||
|
*/
|
||||||
|
double variance() const { return moment2_ / n_sample_; }
|
||||||
|
|
||||||
|
void include_sample(double x) {
|
||||||
|
/* n+1 */
|
||||||
|
uint32_t np1 = this->n_sample_ + 1;
|
||||||
|
|
||||||
|
double mean_np1 = update_online_mean(x, this->n_sample_, this->mean_);
|
||||||
|
double moment2_np1 = update_online_moment2(x, this->mean_, mean_np1, this->moment2_);
|
||||||
|
|
||||||
|
this->n_sample_ = np1;
|
||||||
|
this->mean_ = mean_np1;
|
||||||
|
this->moment2_ = moment2_np1;
|
||||||
|
} /*include_sample*/
|
||||||
|
|
||||||
|
private:
|
||||||
|
uint32_t n_sample_ = 0;
|
||||||
|
/* estimated mean */
|
||||||
|
double mean_ = 0.0;
|
||||||
|
/* estimated 2nd moment E[X^2] */
|
||||||
|
double moment2_ = 0.0;
|
||||||
|
}; /*SampleStatistics*/
|
||||||
|
} /*namespace statistics*/
|
||||||
|
} /*namespace xo*/
|
||||||
|
|
||||||
|
/* end SampleStatistics.hpp */
|
||||||
Loading…
Add table
Add a link
Reference in a new issue