diff --git a/xo-statistics/.gitignore b/xo-statistics/.gitignore new file mode 100644 index 00000000..13c0afb7 --- /dev/null +++ b/xo-statistics/.gitignore @@ -0,0 +1,6 @@ +# clangd working space (see emacs+lsp) +.cache +# typical cmake build directory (source-tree-nephew) +.build* +# symlink to builddir/compile_commands.json; should be set manually in dev sandbox +compile_commands.json diff --git a/xo-statistics/CMakeLists.txt b/xo-statistics/CMakeLists.txt new file mode 100644 index 00000000..c01852a9 --- /dev/null +++ b/xo-statistics/CMakeLists.txt @@ -0,0 +1,39 @@ +# xo-statistics/CMakeLists.txt + +cmake_minimum_required(VERSION 3.10) + +project(xo_statistics VERSION 1.0) + +include(GNUInstallDirs) +include(cmake/xo-bootstrap-macros.cmake) + +xo_cxx_toplevel_options3() + +# ---------------------------------------------------------------- +# bespoke (usually temporary) c++ settings + +set(PROJECT_CXX_FLAGS "") +#set(PROJECT_CXX_FLAGS "-fconcepts-diagnostics-depth=2") +add_definitions(${PROJECT_CXX_FLAGS}) + +# ---------------------------------------------------------------- + +#add_subdirectory(example) +#add_subdirectory(utest) + +# ---------------------------------------------------------------- +# output targets + +set(SELF_LIB xo_statistics) +xo_add_headeronly_library(${SELF_LIB}) + +# ---------------------------------------------------------------- +# standard install + provide find_package() support + +xo_install_library4(${SELF_LIB} ${PROJECT_NAME}Targets) +xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets) + +# ---------------------------------------------------------------- +# install additional components + +#install(TARGETS statistics_ex1 DESTINATION bin/xo-statistics/example) diff --git a/xo-statistics/cmake/xo-bootstrap-macros.cmake b/xo-statistics/cmake/xo-bootstrap-macros.cmake new file mode 100644 index 00000000..aba31169 --- /dev/null +++ b/xo-statistics/cmake/xo-bootstrap-macros.cmake @@ -0,0 +1,35 @@ +# ---------------------------------------------------------------- +# for example: +# $ PREFIX=/usr/local # for example +# $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build +# +# will get +# CMAKE_MODULE_PATH +# from xo-cmake-config --cmake-module-path +# +# and expect .cmake macros in +# CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake +# ---------------------------------------------------------------- + +find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED) + +if ("${XO_CMAKE_CONFIG_EXECUTABLE}" STREQUAL "XO_CMAKE_CONFIG_EXECUTABLE-NOT_FOUND") + message(FATAL "could not find xo-cmake-config executable") +endif() + +message(STATUS "XO_CMAKE_CONFIG_EXECUTABLE=${XO_CMAKE_CONFIG_EXECUTABLE}") + +if (NOT XO_SUBMODULE_BUILD) + if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix)) + # default to typical install location for xo-project-macros + execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH) + message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}") + endif() +endif() + +# needs to have been installed somewhere on CMAKE_MODULE_PATH, +# (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX) +# +include(xo_macros/xo_cxx) + +xo_cxx_bootstrap_message() diff --git a/xo-statistics/cmake/xo_statisticsConfig.cmake.in b/xo-statistics/cmake/xo_statisticsConfig.cmake.in new file mode 100644 index 00000000..9c15f36a --- /dev/null +++ b/xo-statistics/cmake/xo_statisticsConfig.cmake.in @@ -0,0 +1,4 @@ +@PACKAGE_INIT@ + +include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") +check_required_components("@PROJECT_NAME@") diff --git a/xo-statistics/include/xo/statistics/Accumulator.hpp b/xo-statistics/include/xo/statistics/Accumulator.hpp new file mode 100644 index 00000000..69763e38 --- /dev/null +++ b/xo-statistics/include/xo/statistics/Accumulator.hpp @@ -0,0 +1,10 @@ +/* @file Accumulator.hpp */ + +namespace xo { + nmaespace statistics { + class Accumulator { + }; /*Accumulator*/ + } /*namespace statistics*/ +} /*namespace xo*/ + +/* end Accumulator.hpp */ diff --git a/xo-statistics/include/xo/statistics/Histogram.hpp b/xo-statistics/include/xo/statistics/Histogram.hpp new file mode 100644 index 00000000..d2219f23 --- /dev/null +++ b/xo-statistics/include/xo/statistics/Histogram.hpp @@ -0,0 +1,217 @@ +/* @file Histogram.hpp */ + +#pragma once + +#include "statistics/SampleStatistics.hpp" +#include "logutil/scope.hpp" +#include +#include +#include + +namespace xo { + namespace statistics { + /* sample statistics for a histogram bucket + * (editorial: compare with distribution::Counter) + */ + class Bucket { + public: + Bucket() = default; + Bucket(uint32_t n_sample, double sum, double mean, double mom2) + : n_sample_(n_sample), sum_(sum), mean_(mean), moment2_(mom2) {} + + uint32_t n_sample() const { return n_sample_; } + double sum() const { return sum_; } + double mean() const { return mean_; } + double sample_variance() const { return (n_sample_ > 1) ? moment2_ / (n_sample_ - 1) : 0.0; } + double standard_error() const { return ::sqrt(this->sample_variance()); } + + /* to estimate standard error of the mean: + * 0. let nk = .n_sample be the #of samples falling into this bin. + * n is the total #of samples across all bins. + * (i.e. Histogram.n_sample) + * 1. imagine probability of a sample falling in this bin + * is the observed frequency p = (.n_sample / n) + * 2. imagine a Bernoulli random variable Bp(i) associated with each sample x(i) + * {1, with probability p; 0 with probability q=1-p}) + * 3. each Bp(i) has mean p, variance p(1-p) + * 4. sum of the Bp(1) .. Bp(n) has mean n.p = nk, + * variance + * n.p.(1-p) + * = n.(nk/n).(1 - nk/n) + * = nk.(1 - nk/n) + * (by central limit theorem we can treat this as approximately normal + * for sufficiently large n) + * 5. standard error of Sum{Bp(i)} + * will be + * sqrt(nk.(1 - nk/n)) + */ + double n_sample_stderr(uint32_t n) const { + double nr = 1.0 / n; + uint32_t nk = this->n_sample_; + + return ::sqrt(nk * (1.0 - nk * nr)); + } /*n_sample_stderr*/ + + /* add one sample, x, to this bucket */ + void include_sample(double x) { + using logutil::scope; + using logutil::xtag; + + constexpr char const * c_self = "Bucket::include_sample"; + constexpr bool c_logging_enabled = false; + + /* size of sample _before_ adding x */ + int n = this->n_sample_; + + this->n_sample_ = n+1; + this->sum_ += x; + + double mean_n = this->mean_; + double mom2_n = this->moment2_; + double mean_np1 = SampleStatistics::update_online_mean(x, n, mean_n); + double mom2_np1 = SampleStatistics::update_online_moment2(x, + mean_np1, mean_n, + mom2_n); + scope lscope(c_self, c_logging_enabled); + if(c_logging_enabled) { + lscope.log("update", + xtag("x", x), xtag("n", n), + xtag("sum", sum_), + xtag("mean(n)", mean_n), + xtag("mom2(n)", mom2_n), + xtag("mean(n+1)", mean_np1), + xtag("mom2(n+1)", mom2_np1)); + } + + this->mean_ = mean_np1; + this->moment2_ = mom2_np1; + } /*include_sample*/ + + private: + /* #of samples in this bucket (will be #of times .sample() has been called) */ + uint32_t n_sample_ = 0; + /* sum of samples in this bucket */ + double sum_ = 0.0; + /* mean of values in this bucket + * -- use online algo to avoid catastrophic errors for large #samples + */ + double mean_ = 0.0; + double moment2_ = 0.0; + }; /*Bucket*/ + + /* accumulate histogram on sampled data */ + class Histogram { + public: + using const_iterator = std::vector::const_iterator; + + public: + Histogram(uint32_t n_interior_bucket, double lo_bucket, double hi_bucket) + : n_interior_bucket_(n_interior_bucket), + lo_bucket_(lo_bucket), + hi_bucket_(hi_bucket), + bucket_v_(n_interior_bucket + 2) + {} + + uint32_t n_sample() const { return n_sample_; } + uint32_t n_bucket() const { return n_interior_bucket_ + 2; } + + double bucket_width() const { return (this->hi_bucket_ - this->lo_bucket_) / this->n_interior_bucket_; } + + const_iterator begin() const { return bucket_v_.begin(); } + const_iterator end() const { return bucket_v_.end(); } + Bucket const & lookup(uint32_t ix) const { return this->bucket_v_[ix]; } + + /* compute bucket representing pooled sample combining + * contents of buckets [lo .. hi) + */ + Bucket pooled(uint32_t lo, uint32_t hi) const { + /* NOTE: for pooled bucket, may want to compute "reliability variance", + * i.e. report + * M2 / (N - (sum(nk^2) / N)) + * instead of + * M2 / (N - 1) + */ + + uint32_t n_sample = 0; + double sum = 0.0; + double mean = 0.0; + double mom2 = 0.0; + + for(uint32_t i = lo; ilookup(i); + + n_sample += bucket.n_sample(); + /* note that sum is not numerically well-behaved if summing + * over a large #of buckets + */ + sum += bucket.sum(); + + double prev_mean = mean; + /* relative weight of bucket b(i) relative to pooled statistics + * from buckets b(lo) .. b(i-1) + */ + double wt = (bucket.n_sample() / static_cast(n_sample)); + + /* similar to SampleStatistics::update_online_mean() */ + mean = prev_mean + wt * (bucket.mean() - prev_mean); + /* similar to SampleStatistics::update_online_moment2() */ + mom2 = (mom2 + (bucket.n_sample() + * (bucket.mean() - prev_mean) + * (bucket.mean() - mean))); + } + + return Bucket(n_sample, sum, mean, mom2); + } /*pooled*/ + + double bucket_lo_edge(uint32_t ix) const { + if(ix == 0) { + return -std::numeric_limits::infinity(); + } else { + return this->lo_bucket_ + (ix - 1) * this->bucket_width(); + } + } /*bucket_lo_edge*/ + + double bucket_hi_edge(uint32_t ix) const { + if(ix < n_interior_bucket_ + 1) + return this->lo_bucket_ + ix * this->bucket_width(); + else + return std::numeric_limits::infinity(); + } /*bucket_hi_edge*/ + + /* index (into .bucket_v[]) of bucket to use for a sample with value x */ + uint32_t bucket_ix(double x) const { + if(x < this->lo_bucket_) + return 0; + + if(x < this->hi_bucket_) + return 1 + static_cast((x - this->lo_bucket_) / this->bucket_width()); + + return this->n_interior_bucket_ + 1; + } /*bucket_ix*/ + + void include_sample(double x) { + uint32_t ix = this->bucket_ix(x); + + ++(this->n_sample_); + this->bucket_v_[ix].include_sample(x); + } /*include_sample*/ + + private: + /* #of samples across all buckets */ + uint32_t n_sample_ = 0; + /* #of interior buckets: split [.lo_bucket, .hi_bucket] into + * equally-spaced intervals of width (.hi_bucket - .lo_bucket) / .n_bucket + */ + uint32_t n_interior_bucket_ = 0; + /* right edge of first bucket (left edge is -oo) */ + double lo_bucket_ = 0.0; + /* left edge of last bucket (right edge is +oo) */ + double hi_bucket_ = 0.0; + + /* hisogram buckets */ + std::vector bucket_v_; + }; /*Histogram*/ + } /*namespace statistics*/ +} /*namespace xo*/ + +/* end Histogram.hpp */ diff --git a/xo-statistics/include/xo/statistics/SampleStatistics.hpp b/xo-statistics/include/xo/statistics/SampleStatistics.hpp new file mode 100644 index 00000000..a7d595b0 --- /dev/null +++ b/xo-statistics/include/xo/statistics/SampleStatistics.hpp @@ -0,0 +1,130 @@ +/* @file SampleStatistics.hpp */ + +#pragma once + +#include + +namespace xo { + namespace statistics { + /* accumlate statistics online for a sample */ + class SampleStatistics { + public: + SampleStatistics() = default; + + /* given we have a sample S(n) of size n with given mean, + * compute mean of sample with one event x added + * + * n. #of samples *preceding* x + */ + static double update_online_mean(double x, uint32_t n, double mean) { + /* to update mean in a numerically stable way: + * avoid computing running sample sum, to avoid + * adding floating point numbers with distant magnitudes; + * instead compute correction to the mean directly + * + * n / x(i) \ + * mean(Sn) := Sum | ----- | + * i=1 \ n / + * + * so + * n+1 / x(i) \ + * mean(S(n+1)) = Sum | ----- | + * i=1 \ n+1 / + * + * n n+1 / x(i) \ + * = --- Sum | ----- | + * n+1 i=1 \ n / + * + * n / x(n+1) n x(i) \ + * = --- | ------ + Sum ---- | + * n+1 \ n i=1 n / + * + * x(n+1) / n \ + * = ------ + | --- . mean(S(n)) | + * n+1 \ n+1 / + * + * x(n+1) / -1 \ + * = ------ + mean(S(n)) + | --- . mean(S(n)) | + * n+1 \ n+1 / + * + * = mean(S(n)) + (x(n+1) - mean(S(n))) / (n+1) + */ + return mean + ((1.0 / (n+1)) * (x - mean)); + } /*update_online_mean*/ + + /* + * with S(n) = Sn = {set of n samples}, + * u(n) = mean(Sn) + * + * (with mean, variance meaning "estimate for") + * + * 1 n / 2 \ / 1 \ 2 + * variance(Sn) := --- . Sum | (x(i) | - | --- . Sum x(i) | + * n i=1 \ / \ n i=1 / + * + * using Welford's recurrence for 2nd moment: + * + * define + * M2(n+1) := M2(n) + (x(n+1) - mean(S(n))) + * . (x(n+1) - mean(S(n+1)) + * + * then unbiased variance estimate for S(n+1) is: + * + * M2(n+1) + * ------- + * n + * + * x. new sample value + * mean_np1. mean estimate for S(n+1) + * mean_n. mean estimate for S(n) + * moment2. 2nd moment for S(n) + */ + static double update_online_moment2(double x, + double mean_np1, double mean_n, + double moment2) + { + return moment2 + (x - mean_n) * (x - mean_np1); + } /*update_online_moment2*/ + + uint32_t n_sample() const { return n_sample_; } + double mean() const { return mean_; } + double moment2() const { return moment2_; } + /* 'sample variance' = variance estimate, + * applying Bessel correction for sample bias + * + * require: n_sample >= 2 + */ + double sample_variance() const { return moment2_ / (n_sample_ - 1); } + + /* biased variance estimate + * = (1 - 1/(n+1)) * .sample_variance() + * + * .variance() -> .sample_variance() as sample size -> +oo + * + * require: n_sample >= 1 + */ + double variance() const { return moment2_ / n_sample_; } + + void include_sample(double x) { + /* n+1 */ + uint32_t np1 = this->n_sample_ + 1; + + double mean_np1 = update_online_mean(x, this->n_sample_, this->mean_); + double moment2_np1 = update_online_moment2(x, this->mean_, mean_np1, this->moment2_); + + this->n_sample_ = np1; + this->mean_ = mean_np1; + this->moment2_ = moment2_np1; + } /*include_sample*/ + + private: + uint32_t n_sample_ = 0; + /* estimated mean */ + double mean_ = 0.0; + /* estimated 2nd moment E[X^2] */ + double moment2_ = 0.0; + }; /*SampleStatistics*/ + } /*namespace statistics*/ +} /*namespace xo*/ + +/* end SampleStatistics.hpp */