Add 'xo-distribution/' from commit '036ca5d817'
git-subtree-dir: xo-distribution git-subtree-mainline:4e022df686git-subtree-split:036ca5d817
This commit is contained in:
commit
ecd019e8bb
18 changed files with 1644 additions and 0 deletions
6
xo-distribution/.gitignore
vendored
Normal file
6
xo-distribution/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
# clangd working space (see emacs+lsp)
|
||||
.cache
|
||||
# typical cmake build directory (source-tree-nephew)
|
||||
.build*
|
||||
# symlink to builddir/compile_commands.json; should be set manually in dev sandbox
|
||||
compile_commands.json
|
||||
16
xo-distribution/CMakeLists.txt
Normal file
16
xo-distribution/CMakeLists.txt
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
# xo-distribution/CMakeLists.txt
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
project(xo_distribution VERSION 1.0)
|
||||
|
||||
include(GNUInstallDirs)
|
||||
include(cmake/xo-bootstrap-macros.cmake)
|
||||
|
||||
xo_cxx_toplevel_options3()
|
||||
|
||||
#add_subdirectory(example)
|
||||
add_subdirectory(src/distribution) # note refcnt dep -> not header-only
|
||||
add_subdirectory(utest)
|
||||
|
||||
xo_export_cmake_config(${PROJECT_NAME} ${PROJECT_VERSION} ${PROJECT_NAME}Targets)
|
||||
35
xo-distribution/cmake/xo-bootstrap-macros.cmake
Normal file
35
xo-distribution/cmake/xo-bootstrap-macros.cmake
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
# ----------------------------------------------------------------
|
||||
# for example:
|
||||
# $ PREFIX=/usr/local # for example
|
||||
# $ cmake -DCMAKE_MODULE_PATH=prefix -DCMAKE_INSTALL_PREFIX=$PREFIX -B .build
|
||||
#
|
||||
# will get
|
||||
# CMAKE_MODULE_PATH
|
||||
# from xo-cmake-config --cmake-module-path
|
||||
#
|
||||
# and expect .cmake macros in
|
||||
# CMAKE_MODULE_PATH/xo_macros/xo_cxx.cmake
|
||||
# ----------------------------------------------------------------
|
||||
|
||||
find_program(XO_CMAKE_CONFIG_EXECUTABLE NAMES xo-cmake-config REQUIRED)
|
||||
|
||||
if ("${XO_CMAKE_CONFIG_EXECUTABLE}" STREQUAL "XO_CMAKE_CONFIG_EXECUTABLE-NOT_FOUND")
|
||||
message(FATAL "could not find xo-cmake-config executable")
|
||||
endif()
|
||||
|
||||
message(STATUS "XO_CMAKE_CONFIG_EXECUTABLE=${XO_CMAKE_CONFIG_EXECUTABLE}")
|
||||
|
||||
if (NOT XO_SUBMODULE_BUILD)
|
||||
if (("${CMAKE_MODULE_PATH}" STREQUAL "") OR ("${CMAKE_MODULE_PATH}" STREQUAL prefix))
|
||||
# default to typical install location for xo-project-macros
|
||||
execute_process(COMMAND ${XO_CMAKE_CONFIG_EXECUTABLE} --cmake-module-path OUTPUT_VARIABLE CMAKE_MODULE_PATH)
|
||||
message(STATUS "CMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# needs to have been installed somewhere on CMAKE_MODULE_PATH,
|
||||
# (e.g. from xo-cmake with the same value for CMAKE_INSTALL_PREFIX)
|
||||
#
|
||||
include(xo_macros/xo_cxx)
|
||||
|
||||
xo_cxx_bootstrap_message()
|
||||
6
xo-distribution/cmake/xo_distributionConfig.cmake.in
Normal file
6
xo-distribution/cmake/xo_distributionConfig.cmake.in
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
@PACKAGE_INIT@
|
||||
|
||||
include(CMakeFindDependencyMacro)
|
||||
find_dependency(refcnt)
|
||||
include("${CMAKE_CURRENT_LIST_DIR}/xo_distributionTargets.cmake")
|
||||
check_required_components("@PROJECT_NAME@")
|
||||
20
xo-distribution/include/xo/distribution/Distribution.hpp
Normal file
20
xo-distribution/include/xo/distribution/Distribution.hpp
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
/* @file Distribution.hpp */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "xo/refcnt/Refcounted.hpp"
|
||||
|
||||
namespace xo {
|
||||
namespace distribution {
|
||||
/* abstract api for a cumulative probability distribution.
|
||||
* over supplied Domain
|
||||
*/
|
||||
template<typename Domain>
|
||||
class Distribution : public ref::Refcount {
|
||||
public:
|
||||
virtual double cdf(Domain const & x) const = 0;
|
||||
}; /*Distribution*/
|
||||
} /*namespace distribution*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end Distribution.hpp */
|
||||
41
xo-distribution/include/xo/distribution/Empirical.hpp
Normal file
41
xo-distribution/include/xo/distribution/Empirical.hpp
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
/* @file Empirical.hpp */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "xo/distribution/Distribution.hpp"
|
||||
#include "xo/ordinaltree/RedBlackTree.hpp"
|
||||
#include "xo/indentlog/scope.hpp"
|
||||
#include <map>
|
||||
#include <cstdint>
|
||||
|
||||
namespace xo {
|
||||
namespace distribution {
|
||||
|
||||
/* representation for counter,
|
||||
* recording #of samples with the same value
|
||||
*/
|
||||
using CounterRep = uint32_t;
|
||||
|
||||
/* counter; for use with StdEmpirical distribution below
|
||||
*/
|
||||
class Counter {
|
||||
public:
|
||||
Counter() = default;
|
||||
Counter(CounterRep n) : count_(n) {}
|
||||
|
||||
CounterRep count() const { return count_; }
|
||||
|
||||
void incr() { ++count_; }
|
||||
|
||||
operator CounterRep () const { return count_; }
|
||||
|
||||
Counter & operator+=(CounterRep n) { count_ += n; return *this; }
|
||||
|
||||
private:
|
||||
CounterRep count_ = 0;
|
||||
}; /*Counter*/
|
||||
|
||||
} /*namespace disitribution*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end Empirical.hpp */
|
||||
748
xo-distribution/include/xo/distribution/ExplicitDist.hpp
Normal file
748
xo-distribution/include/xo/distribution/ExplicitDist.hpp
Normal file
|
|
@ -0,0 +1,748 @@
|
|||
/* file ExplicitDist.hpp
|
||||
*
|
||||
* author: Roland Conybeare, Oct 2022
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Distribution.hpp"
|
||||
#include "Normal.hpp"
|
||||
#include "xo/indentlog/scope.hpp"
|
||||
#include "xo/indentlog/print/vector.hpp"
|
||||
#include "xo/indentlog/print/tostr.hpp"
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
|
||||
namespace xo {
|
||||
using xo::xtag;
|
||||
|
||||
namespace distribution {
|
||||
class ProbabilityBucket {
|
||||
public:
|
||||
ProbabilityBucket() = default;
|
||||
|
||||
double weight() const { return wt_; }
|
||||
double cdf() const { return cdf_; }
|
||||
|
||||
/* note: when calling this, must invalidate ExplicitDist.cdf_valid_flag */
|
||||
void scale_weight(double k) { this->wt_ *= k; }
|
||||
|
||||
void assign_weight(double w) { this->wt_ = w; }
|
||||
/* implementation method: only ExplicitDist.renormalize() should call this */
|
||||
void assign_cdf(double x) { this->cdf_ = x; }
|
||||
|
||||
void display(std::ostream & os) const {
|
||||
using xo::xtag;
|
||||
|
||||
os << "<ProbabilityBucket"
|
||||
<< xtag("wt", this->wt_)
|
||||
<< xtag("cdf", this->cdf_)
|
||||
<< ">";
|
||||
} /*display*/
|
||||
|
||||
private:
|
||||
/* probability assigned to this bucket.
|
||||
* updated in-place by ExplicitDist.include_sample()
|
||||
*/
|
||||
double wt_ = 0.0;
|
||||
/* cumulative probability assigned to this bucket b
|
||||
* and all buckets b[i] for domain values < b
|
||||
*
|
||||
* invalidated by ExplicitDist.scale_bucket()
|
||||
* see ExplicitDist.
|
||||
*/
|
||||
double cdf_ = 0.0;
|
||||
}; /*ProbabilityBucket*/
|
||||
|
||||
inline std::ostream & operator<<(std::ostream & os, ProbabilityBucket const & x) {
|
||||
x.display(os);
|
||||
return os;
|
||||
} /*operator<<*/
|
||||
|
||||
namespace detail {
|
||||
/* three kinds of index here:
|
||||
* 1. lo: non-negative index into ExplicitDist.lo_v_[]
|
||||
* 2. hi: non-negative index into ExplicitDist.hi_v_[]
|
||||
* 3. signed: if >=0: index into ExplicitDist.hi_v_[]
|
||||
* if <0: index into ExplicitDist.lo_v_[]
|
||||
*
|
||||
* with lz=.lo_v.size(), hz=hi_v.size():
|
||||
*
|
||||
* .hi_v[0] .hi_v[hz-1]
|
||||
* | v v
|
||||
* |
|
||||
* +--+--+ +--+--|--+--+ +--+
|
||||
* | | | ... | | | | | ... | |
|
||||
* +--+--+ +--+--|--+--+ +--+
|
||||
* ^ ^
|
||||
* .lo_v[lz-1] .lo_v[0]
|
||||
*
|
||||
* ^ ^ ^ ^
|
||||
* signed -lz -1| 0 ... hz-1
|
||||
*/
|
||||
class ExplicitDistIndexUtil {
|
||||
public:
|
||||
static size_t signed2hi(int32_t signed_ix) { return signed_ix; }
|
||||
static size_t signed2lo(int32_t signed_ix) { return -signed_ix - 1; }
|
||||
static int32_t lo2signed(size_t lo_ix) { return -static_cast<int32_t>(lo_ix)-1; }
|
||||
static int32_t hi2signed(size_t hi_ix) { return hi_ix; }
|
||||
|
||||
static ProbabilityBucket const * signed2bucket_c(int32_t signed_ix,
|
||||
std::vector<ProbabilityBucket> const * lo_v,
|
||||
std::vector<ProbabilityBucket> const * hi_v) {
|
||||
if (signed_ix >= 0) {
|
||||
size_t hi_ix = signed2hi(signed_ix);
|
||||
|
||||
if (hi_v && (hi_ix < hi_v->size()))
|
||||
return &(*hi_v)[hi_ix];
|
||||
} else {
|
||||
size_t lo_ix = signed2lo(signed_ix);
|
||||
|
||||
if (lo_v && (lo_ix < lo_v->size()))
|
||||
return &(*lo_v)[lo_ix];
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
} /*signed2bucket_c*/
|
||||
|
||||
static ProbabilityBucket * signed2bucket(int32_t signed_ix,
|
||||
std::vector<ProbabilityBucket> * lo_v,
|
||||
std::vector<ProbabilityBucket> * hi_v) {
|
||||
ProbabilityBucket const * b = signed2bucket_c(signed_ix, lo_v, hi_v);
|
||||
return const_cast<ProbabilityBucket *>(b);
|
||||
} /*signed2bucket*/
|
||||
|
||||
}; /*ExplicitDistIndexUtil*/
|
||||
|
||||
class ExplicitDistIterator : public ExplicitDistIndexUtil {
|
||||
public:
|
||||
ExplicitDistIterator() = default;
|
||||
ExplicitDistIterator(int32_t signed_ix,
|
||||
std::vector<ProbabilityBucket> * lo_v,
|
||||
std::vector<ProbabilityBucket> * hi_v)
|
||||
: signed_ix_{signed_ix}, p_lo_v_{lo_v}, p_hi_v_{hi_v} {}
|
||||
|
||||
ProbabilityBucket & operator*() {
|
||||
ProbabilityBucket * pb = signed2bucket(this->signed_ix_,
|
||||
this->p_lo_v_,
|
||||
this->p_hi_v_);
|
||||
if (!pb) {
|
||||
throw std::runtime_error("ExplicitDistIterator: attempt to deref invalid iterator");
|
||||
}
|
||||
|
||||
return *pb;
|
||||
} /*operator**/
|
||||
|
||||
ExplicitDistIterator & operator++() { ++(this->signed_ix_); return *this; }
|
||||
ExplicitDistIterator & operator--() { --(this->signed_ix_); return *this; }
|
||||
|
||||
private:
|
||||
/* signed iterator */
|
||||
int32_t signed_ix_ = 0;
|
||||
/* will be ExplicitDist.lo_v[] */
|
||||
std::vector<ProbabilityBucket> * p_lo_v_ = nullptr;
|
||||
/* will be ExplicitDist.hi_v[] */
|
||||
std::vector<ProbabilityBucket> * p_hi_v_ = nullptr;
|
||||
}; /*ExplicitDistIterator*/
|
||||
} /*namespace detail*/
|
||||
|
||||
/* explicit distribution with buckets.
|
||||
* (if you want to avoid bucketing at the expense of memory,
|
||||
* see StdEmpirical)
|
||||
*
|
||||
* sample-independent buckets can be faster to the extent
|
||||
* {#of buckets} << {#of samples}
|
||||
*
|
||||
* in particular
|
||||
* | StdEmpirical | PackedEmpirical
|
||||
* ------------------------------+--------------+-----------------
|
||||
* update existing sample/bucket | O(log(n)) | O(1)
|
||||
* create new sample/bucket | O(log(n)) | O(log(n))
|
||||
* cdf | O(log(n)) | O(n)
|
||||
*
|
||||
* PackedEmpirical offers scaling + renormalization
|
||||
*
|
||||
* since .cdf() is slow, .ks_stat_1sided() is supported only in StdEmpirical.
|
||||
* (could generalize in future to some other implementation with fast .cdf())
|
||||
*
|
||||
* Require:
|
||||
* Domain is something with metric, e.g. int|double.
|
||||
* categorical domain not supported here.
|
||||
*
|
||||
* see also: statistics/Histogram. Histogram keeps more per-bucket statistics
|
||||
*/
|
||||
template<typename Domain>
|
||||
class ExplicitDist : public Distribution<Domain>, public detail::ExplicitDistIndexUtil {
|
||||
public:
|
||||
using WeightVector = std::vector<ProbabilityBucket>;
|
||||
using iterator = detail::ExplicitDistIterator;
|
||||
|
||||
public:
|
||||
static rp<ExplicitDist> make(Domain bucket_dx, Domain ref_value) {
|
||||
return new ExplicitDist(bucket_dx, ref_value);
|
||||
}
|
||||
/* create distribution with n buckets of width bucket_dx,
|
||||
* covering range [ref_value, ref_value + n * bucket_dx]
|
||||
*/
|
||||
static rp<ExplicitDist> make_n(size_t n, Domain bucket_dx, Domain ref_value) {
|
||||
return new ExplicitDist(n, bucket_dx, ref_value);
|
||||
}
|
||||
|
||||
size_t n_bucket() const { return this->lo_v_.size() + this->hi_v_.size(); }
|
||||
|
||||
/* lub domain value with .cdf(lo) = 0 */
|
||||
Domain lo() const {
|
||||
std::size_t lz = this->lo_v_.size();
|
||||
|
||||
return this->ref_value_ - lz * this->bucket_dx_;
|
||||
} /*lo*/
|
||||
|
||||
/* glb domain value with .cdf(hi) = 1 */
|
||||
Domain hi() const {
|
||||
std::size_t hz = this->hi_v_.size();
|
||||
|
||||
return this->ref_value_ + hz * this->bucket_dx_;
|
||||
} /*hi*/
|
||||
|
||||
Domain bucket_lo(int32_t signed_ix) const {
|
||||
return this->ref_value_ + signed_ix * this->bucket_dx_;
|
||||
} /*bucket_lo*/
|
||||
|
||||
Domain bucket_mid(int32_t signed_ix) const {
|
||||
return this->ref_value_ + (0.5 + signed_ix) * this->bucket_dx_;
|
||||
} /*bucket_mid*/
|
||||
|
||||
Domain bucket_hi(int32_t signed_ix) const {
|
||||
return this->bucket_lo(signed_ix + 1);
|
||||
} /*bucket_hi*/
|
||||
|
||||
iterator begin() { return iterator(lo2signed(this->lo_v_.size() - 1),
|
||||
&(this->lo_v_),
|
||||
&(this->hi_v_)); }
|
||||
iterator end() { return iterator(hi2signed(this->hi_v_.size()),
|
||||
&(this->lo_v_),
|
||||
&(this->hi_v_)); }
|
||||
|
||||
/* probability density at domain value x */
|
||||
double density(Domain const & x) const {
|
||||
/* careful! may need to renormalize */
|
||||
{
|
||||
ExplicitDist<Domain> * self = const_cast<ExplicitDist<Domain> *>(this);
|
||||
|
||||
self->check_renormalize();
|
||||
}
|
||||
|
||||
auto v = this->lookup_bucket(x);
|
||||
ProbabilityBucket const * b = v.first;
|
||||
|
||||
if (b) {
|
||||
/* probability density is constant within each bucket */
|
||||
return b->weight() / this->bucket_dx_;
|
||||
} else {
|
||||
return 0.0;
|
||||
}
|
||||
} /*density*/
|
||||
|
||||
/* pair {bucket glb, density} for all buckets, in increasing domain order */
|
||||
std::vector<std::pair<Domain, double>> density_v() const {
|
||||
/* careful! may need to renormalize */
|
||||
{
|
||||
ExplicitDist<Domain> * self = const_cast<ExplicitDist<Domain> *>(this);
|
||||
|
||||
self->check_renormalize();
|
||||
}
|
||||
|
||||
size_t n = this->n_bucket();
|
||||
std::vector<std::pair<Domain, double>> retval(n);
|
||||
|
||||
double w2d = 1.0 / this->bucket_dx_;
|
||||
|
||||
for(size_t i=0; i<n; ++i) {
|
||||
Domain lo = this->bucket_lo(i);
|
||||
ProbabilityBucket const * b = this->lookup_signed_bucket(i);
|
||||
|
||||
assert(b);
|
||||
|
||||
double density = b->weight() * w2d;
|
||||
|
||||
retval[i] = std::make_pair(lo, density);
|
||||
}
|
||||
|
||||
return retval;
|
||||
} /*density_v*/
|
||||
|
||||
/* return:
|
||||
* .first signed bucket index ix;
|
||||
* refers to .lo_v[1-ix] if ix<0;
|
||||
* refers to .hi_v[+ix] if ix>=0
|
||||
* .second fractional weight within bucket associated with x.
|
||||
* not scaled by ProbabilityWeight.weight
|
||||
*/
|
||||
std::pair<int32_t, double> signed_bucket_index(Domain const & x) const {
|
||||
double ix_f = ::floor((x - this->ref_value_) / this->bucket_dx_);
|
||||
int32_t ix = ix_f;
|
||||
|
||||
/*
|
||||
* ^
|
||||
* |............
|
||||
* | : |
|
||||
* | wt : | unit area
|
||||
* | : |
|
||||
* +-----------+
|
||||
* bucket_lo ^ bucket_lo + .bucket_dx
|
||||
* x
|
||||
*/
|
||||
|
||||
Domain bucket_lo = this->ref_value_ + (ix * this->bucket_dx_);
|
||||
double wt = (x - bucket_lo) / this->bucket_dx_;
|
||||
|
||||
return std::make_pair(ix, wt);
|
||||
} /*signed_bucket_index*/
|
||||
|
||||
ProbabilityBucket const * lookup_signed_bucket(int32_t signed_ix) const {
|
||||
return signed2bucket_c(signed_ix, &(this->lo_v_), &(this->hi_v_));
|
||||
} /*lookup_signed_bucket*/
|
||||
|
||||
/* non-const version with write access */
|
||||
ProbabilityBucket * lookup_signed_bucket(int32_t signed_ix) {
|
||||
return signed2bucket(signed_ix, &(this->lo_v_), &(this->hi_v_));
|
||||
} /*lookup_signed_bucket*/
|
||||
|
||||
/* return null pointer if bucket that would contain x not represented
|
||||
* .second is fractional weight (relative to unity) within selected bucket
|
||||
*/
|
||||
std::pair<ProbabilityBucket const *, double> lookup_bucket(Domain const & x) const {
|
||||
/* signed index.
|
||||
* ix>=0 => .lo_v[]
|
||||
* ix< 0 => .hi_v[]
|
||||
*/
|
||||
std::pair<int32_t, double> v = this->signed_bucket_index(x);
|
||||
|
||||
int32_t ix = v.first;
|
||||
double fraction_wt = v.second;
|
||||
ProbabilityBucket const * pw = this->lookup_signed_bucket(ix);
|
||||
|
||||
return std::make_pair(pw, fraction_wt);
|
||||
} /*lookup_bucket*/
|
||||
|
||||
/* non-const version */
|
||||
std::pair<ProbabilityBucket *, double> lookup_bucket(Domain const & x) {
|
||||
ExplicitDist<Domain> const * c_self = this;
|
||||
|
||||
std::pair<ProbabilityBucket const *, double> v = c_self->lookup_bucket(x);
|
||||
ProbabilityBucket * b = const_cast<ProbabilityBucket *>(v.first);
|
||||
double fraction_wt = v.second;
|
||||
|
||||
return std::make_pair(b, fraction_wt);
|
||||
} /*lookup_bucket*/
|
||||
|
||||
/* like .lookup_bucket(), but create bucket if not already present
|
||||
* .second reports fractional weight (relative to unity) within bucket
|
||||
* that contains x.
|
||||
*/
|
||||
std::pair<ProbabilityBucket*, double> establish_bucket(Domain const & x) {
|
||||
/* signed index.
|
||||
* ix>=0 => .lo_v[]
|
||||
* ix< 0 => .hi_v[]
|
||||
*/
|
||||
std::pair<int32_t, double> v = this->signed_bucket_index(x);
|
||||
|
||||
int32_t ix = v.first;
|
||||
double fraction_wt = v.second;
|
||||
ProbabilityBucket * pw = nullptr;
|
||||
|
||||
if (ix >= 0) {
|
||||
size_t hi_ix = signed2hi(ix);
|
||||
|
||||
if (hi_ix >= this->hi_v_.size()) {
|
||||
/* need to expand .hi_v[] */
|
||||
this->hi_v_.resize(hi_ix+1);
|
||||
}
|
||||
|
||||
pw = &(this->hi_v_[hi_ix]);
|
||||
} else {
|
||||
size_t lo_ix = signed2lo(ix);
|
||||
|
||||
if (lo_ix >= this->lo_v_.size()) {
|
||||
/* need to expand .lo_v[] */
|
||||
this->lo_v_.resize(lo_ix+1);
|
||||
}
|
||||
|
||||
pw = &(this->lo_v_[lo_ix]);
|
||||
}
|
||||
|
||||
return std::make_pair(pw, fraction_wt);
|
||||
} /*establish_bucket*/
|
||||
|
||||
void scale_bucket_by_signed_index(int32_t signed_ix, double k) {
|
||||
ProbabilityBucket * b = this->lookup_signed_bucket(signed_ix);
|
||||
|
||||
if (b) {
|
||||
b->scale_weight(k);
|
||||
|
||||
this->cdf_valid_flag_ = false;
|
||||
} else {
|
||||
/* if bucket isn't represented, then corresponding weight is zero,
|
||||
* both before and after scaling. In this special case
|
||||
* representation didn't change -> don't invalidate cdf
|
||||
*/
|
||||
}
|
||||
} /*scale_bucket_by_signed_index*/
|
||||
|
||||
/* scale probability assigned to bucket for domain value x, by factor k */
|
||||
void scale_bucket(Domain const & x, double k) {
|
||||
assert(k >= 0.0);
|
||||
|
||||
std::pair<int32_t, double> v = this->signed_bucket_index(x);
|
||||
|
||||
this->scale_bucket_by_signed_index(v.first, k);
|
||||
} /*scale_bucket*/
|
||||
|
||||
/* scale probability weight for buckets in [lo, hi] by fn(p)
|
||||
* for a point p in each bucket, however:
|
||||
* under no circumstances try to evaluate fn() outside [lo, hi]
|
||||
* (relevant if either lo/hi fall inside a bucket)
|
||||
*/
|
||||
template<typename Function>
|
||||
void scale_interval(Domain const & lo,
|
||||
Domain const & hi,
|
||||
Function && fn)
|
||||
{
|
||||
XO_SCOPE_DISABLED(lscope);
|
||||
|
||||
/* note: using inclusive upper index bounds here;
|
||||
* variying from idiomatic c++ style for symmetry
|
||||
*/
|
||||
|
||||
int32_t min_bucket_ix = lo2signed(this->lo_v_.size() - 1);
|
||||
int32_t max_bucket_ix = hi2signed(this->hi_v_.size() - 1);
|
||||
|
||||
int32_t lo_ix = this->signed_bucket_index(lo).first;
|
||||
int32_t hi_ix = this->signed_bucket_index(hi).first;
|
||||
|
||||
/* start_ix: lowest explicit bucket associating with [lo, hi) */
|
||||
int32_t start_ix = std::max(min_bucket_ix, lo_ix + 1);
|
||||
/* end_ix: highest explicit bucket associating with [lo, hi) */
|
||||
int32_t end_ix = std::min(max_bucket_ix, hi_ix);
|
||||
|
||||
if (lscope.enabled()) {
|
||||
lscope.log(xtag("min_bucket_ix", min_bucket_ix),
|
||||
xtag("max_bucket_ix", max_bucket_ix),
|
||||
xtag("lo_ix", lo_ix),
|
||||
xtag("hi_ix", hi_ix),
|
||||
xtag("start_ix", start_ix),
|
||||
xtag("end_ix", end_ix));
|
||||
}
|
||||
|
||||
/* for endpoints: avoid evaluating fn() outside [lo, hi] */
|
||||
if (min_bucket_ix <= lo_ix) {
|
||||
double lo_k = fn(lo);
|
||||
|
||||
if (lscope.enabled())
|
||||
lscope.log("A", xtag("lo_ix", lo_ix), xtag("lo_k", lo_k));
|
||||
|
||||
this->scale_bucket_by_signed_index(lo_ix, lo_k);
|
||||
}
|
||||
|
||||
for(int32_t ix = start_ix; ix <= end_ix; ++ix) {
|
||||
double k = fn(this->bucket_mid(ix));
|
||||
|
||||
if (lscope.enabled())
|
||||
lscope.log("B", xtag("ix", ix), xtag("k", k));
|
||||
|
||||
this->scale_bucket_by_signed_index(ix, k);
|
||||
}
|
||||
|
||||
/* for endpoints: avoid evaluating fn() outside [lo, hi] */
|
||||
if (hi_ix <= max_bucket_ix) {
|
||||
double hi_k = fn(hi);
|
||||
|
||||
if (lscope.enabled())
|
||||
lscope.log("C", xtag("hi_ix", hi_ix), xtag("hi_k", hi_k));
|
||||
|
||||
this->scale_bucket_by_signed_index(hi_ix, hi_k);
|
||||
}
|
||||
} /*scale_interval*/
|
||||
|
||||
template<typename Function>
|
||||
void scale_all(Function && fn)
|
||||
{
|
||||
this->scale_interval(this->lo(), this->hi(), fn);
|
||||
} /*scale_all*/
|
||||
|
||||
/* convenience: scale by scaled normal cdf.
|
||||
*
|
||||
* support use case where:
|
||||
* 1. explicit dist represents distribution of asset value;
|
||||
* 2. assume one party A to trade/order has noisy signal,
|
||||
* with normally-distributed error around (unknown) true value s,
|
||||
* with variance o^2;
|
||||
* 3. observe a trade/order at some price p,
|
||||
* giving us one-sided information about A's knowledge in two scenarios:
|
||||
* i. A trades
|
||||
* ii. A does not trade
|
||||
*
|
||||
* We will be applying Bayes' rule to update prior.
|
||||
* If buy(p) = {A buys},
|
||||
* N(x) is cumulative normal distribution:
|
||||
* N(x) -> 0 as x -> -oo;
|
||||
* N(x) -> 1 as x -> +oo;
|
||||
*
|
||||
* -1/2 2
|
||||
* (d/dx)N(x) = (2.pi) . exp(-x /2)
|
||||
* then:
|
||||
* P{s=s'}
|
||||
* P{s=s'|buy(p)} = ---------.P{buy(p)|s=s'}
|
||||
* P{buy(p)}
|
||||
*
|
||||
* P{s=s'}
|
||||
* = ---------.N[(1/o)(s'-p)],
|
||||
* P{buy(p)}
|
||||
*
|
||||
* In this scenario, P{s=s'} is our prior, represented by distribution *this.
|
||||
* The unconditional probability P{buy(p)} is not a function of s', so:
|
||||
*
|
||||
* /
|
||||
* |
|
||||
* P{buy(p)} = | P{s=s'} . N[(1/o)(s'-p)] . ds'
|
||||
* |
|
||||
* /
|
||||
*
|
||||
* in other words can scale explicit prior *this by N[(1/o)(s'-p)] then renormalize
|
||||
* so total probability weight is 1.
|
||||
*
|
||||
* Conversely, if event is {A sells}, similar argument leads to scaling
|
||||
* by N[(1/o)(p-s')]
|
||||
*
|
||||
* sign. scale by N[(1/sigma).sign.(x-mean)]
|
||||
*/
|
||||
void scale_by_normal_cdf(int sign, Domain const & mean, Domain const & sigma) {
|
||||
auto fn([sign, mean, sigma](Domain const & s) {
|
||||
return Normal::cdf_impl(sign * (s - mean) / sigma);
|
||||
});
|
||||
|
||||
this->scale_all(fn);
|
||||
} /*scale_by_normal_cdf*/
|
||||
|
||||
// ----- inherited from Distribution<Domain> -----
|
||||
|
||||
/* note: marked const; actually "logically const" */
|
||||
virtual double cdf(Domain const & x) const override {
|
||||
/* .cdf() is slow here, because partial sums aren't stored
|
||||
* --> have to sum over O(n) buckets
|
||||
*/
|
||||
{
|
||||
ExplicitDist<Domain> * self = const_cast<ExplicitDist<Domain> *>(this);
|
||||
|
||||
self->check_renormalize();
|
||||
}
|
||||
|
||||
std::pair<ProbabilityBucket const *, double> v = this->lookup_bucket(x);
|
||||
|
||||
ProbabilityBucket const * b = v.first;
|
||||
double fraction_wt = v.second;
|
||||
|
||||
if (b) {
|
||||
/* for bucket that x belongs to, treat as uniformly distributed
|
||||
* fraction_wt = 1.0 -> 100% of b -> b.cdf()
|
||||
* fraction_wt = 0.0 -> 0% of b -> b.cdf() - b.weight()
|
||||
*/
|
||||
return b->cdf() + (fraction_wt - 1.0) * b->weight();
|
||||
} else {
|
||||
if (x >= this->ref_value_) {
|
||||
/* x falls above farthest .hi_v[] */
|
||||
return 1.0;
|
||||
} else {
|
||||
/* x falls below farthest .lo_v[] */
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
} /*cdf*/
|
||||
|
||||
/* O(n). restores .cdf_valid_flag */
|
||||
void renormalize() {
|
||||
/* [1] compute sum of probability weights
|
||||
* [2] rescale all buckets so sum is 1.
|
||||
* [3] restore .cdf_valid_flag
|
||||
*/
|
||||
|
||||
/* [1] */
|
||||
double lo_sum_prob = 0.0;
|
||||
for (ProbabilityBucket & b : this->lo_v_) {
|
||||
assert(lo_sum_prob >= 0.0);
|
||||
lo_sum_prob += b.weight();
|
||||
}
|
||||
|
||||
double hi_sum_prob = 0.0;
|
||||
for (ProbabilityBucket & b : this->hi_v_) {
|
||||
assert(hi_sum_prob >= 0.0);
|
||||
hi_sum_prob += b.weight();
|
||||
}
|
||||
|
||||
assert(lo_sum_prob + hi_sum_prob > 0.0);
|
||||
|
||||
/* [2] */
|
||||
double renorm_factor = 1.0 / (lo_sum_prob + hi_sum_prob);
|
||||
|
||||
{
|
||||
double lo_tail = lo_sum_prob;
|
||||
|
||||
/* iterate over buckets < .ref_value, in /descending/ domain order */
|
||||
for (ProbabilityBucket & b : this->lo_v_) {
|
||||
b.scale_weight(renorm_factor);
|
||||
b.assign_cdf(lo_tail);
|
||||
|
||||
/* reduce tail after assign cdf to b, since
|
||||
* lo_tail includes b's probability weight
|
||||
*/
|
||||
lo_tail -= b.weight();
|
||||
|
||||
/* numerical roundoff can cause this */
|
||||
if (lo_tail < 0.0)
|
||||
lo_tail = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
double cum_prob = lo_sum_prob;
|
||||
|
||||
/* iterate over buckets >= .refvalue, in /ascending/ domain order */
|
||||
for (ProbabilityBucket & b : this->hi_v_) {
|
||||
b.scale_weight(renorm_factor);
|
||||
|
||||
/* increase cum_prob before assign cdf to b,
|
||||
* since b.cdf should include b's probability weight
|
||||
*/
|
||||
cum_prob += b.weight();
|
||||
|
||||
/* numerical roundoff can cause this */
|
||||
if (cum_prob >= 1.0)
|
||||
cum_prob = 1.0;
|
||||
|
||||
b.assign_cdf(cum_prob);
|
||||
}
|
||||
}
|
||||
|
||||
/* [3] */
|
||||
this->cdf_valid_flag_ = true;
|
||||
} /*renormalize*/
|
||||
|
||||
void check_renormalize() {
|
||||
if (!this->cdf_valid_flag_) {
|
||||
this->renormalize();
|
||||
}
|
||||
} /*check_renormalize*/
|
||||
|
||||
void display(std::ostream & os) const {
|
||||
os << "<ExplicitDist";
|
||||
os << xtag("cdf_valid_flag", this->cdf_valid_flag_);
|
||||
os << xtag("bucket_dx", this->bucket_dx_);
|
||||
os << xtag("ref_value", this->ref_value_);
|
||||
os << xtag("lz", this->lo_v_.size());
|
||||
os << xtag("hz", this->hi_v_.size());
|
||||
os << xtag("lo_v", this->lo_v_);
|
||||
os << xtag("hi_v", this->hi_v_);
|
||||
os << ">";
|
||||
} /*display*/
|
||||
|
||||
std::string display_string() const { return xo::tostr(*this); }
|
||||
|
||||
private:
|
||||
ExplicitDist(Domain bucket_dx, Domain ref_value)
|
||||
: cdf_valid_flag_{true},
|
||||
bucket_dx_{bucket_dx},
|
||||
ref_value_{ref_value},
|
||||
lo_v_{},
|
||||
hi_v_{1}
|
||||
{
|
||||
assert(bucket_dx_ > 0.0);
|
||||
|
||||
/* must have at least one bucket, since need total probability weight = 1 */
|
||||
this->hi_v_[0].assign_weight(1.0);
|
||||
this->hi_v_[0].assign_cdf(1.0);
|
||||
}
|
||||
|
||||
ExplicitDist(size_t n, Domain bucket_dx, Domain ref_value)
|
||||
: cdf_valid_flag_{true},
|
||||
bucket_dx_{bucket_dx},
|
||||
ref_value_{ref_value},
|
||||
lo_v_{},
|
||||
hi_v_{n}
|
||||
{
|
||||
assert(bucket_dx > 0.0);
|
||||
/* must have at least one bucket, since need total probability weight = 1 */
|
||||
assert(n > 0);
|
||||
|
||||
double w = 1.0 / n;
|
||||
|
||||
for(size_t i = 0; i<n; ++i) {
|
||||
this->hi_v_[i].assign_weight(w);
|
||||
this->hi_v_[i].assign_cdf((i+1)*w);
|
||||
}
|
||||
} /*ctor*/
|
||||
|
||||
private:
|
||||
/* with lz=.lo_v.size(), hz=hi_v.size():
|
||||
*
|
||||
* .ref_value - .bucket_dx * lz
|
||||
* | .ref_value
|
||||
* | | .hi_v[0] .hi_v[hz-1]
|
||||
* v v v v
|
||||
* |
|
||||
* +--+--+ +--+--|--+--+ +--+
|
||||
* | | | ... | | | | | ... | |
|
||||
* +--+--+ +--+--|--+--+ +--+
|
||||
* |
|
||||
* ^ ^ ^
|
||||
* .lo_v[lz-1] .lo_v[0] .ref_value + .bucket_dx * hz
|
||||
*
|
||||
*/
|
||||
|
||||
/* .cdf_valid_flag:
|
||||
* -> false whenever .scale_bucket() runs.
|
||||
* -> true whenever .renormalize() runs.
|
||||
*
|
||||
* if false, then *this is 'not a probability distribution':
|
||||
* Sum b[i].weight != 1.0 (summing over probability buckets .lo_v[], .hi_v[])
|
||||
* i
|
||||
*/
|
||||
bool cdf_valid_flag_ = true;
|
||||
|
||||
/* width of each bucket */
|
||||
Domain bucket_dx_;
|
||||
|
||||
/* natural value here would be 0.
|
||||
* use .pos_v[] for values >= .ref_value
|
||||
* use .neg_v[] for values < .ref_value
|
||||
*/
|
||||
Domain ref_value_;
|
||||
|
||||
/* buckets for domain values < .ref_value
|
||||
*
|
||||
* with dx = .bucket_dx
|
||||
* .lo_v[i] represents weights in range
|
||||
* [.ref_value - (i+1) * dx, .ref_value - i * dx)
|
||||
*/
|
||||
WeightVector lo_v_;
|
||||
|
||||
/* buckets for domain values > .ref_value
|
||||
*
|
||||
* with dx = .bucket_dx
|
||||
* .hi_v[i] represents weights in range
|
||||
* [.ref_value + i * dx, .ref_value + (i+1) * dx))
|
||||
*/
|
||||
WeightVector hi_v_;
|
||||
|
||||
}; /*ExplicitDist*/
|
||||
|
||||
template<typename Domain>
|
||||
inline std::ostream &
|
||||
operator<<(std::ostream & os, ExplicitDist<Domain> const & x) {
|
||||
x.display(os);
|
||||
return os;
|
||||
} /*operator<<*/
|
||||
} /*namespace distribution*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end ExplicitDist.hpp */
|
||||
94
xo-distribution/include/xo/distribution/Exponential.hpp
Normal file
94
xo-distribution/include/xo/distribution/Exponential.hpp
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
/* @file Exponential.hpp */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "xo/distribution/Distribution.hpp"
|
||||
#include <limits>
|
||||
#include <cmath>
|
||||
|
||||
namespace xo {
|
||||
namespace distribution {
|
||||
/* Exponential probability distribution */
|
||||
class Exponential : public Distribution<double> {
|
||||
public:
|
||||
explicit Exponential(double lm) : lambda_(lm) {}
|
||||
|
||||
/* exponential probability density:
|
||||
*
|
||||
* -lm.x
|
||||
* p(x) = { lm . e , x > 0
|
||||
* { 0 , x <= 0
|
||||
*
|
||||
*/
|
||||
static double density_impl(double lambda, double x) {
|
||||
if(x <= 0.0)
|
||||
return 0.0;
|
||||
|
||||
return lambda * ::exp(-lambda * x);
|
||||
} /*density_impl*/
|
||||
|
||||
/* exponential distribution:
|
||||
*
|
||||
* -lm.x
|
||||
* F(x) = 1 - e , x > 0
|
||||
* F(x) = 0 , x <= 0
|
||||
*/
|
||||
static double distr_impl(double lambda, double x) {
|
||||
if(x <= 0.0)
|
||||
return 0.0;
|
||||
|
||||
return 1.0 - ::exp(-lambda * x);
|
||||
} /*distr_impl*/
|
||||
|
||||
/* compute x: F(x)=y, where F(x)
|
||||
* is the cumulative exponential probability distributio.
|
||||
*
|
||||
* -lm.x
|
||||
* F(x) = 1 - e , x>0
|
||||
*
|
||||
* -lm.x
|
||||
* e = 1 - F(x)
|
||||
*
|
||||
* -lm.x = ln(1 - F(x))
|
||||
*
|
||||
* x = -ln(1 - F(x)) / lm, 0 < F(x) < 1
|
||||
*/
|
||||
static double distr_inverse_impl(double lambda, double Fx) {
|
||||
if(Fx < 0.0)
|
||||
return -1.0 * std::numeric_limits<double>::infinity();
|
||||
if(Fx >= 1.0)
|
||||
return +1.0 * std::numeric_limits<double>::infinity();
|
||||
|
||||
return (-1.0 / lambda) * ::log(1.0 - Fx);
|
||||
} /*distr_inverse_impl*/
|
||||
|
||||
double lambda() const { return lambda_; }
|
||||
|
||||
double density(double x) const {
|
||||
return density_impl(this->lambda_, x);
|
||||
} /*density*/
|
||||
|
||||
double distribution(double x) const {
|
||||
return distr_impl(this->lambda_, x);
|
||||
} /*distribution*/
|
||||
|
||||
double distribution_inverse(double y) const {
|
||||
return distr_inverse_impl(this->lambda_, y);
|
||||
} /*distribution_inverse*/
|
||||
|
||||
// ----- inherited from Distribution<double> -----
|
||||
|
||||
virtual double cdf(double const & x) const override {
|
||||
return distribution(x);
|
||||
} /*cdf*/
|
||||
|
||||
private:
|
||||
/* intensity parameter.
|
||||
* require: lambda > 0
|
||||
*/
|
||||
double lambda_ = 1.0;
|
||||
}; /*Exponential*/
|
||||
} /*namespace distribution*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end Exponential.cpp */
|
||||
257
xo-distribution/include/xo/distribution/KolmogorovSmirnov.hpp
Normal file
257
xo-distribution/include/xo/distribution/KolmogorovSmirnov.hpp
Normal file
|
|
@ -0,0 +1,257 @@
|
|||
/* @file KolmogorovSmirnov.hpp */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Distribution.hpp"
|
||||
#include "xo/indentlog/scope.hpp"
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
|
||||
namespace xo {
|
||||
/* Kolmogorov-Smirnov probability distribution */
|
||||
namespace distribution {
|
||||
|
||||
class KolmogorovSmirnov : public Distribution<double> {
|
||||
public:
|
||||
KolmogorovSmirnov() = default;
|
||||
|
||||
/* kolmogorov-smirnov (KS) cumulative distribution
|
||||
*
|
||||
* cdf is defined by the series:
|
||||
*
|
||||
* +oo / j 2 2 \
|
||||
* P1(x) = 1 + 2 Sum | (-1) .exp(-2.j .x ) |
|
||||
* j=1 \ /
|
||||
*
|
||||
* this converges rapidly for x > 1. expanding the first few terms:
|
||||
*
|
||||
* / 2 2 2 2 2 2 2 \
|
||||
* P1(x) ~= 1 + 2.| -exp(-2.x ) + exp(-2.2 .x ) - exp(-2.3 .x ) + exp(-2.4 .x ) |
|
||||
* \ /
|
||||
*
|
||||
* / 2 2 4 2 9 2 16 \
|
||||
* = 1 + 2.| -exp(-2.x ) + exp(-2.x ) - exp(2.x ) + exp(-2.x ) |
|
||||
* \ /
|
||||
*
|
||||
* / 4 9 16 \
|
||||
* = 1 + 2.| T(x) + T(x) + T(x) + T(x) |
|
||||
* \ /
|
||||
*
|
||||
* with T(x) = .term_aux(1, x)
|
||||
*
|
||||
* with x=1:
|
||||
* term1_aux(1, 1): -0.135335
|
||||
* term1_aux(2, 1): 3.35463e-4
|
||||
* term1_aux(3, 1): -1.52300e-8
|
||||
* term1_aux(4, 1): 1.26642e-14
|
||||
* term1_aux(5, 1): -1.92875e-22
|
||||
*
|
||||
* with x=1.18:
|
||||
* term1_aux(1, 1): -0.0617414
|
||||
* term1_aux(2, 1): 1.45314e-5
|
||||
* term1_aux(3, 1): -1.30374e-11
|
||||
* term1_aux(4, 1): 4.45890e-20
|
||||
* term1_aux(5, 1): -5.83124e-31
|
||||
*
|
||||
* ----------------------------------------------------------------
|
||||
*
|
||||
* There's an alternative series for the KS distribution,
|
||||
* that converges rapidly for small x (x < ~1.18)
|
||||
*
|
||||
* / 2 2 \
|
||||
* sqrt(2.pi) +oo / | (2j - 1) .pi | \
|
||||
* P2(x) = ---------- . Sum | exp | - ------------ | |
|
||||
* x j=1 \ | 2 | /
|
||||
* \ 8.x /
|
||||
*
|
||||
* / 2 \
|
||||
* | pi |
|
||||
* witu U(x) = exp | - ---- |
|
||||
* | 2 |
|
||||
* \ 8.x /
|
||||
* we have
|
||||
* / 2 \
|
||||
* sqrt(2.pi) +oo | (2j - 1) |
|
||||
* P2(x) = ---------- . Sum | U(x) |
|
||||
* x j=1 | |
|
||||
* \ /
|
||||
*
|
||||
* / \
|
||||
* sqrt(2.pi) | 9 25 49 |
|
||||
* = ---------- . | U(x) + U(x) + U(x) + U(x) + .. |
|
||||
* x | |
|
||||
* \ /
|
||||
*
|
||||
* with x=1.0:
|
||||
* term2_aux(1, 1): 0.291213
|
||||
* term2_aux(2, 1): 1.50625e-5
|
||||
* term2_aux(3, 1): 4.02964e-14
|
||||
* term2_aux(4, 1): 5.57599e-27
|
||||
*
|
||||
* with x=1.18:
|
||||
* term2_aux(1, 1.18): 0.412292
|
||||
* term2_aux(2, 1.18): 3.44223e-4
|
||||
* term2_aux(3, 1.18): 2.39945e-10
|
||||
* term2_aux(4, 1.18): 1.39652e-19
|
||||
*/
|
||||
static double term1_aux(uint32_t j, double x) {
|
||||
double f = ::exp(-2.0 * x * x);
|
||||
/* sgn:
|
||||
* +1 for j in {2, 4, 6, ..}
|
||||
* -1 for j in {1, 3, 5, ..}
|
||||
*/
|
||||
int32_t sgn = (((j & 0x1) == 0) ? +1 : -1);
|
||||
|
||||
if(j == 1) {
|
||||
return sgn * f;
|
||||
} else {
|
||||
return sgn * ::pow(f, j*j);
|
||||
}
|
||||
} /*term1_aux*/
|
||||
|
||||
/* implements P1(x) above; truncating at 4 terms.
|
||||
* use .distr1() for x > ~1.18
|
||||
*/
|
||||
static double distr1_impl(double x) {
|
||||
double f = term1_aux(1.0, x);
|
||||
double f2 = f*f; /*f^2*/
|
||||
double f4 = f2*f2; /*f^4*/
|
||||
double f8 = f4*f4; /*f^8*/
|
||||
double f9 = f8*f; /*f^9*/
|
||||
double f16 = f8*f8; /*f^16*/
|
||||
|
||||
double r = ((f16 + f9) + f4) + f;
|
||||
|
||||
return 1.0 + 2.0*r;
|
||||
} /*distr1_impl*/
|
||||
|
||||
/* pi: 3.141592... */
|
||||
static constexpr double c_pi = M_PI;
|
||||
/* pi^2 / 8 */
|
||||
static constexpr double c_pi2_8 = 0.125 * c_pi * c_pi;
|
||||
|
||||
/* computes
|
||||
*
|
||||
* (2j-1)^2
|
||||
* U(x)
|
||||
*
|
||||
* require:
|
||||
* - j >= 1
|
||||
*/
|
||||
static double term2_aux(uint32_t j, double x) {
|
||||
double u = ::exp(-c_pi2_8 / (x * x));
|
||||
|
||||
if(j == 1) {
|
||||
return u;
|
||||
} else {
|
||||
uint32_t j2m1 = 2*j - 1;
|
||||
|
||||
return ::pow(u, j2m1 * j2m1);
|
||||
}
|
||||
} /*term2_aux*/
|
||||
|
||||
/* implements P2(x) above; truncating at 4 terms */
|
||||
static double distr2_impl(double x) {
|
||||
static double c_sqrt_2pi
|
||||
= ::sqrt(2.0 * c_pi);
|
||||
|
||||
double scale = c_sqrt_2pi / x;
|
||||
|
||||
double u = term2_aux(1, x);
|
||||
double u2 = u*u; /*u^2*/
|
||||
double u4 = u2*u2; /*u^4*/
|
||||
double u8 = u4*u4; /*u^8*/
|
||||
double u16 = u8*u8; /*u^16*/
|
||||
double u32 = u16*u16; /*u^32*/
|
||||
|
||||
double u9 = u8*u; /*u^9*/
|
||||
double u25 = u16*u8*u; /*u^25*/
|
||||
double u49 = u32*u16*u; /*u^49*/
|
||||
|
||||
double r = ((u49 + u25) + u9) + u;
|
||||
|
||||
return scale * r;
|
||||
|
||||
} /*distr2_impl*/
|
||||
|
||||
static double distr_impl(double x) {
|
||||
using xo::tostr;
|
||||
|
||||
constexpr char const * c_self = "KolmogorovSmirnov::distr_impl";
|
||||
|
||||
if(x < 0.0)
|
||||
throw std::runtime_error(tostr(c_self, "KS(x) cdf defined for x>=0"));
|
||||
|
||||
if(x == 0.0)
|
||||
return 0;
|
||||
|
||||
if(x < 1.18) {
|
||||
/* P2(x) converges fastest */
|
||||
return distr2_impl(x);
|
||||
} else {
|
||||
/* P1(x) converges fastest */
|
||||
return distr1_impl(x);
|
||||
}
|
||||
} /*distr_impl*/
|
||||
|
||||
/* p-value for significance of a particular value of the KS-statistic
|
||||
* obtain this statistic for a sample distribution using
|
||||
* Empirical.ks_stat_1sided(dexp)
|
||||
* for some target distribution dexp
|
||||
*
|
||||
* ne.
|
||||
* for 1-sided test: #of points in sample, i.e. Empirical.n_sample()
|
||||
* for 2-sided test: (n1 . n2) / (n1 + n2)
|
||||
* D.
|
||||
* max difference between observed and expected cumulative distributions.
|
||||
* see Empirical.ks_stat_1sided()
|
||||
*/
|
||||
static double ks_pvalue(uint32_t ne, double D)
|
||||
{
|
||||
using xo::scope;
|
||||
using xo::xtag;
|
||||
|
||||
constexpr bool logging_enabled_flag = false;
|
||||
|
||||
scope log(XO_DEBUG(logging_enabled_flag));
|
||||
|
||||
double ne_sqrt = ::sqrt(ne);
|
||||
|
||||
/* argument to KS-distribution.
|
||||
* x in [0, +oo)
|
||||
*
|
||||
* A large value for x -> small value for 1 - KSdist(x),
|
||||
* i.e. represents probability that a sample of size ne with a
|
||||
* KS-statistic of magnitude D or larger, could be drawn from
|
||||
* distribution dexp.
|
||||
*/
|
||||
double x = ne_sqrt + 0.12 + (0.11 / ne_sqrt);
|
||||
|
||||
double xD = x * D;
|
||||
|
||||
double pvalue = 1.0 - distr_impl(xD);
|
||||
|
||||
log && log(xtag("ne", ne),
|
||||
xtag("D", D),
|
||||
xtag("ne_sqrt", ne_sqrt),
|
||||
xtag("x", x),
|
||||
xtag("xD", xD),
|
||||
xtag("pvalue", pvalue));
|
||||
|
||||
return pvalue;
|
||||
} /*ks_pvalue*/
|
||||
|
||||
/* cumulative distribution function */
|
||||
double distribution(double x) const {
|
||||
return distr_impl(x);
|
||||
} /*distribution*/
|
||||
|
||||
// ----- inherited from Distribution<double> -----
|
||||
virtual double cdf(double const & x) const {
|
||||
return this->distribution(x);
|
||||
} /*cdf*/
|
||||
}; /*KolmogorovSmirnov*/
|
||||
} /*namespace distribution*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end KolmogorovSmirnov.hpp */
|
||||
54
xo-distribution/include/xo/distribution/Normal.hpp
Normal file
54
xo-distribution/include/xo/distribution/Normal.hpp
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
/* @file Normal.hpp */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Distribution.hpp"
|
||||
#include <cmath>
|
||||
|
||||
namespace xo {
|
||||
namespace distribution {
|
||||
/* the guassian distribution, with mean 0 and variance 1
|
||||
*/
|
||||
class Normal : public Distribution<double> {
|
||||
public:
|
||||
Normal() = default;
|
||||
|
||||
/* N(0,1): mean 0, sdev 1 */
|
||||
static rp<Normal> unit() { return new Normal(); }
|
||||
|
||||
/* normal probability density:
|
||||
*
|
||||
* x^2
|
||||
* -(1/2) 1/2
|
||||
* p(x) = e / (2.pi)
|
||||
*/
|
||||
static double density(double x) {
|
||||
static double c_sqrt_2pi = ::sqrt(2 * M_PI);
|
||||
|
||||
return ::exp(-0.5 * x * x) / c_sqrt_2pi;
|
||||
} /*density*/
|
||||
|
||||
/* cumulative distribution function for N(0,1):
|
||||
*
|
||||
* / x
|
||||
* |
|
||||
* | p(x).dx
|
||||
* |
|
||||
* / -oo
|
||||
*
|
||||
* where p(x) is the normal density function p(x) = e^[-x^2/2]
|
||||
*/
|
||||
static double cdf_impl(double x) {
|
||||
return 0.5 * std::erfc(-M_SQRT1_2 * x);
|
||||
} /*cdf_impl*/
|
||||
|
||||
// ----- inherited from Distribution<double> -----
|
||||
|
||||
virtual double cdf(double const & x) const override {
|
||||
return cdf_impl(x);
|
||||
} /*cdf*/
|
||||
}; /*Normal*/
|
||||
} /*namespace distribution*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end Normal.hpp */
|
||||
226
xo-distribution/include/xo/distribution/StdEmpirical.hpp
Normal file
226
xo-distribution/include/xo/distribution/StdEmpirical.hpp
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
/* @file StdEmpirical.hpp */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Empirical.hpp"
|
||||
#include "xo/ordinaltree/RedBlackTree.hpp"
|
||||
#include "xo/indentlog/scope.hpp"
|
||||
#include <map>
|
||||
#include <cstdint>
|
||||
|
||||
namespace xo {
|
||||
namespace distribution {
|
||||
/* an empirical distribution over a given domain
|
||||
* (e.g. double as proxy for IR),
|
||||
* obtained by sorting equally-weighted samples
|
||||
*/
|
||||
template<typename Domain>
|
||||
class StdEmpirical : public Distribution<Domain> {
|
||||
public:
|
||||
using SampleMap = xo::tree::RedBlackTree<Domain,
|
||||
Counter,
|
||||
xo::tree::SumReduce<uint32_t>>;
|
||||
using const_iterator = typename SampleMap::const_iterator;
|
||||
|
||||
public:
|
||||
StdEmpirical() = default;
|
||||
|
||||
uint32_t n_sample() const { return n_sample_; }
|
||||
const_iterator begin() const { return sample_map_.begin(); }
|
||||
const_iterator end() const { return sample_map_.end(); }
|
||||
|
||||
/* compute kolmogorov-smirnov statistic with a non-sampled distribution.
|
||||
* if d2 is sampled, should use .ks_stat_2sided() instead
|
||||
*/
|
||||
std::pair<double, double> ks_stat_1sided(Distribution<Domain> const & d2) const {
|
||||
using xo::scope;
|
||||
using xo::xtag;
|
||||
|
||||
constexpr char const * c_self = "Empirical::ks_stat_1sided";
|
||||
constexpr bool c_logging_enabled = false;
|
||||
|
||||
scope lscope(c_self, c_logging_enabled);
|
||||
|
||||
double ks_stat = 0.0;
|
||||
|
||||
/* for i'th loop iteration below:
|
||||
* xj_sum = sum of all x[j] with j<=i
|
||||
*/
|
||||
uint32_t xj_sum = 0;
|
||||
|
||||
/* #of sample in this distribution, as double */
|
||||
double nr = 1.0 / this->n_sample();
|
||||
|
||||
/* loop over elements x[i] of this (sampled) distribution,
|
||||
* compare cdf(x[i]) with d2.cdf(x[i])
|
||||
*
|
||||
* KS stat is the maximum observed difference.
|
||||
*/
|
||||
for(auto const & point : this->sample_map_) {
|
||||
Domain const & xi = point.first;
|
||||
uint32_t xi_count = point.second;
|
||||
|
||||
xj_sum += xi_count;
|
||||
|
||||
/* p1 = xi_sum / n1, where n1 = .n_sample() */
|
||||
double p1 = xj_sum * nr;
|
||||
double p2 = d2.cdf(xi);
|
||||
|
||||
double dp = std::abs(p1 - p2);
|
||||
|
||||
if(c_logging_enabled)
|
||||
lscope.log(c_self,
|
||||
xtag("xi", xi),
|
||||
xtag("xi_count", xi_count),
|
||||
xtag("xj_sum", xj_sum),
|
||||
xtag("p1", p1),
|
||||
xtag("p2", p2),
|
||||
xtag("dp", dp));
|
||||
|
||||
ks_stat = std::max(ks_stat, dp);
|
||||
}
|
||||
|
||||
return std::pair<double, double>(this->n_sample(), ks_stat);
|
||||
} /*ks_stat_1sided*/
|
||||
|
||||
/* compute kolmogorov-smirnov statistic with a sampled distribution;
|
||||
* assess likelihood that both samples come from the same population.
|
||||
*/
|
||||
std::pair<double, double> ks_stat_2sided(StdEmpirical<Domain> const & d2) {
|
||||
/* loop once over both sample distributions;
|
||||
* algorithm is O(n1 + n2) for two empirical
|
||||
* distributions with n1,n2 points respectively
|
||||
*/
|
||||
|
||||
/* return value observed here */
|
||||
double ks_stat = 0.0;
|
||||
|
||||
auto ix1 = this->sample_map_.begin();
|
||||
auto end_ix1 = this->sample_map_.end();
|
||||
|
||||
auto ix2 = d2.sample_map_.begin();
|
||||
auto end_ix2 = this->sample_map_.end();
|
||||
|
||||
uint32_t xj1_sum = 0;
|
||||
uint32_t xj2_sum = 0;
|
||||
|
||||
uint32_t n1 = this->n_sample();
|
||||
uint32_t n2 = d2.n_sample();
|
||||
|
||||
double nr1 = 1.0 / n1;
|
||||
double nr2 = 1.0 / n2;
|
||||
|
||||
/* ^
|
||||
* 1| **.
|
||||
* | ...*..
|
||||
* | . *
|
||||
* | **********
|
||||
* | * .
|
||||
* | .........
|
||||
* | . *
|
||||
* | ********
|
||||
* | ..*.....
|
||||
* +----------------------->
|
||||
* ^ ^
|
||||
* ix1 ix2
|
||||
*/
|
||||
|
||||
while ((ix1 != end_ix1) || (ix2 != end_ix2)) {
|
||||
/* on each iteration, compare sample distributions
|
||||
* at smallest of (ix1->first, ix2->first)
|
||||
*/
|
||||
|
||||
bool advance_ix1_flag = false;
|
||||
bool advance_ix2_flag = false;
|
||||
|
||||
if (ix1 == end_ix1) {
|
||||
/* only ix2 dereferenceable */
|
||||
advance_ix2_flag = true;
|
||||
} else if (ix2 == end_ix2) {
|
||||
/* only ix1 dereferenceable */
|
||||
advance_ix1_flag = true;
|
||||
} else {
|
||||
/* ix1,ix2 both dereferenceable */
|
||||
|
||||
advance_ix1_flag = (ix1->first <= ix2->first);
|
||||
advance_ix2_flag = (ix2->first <= ix1->first);
|
||||
}
|
||||
|
||||
if (advance_ix1_flag) {
|
||||
xj1_sum += ix1->first;
|
||||
++ix1;
|
||||
}
|
||||
if (advance_ix2_flag) {
|
||||
xj2_sum += ix2->first;
|
||||
++ix2;
|
||||
}
|
||||
|
||||
double p1 = xj1_sum * nr1;
|
||||
double p2 = xj2_sum * nr2;
|
||||
|
||||
double dp = std::abs(p1 - p2);
|
||||
|
||||
ks_stat = std::max(ks_stat, dp);
|
||||
}
|
||||
|
||||
/* ne = effective #of points to use when comparing 2 sample dist/s */
|
||||
double ne = (n1 * n2) / static_cast<double>(n1 + n2);
|
||||
|
||||
return std::pair<double, double>(ne, ks_stat);
|
||||
|
||||
#ifdef OBSOLETE
|
||||
/* NOTE: this will cost O(n.log(n))
|
||||
* (for empirical distributions with n points)
|
||||
* if we loop over both sets of points in parallel,
|
||||
* can get O(n) solution
|
||||
*
|
||||
*/
|
||||
|
||||
/* loop over points in *this */
|
||||
double ks1 = this->ks_stat_1sided(d2);
|
||||
/* loop over points in d2 */
|
||||
double ks2 = d2.ks_stat_1sided(*this);
|
||||
|
||||
return std::max(ks1, ks2);
|
||||
#endif
|
||||
} /*ks_stat_2sided*/
|
||||
|
||||
// ----- inherited from Empirical<Domain> -----
|
||||
|
||||
/* introduce one new sample into this distribution */
|
||||
virtual void include_sample(Domain const & x) {
|
||||
++(this->n_sample_);
|
||||
|
||||
/* note: xo::tree::RedBlackTree doesn't provide the usual reference result
|
||||
* from operator[]; it needs to intervene after assignment to update
|
||||
* order statistics
|
||||
*/
|
||||
auto lhs = this->sample_map_[x];
|
||||
|
||||
lhs += 1;
|
||||
} /*include_sample*/
|
||||
|
||||
// ----- inherited from Distribution<Domain> -----
|
||||
|
||||
virtual double cdf(Domain const & x) const override {
|
||||
/* computes #of samples with values <= x */
|
||||
uint32_t nx = this->sample_map_.reduce_lub(x, true /*is_closed*/);
|
||||
size_t n = this->n_sample();
|
||||
|
||||
return static_cast<double>(nx) / n;
|
||||
} /*cdf*/
|
||||
|
||||
private:
|
||||
/* count #of calls to .include_sample() */
|
||||
CounterRep n_sample_ = 0;
|
||||
|
||||
/* .sample_map_[x] counts the #of times
|
||||
* .include_sample(x) has been called.
|
||||
*/
|
||||
SampleMap sample_map_;
|
||||
}; /*StdEmpirical*/
|
||||
|
||||
} /*namespace distribution*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end StdEmpirical.hpp */
|
||||
57
xo-distribution/include/xo/distribution/Uniform.hpp
Normal file
57
xo-distribution/include/xo/distribution/Uniform.hpp
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
/* @file Uniform.hpp */
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Distribution.hpp"
|
||||
|
||||
namespace xo {
|
||||
namespace distribution {
|
||||
/* uniform distribution on an interval */
|
||||
class Uniform : public Distribution<double> {
|
||||
public:
|
||||
Uniform(double lo, double hi) : lo_(lo), hi_(hi) {}
|
||||
|
||||
static rp<Uniform> unit() { return new Uniform(0.0, 1.0); }
|
||||
|
||||
static double density_impl(double lo, double hi, double x) {
|
||||
if (x <= lo)
|
||||
return 0.0;
|
||||
if (x >= hi)
|
||||
return 0.0;
|
||||
|
||||
return 1.0 / (hi - lo);
|
||||
} /*density_impl*/
|
||||
|
||||
static double distr_impl(double lo, double hi, double x) {
|
||||
if (x <= lo)
|
||||
return 0.0;
|
||||
if (x >= hi)
|
||||
return 1.0;
|
||||
|
||||
return (x - lo) / (hi - lo);
|
||||
} /*distr_impl*/
|
||||
|
||||
double lo() const { return lo_; }
|
||||
double hi() const { return hi_; }
|
||||
|
||||
double density(double x) const {
|
||||
return density_impl(this->lo_, this->hi_, x);
|
||||
} /*density*/
|
||||
|
||||
double distribution(double x) const {
|
||||
return distr_impl(this->lo_, this->hi_, x);
|
||||
} /*distribution*/
|
||||
|
||||
// ----- inherited from Distribution<double> -----
|
||||
|
||||
virtual double cdf(double const & x) const override {
|
||||
return this->distribution(x);
|
||||
} /*cdf*/
|
||||
|
||||
private:
|
||||
/* Invariant: .lo < .hi */
|
||||
double lo_ = 0.0;
|
||||
double hi_ = 1.0;
|
||||
}; /*Uniform*/
|
||||
} /*namespace distribution*/
|
||||
} /*namespace xo*/
|
||||
5
xo-distribution/src/distribution/CMakeLists.txt
Normal file
5
xo-distribution/src/distribution/CMakeLists.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
set(SELF_LIB xo_distribution)
|
||||
set(SELF_SRCS Normal.cpp)
|
||||
|
||||
xo_add_shared_library4(${SELF_LIB} ${PROJECT_NAME}Targets ${PROJECT_VERSION} 1 ${SELF_SRCS})
|
||||
xo_dependency(${SELF_LIB} refcnt)
|
||||
9
xo-distribution/src/distribution/Normal.cpp
Normal file
9
xo-distribution/src/distribution/Normal.cpp
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
/* @file Normal.cpp */
|
||||
|
||||
#include "Normal.hpp"
|
||||
|
||||
namespace xo {
|
||||
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end Normal.cpp */
|
||||
13
xo-distribution/utest/CMakeLists.txt
Normal file
13
xo-distribution/utest/CMakeLists.txt
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
# build unittest distribution/utest
|
||||
|
||||
set(SELF_EXE utest.distribution)
|
||||
set(SELF_SRCS
|
||||
distribution_utest_main.cpp
|
||||
Normal.test.cpp
|
||||
Uniform.test.cpp)
|
||||
|
||||
xo_add_utest_executable(${SELF_EXE} ${SELF_SRCS})
|
||||
xo_self_dependency(${SELF_EXE} xo_distribution)
|
||||
xo_external_target_dependency(${SELF_EXE} Catch2 Catch2::Catch2)
|
||||
|
||||
# end CMakeLists.txt
|
||||
24
xo-distribution/utest/Normal.test.cpp
Normal file
24
xo-distribution/utest/Normal.test.cpp
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
/* @file Normal.test.cpp */
|
||||
|
||||
#include "xo/distribution/Normal.hpp"
|
||||
#include <catch2/catch.hpp>
|
||||
|
||||
namespace xo {
|
||||
using xo::distribution::Normal;
|
||||
|
||||
namespace ut {
|
||||
TEST_CASE("normal", "[distribution]") {
|
||||
auto n01 = Normal::unit();
|
||||
|
||||
CHECK(n01->cdf(-3.0) == Approx(0.001349898).margin(1e-9));
|
||||
CHECK(n01->cdf(-2.0) == Approx(0.0227501319).margin(1e-9));
|
||||
CHECK(n01->cdf(-1.0) == Approx(0.1586552539).margin(1e-9));
|
||||
CHECK(n01->cdf(0.0) == 0.5);
|
||||
CHECK(n01->cdf(1.0) == 1.0 - n01->cdf(-1.0));
|
||||
CHECK(n01->cdf(2.0) == 1.0 - n01->cdf(-2.0));
|
||||
CHECK(n01->cdf(3.0) == 1.0 - n01->cdf(-3.0));
|
||||
} /*TEST_CASE(normal)*/
|
||||
} /*namespace ut*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end Normal.test.cpp */
|
||||
27
xo-distribution/utest/Uniform.test.cpp
Normal file
27
xo-distribution/utest/Uniform.test.cpp
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
/* @file Uniform.test.cpp */
|
||||
|
||||
#include "xo/distribution/Uniform.hpp"
|
||||
#include <catch2/catch.hpp>
|
||||
|
||||
namespace xo {
|
||||
using xo::distribution::Uniform;
|
||||
|
||||
namespace ut {
|
||||
TEST_CASE("uniform", "[distribution]") {
|
||||
auto u = Uniform::unit();
|
||||
|
||||
CHECK(u->cdf(-3.0) == 0.0);
|
||||
CHECK(u->cdf(-2.0) == 0.0);
|
||||
CHECK(u->cdf(-1.0) == 0.0);
|
||||
CHECK(u->cdf(0.0) == 0.0);
|
||||
CHECK(u->cdf(0.05) == 0.05);
|
||||
CHECK(u->cdf(0.5) == 0.5);
|
||||
CHECK(u->cdf(0.95) == 0.95);
|
||||
CHECK(u->cdf(1.0) == 1.0);
|
||||
CHECK(u->cdf(2.0) == 1.0);
|
||||
CHECK(u->cdf(3.0) == 1.0);
|
||||
} /*TEST_CASE(uniform)*/
|
||||
} /*namespace ut*/
|
||||
} /*namespace xo*/
|
||||
|
||||
/* end Uniform.test.cpp */
|
||||
6
xo-distribution/utest/distribution_utest_main.cpp
Normal file
6
xo-distribution/utest/distribution_utest_main.cpp
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
/* file distribution_utest_main.cpp */
|
||||
|
||||
#define CATCH_CONFIG_MAIN
|
||||
#include "catch2/catch.hpp"
|
||||
|
||||
/* end distribution_utest_main.cpp */
|
||||
Loading…
Add table
Add a link
Reference in a new issue