diff --git a/.travis.yml b/.travis.yml index 0329a8a53b..fb5d43fec3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,8 +7,6 @@ r_binary_packages: - devtools - testthat - roxygen2 -r_github_packages: - - jimhester/lintr matrix: include: @@ -64,7 +62,6 @@ matrix: - valgrind --leak-check=full --error-exitcode=1 ./build/grf exclude:[characterization] - name: "grf R package" - env: LINTR_COMMENT_BOT=false before_install: - cd r-package/grf script: diff --git a/DEVELOPING.md b/DEVELOPING.md index fd79e5ba61..2da76dbd77 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -16,8 +16,7 @@ The core forest implementation is written in C++, with an R interface powered by ## R package -To build the R package from source, cd into `r-package` and run `build_package.R`. Required development dependencies are listed there -(note: it is recommended to install the latest [lintr](https://github.com/jimhester/lintr) package with `devtools::install_github("jimhester/lintr")`). This mimics the tests run when submitting a pull request. +To build the R package from source, cd into `r-package` and run `build_package.R`. Required development dependencies are listed there. This mimics the tests run when submitting a pull request. An alternative development workflow is to use the accompanying grf.Rproj and build and test the package with RStudio's build menu, which can be convenient for quickly iterating C++/R code changes. diff --git a/core/src/commons/utility.cpp b/core/src/commons/utility.cpp index d9117d8d32..e4b4e06bef 100644 --- a/core/src/commons/utility.cpp +++ b/core/src/commons/utility.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include "utility.h" #include "SparseData.h" diff --git a/core/src/commons/utility.h b/core/src/commons/utility.h index 6ff539fcb7..390863a7e0 100644 --- a/core/src/commons/utility.h +++ b/core/src/commons/utility.h @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/core/src/forest/ForestTrainer.cpp b/core/src/forest/ForestTrainer.cpp index ac4b4430f5..86aa807c83 100644 --- a/core/src/forest/ForestTrainer.cpp +++ b/core/src/forest/ForestTrainer.cpp @@ -23,6 +23,7 @@ #include "commons/utility.h" #include "ForestTrainer.h" +#include "random/random.hpp" ForestTrainer::ForestTrainer(std::shared_ptr relabeling_strategy, std::shared_ptr splitting_rule_factory, @@ -87,7 +88,7 @@ std::vector> ForestTrainer::train_batch( size_t ci_group_size = options.get_ci_group_size(); std::mt19937_64 random_number_generator(options.get_random_seed() + start); - std::uniform_int_distribution udist; + nonstd::uniform_int_distribution udist; std::vector> trees; if (ci_group_size == 1) { diff --git a/core/src/sampling/RandomSampler.cpp b/core/src/sampling/RandomSampler.cpp index fb61dc2093..152a11b2cd 100644 --- a/core/src/sampling/RandomSampler.cpp +++ b/core/src/sampling/RandomSampler.cpp @@ -14,6 +14,7 @@ #include #include +#include #include "RandomSampler.h" @@ -52,7 +53,7 @@ void RandomSampler::subsample(const std::vector& samples, double sample_fraction, std::vector& subsamples) { std::vector shuffled_sample(samples); - std::shuffle(shuffled_sample.begin(), shuffled_sample.end(), random_number_generator); + nonstd::shuffle(shuffled_sample.begin(), shuffled_sample.end(), random_number_generator); uint subsample_size = (uint) std::ceil(samples.size() * sample_fraction); subsamples.resize(subsample_size); @@ -66,7 +67,7 @@ void RandomSampler::subsample(const std::vector& samples, std::vector& subsamples, std::vector& oob_samples) { std::vector shuffled_sample(samples); - std::shuffle(shuffled_sample.begin(), shuffled_sample.end(), random_number_generator); + nonstd::shuffle(shuffled_sample.begin(), shuffled_sample.end(), random_number_generator); size_t subsample_size = (size_t) std::ceil(samples.size() * sample_fraction); subsamples.resize(subsample_size); @@ -84,7 +85,7 @@ void RandomSampler::subsample_with_size(const std::vector& samples, size_t subsample_size, std::vector& subsamples) { std::vector shuffled_sample(samples); - std::shuffle(shuffled_sample.begin(), shuffled_sample.end(), random_number_generator); + nonstd::shuffle(shuffled_sample.begin(), shuffled_sample.end(), random_number_generator); subsamples.resize(subsample_size); std::copy(shuffled_sample.begin(), @@ -133,7 +134,7 @@ void RandomSampler::shuffle_and_split(std::vector& samples, // Fill with 0..n_all-1 and shuffle std::iota(samples.begin(), samples.end(), 0); - std::shuffle(samples.begin(), samples.end(), random_number_generator); + nonstd::shuffle(samples.begin(), samples.end(), random_number_generator); samples.resize(size); } @@ -159,7 +160,7 @@ void RandomSampler::draw_simple(std::vector& result, std::vector temp; temp.resize(max, false); - std::uniform_int_distribution unif_dist(0, max - 1 - skip.size()); + nonstd::uniform_int_distribution unif_dist(0, max - 1 - skip.size()); for (size_t i = 0; i < num_samples; ++i) { size_t draw; do { @@ -191,7 +192,7 @@ void RandomSampler::draw_fisher_yates(std::vector& result, // Draw without replacement using Fisher Yates algorithm for (size_t i = result.size() - 1; i > 0; --i) { - std::uniform_int_distribution distribution(0, i); + nonstd::uniform_int_distribution distribution(0, i); size_t j = distribution(random_number_generator); std::swap(result[i], result[j]); } @@ -210,7 +211,7 @@ void RandomSampler::draw_weighted(std::vector& result, std::vector temp; temp.resize(max + 1, false); - std::discrete_distribution<> weighted_dist(weights.begin(), weights.end()); + nonstd::discrete_distribution<> weighted_dist(weights.begin(), weights.end()); for (size_t i = 0; i < num_samples; ++i) { size_t draw; do { @@ -222,6 +223,6 @@ void RandomSampler::draw_weighted(std::vector& result, } size_t RandomSampler::sample_poisson(size_t mean) { - std::poisson_distribution distribution(mean); + nonstd::poisson_distribution distribution(mean); return distribution(random_number_generator); } diff --git a/core/src/sampling/RandomSampler.h b/core/src/sampling/RandomSampler.h index 8a492ed8b8..cf1736defd 100644 --- a/core/src/sampling/RandomSampler.h +++ b/core/src/sampling/RandomSampler.h @@ -22,6 +22,8 @@ #include "commons/globals.h" #include "commons/utility.h" #include "SamplingOptions.h" +#include "random/random.hpp" +#include "random/algorithm.hpp" #include #include diff --git a/core/src/tree/Tree.h b/core/src/tree/Tree.h index 2de4f58188..ea1a646864 100644 --- a/core/src/tree/Tree.h +++ b/core/src/tree/Tree.h @@ -19,7 +19,6 @@ #define GRF_TREE_H_ #include -#include #include #include "commons/globals.h" diff --git a/core/third_party/random/README.md b/core/third_party/random/README.md new file mode 100644 index 0000000000..65303352e0 --- /dev/null +++ b/core/third_party/random/README.md @@ -0,0 +1,73 @@ +# third_party/random + +## What are these files? + +They are copies of `random` and `algorithm` headers from the [llvm](https://github.com/llvm-mirror/libcxx/blob/master/include/) standard library. + + +## Motivation + +Users complained about stability of random numbers across machines when setting seed across different platforms. See issue [#379](https://github.com/grf-labs/grf/issues/379). + +As [pointed out](https://github.com/grf-labs/grf/issues/379#issuecomment-480641123) by @jtibshirani: + +> the mersenne twister has the same implementation across platforms, the other random methods may differ from compiler to compiler + + +In PR [#469](https://github.com/grf-labs/grf/pull/469), @halflearned included this reduced copy of the relevant headers, ensuring random number generation is done in a consistent way across compilers. + + +## How to reproduce + +#### File `random.hpp` + +Extract only the relevant classes and functions from `random.hpp`: + ++ `__independent_bits_engine` ++ `__clz` ++ `__log2` ++ `__log2_imp` ++ `__generate_canonical` ++ `uniform_int_distribution` ++ `poisson_distribution` ++ `uniform_real_distribution` ++ `normal_distribution` ++ `exponential_distribution` ++ `discrete_distribution` + +From each class, remove all methods associated with `operator<<`. + +Find and remove the following `_LIBCPP` macros: ++ `_LIBCPP_BEGIN_NAMESPACE_STD` ++ `_LIBCPP_CONSTEXPR` ++ `_LIBCPP_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK` ++ `_LIBCPP_END_NAMESPACE_STD` ++ `_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER` ++ `_LIBCPP_INLINE_VISIBILITY` ++ `_LIBCPP_MSVCRT` ++ `_LIBCPP_POP_MACROS` ++ `_LIBCPP_PUSH_MACROS` ++ `_LIBCPP_RANDOM` ++ `_LIBCPP_TEMPLATE_VIS` ++ `_LIBCPP_TYPE_VIS` ++ `_LIBCPP_USING_DEV_RANDOM` + +Find and replace prefix: ++ `_VSTD::` -> `std::` + +Add `namespace nonstd`. + + +#### File `algorithm.hpp` + +Include modified `random.hpp` + +Extract relevant class: + ++ `shuffle` + +Replace prefix: + ++ `std::uniform_int_distribution` -> `nonstd::uniform_int_distribution` + +Add `namespace nonstd`. diff --git a/core/third_party/random/algorithm.hpp b/core/third_party/random/algorithm.hpp new file mode 100644 index 0000000000..d4b15e29c5 --- /dev/null +++ b/core/third_party/random/algorithm.hpp @@ -0,0 +1,35 @@ +#include +#include +#include +#include +#include "random/random.hpp" + +#ifndef _GRFSTD_ALGORITHM +#define _GRFSTD_ALGORITHM + +namespace nonstd { + + template + void shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last, +#ifndef _LIBCPP_CXX03_LANG + _UniformRandomNumberGenerator &&__g) +#else + _UniformRandomNumberGenerator& __g) +#endif + { + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type; + typedef nonstd::uniform_int_distribution _Dp; + typedef typename _Dp::param_type _Pp; + difference_type __d = __last - __first; + if (__d > 1) { + _Dp __uid; + for (--__last, --__d; __first < __last; ++__first, --__d) { + difference_type __i = __uid(__g, _Pp(0, __d)); + if (__i != difference_type(0)) + std::swap(*__first, *(__first + __i)); + } + } + } +} + +#endif // _GRFSTD_ALGORITHM diff --git a/core/third_party/random/random.hpp b/core/third_party/random/random.hpp new file mode 100644 index 0000000000..52f5658842 --- /dev/null +++ b/core/third_party/random/random.hpp @@ -0,0 +1,1098 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef _LIBGRF_RANDOM +#define _LIBGRF_RANDOM + +namespace nonstd { + + +// Precondition: __x != 0 + inline + unsigned __clz(unsigned __x) { +#ifndef _LIBCPP_COMPILER_MSVC + return static_cast(__builtin_clz(__x)); +#else + static_assert(sizeof(unsigned) == sizeof(unsigned long), ""); + static_assert(sizeof(unsigned long) == 4, ""); + unsigned long where; + // Search from LSB to MSB for first set bit. + // Returns zero if no set bit is found. + if (_BitScanReverse(&where, __x)) + return 31 - where; + return 32; // Undefined Behavior. +#endif + } + + inline + unsigned long __clz(unsigned long __x) { +#ifndef _LIBCPP_COMPILER_MSVC + return static_cast(__builtin_clzl(__x)); +#else + static_assert(sizeof(unsigned) == sizeof(unsigned long), ""); + return __clz(static_cast(__x)); +#endif + } + + inline + unsigned long long __clz(unsigned long long __x) { +#ifndef _LIBCPP_COMPILER_MSVC + return static_cast(__builtin_clzll(__x)); +#else + unsigned long where; +// BitScanReverse scans from MSB to LSB for first set bit. +// Returns 0 if no set bit is found. +#if defined(_LIBCPP_HAS_BITSCAN64) + if (_BitScanReverse64(&where, __x)) + return static_cast(63 - where); +#else + // Scan the high 32 bits. + if (_BitScanReverse(&where, static_cast(__x >> 32))) + return 63 - (where + 32); // Create a bit offset from the MSB. + // Scan the low 32 bits. + if (_BitScanReverse(&where, static_cast(__x))) + return 63 - where; +#endif + return 64; // Undefined Behavior. +#endif // _LIBCPP_COMPILER_MSVC + } + + +// __independent_bits_engine + + template + struct __log2_imp { + static const size_t value = _Xp & ((unsigned long long) (1) << _Rp) ? _Rp + : __log2_imp<_Xp, _Rp - 1>::value; + }; + + template + struct __log2_imp<_Xp, 0> { + static const size_t value = 0; + }; + + template + struct __log2_imp<0, _Rp> { + static const size_t value = _Rp + 1; + }; + + template + struct __log2 { + static const size_t value = __log2_imp<_Xp, + sizeof(_UIntType) * __CHAR_BIT__ - 1>::value; + }; + + template + class __independent_bits_engine { + public: + // types + typedef _UIntType result_type; + + private: + typedef typename _Engine::result_type _Engine_result_type; + typedef typename std::conditional + < + sizeof(_Engine_result_type) <= sizeof(result_type), + result_type, + _Engine_result_type + >::type _Working_result_type; + + _Engine &__e_; + size_t __w_; + size_t __w0_; + size_t __n_; + size_t __n0_; + _Working_result_type __y0_; + _Working_result_type __y1_; + _Engine_result_type __mask0_; + _Engine_result_type __mask1_; + +#ifdef _LIBCPP_CXX03_LANG + static const _Working_result_type _Rp = _Engine::_Max - _Engine::_Min + + _Working_result_type(1); +#else + static const _Working_result_type _Rp = _Engine::max() - _Engine::min() + + _Working_result_type(1); +#endif + static const size_t __m = __log2<_Working_result_type, _Rp>::value; + static const size_t _WDt = std::numeric_limits<_Working_result_type>::digits; + static const size_t _EDt = std::numeric_limits<_Engine_result_type>::digits; + + public: + // constructors and seeding functions + __independent_bits_engine(_Engine &__e, size_t __w); + + // generating functions + result_type operator()() { return __eval(std::integral_constant()); } + + private: + result_type __eval(std::false_type); + + result_type __eval(std::true_type); + }; + + template + __independent_bits_engine<_Engine, _UIntType> + ::__independent_bits_engine(_Engine &__e, size_t __w) + : __e_(__e), + __w_(__w) { + __n_ = __w_ / __m + (__w_ % __m != 0); + __w0_ = __w_ / __n_; + if (_Rp == 0) + __y0_ = _Rp; + else if (__w0_ < _WDt) + __y0_ = (_Rp >> __w0_) << __w0_; + else + __y0_ = 0; + if (_Rp - __y0_ > __y0_ / __n_) { + ++__n_; + __w0_ = __w_ / __n_; + if (__w0_ < _WDt) + __y0_ = (_Rp >> __w0_) << __w0_; + else + __y0_ = 0; + } + __n0_ = __n_ - __w_ % __n_; + if (__w0_ < _WDt - 1) + __y1_ = (_Rp >> (__w0_ + 1)) << (__w0_ + 1); + else + __y1_ = 0; + __mask0_ = __w0_ > 0 ? _Engine_result_type(~0) >> (_EDt - __w0_) : + _Engine_result_type(0); + __mask1_ = __w0_ < _EDt - 1 ? + _Engine_result_type(~0) >> (_EDt - (__w0_ + 1)) : + _Engine_result_type(~0); + } + + template + inline + _UIntType + __independent_bits_engine<_Engine, _UIntType>::__eval(std::false_type) { + return static_cast(__e_() & __mask0_); + } + + template + _UIntType + __independent_bits_engine<_Engine, _UIntType>::__eval(std::true_type) { + const size_t _WRt = std::numeric_limits::digits; + result_type _Sp = 0; + for (size_t __k = 0; __k < __n0_; ++__k) { + _Engine_result_type __u; + do { + __u = __e_() - _Engine::min(); + } while (__u >= __y0_); + if (__w0_ < _WRt) + _Sp <<= __w0_; + else + _Sp = 0; + _Sp += __u & __mask0_; + } + for (size_t __k = __n0_; __k < __n_; ++__k) { + _Engine_result_type __u; + do { + __u = __e_() - _Engine::min(); + } while (__u >= __y1_); + if (__w0_ < _WRt - 1) + _Sp <<= __w0_ + 1; + else + _Sp = 0; + _Sp += __u & __mask1_; + } + return _Sp; + } + + +// uniform_int_distribution + + template + class uniform_int_distribution { + public: + // types + typedef _IntType result_type; + + class param_type { + result_type __a_; + result_type __b_; + public: + typedef uniform_int_distribution distribution_type; + + explicit param_type(result_type __a = 0, + result_type __b = std::numeric_limits::max()) + : __a_(__a), __b_(__b) {} + + result_type a() const { return __a_; } + + result_type b() const { return __b_; } + + friend bool operator==(const param_type &__x, const param_type &__y) { + return __x.__a_ == __y.__a_ && __x.__b_ == __y.__b_; + } + + friend bool operator!=(const param_type &__x, const param_type &__y) { return !(__x == __y); } + }; + + private: + param_type __p_; + + public: + // constructors and reset functions + explicit uniform_int_distribution(result_type __a = 0, + result_type __b = std::numeric_limits::max()) + : __p_(param_type(__a, __b)) {} + + explicit uniform_int_distribution(const param_type &__p) : __p_(__p) {} + + void reset() {} + + // generating functions + template + result_type operator()(_URNG &__g) { return (*this)(__g, __p_); } + + template + result_type operator()(_URNG &__g, const param_type &__p); + + // property functions + result_type a() const { return __p_.a(); } + + result_type b() const { return __p_.b(); } + + param_type param() const { return __p_; } + + void param(const param_type &__p) { __p_ = __p; } + + result_type min() const { return a(); } + + result_type max() const { return b(); } + + friend bool operator==(const uniform_int_distribution &__x, + const uniform_int_distribution &__y) { return __x.__p_ == __y.__p_; } + + friend bool operator!=(const uniform_int_distribution &__x, + const uniform_int_distribution &__y) { return !(__x == __y); } + }; + + template + template + typename uniform_int_distribution<_IntType>::result_type + uniform_int_distribution<_IntType>::operator()(_URNG &__g, const param_type &__p) { + typedef typename std::conditional::type _UIntType; + const _UIntType _Rp = __p.b() - __p.a() + _UIntType(1); + if (_Rp == 1) + return __p.a(); + const size_t _Dt = std::numeric_limits<_UIntType>::digits; + typedef __independent_bits_engine<_URNG, _UIntType> _Eng; + if (_Rp == 0) + return static_cast(_Eng(__g, _Dt)()); + size_t __w = _Dt - __clz(_Rp) - 1; + if ((_Rp & (std::numeric_limits<_UIntType>::max() >> (_Dt - __w))) != 0) + ++__w; + _Eng __e(__g, __w); + _UIntType __u; + do { + __u = __e(); + } while (__u >= _Rp); + return static_cast(__u + __p.a()); + } + +#if _LIBCPP_STD_VER <= 14 || defined(_LIBCPP_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE) \ + || defined(_LIBCPP_BUILDING_LIBRARY) + + class __rs_default; + + __rs_default __rs_get(); + + class __rs_default { + static unsigned __c_; + + __rs_default(); + + public: + typedef uint_fast32_t result_type; + + static const result_type _Min = 0; + static const result_type _Max = 0xFFFFFFFF; + + __rs_default(const __rs_default &); + + ~__rs_default(); + + result_type operator()(); + + static result_type min() { return _Min; } + + static result_type max() { return _Max; } + + friend __rs_default __rs_get(); + }; + + __rs_default __rs_get(); + + template + void + random_shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last) { + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type; + typedef uniform_int_distribution _Dp; + typedef typename _Dp::param_type _Pp; + difference_type __d = __last - __first; + if (__d > 1) { + _Dp __uid; + __rs_default __g = __rs_get(); + for (--__last, --__d; __first < __last; ++__first, --__d) { + difference_type __i = __uid(__g, _Pp(0, __d)); + if (__i != difference_type(0)) + swap(*__first, *(__first + __i)); + } + } + } + + template + void + random_shuffle(_RandomAccessIterator __first, _RandomAccessIterator __last, +#ifndef _LIBCPP_CXX03_LANG + _RandomNumberGenerator &&__rand) +#else + _RandomNumberGenerator& __rand) +#endif + { + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type; + difference_type __d = __last - __first; + if (__d > 1) { + for (--__last; __first < __last; ++__first, --__d) { + difference_type __i = __rand(__d); + swap(*__first, *(__first + __i)); + } + } + } + +#endif + + template + + _SampleIterator __sample(_PopulationIterator __first, + _PopulationIterator __last, _SampleIterator __output_iter, + _Distance __n, + _UniformRandomNumberGenerator &__g, + std::input_iterator_tag) { + + _Distance __k = 0; + for (; __first != __last && __k < __n; ++__first, (void) ++__k) + __output_iter[__k] = *__first; + _Distance __sz = __k; + for (; __first != __last; ++__first, (void) ++__k) { + _Distance __r = uniform_int_distribution<_Distance>(0, __k)(__g); + if (__r < __sz) + __output_iter[__r] = *__first; + } + return __output_iter + std::min(__n, __k); + } + + + +// generate_canonical + + template + _RealType + __generate_canonical(_URNG &__g) { + const size_t _Dt = std::numeric_limits<_RealType>::digits; + const size_t __b = _Dt < __bits ? _Dt : __bits; +#ifdef _LIBCPP_CXX03_LANG + const size_t __logR = __log2::value; +#else + const size_t __logR = __log2::value; +#endif + const size_t __k = __b / __logR + (__b % __logR != 0) + (__b == 0); + const _RealType _Rp = _URNG::max() - _URNG::min() + _RealType(1); + _RealType __base = _Rp; + _RealType _Sp = __g() - _URNG::min(); + for (size_t __i = 1; __i < __k; ++__i, __base *= _Rp) + _Sp += (__g() - _URNG::min()) * __base; + return _Sp / __base; + } + + + + +// poisson distribution + + template + class poisson_distribution { + public: + // types + typedef _IntType result_type; + + class param_type { + double __mean_; + double __s_; + double __d_; + double __l_; + double __omega_; + double __c0_; + double __c1_; + double __c2_; + double __c3_; + double __c_; + + public: + typedef poisson_distribution distribution_type; + + explicit param_type(double __mean = 1.0); + + + double mean() const { return __mean_; } + + friend + bool operator==(const param_type &__x, const param_type &__y) { return __x.__mean_ == __y.__mean_; } + + friend + bool operator!=(const param_type &__x, const param_type &__y) { return !(__x == __y); } + + friend class poisson_distribution; + }; + + private: + param_type __p_; + + public: + // constructors and reset functions + + explicit poisson_distribution(double __mean = 1.0) : __p_(__mean) {} + + + explicit poisson_distribution(const param_type &__p) : __p_(__p) {} + + + void reset() {} + + // generating functions + template + + result_type operator()(_URNG &__g) { return (*this)(__g, __p_); } + + template + result_type operator()(_URNG &__g, const param_type &__p); + + // property functions + + double mean() const { return __p_.mean(); } + + + param_type param() const { return __p_; } + + + void param(const param_type &__p) { __p_ = __p; } + + + result_type min() const { return 0; } + + + result_type max() const { return std::numeric_limits::max(); } + + friend + bool operator==(const poisson_distribution &__x, + const poisson_distribution &__y) { return __x.__p_ == __y.__p_; } + + friend + bool operator!=(const poisson_distribution &__x, + const poisson_distribution &__y) { return !(__x == __y); } + }; + +//#define std std::_LIBCPP_NAMESPACE + + template + poisson_distribution<_IntType>::param_type::param_type(double __mean) + : __mean_(__mean) { + if (__mean_ < 10) { + __s_ = 0; + __d_ = 0; + __l_ = std::exp(-__mean_); + __omega_ = 0; + __c3_ = 0; + __c2_ = 0; + __c1_ = 0; + __c0_ = 0; + __c_ = 0; + } else { + __s_ = std::sqrt(__mean_); + __d_ = 6 * __mean_ * __mean_; + __l_ = static_cast(__mean_ - 1.1484); + __omega_ = .3989423 / __s_; + double __b1_ = .4166667E-1 / __mean_; + double __b2_ = .3 * __b1_ * __b1_; + __c3_ = .1428571 * __b1_ * __b2_; + __c2_ = __b2_ - 15. * __c3_; + __c1_ = __b1_ - 6. * __b2_ + 45. * __c3_; + __c0_ = 1. - __b1_ + 3. * __b2_ - 15. * __c3_; + __c_ = .1069 / __mean_; + } + } + + +// uniform_real_distribution + + template + class uniform_real_distribution { + public: + // types + typedef _RealType result_type; + + class param_type { + result_type __a_; + result_type __b_; + public: + typedef uniform_real_distribution distribution_type; + + + explicit param_type(result_type __a = 0, + result_type __b = 1) + : __a_(__a), __b_(__b) {} + + + result_type a() const { return __a_; } + + + result_type b() const { return __b_; } + + friend + bool operator==(const param_type &__x, const param_type &__y) { + return __x.__a_ == __y.__a_ && __x.__b_ == __y.__b_; + } + + friend + bool operator!=(const param_type &__x, const param_type &__y) { return !(__x == __y); } + }; + + private: + param_type __p_; + + public: + // constructors and reset functions + + explicit uniform_real_distribution(result_type __a = 0, result_type __b = 1) + : __p_(param_type(__a, __b)) {} + + + explicit uniform_real_distribution(const param_type &__p) : __p_(__p) {} + + + void reset() {} + + // generating functions + template + + result_type operator()(_URNG &__g) { return (*this)(__g, __p_); } + + template + result_type operator()(_URNG &__g, const param_type &__p); + + // property functions + + result_type a() const { return __p_.a(); } + + + result_type b() const { return __p_.b(); } + + + param_type param() const { return __p_; } + + + void param(const param_type &__p) { __p_ = __p; } + + + result_type min() const { return a(); } + + + result_type max() const { return b(); } + + friend + bool operator==(const uniform_real_distribution &__x, + const uniform_real_distribution &__y) { return __x.__p_ == __y.__p_; } + + friend + bool operator!=(const uniform_real_distribution &__x, + const uniform_real_distribution &__y) { return !(__x == __y); } + }; + + template + template + inline + typename uniform_real_distribution<_RealType>::result_type + uniform_real_distribution<_RealType>::operator()(_URNG &__g, const param_type &__p) { + return (__p.b() - __p.a()) + * __generate_canonical<_RealType, std::numeric_limits<_RealType>::digits>(__g) + + __p.a(); + } + + +// normal_distribution + + template + class normal_distribution { + public: + // types + typedef _RealType result_type; + + class param_type { + result_type __mean_; + result_type __stddev_; + public: + typedef normal_distribution distribution_type; + + + explicit param_type(result_type __mean = 0, result_type __stddev = 1) + : __mean_(__mean), __stddev_(__stddev) {} + + + result_type mean() const { return __mean_; } + + + result_type stddev() const { return __stddev_; } + + friend + bool operator==(const param_type &__x, const param_type &__y) { + return __x.__mean_ == __y.__mean_ && __x.__stddev_ == __y.__stddev_; + } + + friend + bool operator!=(const param_type &__x, const param_type &__y) { return !(__x == __y); } + }; + + private: + param_type __p_; + result_type _V_; + bool _V_hot_; + + public: + // constructors and reset functions + + explicit normal_distribution(result_type __mean = 0, result_type __stddev = 1) + : __p_(param_type(__mean, __stddev)), _V_hot_(false) {} + + + explicit normal_distribution(const param_type &__p) + : __p_(__p), _V_hot_(false) {} + + + void reset() { _V_hot_ = false; } + + // generating functions + template + + result_type operator()(_URNG &__g) { return (*this)(__g, __p_); } + + template + result_type operator()(_URNG &__g, const param_type &__p); + + // property functions + + result_type mean() const { return __p_.mean(); } + + + result_type stddev() const { return __p_.stddev(); } + + + param_type param() const { return __p_; } + + + void param(const param_type &__p) { __p_ = __p; } + + + result_type min() const { return -std::numeric_limits::infinity(); } + + + result_type max() const { return std::numeric_limits::infinity(); } + + friend + bool operator==(const normal_distribution &__x, + const normal_distribution &__y) { + return __x.__p_ == __y.__p_ && __x._V_hot_ == __y._V_hot_ && + (!__x._V_hot_ || __x._V_ == __y._V_); + } + + friend + bool operator!=(const normal_distribution &__x, + const normal_distribution &__y) { return !(__x == __y); } + + }; + + template + template + _RealType + normal_distribution<_RealType>::operator()(_URNG &__g, const param_type &__p) { + result_type _Up; + if (_V_hot_) { + _V_hot_ = false; + _Up = _V_; + } else { + uniform_real_distribution _Uni(-1, 1); + result_type __u; + result_type __v; + result_type __s; + do { + __u = _Uni(__g); + __v = _Uni(__g); + __s = __u * __u + __v * __v; + } while (__s > 1 || __s == 0); + result_type _Fp = std::sqrt(-2 * std::log(__s) / __s); + _V_ = __v * _Fp; + _V_hot_ = true; + _Up = __u * _Fp; + } + return _Up * __p.stddev() + __p.mean(); + } + + + + +// exponential_distribution + + template + class exponential_distribution { + public: + // types + typedef _RealType result_type; + + class param_type { + result_type __lambda_; + public: + typedef exponential_distribution distribution_type; + + + explicit param_type(result_type __lambda = 1) : __lambda_(__lambda) {} + + + result_type lambda() const { return __lambda_; } + + friend + bool operator==(const param_type &__x, const param_type &__y) { return __x.__lambda_ == __y.__lambda_; } + + friend + bool operator!=(const param_type &__x, const param_type &__y) { return !(__x == __y); } + }; + + private: + param_type __p_; + + public: + // constructors and reset functions + + explicit exponential_distribution(result_type __lambda = 1) + : __p_(param_type(__lambda)) {} + + + explicit exponential_distribution(const param_type &__p) : __p_(__p) {} + + + void reset() {} + + // generating functions + template + + result_type operator()(_URNG &__g) { return (*this)(__g, __p_); } + + template + result_type operator()(_URNG &__g, const param_type &__p); + + // property functions + + result_type lambda() const { return __p_.lambda(); } + + + param_type param() const { return __p_; } + + + void param(const param_type &__p) { __p_ = __p; } + + + result_type min() const { return 0; } + + + result_type max() const { return std::numeric_limits::infinity(); } + + friend + bool operator==(const exponential_distribution &__x, + const exponential_distribution &__y) { return __x.__p_ == __y.__p_; } + + friend + bool operator!=(const exponential_distribution &__x, + const exponential_distribution &__y) { return !(__x == __y); } + }; + + template + template + _RealType + exponential_distribution<_RealType>::operator()(_URNG &__g, const param_type &__p) { + return -std::log + ( + result_type(1) - + __generate_canonical::digits>(__g) + ) + / __p.lambda(); + } + + +// poisson distribution + + template + template + _IntType + poisson_distribution<_IntType>::operator()(_URNG &__urng, const param_type &__pr) { + result_type __x; + uniform_real_distribution __urd; + if (__pr.__mean_ < 10) { + __x = 0; + for (double __p = __urd(__urng); __p > __pr.__l_; ++__x) + __p *= __urd(__urng); + } else { + double __difmuk; + double __g = __pr.__mean_ + __pr.__s_ * normal_distribution()(__urng); + double __u; + if (__g > 0) { + __x = static_cast(__g); + if (__x >= __pr.__l_) + return __x; + __difmuk = __pr.__mean_ - __x; + __u = __urd(__urng); + if (__pr.__d_ * __u >= __difmuk * __difmuk * __difmuk) + return __x; + } + exponential_distribution __edist; + for (bool __using_exp_dist = false; true; __using_exp_dist = true) { + double __e; + if (__using_exp_dist || __g < 0) { + double __t; + do { + __e = __edist(__urng); + __u = __urd(__urng); + __u += __u - 1; + __t = 1.8 + (__u < 0 ? -__e : __e); + } while (__t <= -.6744); + __x = __pr.__mean_ + __pr.__s_ * __t; + __difmuk = __pr.__mean_ - __x; + __using_exp_dist = true; + } + double __px; + double __py; + if (__x < 10) { + const double __fac[] = {1, 1, 2, 6, 24, 120, 720, 5040, + 40320, 362880}; + __px = -__pr.__mean_; + __py = std::pow(__pr.__mean_, (double) __x) / __fac[__x]; + } else { + double __del = .8333333E-1 / __x; + __del -= 4.8 * __del * __del * __del; + double __v = __difmuk / __x; + if (std::abs(__v) > 0.25) + __px = __x * std::log(1 + __v) - __difmuk - __del; + else + __px = __x * __v * __v * (((((((.1250060 * __v + -.1384794) * + __v + .1421878) * __v + -.1661269) * __v + .2000118) * + __v + -.2500068) * __v + .3333333) * __v + -.5) - __del; + __py = .3989423 / std::sqrt(__x); + } + double __r = (0.5 - __difmuk) / __pr.__s_; + double __r2 = __r * __r; + double __fx = -0.5 * __r2; + double __fy = __pr.__omega_ * (((__pr.__c3_ * __r2 + __pr.__c2_) * + __r2 + __pr.__c1_) * __r2 + __pr.__c0_); + if (__using_exp_dist) { + if (__pr.__c_ * std::abs(__u) <= __py * std::exp(__px + __e) - + __fy * std::exp(__fx + __e)) + break; + } else { + if (__fy - __u * __fy <= __py * std::exp(__px - __fx)) + break; + } + } + } + return __x; + } + + +// discrete_distribution + + template + class discrete_distribution { + public: + // types + typedef _IntType result_type; + + class param_type { + std::vector __p_; + public: + typedef discrete_distribution distribution_type; + + + param_type() {} + + template + + param_type(_InputIterator __f, _InputIterator __l) + : __p_(__f, __l) { __init(); } + +#ifndef _LIBCPP_CXX03_LANG + + + param_type(std::initializer_list __wl) + : __p_(__wl.begin(), __wl.end()) { __init(); } + +#endif // _LIBCPP_CXX03_LANG + + template + param_type(size_t __nw, double __xmin, double __xmax, + _UnaryOperation __fw); + + std::vector probabilities() const; + + friend + bool operator==(const param_type &__x, const param_type &__y) { return __x.__p_ == __y.__p_; } + + friend + bool operator!=(const param_type &__x, const param_type &__y) { return !(__x == __y); } + + private: + void __init(); + + friend class discrete_distribution; + + }; + + private: + param_type __p_; + + public: + // constructor and reset functions + + discrete_distribution() {} + + template + + discrete_distribution(_InputIterator __f, _InputIterator __l) + : __p_(__f, __l) {} + +#ifndef _LIBCPP_CXX03_LANG + + + discrete_distribution(std::initializer_list __wl) + : __p_(__wl) {} + +#endif // _LIBCPP_CXX03_LANG + + template + + discrete_distribution(size_t __nw, double __xmin, double __xmax, + _UnaryOperation __fw) + : __p_(__nw, __xmin, __xmax, __fw) {} + + + explicit discrete_distribution(const param_type &__p) + : __p_(__p) {} + + + void reset() {} + + // generating functions + template + + result_type operator()(_URNG &__g) { return (*this)(__g, __p_); } + + template + result_type operator()(_URNG &__g, const param_type &__p); + + // property functions + + std::vector probabilities() const { return __p_.probabilities(); } + + + param_type param() const { return __p_; } + + + void param(const param_type &__p) { __p_ = __p; } + + + result_type min() const { return 0; } + + + result_type max() const { return __p_.__p_.size(); } + + friend + bool operator==(const discrete_distribution &__x, + const discrete_distribution &__y) { return __x.__p_ == __y.__p_; } + + friend + bool operator!=(const discrete_distribution &__x, + const discrete_distribution &__y) { return !(__x == __y); } + + }; + + template + template + discrete_distribution<_IntType>::param_type::param_type(size_t __nw, + double __xmin, + double __xmax, + _UnaryOperation __fw) { + if (__nw > 1) { + __p_.reserve(__nw - 1); + double __d = (__xmax - __xmin) / __nw; + double __d2 = __d / 2; + for (size_t __k = 0; __k < __nw; ++__k) + __p_.push_back(__fw(__xmin + __k * __d + __d2)); + __init(); + } + } + + template + void + discrete_distribution<_IntType>::param_type::__init() { + if (!__p_.empty()) { + if (__p_.size() > 1) { + double __s = std::accumulate(__p_.begin(), __p_.end(), 0.0); + for (std::vector::iterator __i = __p_.begin(), __e = __p_.end(); + __i < __e; ++__i) + *__i /= __s; + std::vector __t(__p_.size() - 1); + std::partial_sum(__p_.begin(), __p_.end() - 1, __t.begin()); + swap(__p_, __t); + } else { + __p_.clear(); + __p_.shrink_to_fit(); + } + } + } + + template + std::vector + discrete_distribution<_IntType>::param_type::probabilities() const { + size_t __n = __p_.size(); + std::vector __p(__n + 1); + std::adjacent_difference(__p_.begin(), __p_.end(), __p.begin()); + if (__n > 0) + __p[__n] = 1 - __p_[__n - 1]; + else + __p[0] = 1; + return __p; + } + + template + template + _IntType + discrete_distribution<_IntType>::operator()(_URNG &__g, const param_type &__p) { + uniform_real_distribution __gen; + return static_cast<_IntType>( + std::upper_bound(__p.__p_.begin(), __p.__p_.end(), __gen(__g)) - + __p.__p_.begin()); + } + + +} // namespace random +# endif diff --git a/r-package/build_package.R b/r-package/build_package.R index 323788b42b..11c35182c2 100755 --- a/r-package/build_package.R +++ b/r-package/build_package.R @@ -11,24 +11,9 @@ library(Rcpp) library(devtools) library(testthat) library(roxygen2) -library(lintr) package.name <- "grf" -# Check code style consistency -linters <- with_defaults( - line_length_linter(120), # Max line length = 120 - object_name_linter = NULL, # To allow variable.names + function_names - commented_code_linter = NULL, # Misc. false positives - object_usage_linter = NULL, # Misc. false positives - spaces_left_parentheses_linter = NULL, # Misc. false positives - default = default_linters); - -lint_res <- lint_package(path = package.name, linters = linters, exclusions = file.path(package.name, "R/RcppExports.R")) -lint_res - -if (length(lint_res) > 0) quit(status = 1) - # If built for CRAN, exlude all test except ones with "cran" in the filename # by adding the following regex to .Rbuildignore. if (!is.na(args[1]) && args[1] == "--as-cran") { diff --git a/r-package/grf/R/causal_tuning.R b/r-package/grf/R/causal_tuning.R index 3e4c17c1a6..3d1993a124 100644 --- a/r-package/grf/R/causal_tuning.R +++ b/r-package/grf/R/causal_tuning.R @@ -84,7 +84,7 @@ #' @export tune_causal_forest <- function(X, Y, W, Y.hat, W.hat, sample.weights = NULL, - num.fit.trees = 200, + num.fit.trees = 100, num.fit.reps = 50, num.optimize.reps = 1000, min.node.size = NULL, @@ -123,6 +123,14 @@ tune_causal_forest <- function(X, Y, W, Y.hat, W.hat, all.params <- get_initial_params(min.node.size, sample.fraction, mtry, alpha, imbalance.penalty) fixed.params <- all.params[!is.na(all.params)] tuning.params <- all.params[is.na(all.params)] + default.params <- c( + min.node.size = validate_min_node_size(min.node.size), + sample.fraction = validate_sample_fraction(sample.fraction), + mtry = validate_mtry(mtry, X), + alpha = validate_alpha(alpha), + imbalance.penalty = validate_imbalance_penalty(imbalance.penalty) + ) + default.params[!is.na(all.params)] <- all.params[!is.na(all.params)] if (length(tuning.params) == 0) { return(list("error" = NA, "params" = c(all.params))) @@ -134,7 +142,7 @@ tune_causal_forest <- function(X, Y, W, Y.hat, W.hat, colnames(fit.draws) <- names(tuning.params) compute.oob.predictions <- TRUE - debiased.errors <- apply(fit.draws, 1, function(draw) { + small.forest.errors <- apply(fit.draws, 1, function(draw) { params <- c(fixed.params, get_params_from_draw(X, draw)) small.forest <- causal_train( data$default, data$sparse, @@ -165,35 +173,147 @@ tune_causal_forest <- function(X, Y, W, Y.hat, W.hat, mean(prediction$debiased.error, na.rm = TRUE) }) + if (any(is.na(small.forest.errors))) { + warning(paste0( + "Could not tune causal forest because some small forest error estimates were NA.\n", + "Consider increasing tuning argument num.fit.trees." + )) + out <- get_tuning_output(params = c(default.params), status = "failure") + return(out) + } + + if (sd(small.forest.errors) / mean(small.forest.errors) < 1e-10) { + warning(paste0( + "Could not tune causal forest because small forest errors were nearly constant.\n", + "Consider increasing argument num.fit.trees." + )) + out <- get_tuning_output(params = c(default.params), status = "failure") + return(out) + } + # Fit the 'dice kriging' model to these error estimates. # Note that in the 'km' call, the kriging package prints a large amount of information # about the fitting process. Here, capture its console output and discard it. - variance.guess <- rep(var(debiased.errors) / 2, nrow(fit.draws)) - env <- new.env() - capture.output(env$kriging.model <- - DiceKriging::km( - design = data.frame(fit.draws), - response = debiased.errors, - noise.var = variance.guess - )) - kriging.model <- env$kriging.model + variance.guess <- rep(var(small.forest.errors) / 2, nrow(fit.draws)) + kriging.model <- tryCatch( + { + capture.output( + model <- DiceKriging::km( + design = data.frame(fit.draws), + response = small.forest.errors, + noise.var = variance.guess + ) + ) + model + }, + error = function(e) { + warning(paste0("Dicekriging threw the following error during causal tuning: \n", e)) + return(NULL) + } + ) + if (is.null(kriging.model)) { + warning("Tuning was attempted but failed. Reverting to default parameters.") + out <- get_tuning_output(params = default.parameters, status = "failure") + return(out) + } # To determine the optimal parameter values, predict using the kriging model at a large # number of random values, then select those that produced the lowest error. optimize.draws <- matrix(runif(num.optimize.reps * num.params), num.optimize.reps, num.params) colnames(optimize.draws) <- names(tuning.params) - model.surface <- predict(kriging.model, newdata = data.frame(optimize.draws), type = "SK") - + model.surface <- predict(kriging.model, newdata = data.frame(optimize.draws), type = "SK")$mean tuned.params <- get_params_from_draw(X, optimize.draws) - grid <- cbind(error = model.surface$mean, tuned.params) - optimal.draw <- which.min(grid[, "error"]) - optimal.param <- grid[optimal.draw, ] - out <- list( - error = optimal.param[1], params = c(fixed.params, optimal.param[-1]), - grid = grid + grid <- cbind(error = c(model.surface), tuned.params) + small.forest.optimal.draw <- which.min(grid[, "error"]) + small.forest.optimal.params <- grid[small.forest.optimal.draw, -1] + + # To avoid the possibility of selection bias, re-train a moderately-sized forest + # at the value chosen by the method above + retrained.forest.params <- c(fixed.params, grid[small.forest.optimal.draw, -1]) + retrained.forest.num.trees <- num.fit.trees * 4 + retrained.forest <- causal_train( + data$default, data$sparse, + outcome.index, treatment.index, sample.weight.index, + !is.null(sample.weights), + retrained.forest.params["mtry"], + retrained.forest.num.trees, + retrained.forest.params["min.node.size"], + retrained.forest.params["sample.fraction"], + honesty, + coerce_honesty_fraction(honesty.fraction), + prune.empty.leaves, + ci.group.size, + reduced.form.weight, + retrained.forest.params["alpha"], + retrained.forest.params["imbalance.penalty"], + stabilize.splits, + clusters, + samples.per.cluster, + compute.oob.predictions, + num.threads, + seed + ) + + retrained.forest.prediction <- causal_predict_oob( + retrained.forest, data$default, data$sparse, + outcome.index, treatment.index, num.threads, FALSE ) - class(out) <- c("tuning_output") + + retrained.forest.error <- mean(retrained.forest.prediction$debiased.error, na.rm = TRUE) + + + # Train a forest with default parameters, and check its predicted error. + # This improves our chances of not doing worse than default + num.default.forest.trees <- num.fit.trees * 4 + + default.forest <- causal_train( + data$default, data$sparse, + outcome.index, treatment.index, sample.weight.index, + !is.null(sample.weights), + default.params["mtry"], + num.default.forest.trees, + default.params["min.node.size"], + default.params["sample.fraction"], + honesty, + coerce_honesty_fraction(honesty.fraction), + prune.empty.leaves, + ci.group.size, + reduced.form.weight, + default.params["alpha"], + default.params["imbalance.penalty"], + stabilize.splits, + clusters, + samples.per.cluster, + compute.oob.predictions, + num.threads, + seed + ) + + default.forest.prediction <- causal_predict_oob( + default.forest, data$default, data$sparse, + outcome.index, treatment.index, num.threads, FALSE + ) + + default.forest.error <- mean(default.forest.prediction$debiased.error, na.rm = TRUE) + + # Now compare predicted errors: default parameters vs retrained parameter + if (default.forest.error < retrained.forest.error) + { + out <- get_tuning_output( + error = default.forest.error, + params = default.params, + grid = NA, + status = "default" + ) + } else { + out <- get_tuning_output( + error = retrained.forest.error, + params = retrained.forest.params, + grid = grid, + status = "tuned" + ) + } out } diff --git a/r-package/grf/R/plot.R b/r-package/grf/R/plot.R index ff05fd9977..6e9ea89073 100644 --- a/r-package/grf/R/plot.R +++ b/r-package/grf/R/plot.R @@ -3,7 +3,7 @@ #' If it is a non-leaf node: show its splitting variable and splitting value #' @param tree the tree to convert #' @param index the index of the current node -#' @keyword internal +#' @keywords internal create_dot_body <- function(tree, index = 1) { node <- tree$nodes[[index]] @@ -68,7 +68,7 @@ size = ", num_samples, '"];') #' This function generates a GraphViz representation of the tree, #' which is then written into `dot_string`. #' @param tree the tree to convert -#' @keyword internal +#' @keywords internal export_graphviz <- function(tree) { header <- "digraph nodes { \n node [shape=box] ;" footer <- "}" diff --git a/r-package/grf/R/print.R b/r-package/grf/R/print.R index 949a9a085e..2ab3851000 100644 --- a/r-package/grf/R/print.R +++ b/r-package/grf/R/print.R @@ -94,30 +94,47 @@ print.boosted_regression_forest <- function(x, ...) { #' @importFrom stats aggregate quantile #' @export print.tuning_output <- function(x, tuning.quantiles = seq(0, 1, 0.2), ...) { - grid <- x$grid - out <- lapply(colnames(grid)[-1], function(name) { - q <- quantile(grid[, name], probs = tuning.quantiles) - # Cannot form for example quintiles for mtry if the number of variables is - # less than 5, so here we just truncate the groups. - if (length(unique(q) < length(q))) { - q <- unique(q) - } - rank <- cut(grid[, name], q, include.lowest = TRUE) - out <- aggregate(grid[, "error"], by = list(rank), FUN = mean) - colnames(out) <- c(name, "error") - out - }) - - err <- x$error - params <- x$params[colnames(grid)[-1]] - opt <- formatC(c(err, params)) - - cat("Optimal tuning parameters: \n") - cat(paste0(names(opt), ": ", opt, "\n")) - - cat("Average error by ", length(tuning.quantiles) - 1, "-quantile:\n", sep = "") - for (i in out) { + if (x$status == "failure") { + cat("Tuning status: failure.\n") + cat("This indicates tuning was attempted but failed due to an error, and we fell back to default parameters: \n\n") + params <- x$params + cat(paste0(names(params), ": ", params, "\n")) + } else if (x$status == "default") { + cat("Tuning status: default.\n") + cat("This indicates tuning was attempted. ") + cat("However, we could not find parameters that were expected to perform better than default: \n\n") + params <- x$params + cat(paste0(names(params), ": ", params, "\n")) + } else if (x$status == "tuned") { + cat("Tuning status: tuned.\n") + cat("This indicates tuning found parameters that are expected to perform better than default. \n\n") + grid <- x$grid + out <- lapply(colnames(grid)[-1], function(name) { + q <- quantile(grid[, name], probs = tuning.quantiles) + # Cannot form for example quintiles for mtry if the number of variables is + # less than 5, so here we just truncate the groups. + if (length(unique(q) < length(q))) { + q <- unique(q) + } + rank <- cut(grid[, name], q, include.lowest = TRUE) + out <- aggregate(grid[, "error"], by = list(rank), FUN = mean) + colnames(out) <- c(name, "error") + out + }) + + cat(paste0("Predicted debiased error: ", x$error, "\n\n")) + + cat("Tuned parameters: \n") + cat(paste0(names(x$params), ": ", x$params, "\n")) cat("\n") - print(i, row.names = FALSE) + + cat("Average error by ", length(tuning.quantiles) - 1, "-quantile:\n", sep = "") + for (i in out) { + cat("\n") + print(i, row.names = FALSE) + } + } else { + stop(paste0("Error while reading tuning output. ", + "Parameter 'status' must be one of 'failure', 'default', or 'tuned'")) } } diff --git a/r-package/grf/R/regression_tuning.R b/r-package/grf/R/regression_tuning.R index bc1dc61f91..5b8b3a61e8 100644 --- a/r-package/grf/R/regression_tuning.R +++ b/r-package/grf/R/regression_tuning.R @@ -109,6 +109,14 @@ tune_regression_forest <- function(X, Y, fixed.params <- all.params[!is.na(all.params)] tuning.params <- all.params[is.na(all.params)] + default.params <- c( + min.node.size = validate_min_node_size(min.node.size), + sample.fraction = validate_sample_fraction(sample.fraction), + mtry = validate_mtry(mtry, X), + alpha = validate_alpha(alpha), + imbalance.penalty = validate_imbalance_penalty(imbalance.penalty) + ) + default.params[!is.na(all.params)] <- all.params[!is.na(all.params)] if (length(tuning.params) == 0) { return(list("error" = NA, "params" = c(all.params))) @@ -120,7 +128,7 @@ tune_regression_forest <- function(X, Y, colnames(fit.draws) <- names(tuning.params) compute.oob.predictions <- TRUE - debiased.errors <- apply(fit.draws, 1, function(draw) { + small.forest.errors <- apply(fit.draws, 1, function(draw) { params <- c(fixed.params, get_params_from_draw(X, draw)) small.forest <- regression_train( data$default, data$sparse, outcome.index, sample.weight.index, @@ -150,35 +158,139 @@ tune_regression_forest <- function(X, Y, mean(error, na.rm = TRUE) }) + if (any(is.na(small.forest.errors))) { + warning(paste0( + "Could not tune causal forest because all small forest error estimates were NA.\n", + "Consider increasing tuning argument num.fit.trees." + )) + out <- get_tuning_output(params = c(all.params), status = "failure") + return(out) + } + + if (sd(small.forest.errors) / mean(small.forest.errors) < 1e-10) { + warning(paste0( + "Could not tune causal forest because small forest errors were nearly constant.\n", + "Consider increasing argument num.fit.trees." + )) + out <- get_tuning_output(params = c(all.params), status = "failure") + return(out) + } + # Fit the 'dice kriging' model to these error estimates. # Note that in the 'km' call, the kriging package prints a large amount of information - # about the fitting process. Here, capture its console output and discard it. - variance.guess <- rep(var(debiased.errors) / 2, nrow(fit.draws)) - env <- new.env() - capture.output(env$kriging.model <- - DiceKriging::km( - design = data.frame(fit.draws), - response = debiased.errors, - noise.var = variance.guess - )) - kriging.model <- env$kriging.model + # about the fitting process. Heres, capture its console output and discard it. + variance.guess <- rep(var(small.forest.errors) / 2, nrow(fit.draws)) + kriging.model <- tryCatch( + { + capture.output( + model <- DiceKriging::km( + design = data.frame(fit.draws), + response = small.forest.errors, + noise.var = variance.guess + ) + ) + model + }, + error = function(e) { + warning(paste0("Dicekriging threw the following error during regression tuning: \n", e)) + return(NULL) + } + ) + if (is.null(kriging.model)) { + warning("Tuning was attempted but failed. Reverting to default parameters.") + out <- get_tuning_output(params = default.parameters, status = "failure") + return(out) + } # To determine the optimal parameter values, predict using the kriging model at a large # number of random values, then select those that produced the lowest error. optimize.draws <- matrix(runif(num.optimize.reps * num.params), num.optimize.reps, num.params) colnames(optimize.draws) <- names(tuning.params) - model.surface <- predict(kriging.model, newdata = data.frame(optimize.draws), type = "SK") - + model.surface <- predict(kriging.model, newdata = data.frame(optimize.draws), type = "SK")$mean tuned.params <- get_params_from_draw(X, optimize.draws) - grid <- cbind(error = model.surface$mean, tuned.params) - optimal.draw <- which.min(grid[, "error"]) - optimal.param <- grid[optimal.draw, ] - out <- list( - error = optimal.param[1], params = c(fixed.params, optimal.param[-1]), - grid = grid + grid <- cbind(error = c(model.surface), tuned.params) + small.forest.optimal.draw <- which.min(grid[, "error"]) + small.forest.optimal.params <- grid[small.forest.optimal.draw, -1] + + # To avoid the possibility of selection bias, re-train a moderately-sized forest + # at the value chosen by the method above + retrained.forest.params <- c(fixed.params, grid[small.forest.optimal.draw, -1]) + retrained.forest.num.trees <- num.fit.trees * 4 + retrained.forest <- regression_train( + data$default, data$sparse, outcome.index, sample.weight.index, + !is.null(sample.weights), + as.numeric(retrained.forest.params["mtry"]), + num.fit.trees, + as.numeric(retrained.forest.params["min.node.size"]), + as.numeric(retrained.forest.params["sample.fraction"]), + honesty, + coerce_honesty_fraction(honesty.fraction), + prune.empty.leaves, + ci.group.size, + as.numeric(retrained.forest.params["alpha"]), + as.numeric(retrained.forest.params["imbalance.penalty"]), + clusters, + samples.per.cluster, + compute.oob.predictions, + num.threads, + seed + ) + + retrained.forest.prediction <- regression_predict_oob( + retrained.forest, data$default, data$sparse, + outcome.index, num.threads, FALSE ) - class(out) <- c("tuning_output") + + retrained.forest.error <- mean(retrained.forest.prediction$debiased.error, na.rm = TRUE) + + # Train a forest with default parameters, and check its predicted error. + # This improves our chances of not doing worse than default + num.default.forest.trees <- num.fit.trees * 4 + + default.forest <- regression_train( + data$default, data$sparse, outcome.index, sample.weight.index, + !is.null(sample.weights), + as.numeric(default.params["mtry"]), + num.fit.trees, + as.numeric(default.params["min.node.size"]), + as.numeric(default.params["sample.fraction"]), + honesty, + coerce_honesty_fraction(honesty.fraction), + prune.empty.leaves, + ci.group.size, + as.numeric(default.params["alpha"]), + as.numeric(default.params["imbalance.penalty"]), + clusters, + samples.per.cluster, + compute.oob.predictions, + num.threads, + seed + ) + + default.forest.prediction <- regression_predict_oob( + default.forest, data$default, data$sparse, + outcome.index, num.threads, FALSE + ) + + default.forest.error <- mean(default.forest.prediction$debiased.error, na.rm = TRUE) + + # Now compare predicted errors: default parameters vs retrained parameter + if (default.forest.error < retrained.forest.error) { + out <- get_tuning_output( + error = default.forest.error, + params = default.params, + grid = NA, + status = "default" + ) + } else { + out <- get_tuning_output( + error = retrained.forest.error, + params = retrained.forest.params, + grid = grid, + status = "tuned" + ) + } out } diff --git a/r-package/grf/R/tuning_utilities.R b/r-package/grf/R/tuning_utilities.R index da0949a228..a84a3793dc 100644 --- a/r-package/grf/R/tuning_utilities.R +++ b/r-package/grf/R/tuning_utilities.R @@ -33,3 +33,9 @@ get_params_from_draw <- function(X, draws) { } }, FUN.VALUE = numeric(n)) } + +get_tuning_output <- function(status, params, error = NA, grid = NA) { + out <- list(status = status, params = params, error = error, grid = grid) + class(out) <- c("tuning_output") + out +} diff --git a/r-package/grf/src/random b/r-package/grf/src/random new file mode 120000 index 0000000000..7c66ed57ec --- /dev/null +++ b/r-package/grf/src/random @@ -0,0 +1 @@ +../../../core/third_party/random \ No newline at end of file diff --git a/r-package/grf/tests/testthat/test_regression_forest.R b/r-package/grf/tests/testthat/test_regression_forest.R index cc2e6fbe09..ef2751afcc 100755 --- a/r-package/grf/tests/testthat/test_regression_forest.R +++ b/r-package/grf/tests/testthat/test_regression_forest.R @@ -221,8 +221,8 @@ test_that("a non-pruned honest regression forest has lower MSE than a pruned hon X <- matrix(rnorm(n * p), n, p) Y <- abs(X[, 1]) + 0.1 * rnorm(n) - f1 <- regression_forest(X, Y, honesty = TRUE, honesty.fraction = 0.9, prune = TRUE) - f2 <- regression_forest(X, Y, honesty = TRUE, honesty.fraction = 0.9, prune = FALSE) + f1 <- regression_forest(X, Y, honesty = TRUE, honesty.fraction = 0.9, prune.empty.leaves = TRUE) + f2 <- regression_forest(X, Y, honesty = TRUE, honesty.fraction = 0.9, prune.empty.leaves = FALSE) mse.pruned <- mean((predict(f1)$predictions - Y)^2) mse.notpruned <- mean((predict(f2)$predictions - Y)^2)