-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathJenksTest.cpp
390 lines (317 loc) · 10.3 KB
/
JenksTest.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
#define _ITERATOR_DEBUG_LEVEL 0
#include <assert.h>
#include <vector>
#include <algorithm>
#include "set/DataCompare.h"
using break_array = std::vector<Float64>;
typedef std::size_t SizeT;
typedef SizeT CountType;
typedef std::pair<double, CountType> ValueCountPair;
typedef std::vector<ValueCountPair> ValueCountPairContainer;
// helper funcs
template <typename T> T Min(T a, T b) { return (a<b) ? a : b; }
SizeT GetTotalCount(const ValueCountPairContainer& vcpc)
{
SizeT sum = 0;
ValueCountPairContainer::const_iterator
i = vcpc.begin(),
e = vcpc.end();
for(sum = 0; i!=e; ++i)
sum += (*i).second;
return sum;
}
// helper struct JenksFisher
struct JenksFisher // captures the intermediate data and methods for the calculation of Natural Class Breaks.
{
JenksFisher(const ValueCountPairContainer& vcpc, SizeT k)
: m_M(vcpc.size())
, m_K(k)
, m_BufSize(vcpc.size()-(k-1))
, m_PrevSSM(m_BufSize)
, m_CurrSSM(m_BufSize)
, m_CB(m_BufSize * (m_K-1))
, m_CBPtr()
{
m_CumulValues.reserve(vcpc.size());
double cwv=0;
CountType cw = 0, w;
for(SizeT i=0; i!=m_M; ++i)
{
assert(!i || vcpc[i].first > vcpc[i-1].first); // PRECONDITION: the value sequence must be strictly increasing
w = vcpc[i].second;
assert(w > 0); // PRECONDITION: all weights must be positive
cw += w;
assert(cw > w || !i); // No overflow? No loss of precision?
cwv += w * vcpc[i].first;
m_CumulValues.push_back(ValueCountPair(cwv, cw));
if (i < m_BufSize)
m_PrevSSM[i] = cwv * cwv / cw; // prepare SSM for first class. Last (k-1) values are omitted
}
}
double GetW(SizeT b, SizeT e)
// Gets sum of weighs for elements b..e.
{
assert(b<=e);
assert(e<m_M);
double res = m_CumulValues[e].second;
if (b) res -= m_CumulValues[b-1].second;
return res;
}
double GetWV(SizeT b, SizeT e)
// Gets sum of weighed values for elements b..e
{
assert(b<=e);
assert(e<m_M);
double res = m_CumulValues[e].first;
if (b) res -= m_CumulValues[b-1].first;
return res;
}
double GetSSM(SizeT b, SizeT e)
// Gets the Squared Mean for elements b..e, multiplied by weight.
// Note that n*mean^2 = sum^2/n when mean := sum/n
{
double res = GetWV(b,e);
return res * res / GetW(b,e);
}
SizeT FindMaxBreakIndex(SizeT i, SizeT bp, SizeT ep)
// finds CB[i+m_NrCompletedRows] given that
// the result is at least bp+(m_NrCompletedRows-1)
// and less than ep+(m_NrCompletedRows-1)
// Complexity: O(ep-bp) <= O(m)
{
assert(bp < ep);
assert(bp <= i);
assert(ep <= i+1);
assert(i < m_BufSize);
assert(ep <= m_BufSize);
double minSSM = m_PrevSSM[bp] + GetSSM(bp+m_NrCompletedRows, i+m_NrCompletedRows);
SizeT foundP = bp;
while (++bp < ep)
{
double currSSM = m_PrevSSM[bp] + GetSSM(bp+m_NrCompletedRows, i+m_NrCompletedRows);
if (currSSM > minSSM)
{
minSSM = currSSM;
foundP = bp;
}
}
m_CurrSSM[i] = minSSM;
return foundP;
}
void CalcRange(SizeT bi, SizeT ei, SizeT bp, SizeT ep)
// find CB[i+m_NrCompletedRows]
// for all i>=bi and i<ei given that
// the results are at least bp+(m_NrCompletedRows-1)
// and less than ep+(m_NrCompletedRows-1)
// Complexity: O(log(ei-bi)*Max((ei-bi),(ep-bp))) <= O(m*log(m))
{
assert(bi <= ei);
assert(ep <= ei);
assert(bp <= bi);
if (bi == ei)
return;
assert(bp < ep);
SizeT mi = (bi + ei)/2;
SizeT mp = FindMaxBreakIndex(mi, bp, Min<SizeT>(ep, mi+1));
assert(bp <= mp);
assert(mp < ep);
assert(mp <= mi);
// solve first half of the sub-problems with lower 'half' of possible outcomes
CalcRange(bi, mi, bp, Min<SizeT>(mi, mp+1));
m_CBPtr[ mi ] = mp; // store result for the middle element.
// solve second half of the sub-problems with upper 'half' of possible outcomes
CalcRange(mi+1, ei, mp, ep);
}
void CalcAll()
// complexity: O(m*log(m)*k)
{
if (m_K>=2)
{
m_CBPtr = m_CB.begin();
for (m_NrCompletedRows=1; m_NrCompletedRows<m_K-1; ++m_NrCompletedRows)
{
CalcRange(0, m_BufSize, 0, m_BufSize); // complexity: O(m*log(m))
m_PrevSSM.swap(m_CurrSSM);
m_CBPtr += m_BufSize;
}
}
}
SizeT m_M, m_K, m_BufSize;
ValueCountPairContainer m_CumulValues;
std::vector<double> m_PrevSSM;
std::vector<double> m_CurrSSM;
std::vector<SizeT> m_CB;
std::vector<SizeT>::iterator m_CBPtr;
SizeT m_NrCompletedRows;
};
// GetValueCountPairs
//
// GetValueCountPairs sorts chunks of values and then merges them in order to minimize extra memory and work when many values are equal.
// This is done recursively while retaining used intermediary buffers in order to minimize heap allocations.
const SizeT BUFFER_SIZE = 1024;
void GetCountsDirect(ValueCountPairContainer& vcpc, const double* values, SizeT size)
{
assert(size <= BUFFER_SIZE);
assert(size > 0);
assert(vcpc.empty());
double buffer[BUFFER_SIZE];
std::copy(values, values+size, buffer);
std::sort(buffer, buffer+size, DataCompare<double>());
double currValue = buffer[0];
SizeT currCount = 1;
for (SizeT index = 1; index != size; ++index)
{
if (currValue < buffer[index])
{
vcpc.push_back(ValueCountPair(currValue, currCount));
currValue = buffer[index];
currCount = 1;
}
else
++currCount;
}
vcpc.push_back(ValueCountPair(currValue, currCount));
}
struct CompareFirst
{
bool operator () (const ValueCountPair& lhs, const ValueCountPair& rhs)
{
return lhs.first < rhs.first;
}
};
void MergeToLeft(ValueCountPairContainer& vcpcLeft, const ValueCountPairContainer& vcpcRight, ValueCountPairContainer& vcpcDummy)
{
assert(vcpcDummy.empty());
vcpcDummy.swap(vcpcLeft);
vcpcLeft.resize(vcpcRight.size() + vcpcDummy.size());
std::merge(vcpcRight.begin(), vcpcRight.end(), vcpcDummy.begin(), vcpcDummy.end(), vcpcLeft.begin(), CompareFirst());
ValueCountPairContainer::iterator
currPair = vcpcLeft.begin(),
lastPair = vcpcLeft.end();
ValueCountPairContainer::iterator index = currPair+1;
while (index != lastPair && currPair->first < index->first)
{
currPair = index;
++index;
}
double currValue = currPair->first;
SizeT currCount = currPair->second;
for (; index != lastPair;++index)
{
if (currValue < index->first)
{
*currPair++ = ValueCountPair(currValue, currCount);
currValue = index->first;
currCount = index->second;
}
else
currCount += index->second;
}
*currPair++ = ValueCountPair(currValue, currCount);
vcpcLeft.erase(currPair, lastPair);
vcpcDummy.clear();
}
struct ValueCountPairContainerArray : std::vector<ValueCountPairContainer>
{
void resize(SizeT k)
{
assert(capacity() >= k);
while (size() < k)
{
push_back(ValueCountPairContainer());
back().reserve(BUFFER_SIZE);
}
}
void GetValueCountPairs(ValueCountPairContainer& vcpc, const double* values, SizeT size, unsigned int nrUsedContainers)
{
assert(vcpc.empty());
if (size <= BUFFER_SIZE)
GetCountsDirect(vcpc, values, size);
else
{
resize(nrUsedContainers+2);
unsigned int m = size/2;
GetValueCountPairs(vcpc, values, m, nrUsedContainers);
GetValueCountPairs(begin()[nrUsedContainers], values + m, size - m, nrUsedContainers+1);
MergeToLeft(vcpc, begin()[nrUsedContainers], begin()[nrUsedContainers+1]);
begin()[nrUsedContainers].clear();
}
assert(GetTotalCount(vcpc) == size);
}
};
void GetValueCountPairs(ValueCountPairContainer& vcpc, const double* values, SizeT n)
{
vcpc.clear();
if(n)
{
ValueCountPairContainerArray vcpca;
// max nr halving is log2(max cardinality / BUFFER_SIZE); max cardinality is SizeT(-1)
vcpca.reserve(3+8*sizeof(SizeT)-10);
vcpca.GetValueCountPairs(vcpc, values, n, 0);
assert(vcpc.size());
}
}
void ClassifyJenksFisherFromValueCountPairs(LimitsContainer& breaksArray, SizeT k, const ValueCountPairContainer& vcpc)
{
breaksArray.resize(k);
SizeT m = vcpc.size();
assert(k <= m); // PRECONDITION
if (!k)
return;
JenksFisher jf(vcpc, k);
if (k > 1)
{
jf.CalcAll();
SizeT lastClassBreakIndex = jf.FindMaxBreakIndex(jf.m_BufSize-1, 0, jf.m_BufSize);
while (--k)
{
breaksArray[k] = vcpc[lastClassBreakIndex+k].first;
assert(lastClassBreakIndex < jf.m_BufSize);
if (k > 1)
{
jf.m_CBPtr -= jf.m_BufSize;
lastClassBreakIndex = jf.m_CBPtr[lastClassBreakIndex];
}
}
assert(jf.m_CBPtr == jf.m_CB.begin());
}
assert( k == 0 );
breaksArray[0] = vcpc[0].first; // break for the first class is the minimum of the dataset.
}
// test code
#include <boost/random.hpp>
int main(int c, char** argv)
{
const double rangeMin = 0.0;
const double rangeMax = 10.0;
typedef boost::uniform_real<double> NumberDistribution;
typedef boost::mt19937 RandomNumberGenerator;
typedef boost::variate_generator<RandomNumberGenerator&, NumberDistribution> Generator;
NumberDistribution distribution(rangeMin, rangeMax);
RandomNumberGenerator generator;
generator.seed(0); // seed with the current time
Generator numberGenerator(generator, distribution);
const int n = 1000000;
const int k = 10;
std::cout << "Generating random numbers..." << std::endl;
std::vector<double> values;
values.reserve(n);
for (int i=0; i!=n; ++i)
{
double v = numberGenerator();
values.push_back(v*v); //populate a distribuiton slightly more interesting than uniform, with a lower density at higher values.
}
assert(values.size() == n);
std::cout << "Generating sortedUniqueValueCounts ..." << std::endl;
ValueCountPairContainer sortedUniqueValueCounts;
GetValueCountPairs(sortedUniqueValueCounts, &values[0], n);
std::cout << "Finding Jenks ClassBreaks..." << std::endl;
LimitsContainer resultingbreaksArray;
ClassifyJenksFisherFromValueCountPairs(resultingbreaksArray, k, sortedUniqueValueCounts);
std::cout << "Reporting results..." << std::endl;
for (double breakValue: resultingbreaksArray)
std::cout << breakValue << std::endl << std::endl;
std::cout << "Press a char and <enter> to terminate" << std::endl;
char ch;
std::cin >> ch; // wait for user to enter a key
} // main