-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdima_version.py
112 lines (95 loc) · 3.67 KB
/
dima_version.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# Code by: Dima Pasechnik (@dimpase)
# Modified by: Olivia Guest
# Original code here:
# https://github.com/oliviaguest/pairwise_distance/issues/2#issuecomment-294340333
from __future__ import division, print_function
def pdistsum(X, w, dist, k=1):
"""
compute distances, given by dist(,), between elements of X, weighted by w
using k threads
"""
import threading
from math import sqrt, ceil
N = len(X)
results = [None] * k
def rsum(r0, r1, num):
s = 0.0 # partial sums
c = 0 # partial counts
print("rsum instance:", str(num) +
" from " + str(r0) + " to " + str(r1))
for i in xrange(r0, r1):
for j in xrange(i + 1, N):
c += 1
s += dist(X[i], X[j]) * w[i] * w[j]
results[num] = (s, c)
return
# k+1 ranges: 0,N(1-sqrt((k-1)/k)),N(1-sqrt((k-2)/k)),...,N
r = [0] + map(lambda i: int(ceil(N * (1 - sqrt((k - i) / k)))),
xrange(1, k)) + [N]
threads = []
for i in range(k):
t = threading.Thread(target=rsum, args=(r[i], r[i + 1], i,))
threads.append(t)
t.start()
for i in range(k):
threads[i].join()
total_sum = 0
for result in results:
total_sum += result[0]
N = w.sum()
total_mean = total_sum / (N * (N - 1.0) / 2.0)
return total_sum, total_mean
### testing ####
def dfun(a, b): # example of dist() between 2 numbers
# from math import sqrt
# return sqrt(abs(a-b))
from geopy.distance import great_circle
return great_circle(a, b).km
if __name__ == "__main__":
import sklearn.datasets
import scipy.spatial.distance
import numpy as np
from time import time
from scipy.spatial.distance import pdist
# Generate some data:
N = 783
centers = [[0, 0], [1, 0], [0.5, np.sqrt(0.75)]]
cluster_std = [0.3, 0.3, 0.3]
n_clusters = len(centers)
n_samples = int(0.75 * N)
data, labels_true = sklearn.datasets.make_blobs(n_samples=n_samples,
centers=centers, cluster_std=cluster_std)
centers = [[0.5, np.sqrt(0.75)]]
cluster_std = [0.3]
n_clusters = len(centers)
extra, labels_true = sklearn.datasets.make_blobs(n_samples=int(0.25 * N),
centers=centers, cluster_std=cluster_std)
X = np.concatenate((data, extra), axis=0)
N = X.shape[0]
# Pick some random floats for the counts/weights:
counts = np.random.random_sample((N,)) * 10
# counts = np.ones((N,))
##########################################################################
# Parallel:
t = time()
parallel_sum, parallel_mean = pdistsum(
X, counts, dfun, k=10) # example with 4 threads
print('parallel:\t{} s'.format(time() - t))
##########################################################################
##########################################################################
# Serial:
# Comment this out if you use a high N as it will eat RAM!
t = time()
Y = pdist(X, metric=dfun)
weights = [counts[i] * counts[j]
for i in xrange(N - 1) for j in xrange(i + 1, N)]
serial_sum = np.sum(weights * Y)
N = counts.sum()
serial_mean = serial_sum / (((N - 1)**2 + (N + 1)) / 2 + N)
print('serial:\t\t{} s'.format(time() - t))
##########################################################################
# There is minor rounding error, but check for equality:
assert np.round(serial_sum) == np.round(parallel_sum)
print('sum = {}'.format(parallel_sum))
assert np.round(serial_mean) == np.round(parallel_mean)
print('mean = {}'.format(parallel_mean))