-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarystatistics.jl
191 lines (163 loc) · 7.31 KB
/
summarystatistics.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#TODO samplemean, samplequantile (this will have to be marginal quantiles), measures of multivariate skewness and kurtosis (https://www.jstor.org/stable/2334770). See what Gerber did.
"""
samplesize(Z::AbstractArray)
Computes the sample size of a set of independent realisations `Z`.
Note that this function is a wrapper around [`numberreplicates`](@ref), but this
function returns the number of replicates as the eltype of `Z`, rather than as an integer.
"""
samplesize(Z) = eltype(Z)(numberreplicates(Z))
"""
samplecovariance(Z::AbstractArray)
Computes the [sample covariance matrix](https://en.wikipedia.org/wiki/Sample_mean_and_covariance#Definition_of_sample_covariance),
Σ̂, and returns the vectorised lower triangle of Σ̂.
# Examples
```
# 5 independent replicates of a 3-dimensional vector
z = rand(3, 5)
samplecovariance(z)
```
"""
function samplecovariance(z::A) where {A <: AbstractArray{T, N}} where {T, N}
@assert size(z, N) > 1 "The number of replicates, which are stored in the final dimension of the input array, should be greater than 1"
z = Flux.flatten(z) # convert to matrix (allows for arbitrary sized data inputs)
d = size(z, 1)
Σ̂ = cov(z, dims = 2, corrected = false)
tril_idx = tril(trues(d, d))
return Σ̂[tril_idx]
end
samplecovariance(z::AbstractVector) = samplecovariance(reshape(z, :, 1))
"""
samplecorrelation(Z::AbstractArray)
Computes the sample correlation matrix,
R̂, and returns the vectorised strict lower triangle of R̂.
# Examples
```
# 5 independent replicates of a 3-dimensional vector
z = rand(3, 5)
samplecorrelation(z)
```
"""
function samplecorrelation(z::A) where {A <: AbstractArray{T, N}} where {T, N}
@assert size(z, N) > 1 "The number of replicates, which are stored in the final dimension of the input array, should be greater than 1"
z = Flux.flatten(z) # convert to matrix (allows for arbitrary sized data inputs)
d = size(z, 1)
Σ̂ = cor(z, dims = 2)
tril_idx = tril(trues(d, d), -1)
return Σ̂[tril_idx]
end
samplecorrelation(z::AbstractVector) = samplecorrelation(reshape(z, :, 1))
# NB I thought the following functions might be better on the GPU, but after
# some benchmarking it turns out the base implementation is better (at least
# when considering only a single data set at a time). Still, I will leave these
# functions here in case I want to implement something similar later.
# function samplecov(z::A) where {A <: AbstractArray{T, N}} where {T, N}
# @assert size(z, N) > 1 "The number of replicates, which are stored in the final dimension of the input array, should be greater than 1"
# z = Flux.flatten(z) # convert to matrix (allows for arbitrary sized data inputs)
# d, n = size(z)
# z̄ = mean(z, dims = 2)
# e = z .- z̄
# e = reshape(e, (size(e, 1), 1, n)) # 3D array for batched mul and transpose
# Σ̂ = sum(e ⊠ batched_transpose(e), dims = 3) / T(n)
# Σ̂ = reshape(Σ̂, d, d) # convert matrix (drop final singelton)
# tril_idx = tril(trues(d, d))
# return Σ̂[tril_idx]
# end
#
# function samplecor(z::A) where {A <: AbstractArray{T, N}} where {T, N}
# @assert size(z, N) > 1 "The number of replicates, which are stored in the final dimension of the input array, should be greater than 1"
# z = Flux.flatten(z) # convert to matrix (allows for arbitrary sized data inputs)
# d, n = size(z)
# z̄ = mean(z, dims = 2)
# e = z .- z̄
# e = reshape(e, (size(e, 1), 1, n)) # 3D array for batched mul and transpose
# Σ̂ = sum(e ⊠ batched_transpose(e), dims = 3) / T(n)
# Σ̂ = reshape(Σ̂, d, d) # convert matrix (drop final singelton)
# σ̂ = Σ̂[diagind(Σ̂)]
# D = Diagonal(1 ./ sqrt.(σ̂))
# Σ̂ = D * Σ̂ * D
# tril_idx = tril(trues(d, d), -1)
# return Σ̂[tril_idx]
# end
#
# using NeuralEstimators
# using Flux
# using BenchmarkTools
# using Statistics
# using LinearAlgebra
# z = rand(3, 4000) |> gpu
# @time samplecovariance(z)
# @time samplecov(z)
# @time samplecorrelation(z)
# @time samplecor(z)
#
# @btime samplecovariance(z);
# @btime samplecov(z);
# @btime samplecorrelation(z);
# @btime samplecor(z);
#TODO clean up this documentation (e.g., don't bother with the bin notation)
#TODO there is a more general structure that we could define, that has message(xi, xj, e) as a slot
@doc raw"""
NeighbourhoodVariogram(h_max, n_bins)
(l::NeighbourhoodVariogram)(g::GNNGraph)
Computes the empirical variogram,
```math
\hat{\gamma}(h \pm \delta) = \frac{1}{2|N(h \pm \delta)|} \sum_{(i,j) \in N(h \pm \delta)} (Z_i - Z_j)^2
```
where $N(h \pm \delta) \equiv \left\{(i,j) : \|\boldsymbol{s}_i - \boldsymbol{s}_j\| \in (h-\delta, h+\delta)\right\}$
is the set of pairs of locations separated by a distance within $(h-\delta, h+\delta)$, and $|\cdot|$ denotes set cardinality.
The distance bins are constructed to have constant width $2\delta$, chosen based on the maximum distance
`h_max` to be considered, and the specified number of bins `n_bins`.
The input type is a `GNNGraph`, and the empirical variogram is computed based on the corresponding graph structure.
Specifically, only locations that are considered neighbours will be used when computing the empirical variogram.
# Examples
```
using NeuralEstimators, Distances, LinearAlgebra
# Simulate Gaussian spatial data with exponential covariance function
θ = 0.1 # true range parameter
n = 250 # number of spatial locations
S = rand(n, 2) # spatial locations
D = pairwise(Euclidean(), S, dims = 1) # distance matrix
Σ = exp.(-D ./ θ) # covariance matrix
L = cholesky(Symmetric(Σ)).L # Cholesky factor
m = 5 # number of independent replicates
Z = L * randn(n, m) # simulated data
# Construct the spatial graph
r = 0.15 # radius of neighbourhood set
g = spatialgraph(S, Z, r = r)
# Construct the variogram object wth 10 bins
nv = NeighbourhoodVariogram(r, 10)
# Compute the empirical variogram
nv(g)
```
"""
struct NeighbourhoodVariogram{T} <: GNNLayer
h_cutoffs::T
# TODO inner constructor, add 0 into h_cutoffs if it is not already in there
end
function NeighbourhoodVariogram(h_max, n_bins::Integer)
h_cutoffs = range(0, stop= h_max, length = n_bins+1)
h_cutoffs = collect(h_cutoffs)
NeighbourhoodVariogram(h_cutoffs)
end
function (l::NeighbourhoodVariogram)(g::GNNGraph)
# NB in the case of a batched graph, see the comments in the method summarystatistics(d::DeepSet, Z::V) where {V <: AbstractVector{G}} where {G <: GNNGraph}
Z = g.ndata.Z
h = g.graph[3]
message(xi, xj, e) = (xi - xj).^2
z = apply_edges(message, g, Z, Z, h) # (Zⱼ - Zᵢ)², possibly replicated
z = mean(z, dims = 2) # average over the replicates
z = vec(z)
# Bin the distances
h_cutoffs = l.h_cutoffs
bins_upper = h_cutoffs[2:end] # upper bounds of the distance bins
bins_lower = h_cutoffs[1:end-1] # lower bounds of the distance bins
N = [bins_lower[i:i] .< h .<= bins_upper[i:i] for i in eachindex(bins_upper)] # NB avoid scalar indexing by i:i
N = reduce(hcat, N)
# Compute the average over each bin
N_card = sum(N, dims = 1) # number of occurences in each distance bin
N_card = N_card + (N_card .== 0) # prevent division by zero
Σ = sum(z .* N, dims = 1) # ∑(Zⱼ - Zᵢ)² in each bin
vec(Σ ./ 2N_card)
end
@layer NeighbourhoodVariogram
Flux.trainable(l::NeighbourhoodVariogram) = ()