-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathBuildTreeMetric_HighDim_V2.m
347 lines (267 loc) · 12.6 KB
/
BuildTreeMetric_HighDim_V2.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
function [TM, XX_VertexID] = BuildTreeMetric_HighDim_V2(XX, L, KC)
%%%%%%%%
% GOAL:
% Building tree metric for high-dimensional space
% Partition space by the farthest clustering of Gonzalez.
% Apply for discrete empirical measure \mu = \sum_i a_i \delta_{x_i}
% where weight a_i >=0, sum_i a_i = 1, and \delta_{\cdot} is a Dirac
% function.
% NOTE: heigh-level: from 0 --> L
%%%%%%%%
% INPUT:
% XX: cell of supports for N empirical measures
% Each element XX{ii} is a matrix of supports (N_i x dim) where N_i is the
% number of supports with dim dimensions.
% L: the predefined highest level of tree
% (as a stopping condition)
% KC: number of clusters (for the farthest clustering of Gonzalez)
% (when one partitions space by clustering)
%%%%%%%%
% OUTPUT:
% TM: Tree structure T
% Structure of TM
% ----TM.Vertex: set of vertices (4 components: parent, child, pos(ition), path)
% TM.nVertices : number of vertices
% TM.Vertex_ParentId
% TM.Vertex_ChildId
% TM.Vertex_Pos
% TM.Vertex_EdgeIdPath
%
% ----TM.Edge: set of Edges (3 components: lowNode, highNode, weight)
% TM.Edge_LowNode
% TM.Edge_HighNode
% TM.Edge_Weight
%
% TM.Level_sID: start ID (nodes) at each height level
% TM.Level_eID: ending ID (nodes) at each height level
%
% TM.LeavesIdArray: array of leaf IDs
% XX_VertexID: corresponding for XX in tree.
%%%%%%%%
% SUMMARY: Empirical measures
% XX: 1xm cell --> each cell: n_i x dim
% WW: 1xn cell --> each cell: n_i x 1 double
MAXNUM_TREE = KC^(L+1);
% maximum number of clusters for each height level
% At level \ell --> maximum number of clusters: KC^(\ell)
KCArray = KC.^(1:L)'; % L x 1
% number of empirical measures
N = length(XX);
% dimension of supports
dim = size(XX{1}, 2);
% index for each empirical measures (when gathering all supports together
% for building the tree)
nSupports = 0; %count the total number of supports in XX
sIDArray = zeros(N, 1); % starting index
eIDArray = zeros(N, 1); % ending index
% get start-end ID of each empirical measure
for ii = 1:N
sIDArray(ii) = nSupports + 1; %starting index
nSupports = nSupports + size(XX{ii}, 1);
eIDArray(ii) = nSupports; %ending index
end
% gathering ALL SUPPORTS
allXX = zeros(nSupports, dim);
for ii = 1:N
allXX(sIDArray(ii):eIDArray(ii), :) = XX{ii};
end
% allXX (all supports) for clustering
nXX = nSupports;
% Node at parent level (centroids of clusters) --> for finding children nodes
% kcCenterPP: set of current parent nodes
% ===<PP>: denotes for the Parent nodes (current leaf nodes)
% only the center for the first level (root node)
kcCenterPP = mean(allXX)'; % column vector (dim x 1)
numPP = 1; % number of Parent nodes at height level 0
% id of cluster for each empirical measure
% cluster ID starts from 0
% ===<ZZ: cluster ID for "supports" at parent level>
idZZPP = zeros(1, N); % all zeros (all supports in the same cluster id=0)
TM.nVertices = 0;
% Initialization (memory allocation)
TM.Vertex_ParentId = zeros(MAXNUM_TREE, 1);
TM.Vertex_ChildId = cell(MAXNUM_TREE, 1);
TM.Vertex_Pos = zeros(MAXNUM_TREE, dim);
TM.Vertex_EdgeIdPath = cell(MAXNUM_TREE, 1);
%--
TM.Edge_LowNode = zeros(MAXNUM_TREE, 1);
TM.Edge_HighNode = zeros(MAXNUM_TREE, 1);
TM.Edge_Weight = zeros(MAXNUM_TREE, 1);
%--
TM.Level_sID = zeros(L + 1, 1); % from level 0 --> level L
TM.Level_eID = zeros(L + 1, 1); % from level 0 --> level L
% add root into tree
TM.nVertices = 1;
% no parent --> id_parent = 0
TM.Vertex_Pos(1, :) = kcCenterPP; %column vector
TM.Vertex_EdgeIdPath{1} = []; %path is empty
% child nodes: ---> is processing inside the loop
TM.Level_sID(1) = 1;
TM.Level_eID(1) = 1;
% for each height level on the tree T
% (idLL ==> height level idLL + 1
for idLL = 1:L
% HIERARCHICAL-STRUCTURE
% initialize cluster_Id for child nodes
% % ===<LL>: denotes for the children nodes (will be obtained by clustering)
idZZLL = zeros(1, nXX); % cluster id for each support
% all centroids at height level idLL in kcCenterLL (children nodes)
% KCArray(idLL): number of clusters at level idLL
kcCenterLL = zeros(dim, KCArray(idLL)); % having at most KCArray(idLL)
nkcCenterLL = 0; %count the current real number of centroids at height level (idLL+1)
% weight for each edge --- child node (kcCenterLL) & its corresponding
% parent node (kcCenterPP)
% wLL = zeros(1, KCArray(idLL)); %row vector
% save/init info to TM
TM.Level_sID(idLL+1) = TM.Level_eID(idLL) + 1;%assume >= 1
TM.Level_eID(idLL+1) = TM.Level_eID(idLL); %will need to update for each idCCPP
% consider for each cluster at parent level (ID cluster: 0 --> numPP-1)
% then perform clustering to find children nodes for each parent node
% idCCPP: ID cluster at parent nodes
% ==<CC: cluster ID at children nodes>
for idCCPP = 0:(numPP-1)
% index of parent vertex
idVertexPP = TM.Level_sID(idLL) + idCCPP;
% First time for clustering or not?
% idZZ_idCCPP: cluster ID --idCCPP-- for "supports" (idZZ) at parent level.
if idLL == 1
% Don't need to find ID for this cluster
% only 1 cluster at height level 0
idZZ_idCCPP = 1:(nXX);
else
%extract 1 cluster from parent's clusters (Cluster ID: idCCPP)
idZZ_idCCPP = find(idZZPP == idCCPP);
end
% we have collected support data points on cluster ID --idCCPP--
% --> perform clustering to obtain children nodes
% If there is no support data points --> skip
% % if ~isempty(idZZ_idCCPP)
% ALSO SKIP if there is only 1 data point (using for small finite
% number of supports)
if length(idZZ_idCCPP) > 1
% extract supports for the considered cluster ID from parent's clusters
allZZ_idCCPP = allXX(idZZ_idCCPP, :); % supports_clusterID_idCCPP x dim
% % function [K, rx, clusterIndex, clusterCenter, numPoints, clusterRadii] =
% % ...figtreeKCenterClustering(d, N, x, K)
% % ===(THIRD-PARTY TOOLBOX: Gonzalez's farthest-point clustering algorithm)
% % Input
% % * d --> data dimensionality.
% % * N --> number of source points.
% % * x --> d x N matrix of N source points in d dimensions
% % * kMax --> maximum number of clusters.
% % Ouput
% % * K --> actual number of clusters (less than kMax if duplicate pts exist)
% % * rx --> maximum radius of the clusters.
% % * clusterIndex --> vector of length N where the i th element is the
% % cluster number to which the i th point belongs.
% % ClusterIndex[i] varies between 0 to K-1.
% % * clusterCenters --> d x K matrix of K cluster centers
% % * numPoints --> number of points in each cluster.
% % * clusterRadii --> radius of each cluster.
% perform clustering to find children nodes (children's clusters)
% rKCLL_idCCPP: ==<rKC(real #clusters)>
% idZZLL_idCCPP: idZZ("supports")LL("child level") from "idCCPP" clusterID at parent level
% kcCenterLL_idCCPP: kcCenter("centroids") ...
[rKCLL_idCCPP, ~, idZZLL_idCCPP, kcCenterLL_idCCPP, ~, ~] = ...
figtreeKCenterClustering(dim, length(idZZ_idCCPP), allZZ_idCCPP', KC);
% Compute weight --> update to the global weight (wLL)!
% at parent level, kcCenterPP(:, ii+1) is centroid (node) for cluster ID: ii
% rKCLL_idCCPP: number of children nodes
ppMM = repmat(kcCenterPP(:, idCCPP + 1), 1, rKCLL_idCCPP);
% Using Euclidean distance for edges
wLL_idCCPP = sqrt(sum((kcCenterLL_idCCPP - ppMM).^2, 1));
% wLL(nkcCenterLL + (1:rKCLL_idCCPP)) = wLL_idCCPP;
setID_0 = find(wLL_idCCPP == 0);
if length(setID_0) > 0 % exist 0-length
kcCenterLL_idCCPP(:, setID_0) = []; % delete 0-length-edge clusters
wLL_idCCPP(setID_0) = [];
% RELABEL CLUSTER-ID
% -1: 0-length-edge cluster
% 0 --> rKCLL_idCCPP - 1
% 0 --> rKCLL_idCCPP - length(setID_0) - 1
% set -1
clusterID_ZeroLength = setID_0 - 1;
allID_Zero = [];
for iiCC_Zero = 1:length(clusterID_ZeroLength)
tmp = find(idZZLL_idCCPP == clusterID_ZeroLength(iiCC_Zero));
allID_Zero = [allID_Zero; tmp'];
end
idZZLL_idCCPP(allID_Zero) = -1;
% RELABEL
clusterID_NonZero = 0:(rKCLL_idCCPP-1);
clusterID_NonZero(setID_0) = [];
for iiCC_NonZero = 1:length(clusterID_NonZero)
if clusterID_NonZero(iiCC_NonZero) ~= (iiCC_NonZero - 1)
idZZLL_idCCPP(idZZLL_idCCPP == clusterID_NonZero(iiCC_NonZero)) = iiCC_NonZero - 1;
end
end
rKCLL_idCCPP = rKCLL_idCCPP - length(setID_0); % reduce #clusters
end
% update to global index (idZZLL): id of supports at child level
% each cluster ID: idCCPP ---> have at most KC clusters
idZZLL(idZZ_idCCPP) = nkcCenterLL + idZZLL_idCCPP;
if length(setID_0) > 0 % exist 0-length
idZZLL(idZZ_idCCPP(allID_Zero)) = -1;
end
if rKCLL_idCCPP > 0
kcCenterLL(:, nkcCenterLL+(1:rKCLL_idCCPP)) = kcCenterLL_idCCPP;
% update current real number of centroids (children nodes)
nkcCenterLL = nkcCenterLL + rKCLL_idCCPP;
% save to tree metric TM
% rKCLL_idCCPP (number of new children nodes)
% ID for new nodes (vertices): TM.Level_eID(idLL+1 +
% [1:rKCLL_idCCPP]
% kcCenterLL_idCCPP: centroids (node pos)
% == Vertices
TM.nVertices = TM.nVertices + rKCLL_idCCPP;
idNewVertices = TM.Level_eID(idLL+1) + (1:rKCLL_idCCPP);
TM.Vertex_ParentId(idNewVertices) = idVertexPP;
TM.Vertex_ChildId{idVertexPP} = idNewVertices; % OK for parent node, later for children nodes
TM.Vertex_Pos(idNewVertices, :) = kcCenterLL_idCCPP';
% == Edges
idNewEdges = idNewVertices - 1;
TM.Edge_LowNode(idNewEdges) = idVertexPP;
TM.Edge_HighNode(idNewEdges) = idNewVertices;
TM.Edge_Weight(idNewEdges) = wLL_idCCPP;
% path --vertex
for ii = 1:rKCLL_idCCPP
% Path (Edge ID) from root to each node
TM.Vertex_EdgeIdPath{idNewVertices(ii)} = [TM.Vertex_EdgeIdPath{idVertexPP}, idNewEdges(ii)];
end
% == Level
TM.Level_eID(idLL+1) = TM.Level_eID(idLL+1) + rKCLL_idCCPP;
end
end
end
% UPDATE clusterPP at parent level by information from children level
% --> using for the next height level
idZZPP = idZZLL; %cluster ID for each point! (values starting from 0)
%update kcCenterPP (centroids) -- real centroids
kcCenterPP = kcCenterLL(:, 1:nkcCenterLL);
numPP = nkcCenterLL;
end
TM.LeavesIDArray = (TM.Level_sID(L+1):TM.Level_eID(L+1));
% TM.nVertices
% Initialization (memory allocation)
TM.Vertex_ParentId = TM.Vertex_ParentId(1:TM.nVertices); %zeros(MAXNUM_TREE, 1);
TM.Vertex_ChildId = TM.Vertex_ChildId(1:TM.nVertices); %cell(MAXNUM_TREE, 1);
TM.Vertex_Pos = TM.Vertex_Pos(1:TM.nVertices, :); %zeros(MAXNUM_TREE, dim);
TM.Vertex_EdgeIdPath = TM.Vertex_EdgeIdPath(1:TM.nVertices); %cell(MAXNUM_TREE, 1);
%--
TM.Edge_LowNode = TM.Edge_LowNode(1:(TM.nVertices - 1)); %zeros(MAXNUM_TREE, 1);
TM.Edge_HighNode = TM.Edge_HighNode(1:(TM.nVertices - 1)); %zeros(MAXNUM_TREE, 1);
TM.Edge_Weight = TM.Edge_Weight(1:(TM.nVertices - 1)); %zeros(MAXNUM_TREE, 1);
if nargout > 1
XX_VertexID = cell(1, N);
for ii = 1:N
% for each empirical measure
% sIDArray(ii) % starting index
% eIDArray(ii) % ending index
% idZZPP (starting from 0 ---> nkcCenterLL-1)
% --> corresponding to TM.Level_sID(L+1) --> TM.Level_eID(L+1)
% XX{ii}
%corresponding set of idVertex
XX_VertexID{ii} = TM.Level_sID(L+1) + idZZPP(sIDArray(ii):eIDArray(ii));
end
end
end