-
Notifications
You must be signed in to change notification settings - Fork 6
/
SOM.java
351 lines (318 loc) · 9.21 KB
/
SOM.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
//SOM.java
/**
* Fit a self-organizing map to a dataset.
*
* Copyright (C) 2016 David Shaub
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* @author David Shaub
* @version 1.1.0
*
* */
import java.util.*;
public class SOM extends Grid
{
// X dimension of the map
private int xDim;
// Y dimension of the map
private int yDim;
// Number of trianing rounds
private int epochs;
// Ordered pairs for all the points on the map
private int [][] pairArray;
// Weights fitted during training
private double [][] weights;
// Final node assigned to each observation in training
private int finalNodes[];
// The disance from each data point to the final node
private double finalDistances[];
public SOM(double[][] matrix, int xDim, int yDim, int epochs)
{
super(matrix);
// Only allow positive xDim and yDim
if(xDim <= 0 || yDim <= 0)
{
throw new IllegalArgumentException();
}
this.xDim = xDim;
this.yDim = yDim;
this.epochs = epochs;
}
/**
* Getter method for xDim.
*
* @return the X dimension of the map.
*
* */
public int getXDim()
{
return this.xDim;
}
/**
* Getter method for yDim.
*
* @return the Y dimension of the map.
*
* */
public int getYDim()
{
return this.yDim;
}
/**
* Getter method for finalDistances.
*
* This method returns the final distances
* to the assigned node for each observation
* after training.
*
* @return An array with the distance
* to the assigned node for each observation
*
* */
public double [] getDistances()
{
return this.finalDistances;
}
/**
* Getter method for finalNodes.
*
* This method returns the final node
* labels assigned to each observation
* after training
*
* @return An array with the node labels
* for each observation
*
* */
public int [] getNodes()
{
return this.finalNodes;
}
/**
* Train the SOM to the data.
* This method runs the training
* algorithm to fit the self-organizing
* maps to the training data
*
* */
public void train()
{
this.init();
// Number of rows in the training data
int dataRows = this.gridData.length;
// Number of columns in the training data
int dataColumns = this.gridData[0].length;
// Number of rows (same number of columns) in the weights map
int weightsRows = this.weights.length;
// Number of columns in the weights map
int weightsColumns = this.weights[0].length;
// Number of rounds of training
int iterations = this.epochs * dataRows;
// Initial learning rate
double learningRate = 0.5;
// Initial neighborhood size
double neighborhood;
// Current row being processed
int currentObs;
// Nearest node to the current point
int nearest = 0;
// Smallest identified distance to a node
double nearestDistance;
// Temporary variables for the current distance to a node
double dist;
double tmp;
// "Unpack" the pair distances into a 1D array
double [] distPairs = new double[this.pairArray.length * this.pairArray.length];
int count = 0;
int currentX;
int currentY;
double xDist;
double yDist;
for(int i = 0; i < this.pairArray.length; i++)
{
// Set the reference point to the current row
currentX = this.pairArray[i][0];
currentY = this.pairArray[i][1];
for(int j = 0; j < this.pairArray.length; j++)
{
// Calculate the rectilinear distances from this point
// to the reference point
xDist = Math.abs(this.pairArray[j][0] - currentX);
yDist = Math.abs(this.pairArray[j][1] - currentY);
distPairs[count] = xDist + yDist;
count++;
}
}
// Set the neighborhood to capture approximately 2/3 of the nodes.
// This is approximately 1.75 * variance (See Chebychev's inequality)
// https://en.wikipedia.org/wiki/Chebyshev's_inequality
neighborhood = 1.75 * variance(distPairs);
double stepLR = learningRate/iterations;
//double stepNH = neighborhood/iterations;
//double initLR = learningRate;
double initNH = neighborhood;
// Adapted from the C code for VR_onlineSOM in the R "class" package
for(int i = 0; i < iterations; i++)
{
// Choose a random observation for fitting
currentObs = (int)(Math.random() * dataRows);
// Find its nearest node
// Start with the maximum distance possible
nearestDistance = Double.MAX_VALUE;
nearest = 0;
for(int j = 0; j < weightsRows; j++)
{
// Reset the distance to zero for the next training point
dist = 0;
for(int k = 0; k < dataColumns; k++)
{
// For the current random observation and the current column,
// find the difference
tmp = gridData[currentObs][k] - weights[j][k];
// dist^2 is the square of the sums of
// all the individual components, and
// minimizing distance^2 leads to the same
// node as minimizing distance.
dist += (tmp * tmp);
}
// New closest node found
if(dist < nearestDistance)
{
// Update the nearest node and distance
nearest = j;
nearestDistance = dist;
}
}
// Update learning rate and neighborhood distances
// Initially "pull" the map by large amounts and
// pull nodes that are farther away (but within the
// neighborhood as well); as training
// continues create smaller distortions and apply
// them within a smaller neighborhood.
// default decrease strategies: linear for learningRate, exponential for neighborhood
learningRate -= stepLR;
//neighborhood -= stepNH;
double exp = Math.exp(-3d*i/iterations);
//learningRate = initLR*exp;
neighborhood = initNH*exp;
// Rounding errors can lead to negative numbers towards the end.
// Stop then, else the algorithm acts wrongly, e.g., the BMU's weight will be moved away from the input vector.
if (learningRate <= 0 || neighborhood <= 0)
{
break;
}
// Apply the distortion to the map for nodes within
// the neighborhood
for(int l = 0; l < weightsRows; l++)
{
// Apply if the distance to the other node is within the neighborhood
if(distPairs[l + weightsRows * nearest] <= neighborhood)
{
// Apply to all columns in this row
for(int m = 0; m < dataColumns; m++)
{
tmp = gridData[currentObs][m] - weights[l][m];
weights[l][m] += (tmp * learningRate);
}
}
}
}
// Finally label the observations with the nearest node
// to complete the map training
finalNodes = new int[dataRows];
finalDistances = new double[dataRows];
double minDistance;
// Adapted from the C code for mapKohonen in the R "kohonen" package
// Now calculate the weights/data to map distance
double [][] distanceMatrix = new double [dataRows][weightsRows];
// Loop over all data points
for(int i = 0; i < dataRows; i++)
{
minDistance = Double.MAX_VALUE;
// Loop over all the map nodes
for(int j = 0; j < weightsRows; j++)
{
distanceMatrix[i][j] = 0;
// Loop over all the variable
for(int k = 0; k < weightsColumns && distanceMatrix[i][j] < minDistance; k++)
{
tmp = gridData[i][k] - weights[j][k];
distanceMatrix[i][j] += (tmp * tmp);
}
if(distanceMatrix[i][j] < minDistance)
{
minDistance = distanceMatrix[i][j];
finalNodes[i] = j;
}
}
finalDistances[i] = minDistance;
}
//for(int i = 0; i < finalNodes.length; i++)
//{
//System.out.println("node: " + finalNodes[i] + "\t distance: " + finalDistances[i]);
//}
}
/**
* Initialize the SOM object for training
* This method prepares the SOM for
* training by initializing the random
* weights with a bootstrap sample from
* the training data.
*
* */
private void init()
{
// Scale the Grid
this.scaleGrid();
// Prepare the array for
// calculating pair distances
pairArray = new int[this.xDim * this.yDim][2];
int count = 0;
for(int i = 0; i < this.xDim; i++)
{
for(int j = 0; j < this.yDim; j++)
{
this.pairArray[count][0] = i;
this.pairArray[count][1] = j;
count++;
}
}
// Useful variables
int pairRows = this.pairArray.length;
int dataRows = this.gridData.length;
// Sample from the data to determine
// which observations to use
// for initial weights
Set <Integer> samplePoints = new HashSet <>();
while(samplePoints.size() < pairRows)
{
samplePoints.add((int)(Math.random() * dataRows));
}
Integer [] sampleIndex = samplePoints.toArray(new Integer[0]);
// Use the selected rows to build the starting weights
weights = new double[pairRows][gridData[0].length];
int weightCount = 0;
// Select the rows from sampleIndex
for(Integer i : sampleIndex)
{
// Select all the columns in the row
for(int j = 0; j < gridData[0].length; j++)
{
weights[weightCount][j] = gridData[i][j];
}
weightCount++;
}
}
}