-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHwSportsArticleDataAnalysis.java
901 lines (657 loc) · 30.1 KB
/
HwSportsArticleDataAnalysis.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
import func.nn.backprop.BackPropagationNetwork;
import func.nn.backprop.BackPropagationNetworkFactory;
import func.nn.backprop.BatchBackPropagationTrainer;
import func.nn.backprop.RPROPUpdateRule;
import opt.OptimizationAlgorithm;
import opt.RandomizedHillClimbing;
import opt.SimulatedAnnealing;
import opt.example.NeuralNetworkOptimizationProblem;
import opt.ga.StandardGeneticAlgorithm;
import shared.*;
import util.linalg.Vector;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.text.DecimalFormat;
import java.util.*;
public class HwSportsArticleDataAnalysis{
private static final String FILENAME = "sportsArticleModified.csv";
// TODO: How many examples you have
private static int num_examples = 1000;
// TODO: How many attributes you have. This is (number_of_columns - 1)
private static int num_attributes = 59;
/* Randomization */
/**
* Randomizing your rows
*
* If you enable the randomization, your rows will be shuffled.
* The seed value can be any arbitrary value. Having a seed ensures that each run of the
* script has the same randomized shuffle ordering as any other run.
*/
private static final boolean shouldRandomize = true;
private static final long SEED = 0xABCDEF;
/* Cross Validation and Testing params */
/**
* Set K for number of folds.
* Set PERCENT_TRAIN for the percentage to be used for training
*/
private static final int K = 10;
private static final double PERCENT_TRAIN = 0.7;
/* Neural Network Params*/
/**
* Tweak the following as needed. The maxLearningRate and minLearningRate are the default
* hardcoded values in RPROPUpdateRule.java
*
* The accuracy threshold to stop backprop. Set this EXTREMELY low to ensure training ends only
* when absolutely sure network output is correct
*/
private static double initialLearningRate = 0.1;
private static double maxLearningRate = 50;
private static double minLearningRate = 0.000001;
private static double backprop_threshold = 1e-10;
/* Backprop Params */
/**
* If true, shouldFindNNParams determines the best params for your neural network. Since this
* process is lengthy, you don't want to repeat it often. Do it once, record the value, and
* place the value for numberHiddenLayerNodes.
*/
private static final boolean shouldFindNNParams = false;
private static int numberHiddenLayerNodes = 30;
/* Simulated Annealing */
/**
* Same as above, if you've already run it once, just record the values and store them in
* best_temp and best_cooling. Otherwise, the temps[] and cooling[] values will be cycled
* through until the best param configuration for simulated annealing is found.
*/
// TODO: Set this to false if you retained the best SA params from a previous run
private static final boolean shouldFindSAParams = true;
// TODO: Modify these to try different possible temp and cooling params
//private static double[] temps = {1e5, 1E8, 1E10, 1E12, 1E15};
private static double[] temps = {125e3};
//private static double[] coolingRates = {0.9, 0.95, 0.99, 0.999};
private static double[] coolingRates = {1.2};
// TODO: Place values here from previous run if you set shouldFindSAParams to false.
private static double best_temp = 1E12;
private static double best_cooling = 0.95;
/* Genetic Algorithms */
/**
* Same as above, if you've already run it once, just record the values and store them in
* populationSize, toMate, and toMutate. Otherwise, the ratios will be cycled
* through until the best param configuration for genetic algorithms is found.
*
* NOTE: min(populationRations) >= max(mateRatios) + max(toMutateRatio)
* This condition must be upheld or Exceptions will be thrown later in the script
*/
private static final boolean shouldFindGAParams = true;
//private static double[] populationRatios = {0.10, 0.15, 0.20, 0.25};
private static double[] populationRatios = {0.30};
//private static double[] mateRatios = {0.02, 0.04};
private static double[] mateRatios = {0.06};
//private static double[] mutateRatios = {0.02, 0.04};
private static double[] mutateRatios = {0.02};
// TODO: Place values here from previous run if you set shouldFindGAParams to false
private static double populationRatio = 0.25;
private static double toMateRatio = 0.02;
private static double toMutateRatio = 0.02;
/* Other global vars - Don't mess with these unless you have a reason to */
// Number of input nodes is the same as the number of attributes
private static int inputLayer = num_attributes;
// This is determined dynamically
private static int outputLayer;
// Determines how many possible classifications we can have
private static Set<String> unique_labels;
private static Instance[] allInstances;
private static Map<String, double[]> bitvector_mappings;
private static BackPropagationNetworkFactory factory = new BackPropagationNetworkFactory();
private static ErrorMeasure measure = new SumOfSquaresError();
private static DecimalFormat df = new DecimalFormat("0.000");
// Train and Test sets
private static DataSet trainSet;
private static DataSet testSet;
// Cross validation folds
private static List<Instance[]> folds;
/* Begin actual code */
/**
* Comment out anything you don't want to immediately run.
* Advice: Write loops to do certain numbers of iterations and output results for rand opt
* algorithms
* @param args ignored
*/
public static void main(String[] args) {
initializeInstances();
// Creates K-folds for cross-validation
makeTestTrainSets();
folds = kfolds(trainSet);
int trainIterations;
// Determine best NN params, which happen to just be the number of hidden layer nodes.
if (shouldFindNNParams) {
// TODO: Tweak this value for better results
// This is how Weka does it: # of attributes + # of classes / 2
int weka_technique = num_attributes + outputLayer / 2;
int[] hiddenLayerNodes = {weka_technique, 5, 10, 20, 50, 100, 200};
trainIterations = 200;
determineNNParams(trainIterations, hiddenLayerNodes);
}
/* Backprop */
trainIterations = 1;
// Now determine the NN performance. Results are simply printed out.
runBackprop(trainIterations);
/* RHC */
trainIterations = 1;
// RHC has no params. Just run it directly
runRHC(trainIterations);
/* SA */
if (shouldFindSAParams) {
trainIterations = 1;
determineSAParams(trainIterations);
}
// Run actual SA with best params here
trainIterations = 1;
runSA(trainIterations, best_temp, best_cooling);
/* GA */
if (shouldFindGAParams) {
// TODO: Keep this very small
trainIterations = 1;
determineGAParams(trainIterations);
}
// Run actual GA with best params here
trainIterations = 25;
runGA(trainIterations);
}
/**
* Uses k-folds cross validation to determine the best number of hidden layers to configure a
* nueral network among the list provided
* @param trainIterations the number of iterations to run
* @param hiddenLayerNodes an int[] of hidden layer nodes to try
*/
public static void determineNNParams(int trainIterations, int[] hiddenLayerNodes) {
System.out.println("===========Cross Validation for NN Params=========");
double[] errors = new double[hiddenLayerNodes.length];
for (int m = 0; m < hiddenLayerNodes.length; m++) {
int numHiddenNodes = hiddenLayerNodes[m];
BackPropagationNetwork[] nets = new BackPropagationNetwork[K];
double[] validationErrors = new double[K];
// Create and train networks
for (int i = 0; i < nets.length; i++) {
Instance[] validation = getValidationFold(folds, i);
Instance[] trainFolds = getTrainFolds(folds, i);
DataSet trnfoldsSet = new DataSet(trainFolds);
// Creates a network with specified number of hidden layer nodes
nets[i] = factory.createClassificationNetwork(
new int[]{inputLayer, numHiddenNodes, outputLayer});
BackPropagationNetwork backpropNet = nets[i];
ConvergenceTrainer trainer = new ConvergenceTrainer(
new BatchBackPropagationTrainer(trnfoldsSet, backpropNet, new SumOfSquaresError(),
new RPROPUpdateRule(initialLearningRate, maxLearningRate, minLearningRate)),
backprop_threshold, trainIterations);
trainer.train();
validationErrors[i] = evaluateNetwork(backpropNet, validation);
}
// Find the average error for this network configuration
double error = 0;
for (int j = 0; j < validationErrors.length; j++) {
error += validationErrors[j];
}
error /= validationErrors.length;
errors[m] = error;
System.out.printf("Nodes: %d\tError: %s%%%n", numHiddenNodes, df.format(errors[m]));
}
// Find the index with the min avg validation error
int best_index = 0;
double minError = Double.MAX_VALUE;
for (int j = 0; j < errors.length; j++) {
if (errors[j] < minError) {
minError = errors[j];
best_index = j;
}
}
int bestNumNodes = hiddenLayerNodes[best_index];
System.out.printf("%nBest Num Hidden Nodes: %d\tError: %s%%%n", bestNumNodes, df.format
(minError));
numberHiddenLayerNodes = bestNumNodes;
}
/**
* This method will run Backpropagation using each
* combination of (K-1) folds for training, and the Kth fold for validation. Once the model
* with the lowest validation set error is found, that is used as the "best" model and the
* training and test errors on that model are recorded.
* @param trainIterations the number of epochs/iterations
*/
public static void runBackprop(int trainIterations) {
System.out.println("===========Backpropagation=========");
double starttime = System.nanoTime();;
double endtime;
BackPropagationNetwork backpropNet = factory.createClassificationNetwork(
new int[]{inputLayer, numberHiddenLayerNodes, outputLayer});
ConvergenceTrainer trainer = new ConvergenceTrainer(
new BatchBackPropagationTrainer(trainSet, backpropNet, new SumOfSquaresError(),
new RPROPUpdateRule(initialLearningRate, maxLearningRate, minLearningRate)),
backprop_threshold, trainIterations);
trainer.train();
double trainError = evaluateNetwork(backpropNet, trainSet.getInstances());
double testError = evaluateNetwork(backpropNet, testSet.getInstances());
System.out.printf("Train Error: %s%% %n", df.format(trainError));
System.out.printf("Test Error: %s%% %n", df.format(testError));
endtime = System.nanoTime();
double time_elapsed = endtime - starttime;
// Convert nanoseconds to seconds
time_elapsed /= Math.pow(10,9);
System.out.printf("Time Elapsed: %s s %n", df.format(time_elapsed));
}
/**
* Determines optimal weights for configured neural network using Randomized Hill Climbing with
* Random Restarts and evaluates a neural networks performance on train and test sets with
* those weights
* @param trainIterations the number of iterations
*/
public static void runRHC(int trainIterations) {
System.out.println("===========Randomized Hill Climbing=========");
double starttime = System.nanoTime();;
double endtime;
BackPropagationNetwork backpropNet = factory.createClassificationNetwork(
new int[]{inputLayer, numberHiddenLayerNodes, outputLayer});
NeuralNetworkOptimizationProblem nnop = new NeuralNetworkOptimizationProblem(trainSet,
backpropNet, measure);
OptimizationAlgorithm oa = new RandomizedHillClimbing(nnop);
// TODO: Vary the number of iterations as needed for your results
train(oa, backpropNet, trainIterations);
double trainError = evaluateNetwork(backpropNet, trainSet.getInstances());
double testError = evaluateNetwork(backpropNet, testSet.getInstances());
System.out.printf("Train Error: %s%% %n", df.format(trainError));
System.out.printf("Test Error: %s%% %n", df.format(testError));
endtime = System.nanoTime();
double time_elapsed = endtime - starttime;
// Convert nanoseconds to seconds
time_elapsed /= Math.pow(10,9);
System.out.printf("Time Elapsed: %s s %n", df.format(time_elapsed));
}
public static void determineSAParams(int trainIterations) {
System.out.println("===========Determining Simulated Annealing Params=========");
double[][] errors = new double[temps.length][coolingRates.length];
for (int x = 0; x < temps.length; x++) {
double temp = temps[x];
for (int y = 0; y < coolingRates.length; y++) {
double cooling = coolingRates[y];
BackPropagationNetwork[] nets = new BackPropagationNetwork[K];
NeuralNetworkOptimizationProblem[] nnops = new NeuralNetworkOptimizationProblem[K];
OptimizationAlgorithm[] oas = new OptimizationAlgorithm[K];
double[] validationErrors = new double[K];
for (int i = 0; i < nets.length; i++) {
Instance[] validation = getValidationFold(folds, i);
Instance[] trainFolds = getTrainFolds(folds, i);
DataSet trnfoldsSet = new DataSet(trainFolds);
// Creates a network with specified number of hidden layer nodes
nets[i] = factory.createClassificationNetwork(
new int[]{inputLayer, numberHiddenLayerNodes, outputLayer});
BackPropagationNetwork backpropNet = nets[i];
nnops[i] = new NeuralNetworkOptimizationProblem(trnfoldsSet, backpropNet, measure);
oas[i] = new SimulatedAnnealing(temp, cooling, nnops[i]);
train(oas[i], nets[i], trainIterations);
validationErrors[i] = evaluateNetwork(backpropNet, validation);
}
// Find the average error for this network configuration
double error = 0;
for (int j = 0; j < validationErrors.length; j++) {
error += validationErrors[j];
}
error /= validationErrors.length;
errors[x][y] = error;
}
}
int best_temp_index = 0;
int best_cool_index = 0;
double minErr = Double.MAX_VALUE;
for (int x = 0; x < temps.length; x++) {
for (int y = 0; y < coolingRates.length; y++) {
if (minErr > errors[x][y]) {
best_temp_index = x;
best_cool_index = y;
minErr = errors[x][y];
}
}
}
double bestCooling = coolingRates[best_cool_index];
double bestTemp = temps[best_temp_index];
System.out.printf("Best Cooling: %s%n", df.format(bestCooling));
System.out.printf("Best Temp: %s%n", df.format(bestTemp));
}
public static void runSA(int trainIterations, double temp, double cooling) {
System.out.println("===========Simulated Annealing=========");
double starttime = System.nanoTime();;
double endtime;
BackPropagationNetwork backpropNet = factory.createClassificationNetwork(
new int[]{inputLayer, numberHiddenLayerNodes, outputLayer});
NeuralNetworkOptimizationProblem nnop = new NeuralNetworkOptimizationProblem(trainSet,
backpropNet, measure);
OptimizationAlgorithm oa = new SimulatedAnnealing(temp, cooling, nnop);
train(oa, backpropNet, trainIterations);
double trainError = evaluateNetwork(backpropNet, trainSet.getInstances());
double testError = evaluateNetwork(backpropNet, testSet.getInstances());
System.out.printf("Train Error: %s%% %n", df.format(trainError));
System.out.printf("Test Error: %s%% %n", df.format(testError));
endtime = System.nanoTime();
double time_elapsed = endtime - starttime;
// Convert nanoseconds to seconds
time_elapsed /= Math.pow(10,9);
System.out.printf("Time Elapsed: %s s %n", df.format(time_elapsed));
}
public static void determineGAParams(int trainIterations) {
System.out.println("===========Determining Genetic Algorithms Params=========");
double[][][] errors =
new double[populationRatios.length][mateRatios.length][mutateRatios.length];
// Training population size is always 9/10 of the total training set, or equivalently
// 9 times the validation set
int trainingPopulation = getValidationFold(folds, 0).length * 9;
for (int x = 0; x < populationRatios.length; x++) {
int population = (int) (Math.max(populationRatios[x] * trainingPopulation, 10));
for (int y = 0; y < mateRatios.length; y++) {
int mate = (int) (Math.max(mateRatios[y] * trainingPopulation, 10));
for (int z = 0; z < mutateRatios.length; z++) {
int mutate = (int) (Math.max(mutateRatios[z] * trainingPopulation, 10));
BackPropagationNetwork[] nets = new BackPropagationNetwork[K];
NeuralNetworkOptimizationProblem[] nnops = new NeuralNetworkOptimizationProblem[K];
OptimizationAlgorithm[] oas = new OptimizationAlgorithm[K];
double[] validationErrors = new double[K];
for (int i = 0; i < nets.length; i++) {
Instance[] validation = getValidationFold(folds, i);
Instance[] trainFolds = getTrainFolds(folds, i);
DataSet trnfoldsSet = new DataSet(trainFolds);
nets[i] = factory.createClassificationNetwork(
new int[]{inputLayer, numberHiddenLayerNodes, outputLayer});
BackPropagationNetwork backpropNet = nets[i];
nnops[i] = new NeuralNetworkOptimizationProblem(trnfoldsSet, backpropNet, measure);
oas[i] = new StandardGeneticAlgorithm(population, mate, mutate, nnops[i]);
train(oas[i], backpropNet, trainIterations);
validationErrors[i] = evaluateNetwork(backpropNet, validation);
}
// Find the average error for this configuration
double error = 0;
for (int j = 0; j < validationErrors.length; j++) {
error += validationErrors[j];
}
error /= validationErrors.length;
errors[x][y][z] = error;
}
}
}
int best_pop = 0;
int best_mate = 0;
int best_mutate = 0;
double minErr = Double.MAX_VALUE;
for (int x = 0; x < populationRatios.length; x++) {
for (int y = 0; y < mateRatios.length; y++) {
for (int z = 0; z < mutateRatios.length; z++) {
if (errors[x][y][z] < minErr) {
best_pop = x;
best_mate = y;
best_mutate = z;
minErr = errors[x][y][z];
}
}
}
}
populationRatio = populationRatios[best_pop];
toMateRatio = mateRatios[best_mate];
toMutateRatio = mutateRatios[best_mutate];
System.out.printf("Population Ratio: %s%n", df.format(populationRatio));
System.out.printf("Mate Ratio: %s%n", df.format(toMateRatio));
System.out.printf("Mutate Ratio: %s%n", df.format(toMutateRatio));
System.out.printf("Error: %s%% %n", df.format(minErr));
}
/**
* Run genetic algorithms.
* @param trainIterations the iterations to run
*/
public static void runGA(int trainIterations) {
System.out.println("===========Genetic Algorithms=========");
double starttime = System.nanoTime();
double endtime;
int trainSetSize = trainSet.size();
int populationSize = (int) (trainSetSize * populationRatio);
int toMate = (int) (trainSetSize * toMateRatio);
int toMutate = (int) (trainSetSize * toMutateRatio);
BackPropagationNetwork backpropNet = factory.createClassificationNetwork(
new int[]{inputLayer, numberHiddenLayerNodes, outputLayer});
NeuralNetworkOptimizationProblem nnop = new NeuralNetworkOptimizationProblem(trainSet,
backpropNet, measure);
OptimizationAlgorithm oa = new StandardGeneticAlgorithm(populationSize, toMate, toMutate, nnop);
train(oa, backpropNet, trainIterations);
double trainError = evaluateNetwork(backpropNet, trainSet.getInstances());
double testError = evaluateNetwork(backpropNet, testSet.getInstances());
System.out.printf("Train Error: %s%% %n", df.format(trainError));
System.out.printf("Test Error: %s%% %n", df.format(testError));
endtime = System.nanoTime();
double time_elapsed = endtime - starttime;
// Convert nanoseconds to seconds
time_elapsed /= Math.pow(10,9);
System.out.printf("Time Elapsed: %s s %n", df.format(time_elapsed));
}
/**
* Train a given optimization problem for a given number of iterations. Called by RHC, SA, and
* GA algorithms
* @param oa the optimization algorithm
* @param network the network that corresponds to the randomized optimization problem. The
* optimization algorithm will determine the best weights to try using with this
* network and assign those weights
* @param iterations the number of training iterations
*/
private static void train(OptimizationAlgorithm oa, BackPropagationNetwork network, int
iterations) {
for(int i = 0; i < iterations; i++) {
oa.train();
}
Instance optimalWeights = oa.getOptimal();
network.setWeights(optimalWeights.getData());
}
/**
* Given a network and instances, the output of the network is evaluated and a decimal value
* for error is given
* @param network the BackPropagationNetwork with weights already initialized
* @param data the instances to be evaluated against
* @return
*/
public static double evaluateNetwork(BackPropagationNetwork network, Instance[] data) {
double num_incorrect = 0;
for (int j = 0; j < data.length; j++) {
network.setInputValues(data[j].getData());
network.run();
Vector actual = data[j].getLabel().getData();
Vector predicted = network.getOutputValues();
boolean mismatch = ! isEqualOutputs(actual, predicted);
if (mismatch) {
num_incorrect += 1;
}
}
double error = num_incorrect / data.length * 100;
return error;
}
/**
* Compares two bit vectors to see if expected bit vector is most likely to be the same
* class as the actual bit vector
* @param actual
* @param predicted
* @return
*/
private static boolean isEqualOutputs(Vector actual, Vector predicted) {
int max_at = 0;
double max = 0;
// Where the actual max should be
int actual_index = 0;
for (int i = 0; i < actual.size(); i++) {
double aVal = actual.get(i);
if (aVal == 1.0) {
actual_index = i;
}
double bVal = predicted.get(i);
if (bVal > max) {
max = bVal;
max_at = i;
}
}
return actual_index == max_at;
}
/**
* Reads a file formatted as CSV. Takes the labels and adds them to the set of labels (which
* later helps determine the length of bit vectors). Records real-valued attributes. Turns the
* attributes and labels into bit-vectors. Initializes a DataSet object with these instances.
*/
private static void initializeInstances() {
double[][] attributes = new double[num_examples][];
String[] labels = new String[num_examples];
unique_labels = new HashSet<>();
// Reading dataset
try {
BufferedReader br = new BufferedReader(new FileReader(new File(FILENAME)));
// You don't need these headers, they're just the column labels
String useless_headers = br.readLine();
for(int i = 0; i < attributes.length; i++) {
Scanner scan = new Scanner(br.readLine());
scan.useDelimiter(",");
attributes[i] = new double[num_attributes];
for(int j = 0; j < num_attributes; j++) {
attributes[i][j] = Double.parseDouble(scan.next());
}
// This last element is actually your classification, which is assumed to be a string
labels[i] = scan.next();
unique_labels.add(labels[i]);
}
}
catch(Exception e) {
e.printStackTrace();
}
// Creating a mapping of bitvectors. So "some classification" => [0, 1, 0, 0]
int distinct_labels = unique_labels.size();
outputLayer = distinct_labels;
bitvector_mappings = new HashMap<>();
int index = 0;
for (String label : unique_labels) {
double[] bitvect = new double[distinct_labels];
// At index, set to 1 for a given string
bitvect[index] = 1.0;
// Increment which index will have a bit flipped in next classification
index++;
bitvector_mappings.put(label, bitvect);
}
// Replaces the label for each instance with the corresponding bit vector for that label
// This works even for binary classification
allInstances = new Instance[num_examples];
for (int i = 0; i < attributes.length; i++) {
double[] X = attributes[i];
String label = labels[i];
double[] bitvect = bitvector_mappings.get(label);
Instance instance = new Instance(X);
instance.setLabel(new Instance(bitvect));
allInstances[i] = instance;
}
}
/**
* Print out the actual vs expected bit-vector. Used for debugging purposes only
* @param actual what the example's actual bit vector looks like
* @param expected what a network output as a bit vector
*/
public static void printVectors(Vector actual, Vector expected) {
System.out.print("Actual: [");
for (int i = 0; i < actual.size(); i++) {
System.out.printf(" %f", actual.get(i));
}
System.out.print(" ] \t Expected: [");
for (int i = 0; i < expected.size(); i++) {
System.out.printf(" %f", expected.get(i));
}
System.out.println(" ]");
}
/**
* Takes all instances, and randomly orders them. Then, the first PERCENT_TRAIN percentage of
* instances form the trainSet DataSet, and the remaining (1 - PERCENT_TRAIN) percentage of
* instances form the testSet DataSet.
*/
public static void makeTestTrainSets() {
List<Instance> instances = new ArrayList<>();
for (Instance instance: allInstances) {
instances.add(instance);
}
Random rand = new Random(SEED);
if (shouldRandomize) {
Collections.shuffle(instances, rand);
}
int cutoff = (int) (instances.size() * PERCENT_TRAIN);
List<Instance> trainInstances = instances.subList(0, cutoff);
List<Instance> testInstances = instances.subList(cutoff, instances.size());
Instance[] arr_trn = new Instance[trainInstances.size()];
trainSet = new DataSet(trainInstances.toArray(arr_trn));
// System.out.println("Train Set ")
Instance[] arr_tst = new Instance[testInstances.size()];
testSet = new DataSet(testInstances.toArray(arr_tst));
}
/**
* Given a DataSet of training data, separate the instances into K nearly-equal-sized
* partitions called folds for K-folds cross validation
* @param training, the training DataSet
* @return a list of folds, where each fold is an Instance[]
*/
public static List<Instance[]> kfolds(DataSet training) {
Instance[] trainInstances = training.getInstances();
List<Instance> instances = new ArrayList<>();
for (Instance instance: trainInstances) {
instances.add(instance);
}
List<Instance[]> folds = new ArrayList<>();
// Number of values per fold
int per_fold = (int) Math.floor((double)(instances.size()) / K);
int start = 0;
int end = per_fold;
while (start < instances.size()) {
List<Instance> foldList = null;
if (end > instances.size()) {
end = instances.size();
}
foldList = instances.subList(start, end);
Instance[] fold = new Instance[foldList.size()];
fold = foldList.toArray(fold);
folds.add(fold);
start = end + 1;
end = start + per_fold;
}
return folds;
}
/**
* Given a list of Instance[], this helper combines each arrays contents into one, single
* output array
* @param instanceList the list of Instance[]
* @return the combined array consisting of the contents of each Instance[] in instanceList
*/
public static Instance[] combineInstances(List<Instance[]> instanceList) {
List<Instance> combined = new ArrayList<>();
for (Instance[] fold: instanceList) {
for (Instance instance : fold) {
combined.add(instance);
}
}
Instance[] combinedArr = new Instance[combined.size()];
combinedArr = combined.toArray(combinedArr);
return combinedArr;
}
/**
* Given a list of folds and an index, it will provide an Instance[] with the combined
* instances from every fold except for the fold at the given index
* @param folds the K-folds, a list of Instance[] used as folds for cross-validation
* @param foldIndex the index of the fold to exclude. That fold is used as the validation set
* @return the training folds combined into once Instance[]
*/
public static Instance[] getTrainFolds(List<Instance[]> folds, int foldIndex) {
List<Instance[]> trainFolds = new ArrayList<>(folds);
trainFolds.remove(foldIndex);
Instance[] trnfolds = combineInstances(trainFolds);
return trnfolds;
}
/**
* Given a list of folds and an index, it will provide an Instance[] to serve as a validation
* set.
* @param folds the K-folds, a list of Instance[] used as folds for cross-validation
* @param foldIndex the index of the fold to use as the validation set
* @return the validation set
*/
public static Instance[] getValidationFold(List<Instance[]> folds, int foldIndex) {
return folds.get(foldIndex);
}
}