From dafebe2233aa925f3210ccf59b1ccd71774aed26 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Mon, 18 Aug 2014 10:25:08 -0700 Subject: [PATCH] Bug fixes for examples SampledRDDs.scala and sampled_rdds.py: Check for division by 0 and for missing key in maps. --- examples/src/main/python/mllib/sampled_rdds.py | 10 +++++++++- .../apache/spark/examples/mllib/SampledRDDs.scala | 15 +++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/examples/src/main/python/mllib/sampled_rdds.py b/examples/src/main/python/mllib/sampled_rdds.py index 652043a71584f..ec64a5978c672 100755 --- a/examples/src/main/python/mllib/sampled_rdds.py +++ b/examples/src/main/python/mllib/sampled_rdds.py @@ -40,6 +40,9 @@ examples = MLUtils.loadLibSVMFile(sc, datapath) numExamples = examples.count() + if numExamples == 0: + print >> sys.stderr, "Error: Data file had no samples to load." + exit(1) print 'Loaded data with %d examples from file: %s' % (numExamples, datapath) # Example: RDD.sample() and RDD.takeSample() @@ -73,6 +76,11 @@ print ' \tFractions of examples with key' print 'Key\tOrig\tSample' for k in sorted(keyCountsA.keys()): - print '%d\t%g\t%g' % (k, keyCountsA[k] / float(numExamples), keyCountsB[k] / float(sizeB)) + fracA = keyCountsA[k] / float(numExamples) + if sizeB != 0: + fracB = keyCountsB.get(k, 0) / float(sizeB) + else: + fracB = 0 + print '%d\t%g\t%g' % (k, fracA, fracB) sc.stop() diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SampledRDDs.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SampledRDDs.scala index 42a906e0c296c..f01b8266e3fe3 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/SampledRDDs.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SampledRDDs.scala @@ -66,6 +66,9 @@ object SampledRDDs { val examples = MLUtils.loadLibSVMFile(sc, params.input) val numExamples = examples.count() + if (numExamples == 0) { + throw new RuntimeException("Error: Data file had no samples to load.") + } println(s"Loaded data with $numExamples examples from file: ${params.input}") // Example: RDD.sample() and RDD.takeSample() @@ -105,8 +108,16 @@ object SampledRDDs { println(s"Key\tOrig\tApprox Sample\tExact Sample") keyCounts.keys.toSeq.sorted.foreach { key => val origFrac = keyCounts(key) / numExamples.toDouble - val approxFrac = keyCountsB(key) / sizeB.toDouble - val exactFrac = keyCountsBExact(key) / sizeBExact.toDouble + val approxFrac = if (sizeB != 0) { + keyCountsB.getOrElse(key, 0L) / sizeB.toDouble + } else { + 0 + } + val exactFrac = if (sizeBExact != 0) { + keyCountsBExact.getOrElse(key, 0L) / sizeBExact.toDouble + } else { + 0 + } println(s"$key\t$origFrac\t$approxFrac\t$exactFrac") }