Skip to content

Commit

Permalink
Small updates based on code review. Renamed statistical_summary.py to…
Browse files Browse the repository at this point in the history
… correlations.py
  • Loading branch information
jkbradley committed Aug 13, 2014
1 parent ab48f6e commit 0b7cec3
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#

"""
Statistical summarization using MLlib.
Correlations using MLlib.
"""

import sys
Expand All @@ -29,9 +29,9 @@

if __name__ == "__main__":
if len(sys.argv) not in [1,2]:
print >> sys.stderr, "Usage: statistical_summary (<file>)"
print >> sys.stderr, "Usage: correlations (<file>)"
exit(-1)
sc = SparkContext(appName="PythonStatisticalSummary")
sc = SparkContext(appName="PythonCorrelations")
if len(sys.argv) == 2:
filepath = sys.argv[1]
else:
Expand Down
18 changes: 9 additions & 9 deletions examples/src/main/python/mllib/random_and_sampled_rdds.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

if __name__ == "__main__":
if len(sys.argv) not in [1, 2]:
print >> sys.stderr, "Usage: logistic_regression <libsvm data file>"
print >> sys.stderr, "Usage: random_and_sampled_rdds <libsvm data file>"
exit(-1)
if len(sys.argv) == 2:
datapath = sys.argv[1]
Expand All @@ -45,22 +45,23 @@

# Example: RandomRDDGenerators
normalRDD = RandomRDDGenerators.normalRDD(sc, numExamples)
print 'Generated RDD of %d examples sampled from a unit normal distribution' % normalRDD.count()
print 'Generated RDD of %d examples sampled from the standard normal distribution'\
% normalRDD.count()
normalVectorRDD = RandomRDDGenerators.normalVectorRDD(sc, numRows = numExamples, numCols = 2)
print 'Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count()

print ''
print

# Example: RDD.sample() and RDD.takeSample()
exactSampleSize = int(numExamples * fraction)
expectedSampleSize = int(numExamples * fraction)
print 'Sampling RDD using fraction %g. Expected sample size = %d.' \
% (fraction, exactSampleSize)
% (fraction, expectedSampleSize)
sampledRDD = normalRDD.sample(withReplacement = True, fraction = fraction)
print ' RDD.sample(): sample has %d examples' % sampledRDD.count()
sampledArray = normalRDD.takeSample(withReplacement = True, num = exactSampleSize)
sampledArray = normalRDD.takeSample(withReplacement = True, num = expectedSampleSize)
print ' RDD.takeSample(): sample has %d examples' % len(sampledArray)

print ''
print

# Example: RDD.sampleByKey()
examples = MLUtils.loadLibSVMFile(sc, datapath)
Expand All @@ -74,8 +75,7 @@
fractions = {}
for k in keyCountsA.keys():
fractions[k] = fraction
sampledByKeyRDD = \
keyedRDD.sampleByKey(withReplacement = True, fractions = fractions)#, exact = True)
sampledByKeyRDD = keyedRDD.sampleByKey(withReplacement = True, fractions = fractions)
keyCountsB = sampledByKeyRDD.countByKey()
sizeB = sum(keyCountsB.values())
print ' Sampled %d examples using approximate stratified sampling (by label). ==> Sample' \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,30 +32,33 @@ import org.apache.spark.SparkContext._
* }}}
* If you use it as a template to create your own app, please use `spark-submit` to submit your app.
*/
object RandomAndSampledRDDs extends App {
object RandomAndSampledRDDs {

case class Params(input: String = "data/mllib/sample_binary_classification_data.txt")

val defaultParams = Params()
def main(args: Array[String]) {
val defaultParams = Params()

val parser = new OptionParser[Params]("RandomAndSampledRDDs") {
head("RandomAndSampledRDDs: an example app for randomly generated and sampled RDDs.")
opt[String]("input")
.text(s"Input path to labeled examples in LIBSVM format, default: ${defaultParams.input}")
.action((x, c) => c.copy(input = x))
note(
"""
val parser = new OptionParser[Params]("RandomAndSampledRDDs") {
head("RandomAndSampledRDDs: an example app for randomly generated and sampled RDDs.")
opt[String]("input")
.text(s"Input path to labeled examples in LIBSVM format, default: ${defaultParams.input}")
.action((x, c) => c.copy(input = x))
note(
"""
|For example, the following command runs this app:
|
| bin/spark-submit --class org.apache.spark.examples.mllib.RandomAndSampledRDDs \
| examples/target/scala-*/spark-examples-*.jar
""".stripMargin)
}
""".
stripMargin)
}

parser.parse(args, defaultParams).map { params =>
run(params)
} getOrElse {
sys.exit(1)
parser.parse(args, defaultParams).map { params =>
run(params)
} getOrElse {
sys.exit(1)
}
}

def run(params: Params) {
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/mllib/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,12 +161,12 @@ def squared_distance(self, other):
j += 1
return result

def toDense(self):
def toArray(self):
"""
Returns a copy of this SparseVector as a 1-dimensional NumPy array.
"""
arr = numpy.zeros(self.size)
for i in range(self.indices.size):
for i in xrange(self.indices.size):
arr[self.indices[i]] = self.values[i]
return arr

Expand Down

0 comments on commit 0b7cec3

Please sign in to comment.