Skip to content

Commit

Permalink
Updated based on code review
Browse files Browse the repository at this point in the history
  • Loading branch information
jkbradley committed May 22, 2015
1 parent f16bcec commit cd47f4b
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 17 deletions.
32 changes: 18 additions & 14 deletions docs/ml-features.md
Original file line number Diff line number Diff line change
Expand Up @@ -649,10 +649,13 @@ import org.apache.spark.mllib.util.MLUtils

val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
val dataFrame = sqlContext.createDataFrame(data)
val normalizer = new Normalizer().setInputCol("features").setOutputCol("normFeatures")

// Normalize each Vector using $L^2$ norm.
val l2NormData = normalizer.transform(dataFrame, normalizer.p -> 2)
// Normalize each Vector using $L^1$ norm.
val normalizer = new Normalizer()
.setInputCol("features")
.setOutputCol("normFeatures")
.setP(1.0)
val l1NormData = normalizer.transform(dataFrame)

// Normalize each Vector using $L^\infty$ norm.
val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
Expand All @@ -670,12 +673,13 @@ import org.apache.spark.sql.DataFrame;
JavaRDD<LabeledPoint> data =
MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD();
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);

// Normalize each Vector using $L^1$ norm.
Normalizer normalizer = new Normalizer()
.setInputCol("features")
.setOutputCol("normFeatures");

// Normalize each Vector using $L^2$ norm.
DataFrame l2NormData = normalizer.transform(dataFrame, normalizer.p().w(2));
.setOutputCol("normFeatures")
.setP(1.0);
DataFrame l1NormData = normalizer.transform(dataFrame);

// Normalize each Vector using $L^\infty$ norm.
DataFrame lInfNormData =
Expand All @@ -690,13 +694,13 @@ from pyspark.ml.feature import Normalizer

data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
dataFrame = sqlContext.createDataFrame(data)
normalizer = Normalizer(inputCol="features", outputCol="normFeatures")

# Normalize each Vector using $L^2$ norm.
l2NormData = normalizer.transform(dataFrame, {normalizer.p:2.0})
# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
l1NormData = normalizer.transform(dataFrame)

# Normalize each Vector using $L^\infty$ norm.
lInfNormData = normalizer.transform(dataFrame, {normalizer.p:float("inf")})
lInfNormData = normalizer.transform(dataFrame, {normalizer.p: float("inf")})
{% endhighlight %}
</div>
</div>
Expand Down Expand Up @@ -729,7 +733,7 @@ val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
val dataFrame = sqlContext.createDataFrame(data)
val scaler = new StandardScaler()
.setInputCol("features")
.setOutputCol("normFeatures")
.setOutputCol("scaledFeatures")
.setWithStd(true)
.setWithMean(false)

Expand All @@ -754,7 +758,7 @@ JavaRDD<LabeledPoint> data =
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
StandardScaler scaler = new StandardScaler()
.setInputCol("features")
.setOutputCol("normFeatures")
.setOutputCol("scaledFeatures")
.setWithStd(true)
.setWithMean(false);

Expand All @@ -773,7 +777,7 @@ from pyspark.ml.feature import StandardScaler

data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
dataFrame = sqlContext.createDataFrame(data)
scaler = StandardScaler(inputCol="features", outputCol="normFeatures",
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
withStd=True, withMean=False)

# Compute summary statistics by fitting the StandardScaler
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public void tearDown() {
}

@Test
public void regexTokenizer() {
public void normalizer() {
// The tests are to check Java compatibility.
List<VectorIndexerSuite.FeatureData> points = Lists.newArrayList(
new VectorIndexerSuite.FeatureData(Vectors.dense(0.0, -2.0)),
Expand All @@ -61,9 +61,11 @@ public void regexTokenizer() {

// Normalize each Vector using $L^2$ norm.
DataFrame l2NormData = normalizer.transform(dataFrame, normalizer.p().w(2));
l2NormData.count();

// Normalize each Vector using $L^\infty$ norm.
DataFrame lInfNormData =
normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY));
lInfNormData.count();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public void tearDown() {
}

@Test
public void regexTokenizer() {
public void standardScaler() {
// The tests are to check Java compatibility.
List<VectorIndexerSuite.FeatureData> points = Lists.newArrayList(
new VectorIndexerSuite.FeatureData(Vectors.dense(0.0, -2.0)),
Expand All @@ -57,7 +57,7 @@ public void regexTokenizer() {
VectorIndexerSuite.FeatureData.class);
StandardScaler scaler = new StandardScaler()
.setInputCol("features")
.setOutputCol("normFeatures")
.setOutputCol("scaledFeatures")
.setWithStd(true)
.setWithMean(false);

Expand All @@ -66,5 +66,6 @@ public void regexTokenizer() {

// Normalize each feature to have unit standard deviation.
DataFrame scaledData = scalerModel.transform(dataFrame);
scaledData.count();
}
}

0 comments on commit cd47f4b

Please sign in to comment.