-
+
diff --git a/core/src/main/scala/org/apache/spark/util/FileLogger.scala b/core/src/main/scala/org/apache/spark/util/FileLogger.scala
index 0e6d21b22023a..6a95dc06e155d 100644
--- a/core/src/main/scala/org/apache/spark/util/FileLogger.scala
+++ b/core/src/main/scala/org/apache/spark/util/FileLogger.scala
@@ -61,6 +61,14 @@ private[spark] class FileLogger(
// Only defined if the file system scheme is not local
private var hadoopDataStream: Option[FSDataOutputStream] = None
+ // The Hadoop APIs have changed over time, so we use reflection to figure out
+ // the correct method to use to flush a hadoop data stream. See SPARK-1518
+ // for details.
+ private val hadoopFlushMethod = {
+ val cls = classOf[FSDataOutputStream]
+ scala.util.Try(cls.getMethod("hflush")).getOrElse(cls.getMethod("sync"))
+ }
+
private var writer: Option[PrintWriter] = None
/**
@@ -149,13 +157,13 @@ private[spark] class FileLogger(
/**
* Flush the writer to disk manually.
*
- * If the Hadoop FileSystem is used, the underlying FSDataOutputStream (r1.0.4) must be
- * sync()'ed manually as it does not support flush(), which is invoked by when higher
- * level streams are flushed.
+ * When using a Hadoop filesystem, we need to invoke the hflush or sync
+ * method. In HDFS, hflush guarantees that the data gets to all the
+ * DataNodes.
*/
def flush() {
writer.foreach(_.flush())
- hadoopDataStream.foreach(_.sync())
+ hadoopDataStream.foreach(hadoopFlushMethod.invoke(_))
}
/**
diff --git a/core/src/main/scala/org/apache/spark/util/SerializableHyperLogLog.scala b/core/src/main/scala/org/apache/spark/util/SerializableHyperLogLog.scala
deleted file mode 100644
index 21a88eea3bbc2..0000000000000
--- a/core/src/main/scala/org/apache/spark/util/SerializableHyperLogLog.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.util
-
-import java.io.{Externalizable, ObjectInput, ObjectOutput}
-
-import com.clearspring.analytics.stream.cardinality.{HyperLogLog, ICardinality}
-
-/**
- * A wrapper around [[com.clearspring.analytics.stream.cardinality.HyperLogLog]] that is
- * serializable.
- */
-private[spark]
-class SerializableHyperLogLog(var value: ICardinality) extends Externalizable {
-
- def this() = this(null) // For deserialization
-
- def merge(other: SerializableHyperLogLog) = new SerializableHyperLogLog(value.merge(other.value))
-
- def add[T](elem: T) = {
- this.value.offer(elem)
- this
- }
-
- def readExternal(in: ObjectInput) {
- val byteLength = in.readInt()
- val bytes = new Array[Byte](byteLength)
- in.readFully(bytes)
- value = HyperLogLog.Builder.build(bytes)
- }
-
- def writeExternal(out: ObjectOutput) {
- val bytes = value.getBytes()
- out.writeInt(bytes.length)
- out.write(bytes)
- }
-}
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 170f09be21534..288badd3160f8 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -20,6 +20,7 @@ package org.apache.spark.util.collection
import java.io.{InputStream, BufferedInputStream, FileInputStream, File, Serializable, EOFException}
import java.util.Comparator
+import scala.collection.BufferedIterator
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
@@ -231,7 +232,7 @@ class ExternalAppendOnlyMap[K, V, C](
// Input streams are derived both from the in-memory map and spilled maps on disk
// The in-memory map is sorted in place, while the spilled maps are already in sorted order
private val sortedMap = currentMap.destructiveSortedIterator(comparator)
- private val inputStreams = Seq(sortedMap) ++ spilledMaps
+ private val inputStreams = (Seq(sortedMap) ++ spilledMaps).map(it => it.buffered)
inputStreams.foreach { it =>
val kcPairs = getMorePairs(it)
@@ -246,13 +247,13 @@ class ExternalAppendOnlyMap[K, V, C](
* In the event of key hash collisions, this ensures no pairs are hidden from being merged.
* Assume the given iterator is in sorted order.
*/
- private def getMorePairs(it: Iterator[(K, C)]): ArrayBuffer[(K, C)] = {
+ private def getMorePairs(it: BufferedIterator[(K, C)]): ArrayBuffer[(K, C)] = {
val kcPairs = new ArrayBuffer[(K, C)]
if (it.hasNext) {
var kc = it.next()
kcPairs += kc
val minHash = kc._1.hashCode()
- while (it.hasNext && kc._1.hashCode() == minHash) {
+ while (it.hasNext && it.head._1.hashCode() == minHash) {
kc = it.next()
kcPairs += kc
}
@@ -325,7 +326,8 @@ class ExternalAppendOnlyMap[K, V, C](
*
* StreamBuffers are ordered by the minimum key hash found across all of their own pairs.
*/
- private case class StreamBuffer(iterator: Iterator[(K, C)], pairs: ArrayBuffer[(K, C)])
+ private class StreamBuffer(
+ val iterator: BufferedIterator[(K, C)], val pairs: ArrayBuffer[(K, C)])
extends Comparable[StreamBuffer] {
def isEmpty = pairs.length == 0
diff --git a/core/src/test/java/org/apache/spark/JavaAPISuite.java b/core/src/test/java/org/apache/spark/JavaAPISuite.java
index 7193223addf66..50a62129116f1 100644
--- a/core/src/test/java/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/java/org/apache/spark/JavaAPISuite.java
@@ -23,6 +23,7 @@
import scala.Tuple2;
import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.base.Optional;
import com.google.common.base.Charsets;
@@ -48,7 +49,6 @@
import org.apache.spark.partial.PartialResult;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.util.StatCounter;
-import org.apache.spark.util.Utils;
// The test suite itself is Serializable so that anonymous Function implementations can be
// serialized, as an alternative to converting these anonymous classes to static inner classes;
@@ -70,16 +70,6 @@ public void tearDown() {
sc = null;
}
- static class ReverseIntComparator implements Comparator
, Serializable {
-
- @Override
- public int compare(Integer a, Integer b) {
- if (a > b) return -1;
- else if (a < b) return 1;
- else return 0;
- }
- }
-
@SuppressWarnings("unchecked")
@Test
public void sparkContextUnion() {
@@ -124,7 +114,7 @@ public void intersection() {
JavaRDD intersections = s1.intersection(s2);
Assert.assertEquals(3, intersections.count());
- ArrayList list = new ArrayList();
+ List list = new ArrayList();
JavaRDD empty = sc.parallelize(list);
JavaRDD emptyIntersection = empty.intersection(s2);
Assert.assertEquals(0, emptyIntersection.count());
@@ -144,6 +134,28 @@ public void intersection() {
Assert.assertEquals(2, pIntersection.count());
}
+ @Test
+ public void sample() {
+ List ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+ JavaRDD rdd = sc.parallelize(ints);
+ JavaRDD sample20 = rdd.sample(true, 0.2, 11);
+ // expected 2 but of course result varies randomly a bit
+ Assert.assertEquals(3, sample20.count());
+ JavaRDD sample20NoReplacement = rdd.sample(false, 0.2, 11);
+ Assert.assertEquals(2, sample20NoReplacement.count());
+ }
+
+ @Test
+ public void randomSplit() {
+ List ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+ JavaRDD rdd = sc.parallelize(ints);
+ JavaRDD[] splits = rdd.randomSplit(new double[] { 0.4, 0.6, 1.0 }, 11);
+ Assert.assertEquals(3, splits.length);
+ Assert.assertEquals(2, splits[0].count());
+ Assert.assertEquals(3, splits[1].count());
+ Assert.assertEquals(5, splits[2].count());
+ }
+
@Test
public void sortByKey() {
List> pairs = new ArrayList>();
@@ -161,26 +173,24 @@ public void sortByKey() {
Assert.assertEquals(new Tuple2(3, 2), sortedPairs.get(2));
// Custom comparator
- sortedRDD = rdd.sortByKey(new ReverseIntComparator(), false);
+ sortedRDD = rdd.sortByKey(Collections.reverseOrder(), false);
Assert.assertEquals(new Tuple2(-1, 1), sortedRDD.first());
sortedPairs = sortedRDD.collect();
Assert.assertEquals(new Tuple2(0, 4), sortedPairs.get(1));
Assert.assertEquals(new Tuple2(3, 2), sortedPairs.get(2));
}
- static int foreachCalls = 0;
-
@Test
public void foreach() {
- foreachCalls = 0;
+ final Accumulator accum = sc.accumulator(0);
JavaRDD rdd = sc.parallelize(Arrays.asList("Hello", "World"));
rdd.foreach(new VoidFunction() {
@Override
- public void call(String s) {
- foreachCalls++;
+ public void call(String s) throws IOException {
+ accum.add(1);
}
});
- Assert.assertEquals(2, foreachCalls);
+ Assert.assertEquals(2, accum.value().intValue());
}
@Test
@@ -188,7 +198,7 @@ public void toLocalIterator() {
List correct = Arrays.asList(1, 2, 3, 4);
JavaRDD rdd = sc.parallelize(correct);
List result = Lists.newArrayList(rdd.toLocalIterator());
- Assert.assertTrue(correct.equals(result));
+ Assert.assertEquals(correct, result);
}
@Test
@@ -196,7 +206,7 @@ public void zipWithUniqueId() {
List dataArray = Arrays.asList(1, 2, 3, 4);
JavaPairRDD zip = sc.parallelize(dataArray).zipWithUniqueId();
JavaRDD indexes = zip.values();
- Assert.assertTrue(new HashSet(indexes.collect()).size() == 4);
+ Assert.assertEquals(4, new HashSet(indexes.collect()).size());
}
@Test
@@ -205,7 +215,7 @@ public void zipWithIndex() {
JavaPairRDD zip = sc.parallelize(dataArray).zipWithIndex();
JavaRDD indexes = zip.values();
List correctIndexes = Arrays.asList(0L, 1L, 2L, 3L);
- Assert.assertTrue(indexes.collect().equals(correctIndexes));
+ Assert.assertEquals(correctIndexes, indexes.collect());
}
@SuppressWarnings("unchecked")
@@ -252,8 +262,10 @@ public void cogroup() {
new Tuple2("Oranges", 2),
new Tuple2("Apples", 3)
));
- JavaPairRDD, Iterable>> cogrouped = categories.cogroup(prices);
- Assert.assertEquals("[Fruit, Citrus]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
+ JavaPairRDD, Iterable>> cogrouped =
+ categories.cogroup(prices);
+ Assert.assertEquals("[Fruit, Citrus]",
+ Iterables.toString(cogrouped.lookup("Oranges").get(0)._1()));
Assert.assertEquals("[2]", Iterables.toString(cogrouped.lookup("Oranges").get(0)._2()));
cogrouped.collect();
@@ -281,8 +293,7 @@ public void leftOuterJoin() {
rdd1.leftOuterJoin(rdd2).filter(
new Function>>, Boolean>() {
@Override
- public Boolean call(Tuple2>> tup)
- throws Exception {
+ public Boolean call(Tuple2>> tup) {
return !tup._2()._2().isPresent();
}
}).first();
@@ -356,8 +367,7 @@ public Integer call(Integer a, Integer b) {
Assert.assertEquals(2, localCounts.get(2).intValue());
Assert.assertEquals(3, localCounts.get(3).intValue());
- localCounts = rdd.reduceByKeyLocally(new Function2() {
+ localCounts = rdd.reduceByKeyLocally(new Function2() {
@Override
public Integer call(Integer a, Integer b) {
return a + b;
@@ -448,16 +458,17 @@ public void map() {
JavaDoubleRDD doubles = rdd.mapToDouble(new DoubleFunction() {
@Override
public double call(Integer x) {
- return 1.0 * x;
+ return x.doubleValue();
}
}).cache();
doubles.collect();
- JavaPairRDD pairs = rdd.mapToPair(new PairFunction() {
- @Override
- public Tuple2 call(Integer x) {
- return new Tuple2(x, x);
- }
- }).cache();
+ JavaPairRDD pairs = rdd.mapToPair(
+ new PairFunction() {
+ @Override
+ public Tuple2 call(Integer x) {
+ return new Tuple2(x, x);
+ }
+ }).cache();
pairs.collect();
JavaRDD strings = rdd.map(new Function() {
@Override
@@ -487,7 +498,9 @@ public Iterable call(String x) {
@Override
public Iterable> call(String s) {
List> pairs = new LinkedList>();
- for (String word : s.split(" ")) pairs.add(new Tuple2(word, word));
+ for (String word : s.split(" ")) {
+ pairs.add(new Tuple2(word, word));
+ }
return pairs;
}
}
@@ -499,7 +512,9 @@ public Iterable> call(String s) {
@Override
public Iterable call(String s) {
List lengths = new LinkedList();
- for (String word : s.split(" ")) lengths.add(word.length() * 1.0);
+ for (String word : s.split(" ")) {
+ lengths.add((double) word.length());
+ }
return lengths;
}
});
@@ -521,7 +536,7 @@ public void mapsFromPairsToPairs() {
JavaPairRDD swapped = pairRDD.flatMapToPair(
new PairFlatMapFunction, String, Integer>() {
@Override
- public Iterable> call(Tuple2 item) throws Exception {
+ public Iterable> call(Tuple2 item) {
return Collections.singletonList(item.swap());
}
});
@@ -530,7 +545,7 @@ public Iterable> call(Tuple2 item) thro
// There was never a bug here, but it's worth testing:
pairRDD.mapToPair(new PairFunction, String, Integer>() {
@Override
- public Tuple2 call(Tuple2 item) throws Exception {
+ public Tuple2 call(Tuple2 item) {
return item.swap();
}
}).collect();
@@ -631,14 +646,10 @@ public void wholeTextFiles() throws IOException {
byte[] content2 = "spark is also easy to use.\n".getBytes("utf-8");
String tempDirName = tempDir.getAbsolutePath();
- DataOutputStream ds = new DataOutputStream(new FileOutputStream(tempDirName + "/part-00000"));
- ds.write(content1);
- ds.close();
- ds = new DataOutputStream(new FileOutputStream(tempDirName + "/part-00001"));
- ds.write(content2);
- ds.close();
-
- HashMap container = new HashMap();
+ Files.write(content1, new File(tempDirName + "/part-00000"));
+ Files.write(content2, new File(tempDirName + "/part-00001"));
+
+ Map container = new HashMap();
container.put(tempDirName+"/part-00000", new Text(content1).toString());
container.put(tempDirName+"/part-00001", new Text(content2).toString());
@@ -844,7 +855,7 @@ public void zip() {
JavaDoubleRDD doubles = rdd.mapToDouble(new DoubleFunction() {
@Override
public double call(Integer x) {
- return 1.0 * x;
+ return x.doubleValue();
}
});
JavaPairRDD zipped = rdd.zip(doubles);
@@ -859,17 +870,7 @@ public void zipPartitions() {
new FlatMapFunction2, Iterator, Integer>() {
@Override
public Iterable call(Iterator i, Iterator s) {
- int sizeI = 0;
- int sizeS = 0;
- while (i.hasNext()) {
- sizeI += 1;
- i.next();
- }
- while (s.hasNext()) {
- sizeS += 1;
- s.next();
- }
- return Arrays.asList(sizeI, sizeS);
+ return Arrays.asList(Iterators.size(i), Iterators.size(s));
}
};
@@ -883,6 +884,7 @@ public void accumulators() {
final Accumulator intAccum = sc.intAccumulator(10);
rdd.foreach(new VoidFunction() {
+ @Override
public void call(Integer x) {
intAccum.add(x);
}
@@ -891,6 +893,7 @@ public void call(Integer x) {
final Accumulator doubleAccum = sc.doubleAccumulator(10.0);
rdd.foreach(new VoidFunction() {
+ @Override
public void call(Integer x) {
doubleAccum.add((double) x);
}
@@ -899,14 +902,17 @@ public void call(Integer x) {
// Try a custom accumulator type
AccumulatorParam floatAccumulatorParam = new AccumulatorParam() {
+ @Override
public Float addInPlace(Float r, Float t) {
return r + t;
}
+ @Override
public Float addAccumulator(Float r, Float t) {
return r + t;
}
+ @Override
public Float zero(Float initialValue) {
return 0.0f;
}
@@ -914,6 +920,7 @@ public Float zero(Float initialValue) {
final Accumulator floatAccum = sc.accumulator(10.0f, floatAccumulatorParam);
rdd.foreach(new VoidFunction() {
+ @Override
public void call(Integer x) {
floatAccum.add((float) x);
}
@@ -929,7 +936,8 @@ public void call(Integer x) {
public void keyBy() {
JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2));
List> s = rdd.keyBy(new Function() {
- public String call(Integer t) throws Exception {
+ @Override
+ public String call(Integer t) {
return t.toString();
}
}).collect();
@@ -941,10 +949,10 @@ public String call(Integer t) throws Exception {
public void checkpointAndComputation() {
JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
sc.setCheckpointDir(tempDir.getAbsolutePath());
- Assert.assertEquals(false, rdd.isCheckpointed());
+ Assert.assertFalse(rdd.isCheckpointed());
rdd.checkpoint();
rdd.count(); // Forces the DAG to cause a checkpoint
- Assert.assertEquals(true, rdd.isCheckpointed());
+ Assert.assertTrue(rdd.isCheckpointed());
Assert.assertEquals(Arrays.asList(1, 2, 3, 4, 5), rdd.collect());
}
@@ -952,10 +960,10 @@ public void checkpointAndComputation() {
public void checkpointAndRestore() {
JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
sc.setCheckpointDir(tempDir.getAbsolutePath());
- Assert.assertEquals(false, rdd.isCheckpointed());
+ Assert.assertFalse(rdd.isCheckpointed());
rdd.checkpoint();
rdd.count(); // Forces the DAG to cause a checkpoint
- Assert.assertEquals(true, rdd.isCheckpointed());
+ Assert.assertTrue(rdd.isCheckpointed());
Assert.assertTrue(rdd.getCheckpointFile().isPresent());
JavaRDD recovered = sc.checkpointFile(rdd.getCheckpointFile().get());
@@ -966,16 +974,17 @@ public void checkpointAndRestore() {
@Test
public void mapOnPairRDD() {
JavaRDD rdd1 = sc.parallelize(Arrays.asList(1,2,3,4));
- JavaPairRDD rdd2 = rdd1.mapToPair(new PairFunction() {
- @Override
- public Tuple2 call(Integer i) throws Exception {
- return new Tuple2(i, i % 2);
- }
- });
+ JavaPairRDD rdd2 = rdd1.mapToPair(
+ new PairFunction() {
+ @Override
+ public Tuple2 call(Integer i) {
+ return new Tuple2(i, i % 2);
+ }
+ });
JavaPairRDD rdd3 = rdd2.mapToPair(
new PairFunction, Integer, Integer>() {
@Override
- public Tuple2 call(Tuple2 in) throws Exception {
+ public Tuple2 call(Tuple2 in) {
return new Tuple2(in._2(), in._1());
}
});
@@ -992,14 +1001,15 @@ public Tuple2 call(Tuple2 in) throws Excepti
public void collectPartitions() {
JavaRDD rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7), 3);
- JavaPairRDD rdd2 = rdd1.mapToPair(new PairFunction() {
- @Override
- public Tuple2 call(Integer i) throws Exception {
- return new Tuple2(i, i % 2);
- }
- });
+ JavaPairRDD rdd2 = rdd1.mapToPair(
+ new PairFunction() {
+ @Override
+ public Tuple2 call(Integer i) {
+ return new Tuple2(i, i % 2);
+ }
+ });
- List[] parts = rdd1.collectPartitions(new int[] {0});
+ List[] parts = rdd1.collectPartitions(new int[] {0});
Assert.assertEquals(Arrays.asList(1, 2), parts[0]);
parts = rdd1.collectPartitions(new int[] {1, 2});
@@ -1010,14 +1020,14 @@ public Tuple2 call(Integer i) throws Exception {
new Tuple2(2, 0)),
rdd2.collectPartitions(new int[] {0})[0]);
- parts = rdd2.collectPartitions(new int[] {1, 2});
+ List>[] parts2 = rdd2.collectPartitions(new int[] {1, 2});
Assert.assertEquals(Arrays.asList(new Tuple2(3, 1),
new Tuple2(4, 0)),
- parts[0]);
+ parts2[0]);
Assert.assertEquals(Arrays.asList(new Tuple2(5, 1),
new Tuple2(6, 0),
new Tuple2(7, 1)),
- parts[1]);
+ parts2[1]);
}
@Test
@@ -1028,27 +1038,25 @@ public void countApproxDistinct() {
arrayData.add(i % size);
}
JavaRDD simpleRdd = sc.parallelize(arrayData, 10);
- Assert.assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.2) - size) / (size * 1.0)) < 0.2);
- Assert.assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.05) - size) / (size * 1.0)) <= 0.05);
- Assert.assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.01) - size) / (size * 1.0)) <= 0.01);
+ Assert.assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.05) - size) / (size * 1.0)) <= 0.1);
}
@Test
public void countApproxDistinctByKey() {
- double relativeSD = 0.001;
-
List> arrayData = new ArrayList>();
- for (int i = 10; i < 100; i++)
- for (int j = 0; j < i; j++)
+ for (int i = 10; i < 100; i++) {
+ for (int j = 0; j < i; j++) {
arrayData.add(new Tuple2(i, j));
-
+ }
+ }
+ double relativeSD = 0.001;
JavaPairRDD pairRdd = sc.parallelizePairs(arrayData);
- List> res = pairRdd.countApproxDistinctByKey(relativeSD).collect();
+ List> res = pairRdd.countApproxDistinctByKey(8, 0).collect();
for (Tuple2 resItem : res) {
double count = (double)resItem._1();
Long resCount = (Long)resItem._2();
Double error = Math.abs((resCount - count) / count);
- Assert.assertTrue(error < relativeSD);
+ Assert.assertTrue(error < 0.1);
}
}
@@ -1057,12 +1065,13 @@ public void countApproxDistinctByKey() {
public void collectAsMapWithIntArrayValues() {
// Regression test for SPARK-1040
JavaRDD rdd = sc.parallelize(Arrays.asList(1));
- JavaPairRDD pairRDD = rdd.mapToPair(new PairFunction() {
- @Override
- public Tuple2 call(Integer x) throws Exception {
- return new Tuple2(x, new int[] { x });
- }
- });
+ JavaPairRDD pairRDD = rdd.mapToPair(
+ new PairFunction() {
+ @Override
+ public Tuple2 call(Integer x) {
+ return new Tuple2(x, new int[] { x });
+ }
+ });
pairRDD.collect(); // Works fine
pairRDD.collectAsMap(); // Used to crash with ClassCastException
}
diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
index 64933f4b1046d..f64f3c9036034 100644
--- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -167,26 +167,28 @@ class CheckpointSuite extends FunSuite with LocalSparkContext with Logging {
})
}
- test("ZippedRDD") {
- testRDD(rdd => new ZippedRDD(sc, rdd, rdd.map(x => x)))
- testRDDPartitions(rdd => new ZippedRDD(sc, rdd, rdd.map(x => x)))
+ test("ZippedPartitionsRDD") {
+ testRDD(rdd => rdd.zip(rdd.map(x => x)))
+ testRDDPartitions(rdd => rdd.zip(rdd.map(x => x)))
- // Test that the ZippedPartition updates parent partitions
- // after the parent RDD has been checkpointed and parent partitions have been changed.
- // Note that this test is very specific to the current implementation of ZippedRDD.
+ // Test that ZippedPartitionsRDD updates parent partitions after parent RDDs have
+ // been checkpointed and parent partitions have been changed.
+ // Note that this test is very specific to the implementation of ZippedPartitionsRDD.
val rdd = generateFatRDD()
- val zippedRDD = new ZippedRDD(sc, rdd, rdd.map(x => x))
+ val zippedRDD = rdd.zip(rdd.map(x => x)).asInstanceOf[ZippedPartitionsRDD2[_, _, _]]
zippedRDD.rdd1.checkpoint()
zippedRDD.rdd2.checkpoint()
val partitionBeforeCheckpoint =
- serializeDeserialize(zippedRDD.partitions.head.asInstanceOf[ZippedPartition[_, _]])
+ serializeDeserialize(zippedRDD.partitions.head.asInstanceOf[ZippedPartitionsPartition])
zippedRDD.count()
val partitionAfterCheckpoint =
- serializeDeserialize(zippedRDD.partitions.head.asInstanceOf[ZippedPartition[_, _]])
+ serializeDeserialize(zippedRDD.partitions.head.asInstanceOf[ZippedPartitionsPartition])
assert(
- partitionAfterCheckpoint.partition1.getClass != partitionBeforeCheckpoint.partition1.getClass &&
- partitionAfterCheckpoint.partition2.getClass != partitionBeforeCheckpoint.partition2.getClass,
- "ZippedRDD.partition1 and ZippedRDD.partition2 not updated after parent RDD is checkpointed"
+ partitionAfterCheckpoint.partitions(0).getClass !=
+ partitionBeforeCheckpoint.partitions(0).getClass &&
+ partitionAfterCheckpoint.partitions(1).getClass !=
+ partitionBeforeCheckpoint.partitions(1).getClass,
+ "ZippedPartitionsRDD partition 0 (or 1) not updated after parent RDDs are checkpointed"
)
}
diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
index 5a8310090890d..dc2db66df60e0 100644
--- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala
@@ -25,7 +25,7 @@ import scala.language.postfixOps
import scala.util.Random
import org.scalatest.{BeforeAndAfter, FunSuite}
-import org.scalatest.concurrent.Eventually
+import org.scalatest.concurrent.{PatienceConfiguration, Eventually}
import org.scalatest.concurrent.Eventually._
import org.scalatest.time.SpanSugar._
@@ -76,7 +76,7 @@ class ContextCleanerSuite extends FunSuite with BeforeAndAfter with LocalSparkCo
tester.assertCleanup()
// Verify that shuffles can be re-executed after cleaning up
- assert(rdd.collect().toList === collected)
+ assert(rdd.collect().toList.equals(collected))
}
test("cleanup broadcast") {
@@ -285,7 +285,7 @@ class CleanerTester(
sc.cleaner.get.attachListener(cleanerListener)
/** Assert that all the stuff has been cleaned up */
- def assertCleanup()(implicit waitTimeout: Eventually.Timeout) {
+ def assertCleanup()(implicit waitTimeout: PatienceConfiguration.Timeout) {
try {
eventually(waitTimeout, interval(100 millis)) {
assert(isAllCleanedUp)
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index 1f2206b1f0379..070e974657860 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -230,6 +230,17 @@ class FileSuite extends FunSuite with LocalSparkContext {
}
}
+ test ("allow user to disable the output directory existence checking (old Hadoop API") {
+ val sf = new SparkConf()
+ sf.setAppName("test").setMaster("local").set("spark.hadoop.validateOutputSpecs", "false")
+ sc = new SparkContext(sf)
+ val randomRDD = sc.parallelize(Array((1, "a"), (1, "a"), (2, "b"), (3, "c")), 1)
+ randomRDD.saveAsTextFile(tempDir.getPath + "/output")
+ assert(new File(tempDir.getPath + "/output/part-00000").exists() === true)
+ randomRDD.saveAsTextFile(tempDir.getPath + "/output")
+ assert(new File(tempDir.getPath + "/output/part-00000").exists() === true)
+ }
+
test ("prevent user from overwriting the empty directory (new Hadoop API)") {
sc = new SparkContext("local", "test")
val randomRDD = sc.parallelize(Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1)
@@ -248,6 +259,17 @@ class FileSuite extends FunSuite with LocalSparkContext {
}
}
+ test ("allow user to disable the output directory existence checking (new Hadoop API") {
+ val sf = new SparkConf()
+ sf.setAppName("test").setMaster("local").set("spark.hadoop.validateOutputSpecs", "false")
+ sc = new SparkContext(sf)
+ val randomRDD = sc.parallelize(Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1)
+ randomRDD.saveAsNewAPIHadoopFile[NewTextOutputFormat[String, String]](tempDir.getPath + "/output")
+ assert(new File(tempDir.getPath + "/output/part-r-00000").exists() === true)
+ randomRDD.saveAsNewAPIHadoopFile[NewTextOutputFormat[String, String]](tempDir.getPath + "/output")
+ assert(new File(tempDir.getPath + "/output/part-r-00000").exists() === true)
+ }
+
test ("save Hadoop Dataset through old Hadoop API") {
sc = new SparkContext("local", "test")
val randomRDD = sc.parallelize(Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1)
diff --git a/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala b/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala
index 29d428aa7dc41..47df00050c1e2 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleNettySuite.scala
@@ -23,11 +23,11 @@ class ShuffleNettySuite extends ShuffleSuite with BeforeAndAfterAll {
// This test suite should run all tests in ShuffleSuite with Netty shuffle mode.
- override def beforeAll(configMap: Map[String, Any]) {
+ override def beforeAll() {
System.setProperty("spark.shuffle.use.netty", "true")
}
- override def afterAll(configMap: Map[String, Any]) {
+ override def afterAll() {
System.setProperty("spark.shuffle.use.netty", "false")
}
}
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 1230565ea5b7e..9ddafc451878d 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -119,28 +119,30 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
* relatively tight error bounds to check correctness of functionality rather than checking
* whether the approximation conforms with the requested bound.
*/
- val relativeSD = 0.001
+ val p = 20
+ val sp = 0
+ // When p = 20, the relative accuracy is about 0.001. So with high probability, the
+ // relative error should be smaller than the threshold 0.01 we use here.
+ val relativeSD = 0.01
// For each value i, there are i tuples with first element equal to i.
// Therefore, the expected count for key i would be i.
val stacked = (1 to 100).flatMap(i => (1 to i).map(j => (i, j)))
val rdd1 = sc.parallelize(stacked)
- val counted1 = rdd1.countApproxDistinctByKey(relativeSD).collect()
- counted1.foreach{
- case(k, count) => assert(error(count, k) < relativeSD)
- }
+ val counted1 = rdd1.countApproxDistinctByKey(p, sp).collect()
+ counted1.foreach { case (k, count) => assert(error(count, k) < relativeSD) }
- val rnd = new Random()
+ val rnd = new Random(42)
// The expected count for key num would be num
val randStacked = (1 to 100).flatMap { i =>
- val num = rnd.nextInt % 500
+ val num = rnd.nextInt() % 500
(1 to num).map(j => (num, j))
}
val rdd2 = sc.parallelize(randStacked)
- val counted2 = rdd2.countApproxDistinctByKey(relativeSD, 4).collect()
- counted2.foreach{
- case(k, count) => assert(error(count, k) < relativeSD)
+ val counted2 = rdd2.countApproxDistinctByKey(relativeSD).collect()
+ counted2.foreach { case (k, count) =>
+ assert(error(count, k) < relativeSD, s"${error(count, k)} < $relativeSD")
}
}
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index e686068f7a99a..55af1666df662 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -73,10 +73,8 @@ class RDDSuite extends FunSuite with SharedSparkContext {
val size = 100
val uniformDistro = for (i <- 1 to 100000) yield i % size
val simpleRdd = sc.makeRDD(uniformDistro)
- assert(error(simpleRdd.countApproxDistinct(0.2), size) < 0.2)
- assert(error(simpleRdd.countApproxDistinct(0.05), size) < 0.05)
- assert(error(simpleRdd.countApproxDistinct(0.01), size) < 0.01)
- assert(error(simpleRdd.countApproxDistinct(0.001), size) < 0.001)
+ assert(error(simpleRdd.countApproxDistinct(4, 0), size) < 0.4)
+ assert(error(simpleRdd.countApproxDistinct(8, 0), size) < 0.1)
}
test("SparkContext.union") {
@@ -268,8 +266,9 @@ class RDDSuite extends FunSuite with SharedSparkContext {
// we can optionally shuffle to keep the upstream parallel
val coalesced5 = data.coalesce(1, shuffle = true)
- assert(coalesced5.dependencies.head.rdd.dependencies.head.rdd.asInstanceOf[ShuffledRDD[_, _, _]] !=
- null)
+ val isEquals = coalesced5.dependencies.head.rdd.dependencies.head.rdd.
+ asInstanceOf[ShuffledRDD[_, _, _]] != null
+ assert(isEquals)
// when shuffling, we can increase the number of partitions
val coalesced6 = data.coalesce(20, shuffle = true)
@@ -352,6 +351,10 @@ class RDDSuite extends FunSuite with SharedSparkContext {
intercept[IllegalArgumentException] {
nums.zip(sc.parallelize(1 to 4, 1)).collect()
}
+
+ intercept[SparkException] {
+ nums.zip(sc.parallelize(1 to 5, 2)).collect()
+ }
}
test("partition pruning") {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 81e64c1846ed5..7506d56d7e26d 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -23,7 +23,7 @@ import scala.language.reflectiveCalls
import akka.actor._
import akka.testkit.{ImplicitSender, TestKit, TestActorRef}
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.{BeforeAndAfter, FunSuiteLike}
import org.apache.spark._
import org.apache.spark.rdd.RDD
@@ -37,7 +37,7 @@ class BuggyDAGEventProcessActor extends Actor {
}
}
-class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with FunSuite
+class DAGSchedulerSuite extends TestKit(ActorSystem("DAGSchedulerSuite")) with FunSuiteLike
with ImplicitSender with BeforeAndAfter with LocalSparkContext {
val conf = new SparkConf
diff --git a/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala b/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala
index 6a5653ed2fb54..c1c605cdb487b 100644
--- a/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/TimeStampedHashMapSuite.scala
@@ -105,7 +105,8 @@ class TimeStampedHashMapSuite extends FunSuite {
map("k1") = strongRef
map("k2") = "v2"
map("k3") = "v3"
- assert(map("k1") === strongRef)
+ val isEquals = map("k1") == strongRef
+ assert(isEquals)
// clear strong reference to "k1"
strongRef = null
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index cdebefb67510c..deb780953579d 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -277,6 +277,11 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
("pomatoes", "eructation") // 568647356
)
+ collisionPairs.foreach { case (w1, w2) =>
+ // String.hashCode is documented to use a specific algorithm, but check just in case
+ assert(w1.hashCode === w2.hashCode)
+ }
+
(1 to 100000).map(_.toString).foreach { i => map.insert(i, i) }
collisionPairs.foreach { case (w1, w2) =>
map.insert(w1, w2)
@@ -296,7 +301,32 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
assert(kv._2.equals(expectedValue))
count += 1
}
- assert(count == 100000 + collisionPairs.size * 2)
+ assert(count === 100000 + collisionPairs.size * 2)
+ }
+
+ test("spilling with many hash collisions") {
+ val conf = new SparkConf(true)
+ conf.set("spark.shuffle.memoryFraction", "0.0001")
+ sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+
+ val map = new ExternalAppendOnlyMap[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
+
+ // Insert 10 copies each of lots of objects whose hash codes are either 0 or 1. This causes
+ // problems if the map fails to group together the objects with the same code (SPARK-2043).
+ for (i <- 1 to 10) {
+ for (j <- 1 to 10000) {
+ map.insert(FixedHashObject(j, j % 2), 1)
+ }
+ }
+
+ val it = map.iterator
+ var count = 0
+ while (it.hasNext) {
+ val kv = it.next()
+ assert(kv._2 === 10)
+ count += 1
+ }
+ assert(count === 10000)
}
test("spilling with hash collisions using the Int.MaxValue key") {
@@ -317,3 +347,10 @@ class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
}
}
}
+
+/**
+ * A dummy class that always returns the same hash code, to easily test hash collisions
+ */
+case class FixedHashObject(val v: Int, val h: Int) extends Serializable {
+ override def hashCode(): Int = h
+}
diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index 7f744d5589ef7..ffb70096d6014 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one or more
@@ -128,8 +128,9 @@ def merge_pr(pr_num, target_ref):
merge_message_flags = []
- for p in [title, body]:
- merge_message_flags += ["-m", p]
+ merge_message_flags += ["-m", title]
+ if body != None:
+ merge_message_flags += ["-m", body]
authors = "\n".join(["Author: %s" % a for a in distinct_authors])
diff --git a/dev/mima b/dev/mima
index d4099990254cc..ab6bd4469b0e8 100755
--- a/dev/mima
+++ b/dev/mima
@@ -31,4 +31,5 @@ if [ $ret_val != 0 ]; then
echo "NOTE: Exceptions to binary compatibility can be added in project/MimaExcludes.scala"
fi
+rm -f .generated-mima-excludes
exit $ret_val
diff --git a/docs/configuration.md b/docs/configuration.md
index 0697f7fc2fd91..71fafa573467f 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -487,6 +487,14 @@ Apart from these, the following properties are also available, and may be useful
this duration will be cleared as well.
+
+ spark.hadoop.validateOutputSpecs |
+ true |
+ If set to true, validates the output specification (e.g. checking if the output directory already exists)
+ used in saveAsHadoopFile and other variants. This can be disabled to silence exceptions due to pre-existing
+ output directories. We recommend that users do not disable this except if trying to achieve compatibility with
+ previous versions of Spark. Simply use Hadoop's FileSystem API to delete output directories by hand. |
+
#### Networking
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 8b056f5ea734c..9d5748ba4bc23 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -83,7 +83,7 @@ def parse_args():
"between zones applies)")
parser.add_option("-a", "--ami", help="Amazon Machine Image ID to use")
parser.add_option(
- "-v", "--spark-version", default="0.9.1",
+ "-v", "--spark-version", default="1.0.0",
help="Version of Spark to use: 'X.Y.Z' or a specific git hash")
parser.add_option(
"--spark-git-repo",
@@ -191,7 +191,8 @@ def is_active(instance):
# Return correct versions of Spark and Shark, given the supplied Spark version
def get_spark_shark_version(opts):
spark_shark_map = {
- "0.7.3": "0.7.1", "0.8.0": "0.8.0", "0.8.1": "0.8.1", "0.9.0": "0.9.0", "0.9.1": "0.9.1"
+ "0.7.3": "0.7.1", "0.8.0": "0.8.0", "0.8.1": "0.8.1", "0.9.0": "0.9.0", "0.9.1": "0.9.1",
+ "1.0.0": "1.0.0"
}
version = opts.spark_version.replace("v", "")
if version not in spark_shark_map:
@@ -199,7 +200,6 @@ def get_spark_shark_version(opts):
sys.exit(1)
return (version, spark_shark_map[version])
-
# Attempt to resolve an appropriate AMI given the architecture and
# region of the request.
def get_spark_ami(opts):
@@ -230,7 +230,12 @@ def get_spark_ami(opts):
"c3.xlarge": "pvm",
"c3.2xlarge": "pvm",
"c3.4xlarge": "pvm",
- "c3.8xlarge": "pvm"
+ "c3.8xlarge": "pvm",
+ "r3.large": "hvm",
+ "r3.xlarge": "hvm",
+ "r3.2xlarge": "hvm",
+ "r3.4xlarge": "hvm",
+ "r3.8xlarge": "hvm"
}
if opts.instance_type in instance_types:
instance_type = instance_types[opts.instance_type]
@@ -538,7 +543,12 @@ def get_num_disks(instance_type):
"c3.xlarge": 2,
"c3.2xlarge": 2,
"c3.4xlarge": 2,
- "c3.8xlarge": 2
+ "c3.8xlarge": 2,
+ "r3.large": 1,
+ "r3.xlarge": 1,
+ "r3.2xlarge": 1,
+ "r3.4xlarge": 1,
+ "r3.8xlarge": 2
}
if instance_type in disks_by_instance:
return disks_by_instance[instance_type]
diff --git a/examples/pom.xml b/examples/pom.xml
index 874bcd7916f35..4f6d7fdb87d47 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
org.apache.spark
spark-parent
- 1.0.0-SNAPSHOT
+ 1.1.0-SNAPSHOT
../pom.xml
diff --git a/examples/src/main/scala/org/apache/spark/examples/bagel/PageRankUtils.scala b/examples/src/main/scala/org/apache/spark/examples/bagel/PageRankUtils.scala
index b97cb8fb02823..e06f4dcd54442 100644
--- a/examples/src/main/scala/org/apache/spark/examples/bagel/PageRankUtils.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/bagel/PageRankUtils.scala
@@ -124,4 +124,6 @@ class CustomPartitioner(partitions: Int) extends Partitioner {
c.numPartitions == numPartitions
case _ => false
}
+
+ override def hashCode: Int = numPartitions
}
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
index 9832bec90d7ee..b3cc361154198 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala
@@ -99,7 +99,7 @@ object DecisionTreeRunner {
val sc = new SparkContext(conf)
// Load training data and cache it.
- val examples = MLUtils.loadLabeledData(sc, params.input).cache()
+ val examples = MLUtils.loadLabeledPoints(sc, params.input).cache()
val splits = examples.randomSplit(Array(0.8, 0.2))
val training = splits(0).cache()
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 6aec215687fe0..c1f581967777b 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
org.apache.spark
spark-parent
- 1.0.0-SNAPSHOT
+ 1.1.0-SNAPSHOT
../../pom.xml
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
index 979eb0ca624bd..d014a7aad0fca 100644
--- a/external/kafka/pom.xml
+++ b/external/kafka/pom.xml
@@ -21,7 +21,7 @@
org.apache.spark
spark-parent
- 1.0.0-SNAPSHOT
+ 1.1.0-SNAPSHOT
../../pom.xml
diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 7b2dc5ba1d7f9..4980208cba3b0 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -21,7 +21,7 @@
org.apache.spark
spark-parent
- 1.0.0-SNAPSHOT
+ 1.1.0-SNAPSHOT
../../pom.xml
diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index 5766d3a0d44ec..7073bd4404d9c 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -21,7 +21,7 @@
org.apache.spark
spark-parent
- 1.0.0-SNAPSHOT
+ 1.1.0-SNAPSHOT
../../pom.xml
diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
index 4ed4196bd8662..cf306e0dca8bd 100644
--- a/external/zeromq/pom.xml
+++ b/external/zeromq/pom.xml
@@ -21,7 +21,7 @@
org.apache.spark
spark-parent
- 1.0.0-SNAPSHOT
+ 1.1.0-SNAPSHOT
../../pom.xml
diff --git a/extras/java8-tests/pom.xml b/extras/java8-tests/pom.xml
index 602f66f9c5cf1..955ec1a8c3033 100644
--- a/extras/java8-tests/pom.xml
+++ b/extras/java8-tests/pom.xml
@@ -20,7 +20,7 @@
org.apache.spark
spark-parent
- 1.0.0-SNAPSHOT
+ 1.1.0-SNAPSHOT
../../pom.xml
diff --git a/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java b/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
index c366c10b15a20..729bc0459ce52 100644
--- a/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
+++ b/extras/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java
@@ -99,16 +99,16 @@ public void groupBy() {
@Test
public void leftOuterJoin() {
JavaPairRDD rdd1 = sc.parallelizePairs(Arrays.asList(
- new Tuple2(1, 1),
- new Tuple2(1, 2),
- new Tuple2(2, 1),
- new Tuple2(3, 1)
+ new Tuple2<>(1, 1),
+ new Tuple2<>(1, 2),
+ new Tuple2<>(2, 1),
+ new Tuple2<>(3, 1)
));
JavaPairRDD rdd2 = sc.parallelizePairs(Arrays.asList(
- new Tuple2(1, 'x'),
- new Tuple2(2, 'y'),
- new Tuple2(2, 'z'),
- new Tuple2(4, 'w')
+ new Tuple2<>(1, 'x'),
+ new Tuple2<>(2, 'y'),
+ new Tuple2<>(2, 'z'),
+ new Tuple2<>(4, 'w')
));
List>>> joined =
rdd1.leftOuterJoin(rdd2).collect();
@@ -133,11 +133,11 @@ public void foldReduce() {
@Test
public void foldByKey() {
List> pairs = Arrays.asList(
- new Tuple2(2, 1),
- new Tuple2(2, 1),
- new Tuple2(1, 1),
- new Tuple2(3, 2),
- new Tuple2(3, 1)
+ new Tuple2<>(2, 1),
+ new Tuple2<>(2, 1),
+ new Tuple2<>(1, 1),
+ new Tuple2<>(3, 2),
+ new Tuple2<>(3, 1)
);
JavaPairRDD rdd = sc.parallelizePairs(pairs);
JavaPairRDD sums = rdd.foldByKey(0, (a, b) -> a + b);
@@ -149,11 +149,11 @@ public void foldByKey() {
@Test
public void reduceByKey() {
List> pairs = Arrays.asList(
- new Tuple2(2, 1),
- new Tuple2(2, 1),
- new Tuple2(1, 1),
- new Tuple2(3, 2),
- new Tuple2(3, 1)
+ new Tuple2<>(2, 1),
+ new Tuple2<>(2, 1),
+ new Tuple2<>(1, 1),
+ new Tuple2<>(3, 2),
+ new Tuple2<>(3, 1)
);
JavaPairRDD rdd = sc.parallelizePairs(pairs);
JavaPairRDD counts = rdd.reduceByKey((a, b) -> a + b);
@@ -177,7 +177,7 @@ public void map() {
JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache();
doubles.collect();
- JavaPairRDD pairs = rdd.mapToPair(x -> new Tuple2(x, x))
+ JavaPairRDD pairs = rdd.mapToPair(x -> new Tuple2<>(x, x))
.cache();
pairs.collect();
JavaRDD strings = rdd.map(x -> x.toString()).cache();
@@ -194,31 +194,31 @@ public void flatMap() {
Assert.assertEquals(11, words.count());
JavaPairRDD pairs = rdd.flatMapToPair(s -> {
- List> pairs2 = new LinkedList>();
- for (String word : s.split(" ")) pairs2.add(new Tuple2(word, word));
+ List> pairs2 = new LinkedList<>();
+ for (String word : s.split(" ")) pairs2.add(new Tuple2<>(word, word));
return pairs2;
});
- Assert.assertEquals(new Tuple2("Hello", "Hello"), pairs.first());
+ Assert.assertEquals(new Tuple2<>("Hello", "Hello"), pairs.first());
Assert.assertEquals(11, pairs.count());
JavaDoubleRDD doubles = rdd.flatMapToDouble(s -> {
- List lengths = new LinkedList();
+ List lengths = new LinkedList<>();
for (String word : s.split(" ")) lengths.add(word.length() * 1.0);
return lengths;
});
Double x = doubles.first();
- Assert.assertEquals(5.0, doubles.first().doubleValue(), 0.01);
+ Assert.assertEquals(5.0, doubles.first(), 0.01);
Assert.assertEquals(11, pairs.count());
}
@Test
public void mapsFromPairsToPairs() {
List> pairs = Arrays.asList(
- new Tuple2(1, "a"),
- new Tuple2(2, "aa"),
- new Tuple2(3, "aaa")
+ new Tuple2<>(1, "a"),
+ new Tuple2<>(2, "aa"),
+ new Tuple2<>(3, "aaa")
);
JavaPairRDD pairRDD = sc.parallelizePairs(pairs);
@@ -251,19 +251,18 @@ public void sequenceFile() {
tempDir.deleteOnExit();
String outputDir = new File(tempDir, "output").getAbsolutePath();
List> pairs = Arrays.asList(
- new Tuple2(1, "a"),
- new Tuple2(2, "aa"),
- new Tuple2(3, "aaa")
+ new Tuple2<>(1, "a"),
+ new Tuple2<>(2, "aa"),
+ new Tuple2<>(3, "aaa")
);
JavaPairRDD rdd = sc.parallelizePairs(pairs);
- rdd.mapToPair(pair ->
- new Tuple2(new IntWritable(pair._1()), new Text(pair._2())))
+ rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
.saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
// Try reading the output back as an object file
JavaPairRDD readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class)
- .mapToPair(pair -> new Tuple2(pair._1().get(), pair._2().toString()));
+ .mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString()));
Assert.assertEquals(pairs, readRDD.collect());
Utils.deleteRecursively(tempDir);
}
@@ -325,7 +324,7 @@ public Float zero(Float initialValue) {
}
};
- final Accumulator floatAccum = sc.accumulator((Float) 10.0f, floatAccumulatorParam);
+ final Accumulator floatAccum = sc.accumulator(10.0f, floatAccumulatorParam);
rdd.foreach(x -> floatAccum.add((float) x));
Assert.assertEquals((Float) 25.0f, floatAccum.value());
@@ -338,22 +337,22 @@ public Float zero(Float initialValue) {
public void keyBy() {
JavaRDD rdd = sc.parallelize(Arrays.asList(1, 2));
List> s = rdd.keyBy(x -> x.toString()).collect();
- Assert.assertEquals(new Tuple2("1", 1), s.get(0));
- Assert.assertEquals(new Tuple2("2", 2), s.get(1));
+ Assert.assertEquals(new Tuple2<>("1", 1), s.get(0));
+ Assert.assertEquals(new Tuple2<>("2", 2), s.get(1));
}
@Test
public void mapOnPairRDD() {
JavaRDD rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4));
JavaPairRDD rdd2 =
- rdd1.mapToPair(i -> new Tuple2(i, i % 2));
+ rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
JavaPairRDD rdd3 =
- rdd2.mapToPair(in -> new Tuple2(in._2(), in._1()));
+ rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
Assert.assertEquals(Arrays.asList(
new Tuple2(1, 1),
- new Tuple2(0, 2),
- new Tuple2(1, 3),
- new Tuple2(0, 4)), rdd3.collect());
+ new Tuple2<>(0, 2),
+ new Tuple2<>(1, 3),
+ new Tuple2<>(0, 4)), rdd3.collect());
}
@Test
@@ -361,7 +360,7 @@ public void collectPartitions() {
JavaRDD rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7), 3);
JavaPairRDD rdd2 =
- rdd1.mapToPair(i -> new Tuple2(i, i % 2));
+ rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
List[] parts = rdd1.collectPartitions(new int[]{0});
Assert.assertEquals(Arrays.asList(1, 2), parts[0]);
@@ -369,16 +368,13 @@ public void collectPartitions() {
Assert.assertEquals(Arrays.asList(3, 4), parts[0]);
Assert.assertEquals(Arrays.asList(5, 6, 7), parts[1]);
- Assert.assertEquals(Arrays.asList(new Tuple2(1, 1),
- new Tuple2(2, 0)),
+ Assert.assertEquals(Arrays.asList(new Tuple2<>(1, 1), new Tuple2<>(2, 0)),
rdd2.collectPartitions(new int[]{0})[0]);
parts = rdd2.collectPartitions(new int[]{1, 2});
- Assert.assertEquals(Arrays.asList(new Tuple2(3, 1),
- new Tuple2(4, 0)), parts[0]);
- Assert.assertEquals(Arrays.asList(new Tuple2(5, 1),
- new Tuple2(6, 0),
- new Tuple2(7, 1)), parts[1]);
+ Assert.assertEquals(Arrays.asList(new Tuple2<>(3, 1), new Tuple2<>(4, 0)), parts[0]);
+ Assert.assertEquals(Arrays.asList(new Tuple2<>(5, 1), new Tuple2<>(6, 0), new Tuple2<>(7, 1)),
+ parts[1]);
}
@Test
@@ -386,7 +382,7 @@ public void collectAsMapWithIntArrayValues() {
// Regression test for SPARK-1040
JavaRDD rdd = sc.parallelize(Arrays.asList(new Integer[]{1}));
JavaPairRDD pairRDD =
- rdd.mapToPair(x -> new Tuple2(x, new int[]{x}));
+ rdd.mapToPair(x -> new Tuple2<>(x, new int[]{x}));
pairRDD.collect(); // Works fine
Map map = pairRDD.collectAsMap(); // Used to crash with ClassCastException
}
diff --git a/extras/java8-tests/src/test/java/org/apache/spark/streaming/Java8APISuite.java b/extras/java8-tests/src/test/java/org/apache/spark/streaming/Java8APISuite.java
index 43df0dea614bc..73091cfe2c09e 100644
--- a/extras/java8-tests/src/test/java/org/apache/spark/streaming/Java8APISuite.java
+++ b/extras/java8-tests/src/test/java/org/apache/spark/streaming/Java8APISuite.java
@@ -39,6 +39,7 @@
* Most of these tests replicate org.apache.spark.streaming.JavaAPISuite using java 8
* lambda syntax.
*/
+@SuppressWarnings("unchecked")
public class Java8APISuite extends LocalJavaStreamingContext implements Serializable {
@Test
@@ -52,7 +53,7 @@ public void testMap() {
Arrays.asList(9, 4));
JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
- JavaDStream letterCount = stream.map(s -> s.length());
+ JavaDStream letterCount = stream.map(String::length);
JavaTestUtils.attachTestOutputStream(letterCount);
List> result = JavaTestUtils.runStreams(ssc, 2, 2);
@@ -63,7 +64,7 @@ public void testMap() {
public void testFilter() {
List> inputData = Arrays.asList(
Arrays.asList("giants", "dodgers"),
- Arrays.asList("yankees", "red socks"));
+ Arrays.asList("yankees", "red sox"));
List> expected = Arrays.asList(
Arrays.asList("giants"),
@@ -81,11 +82,11 @@ public void testFilter() {
public void testMapPartitions() {
List> inputData = Arrays.asList(
Arrays.asList("giants", "dodgers"),
- Arrays.asList("yankees", "red socks"));
+ Arrays.asList("yankees", "red sox"));
List> expected = Arrays.asList(
Arrays.asList("GIANTSDODGERS"),
- Arrays.asList("YANKEESRED SOCKS"));
+ Arrays.asList("YANKEESRED SOX"));
JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1);
JavaDStream