diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 5cf719bd65ae4..d18ef59a53c99 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -322,7 +322,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
- columnNameOfCorruptRecord=None, wholeFile=None):
+ columnNameOfCorruptRecord=None, multiLine=None):
"""Loads a CSV file and returns the result as a :class:`DataFrame`.
This function will go through the input once to determine the input schema if
@@ -396,7 +396,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
``spark.sql.columnNameOfCorruptRecord``. If None is set,
it uses the value specified in
``spark.sql.columnNameOfCorruptRecord``.
- :param wholeFile: parse records, which may span multiple lines. If None is
+ :param multiLine: parse records, which may span multiple lines. If None is
set, it uses the default value, ``false``.
>>> df = spark.read.csv('python/test_support/sql/ages.csv')
@@ -411,7 +411,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns,
maxCharsPerColumn=maxCharsPerColumn,
maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,
- columnNameOfCorruptRecord=columnNameOfCorruptRecord, wholeFile=wholeFile)
+ columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine)
if isinstance(path, basestring):
path = [path]
return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 76e8c4f47d8ad..0a5adb4e8925a 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -532,7 +532,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
- columnNameOfCorruptRecord=None, wholeFile=None):
+ columnNameOfCorruptRecord=None, multiLine=None):
"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.
This function will go through the input once to determine the input schema if
@@ -607,7 +607,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
``spark.sql.columnNameOfCorruptRecord``. If None is set,
it uses the value specified in
``spark.sql.columnNameOfCorruptRecord``.
- :param wholeFile: parse one record, which may span multiple lines. If None is
+ :param multiLine: parse one record, which may span multiple lines. If None is
set, it uses the default value, ``false``.
>>> csv_sdf = spark.readStream.csv(tempfile.mkdtemp(), schema = sdf_schema)
@@ -624,7 +624,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns,
maxCharsPerColumn=maxCharsPerColumn,
maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,
- columnNameOfCorruptRecord=columnNameOfCorruptRecord, wholeFile=wholeFile)
+ columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine)
if isinstance(path, basestring):
return self._df(self._jreader.csv(path))
else:
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 845e1c7619cc4..d00be7dd0b398 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -463,9 +463,9 @@ def test_wholefile_json(self):
wholeFile=True)
self.assertEqual(people1.collect(), people_array.collect())
- def test_wholefile_csv(self):
+ def test_multiline_csv(self):
ages_newlines = self.spark.read.csv(
- "python/test_support/sql/ages_newlines.csv", wholeFile=True)
+ "python/test_support/sql/ages_newlines.csv", multiLine=True)
expected = [Row(_c0=u'Joe', _c1=u'20', _c2=u'Hi,\nI am Jeo'),
Row(_c0=u'Tom', _c1=u'30', _c2=u'My name is Tom'),
Row(_c0=u'Hyukjin', _c1=u'25', _c2=u'I am Hyukjin\n\nI love Spark!')]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 0f96e82cedf4e..89a5b1c632bfe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -537,7 +537,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
*
`columnNameOfCorruptRecord` (default is the value specified in
* `spark.sql.columnNameOfCorruptRecord`): allows renaming the new field having malformed string
* created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.
- * `wholeFile` (default `false`): parse one record, which may span multiple lines.
+ * `multiLine` (default `false`): parse one record, which may span multiple lines.
*
* @since 2.0.0
*/
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 76f121c0c955f..bb30138737a43 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -111,7 +111,7 @@ abstract class CSVDataSource extends Serializable {
object CSVDataSource {
def apply(options: CSVOptions): CSVDataSource = {
- if (options.wholeFile) {
+ if (options.multiLine) {
WholeFileCSVDataSource
} else {
TextInputCSVDataSource
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 78c16b75ee684..a13a5a34b4a84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -128,7 +128,7 @@ class CSVOptions(
FastDateFormat.getInstance(
parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), timeZone, Locale.US)
- val wholeFile = parameters.get("wholeFile").map(_.toBoolean).getOrElse(false)
+ val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
val maxColumns = getInt("maxColumns", 20480)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 766776230257d..7ee7b75f0efc9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -276,7 +276,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
* `columnNameOfCorruptRecord` (default is the value specified in
* `spark.sql.columnNameOfCorruptRecord`): allows renaming the new field having malformed string
* created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.
- * `wholeFile` (default `false`): parse one record, which may span multiple lines.
+ * `multiLine` (default `false`): parse one record, which may span multiple lines.
*
*
* @since 2.0.0
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 352dba79a4c08..89d9b69dec7ef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -261,10 +261,10 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
}
test("test for DROPMALFORMED parsing mode") {
- Seq(false, true).foreach { wholeFile =>
+ Seq(false, true).foreach { multiLine =>
val cars = spark.read
.format("csv")
- .option("wholeFile", wholeFile)
+ .option("multiLine", multiLine)
.options(Map("header" -> "true", "mode" -> "dropmalformed"))
.load(testFile(carsFile))
@@ -284,11 +284,11 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
}
test("test for FAILFAST parsing mode") {
- Seq(false, true).foreach { wholeFile =>
+ Seq(false, true).foreach { multiLine =>
val exception = intercept[SparkException] {
spark.read
.format("csv")
- .option("wholeFile", wholeFile)
+ .option("multiLine", multiLine)
.options(Map("header" -> "true", "mode" -> "failfast"))
.load(testFile(carsFile)).collect()
}
@@ -990,13 +990,13 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
}
test("SPARK-18699 put malformed records in a `columnNameOfCorruptRecord` field") {
- Seq(false, true).foreach { wholeFile =>
+ Seq(false, true).foreach { multiLine =>
val schema = new StructType().add("a", IntegerType).add("b", TimestampType)
// We use `PERMISSIVE` mode by default if invalid string is given.
val df1 = spark
.read
.option("mode", "abcd")
- .option("wholeFile", wholeFile)
+ .option("multiLine", multiLine)
.schema(schema)
.csv(testFile(valueMalformedFile))
checkAnswer(df1,
@@ -1011,7 +1011,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
.read
.option("mode", "Permissive")
.option("columnNameOfCorruptRecord", columnNameOfCorruptRecord)
- .option("wholeFile", wholeFile)
+ .option("multiLine", multiLine)
.schema(schemaWithCorrField1)
.csv(testFile(valueMalformedFile))
checkAnswer(df2,
@@ -1028,7 +1028,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
.read
.option("mode", "permissive")
.option("columnNameOfCorruptRecord", columnNameOfCorruptRecord)
- .option("wholeFile", wholeFile)
+ .option("multiLine", multiLine)
.schema(schemaWithCorrField2)
.csv(testFile(valueMalformedFile))
checkAnswer(df3,
@@ -1041,7 +1041,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
.read
.option("mode", "PERMISSIVE")
.option("columnNameOfCorruptRecord", columnNameOfCorruptRecord)
- .option("wholeFile", wholeFile)
+ .option("multiLine", multiLine)
.schema(schema.add(columnNameOfCorruptRecord, IntegerType))
.csv(testFile(valueMalformedFile))
.collect
@@ -1073,7 +1073,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
val df = spark.read
.option("header", true)
- .option("wholeFile", true)
+ .option("multiLine", true)
.csv(path.getAbsolutePath)
// Check if headers have new lines in the names.
@@ -1096,10 +1096,10 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
}
test("Empty file produces empty dataframe with empty schema") {
- Seq(false, true).foreach { wholeFile =>
+ Seq(false, true).foreach { multiLine =>
val df = spark.read.format("csv")
.option("header", true)
- .option("wholeFile", wholeFile)
+ .option("multiLine", multiLine)
.load(testFile(emptyFile))
assert(df.schema === spark.emptyDataFrame.schema)