-
Notifications
You must be signed in to change notification settings - Fork 28.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-14432][SQL] Add API to calculate the approximate quantiles for multiple columns #12207
Changes from all commits
a8f1b33
47d52b9
75edcb1
619660d
b64bd4e
89d4d3e
4309001
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -702,6 +702,15 @@ def test_approxQuantile(self): | |
self.assertEqual(len(aq), 3) | ||
self.assertTrue(all(isinstance(q, float) for q in aq)) | ||
|
||
aqs = df.stat.approxQuantile(["a", "a"], [0.1, 0.5, 0.9], 0.1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shall we add an assert that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added. |
||
self.assertEqual(len(aqs), 2) | ||
self.assertTrue(isinstance(aqs[0], list)) | ||
self.assertEqual(len(aqs[0]), 3) | ||
self.assertTrue(all(isinstance(q, float) for q in aqs[0])) | ||
self.assertTrue(isinstance(aqs[1], list)) | ||
self.assertEqual(len(aqs[1]), 3) | ||
self.assertTrue(all(isinstance(q, float) for q in aqs[1])) | ||
|
||
def test_corr(self): | ||
import math | ||
df = self.sc.parallelize([Row(a=i, b=math.sqrt(i)) for i in range(10)]).toDF() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,14 +52,14 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { | |
* The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670 Space-efficient | ||
* Online Computation of Quantile Summaries]] by Greenwald and Khanna. | ||
* | ||
* @param col the name of the numerical column | ||
* @param col the name of the numerical column. | ||
* @param probabilities a list of quantile probabilities | ||
* Each number must belong to [0, 1]. | ||
* For example 0 is the minimum, 0.5 is the median, 1 is the maximum. | ||
* @param relativeError The relative target precision to achieve (>= 0). | ||
* If set to zero, the exact quantiles are computed, which could be very expensive. | ||
* Note that values greater than 1 are accepted but give the same result as 1. | ||
* @return the approximate quantiles at the given probabilities | ||
* @return the approximate quantiles at the given probabilities. | ||
* | ||
* @since 2.0.0 | ||
*/ | ||
|
@@ -70,6 +70,29 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { | |
StatFunctions.multipleApproxQuantiles(df, Seq(col), probabilities, relativeError).head.toArray | ||
} | ||
|
||
/** | ||
* Calculates the approximate quantiles of numerical columns of a DataFrame. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we don't have the full doc from the above method, we should perhaps provide an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. Updated it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've updated it with specified parameter types. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure this will actually show up in the generated Scaladoc HTML. @jkbradley @mengxr do you prefer to actually make links show up in the HTML API doc? If so, then it often doesn't look good in an IDE. But to do that something like this is needed: |
||
* @see #approxQuantile(String, Array[Double], Double) for detailed description. | ||
* | ||
* @param cols the names of the numerical columns. | ||
* @param probabilities a list of quantile probabilities | ||
* Each number must belong to [0, 1]. | ||
* For example 0 is the minimum, 0.5 is the median, 1 is the maximum. | ||
* @param relativeError The relative target precision to achieve (>= 0). | ||
* If set to zero, the exact quantiles are computed, which could be very expensive. | ||
* Note that values greater than 1 are accepted but give the same result as 1. | ||
* @return the approximate quantiles at the given probabilities for given columns. | ||
* | ||
* @since 2.0.0 | ||
*/ | ||
def approxQuantile( | ||
cols: Array[String], | ||
probabilities: Array[Double], | ||
relativeError: Double): Array[Array[Double]] = { | ||
StatFunctions.multipleApproxQuantiles(df, cols, probabilities, relativeError) | ||
.map(_.toArray).toArray | ||
} | ||
|
||
/** | ||
* Python-friendly version of [[approxQuantile()]] | ||
*/ | ||
|
@@ -80,6 +103,18 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) { | |
approxQuantile(col, probabilities.toArray, relativeError).toList.asJava | ||
} | ||
|
||
/** | ||
* Python-friendly version of [[approxQuantile()]] that computes approximate quantiles | ||
* for multiple columns. | ||
*/ | ||
private[spark] def approxQuantile( | ||
cols: List[String], | ||
probabilities: List[Double], | ||
relativeError: Double): java.util.List[java.util.List[Double]] = { | ||
approxQuantile(cols.toArray, probabilities.toArray, relativeError) | ||
.map(_.toList.asJava).toList.asJava | ||
} | ||
|
||
/** | ||
* Calculate the sample covariance of two numerical columns of a DataFrame. | ||
* @param col1 the name of the first column | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could consider verifying the contents of the list as done for probabilities right bellow (but just a minor point and probably not as important - just if people pass in a list of expressions rather than strings would be nice to have a useful error message).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.