apache · itholic · Sep 4, 2023 · Sep 5, 2023 · Sep 6, 2023 · Sep 6, 2023
diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
@@ -64,8 +64,8 @@ RUN Rscript -e "devtools::install_version('roxygen2', version='7.2.0', repos='ht
 # See more in SPARK-39735
 ENV R_LIBS_SITE "/usr/local/lib/R/site-library:${R_LIBS_SITE}:/usr/lib/R/library"
 
-RUN pypy3 -m pip install numpy 'pandas<=2.0.3' scipy coverage matplotlib
-RUN python3.9 -m pip install numpy pyarrow 'pandas<=2.0.3' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage matplotlib openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
+RUN pypy3 -m pip install numpy 'pandas<=2.1.0' scipy coverage matplotlib
+RUN python3.9 -m pip install numpy pyarrow 'pandas<=2.1.0' scipy unittest-xml-reporting plotly>=4.8 'mlflow>=2.3.1' coverage matplotlib openpyxl 'memory-profiler==0.60.0' 'scikit-learn==1.1.*'
 
 # Add Python deps for Spark Connect.
 RUN python3.9 -m pip install grpcio protobuf googleapis-common-protos grpcio-status

diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
@@ -311,7 +311,14 @@ def aggregate(
                 i for i, gkey in enumerate(self._groupkeys) if gkey._psdf is not self._psdf
             )
             if len(should_drop_index) > 0:
-                psdf = psdf.reset_index(level=should_drop_index, drop=True)
+                drop = not any(
+                    [
+                        isinstance(func_or_funcs[gkey.name], list)
+                        for gkey in self._groupkeys
+                        if gkey.name in func_or_funcs
+                    ]
+                )
+                psdf = psdf.reset_index(level=should_drop_index, drop=drop)
             if len(should_drop_index) < len(self._groupkeys):
                 psdf = psdf.reset_index()
 

diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py
@@ -98,7 +98,7 @@ def generate_supported_api(output_rst_file_path: str) -> None:
 
     Write supported APIs documentation.
     """
-    pandas_latest_version = "2.0.3"
+    pandas_latest_version = "2.1.0"
     if LooseVersion(pd.__version__) != LooseVersion(pandas_latest_version):
         msg = (
             "Warning: Latest version of pandas (%s) is required to generate the documentation; "

diff --git a/python/pyspark/pandas/tests/computation/test_corrwith.py b/python/pyspark/pandas/tests/computation/test_corrwith.py
@@ -59,10 +59,7 @@ def _test_corrwith(self, psdf, psobj):
         # Therefore, we only test the pandas 1.5.0 in different way.
         # See https://github.com/pandas-dev/pandas/issues/48826 for the reported issue,
         # and https://github.com/pandas-dev/pandas/pull/46174 for the initial PR that causes.
-        if LooseVersion(pd.__version__) == LooseVersion("1.5.0") and isinstance(pobj, pd.Series):
-            methods = ["kendall"]
-        else:
-            methods = ["pearson", "spearman", "kendall"]
+        methods = ["pearson", "spearman", "kendall"]
         for method in methods:
             for drop in [True, False]:
                 p_corr = pdf.corrwith(pobj, drop=drop, method=method)

diff --git a/python/pyspark/pandas/tests/frame/test_reshaping.py b/python/pyspark/pandas/tests/frame/test_reshaping.py
@@ -291,7 +291,8 @@ def test_stack(self):
         psdf_multi_level_cols2 = ps.from_pandas(pdf_multi_level_cols2)
 
         self.assert_eq(
-            psdf_multi_level_cols2.stack().sort_index(), pdf_multi_level_cols2.stack().sort_index()
+            psdf_multi_level_cols2.stack().sort_index()[["weight", "height"]],
+            pdf_multi_level_cols2.stack().sort_index()[["weight", "height"]],
         )
 
         pdf = pd.DataFrame(
@@ -304,7 +305,7 @@ def test_stack(self):
         )
         psdf = ps.from_pandas(pdf)
 
-        self.assert_eq(psdf.stack().sort_index(), pdf.stack().sort_index())
+        self.assert_eq(psdf.stack().sort_index()[["x", "y"]], pdf.stack().sort_index()[["x", "y"]])
         self.assert_eq(psdf[[]].stack().sort_index(), pdf[[]].stack().sort_index(), almost=True)
 
     def test_unstack(self):

diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py
@@ -20,11 +20,6 @@
 import numpy as np
 import pandas as pd
 
-try:
-    from pandas._testing import makeMissingDataframe
-except ImportError:
-    from pandas.util.testing import makeMissingDataframe
-
 from pyspark import pandas as ps
 from pyspark.pandas.config import option_context
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, SPARK_CONF_ARROW_ENABLED
@@ -273,7 +268,18 @@ def test_skew_kurt_numerical_stability(self):
         self.assert_eq(psdf.kurt(), pdf.kurt(), almost=True)
 
     def test_dataframe_corr(self):
-        pdf = makeMissingDataframe(0.3, 42)
+        pdf = pd.DataFrame(
+            index=[
+                "".join(
+                    np.random.choice(
+                        list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10
+                    )
+                )
+                for _ in range(30)
+            ],
+            columns=list("ABCD"),
+            dtype="float64",
+        )
         psdf = ps.from_pandas(pdf)
 
         with self.assertRaisesRegex(ValueError, "Invalid method"):
@@ -347,7 +353,18 @@ def test_dataframe_corr(self):
             )
 
     def test_series_corr(self):
-        pdf = makeMissingDataframe(0.3, 42)
+        pdf = pd.DataFrame(
+            index=[
+                "".join(
+                    np.random.choice(
+                        list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"), 10
+                    )
+                )
+                for _ in range(30)
+            ],
+            columns=list("ABCD"),
+            dtype="float64",
+        )
         pser1 = pdf.A
         pser2 = pdf.B
         psdf = ps.from_pandas(pdf)

diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py
@@ -487,23 +487,23 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     ...     pass
     >>> inferred = infer_return_type(func)
     >>> inferred.dtypes
-    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
+    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False, categories_dtype=int64)]
     >>> inferred.spark_type
     StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)])
 
     >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
     ...     pass
     >>> inferred = infer_return_type(func)
     >>> inferred.dtypes
-    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
+    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False, categories_dtype=int64)]
     >>> inferred.spark_type
     StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)])
 
     >>> def func() -> ps.Series[pdf.b.dtype]:
     ...     pass
     >>> inferred = infer_return_type(func)
     >>> inferred.dtype
-    CategoricalDtype(categories=[3, 4, 5], ordered=False)
+    CategoricalDtype(categories=[3, 4, 5], ordered=False, categories_dtype=int64)
     >>> inferred.spark_type
     LongType()
 
@@ -521,7 +521,8 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     ...     pass
     >>> inferred = infer_return_type(func)
     >>> inferred.dtypes
-    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
+    [dtype('int64'), dtype('int64'),
+     CategoricalDtype(categories=[3, 4, 5], ordered=False, categories_dtype=int64)]
     >>> inferred.spark_type.simpleString()
     'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
     >>> inferred.index_fields
@@ -533,7 +534,8 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     ...     pass
     >>> inferred = infer_return_type(func)
     >>> inferred.dtypes
-    [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')]
+    [CategoricalDtype(categories=[3, 4, 5], ordered=False, categories_dtype=int64),
+     dtype('int64'), dtype('int64')]
     >>> inferred.spark_type.simpleString()
     'struct<index:bigint,id:bigint,A:bigint>'
     >>> inferred.index_fields
@@ -544,7 +546,8 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     ...     pass
     >>> inferred = infer_return_type(func)
     >>> inferred.dtypes
-    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
+    [dtype('int64'), dtype('int64'),
+     CategoricalDtype(categories=[3, 4, 5], ordered=False, categories_dtype=int64)]
     >>> inferred.spark_type.simpleString()
     'struct<__index_level_0__:bigint,a:bigint,b:bigint>'
     >>> inferred.index_fields

diff --git a/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py
@@ -166,7 +166,7 @@ def check_apply_in_pandas_not_returning_pandas_dataframe(self):
             fn=lambda lft, rgt: lft.size + rgt.size,
             error_class=PythonException,
             error_message_regex="Return type of the user-defined function "
-            "should be pandas.DataFrame, but is int64.",
+            "should be pandas.DataFrame, but is int.",
         )
 
     def test_apply_in_pandas_returning_column_names(self):