[SPARK-47202][PYTHON][TESTS][FOLLOW-UP] Test timestamp with tzinfo in…

… toPandas and createDataFrame with Arrow optimized ### What changes were proposed in this pull request? This PR is a follow up of #45301 that actually test the change. ### Why are the changes needed? To prevent a regression. ### Does this PR introduce _any_ user-facing change? No, test-only. ### How was this patch tested? Manually ran the tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45308 from HyukjinKwon/SPARK-47202-followup. Authored-by: Hyukjin Kwon <gurwls223@apache.org> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org> (cherry picked from commit 721c2a4) Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
apache · Feb 28, 2024 · e6f3dd9 · e6f3dd9
1 parent c0a4416
commit e6f3dd9
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 0 deletions.
diff --git a/python/pyspark/sql/tests/connect/test_parity_arrow.py b/python/pyspark/sql/tests/connect/test_parity_arrow.py
@@ -136,6 +136,9 @@ def test_createDataFrame_nested_timestamp(self):
     def test_toPandas_nested_timestamp(self):
         self.check_toPandas_nested_timestamp(True)
 
+    def test_toPandas_timestmap_tzinfo(self):
+        self.check_toPandas_timestmap_tzinfo(True)
+
     def test_createDataFrame_udt(self):
         self.check_createDataFrame_udt(True)
 

diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
@@ -18,12 +18,14 @@
 import datetime
 import os
 import threading
+import calendar
 import time
 import unittest
 import warnings
 from distutils.version import LooseVersion
 from typing import cast
 from collections import namedtuple
+from zoneinfo import ZoneInfo
 
 from pyspark import SparkContext, SparkConf
 from pyspark.sql import Row, SparkSession
@@ -1090,6 +1092,31 @@ def check_createDataFrame_nested_timestamp(self, arrow_enabled):
 
         self.assertEqual(df.first(), expected)
 
+    def test_toPandas_timestmap_tzinfo(self):
+        for arrow_enabled in [True, False]:
+            with self.subTest(arrow_enabled=arrow_enabled):
+                self.check_toPandas_timestmap_tzinfo(arrow_enabled)
+
+    def check_toPandas_timestmap_tzinfo(self, arrow_enabled):
+        # SPARK-47202: Test timestamp with tzinfo in toPandas and createDataFrame
+        ts_tzinfo = datetime.datetime(2023, 1, 1, 0, 0, 0, tzinfo=ZoneInfo("America/Los_Angeles"))
+        data = pd.DataFrame({"a": [ts_tzinfo]})
+        df = self.spark.createDataFrame(data)
+
+        with self.sql_conf(
+            {
+                "spark.sql.execution.arrow.pyspark.enabled": arrow_enabled,
+            }
+        ):
+            pdf = df.toPandas()
+
+        expected = pd.DataFrame(
+            # Spark unsets tzinfo and converts them to localtimes.
+            {"a": [datetime.datetime.fromtimestamp(calendar.timegm(ts_tzinfo.utctimetuple()))]}
+        )
+
+        assert_frame_equal(pdf, expected)
+
     def test_toPandas_nested_timestamp(self):
         for arrow_enabled in [True, False]:
             with self.subTest(arrow_enabled=arrow_enabled):