From fb67e8f8fd99925cc5e92d5edcc6df22e3221669 Mon Sep 17 00:00:00 2001 From: EugeneTorap Date: Sun, 7 Aug 2022 10:03:23 +0300 Subject: [PATCH 01/12] Bump pandas 1.4 and pyarrow 6 --- requirements/base.txt | 4 ++-- setup.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index c9b1baf3b97f8..5033a88798b09 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -185,7 +185,7 @@ packaging==21.3 # via # bleach # deprecation -pandas==1.3.4 +pandas==1.4.3 # via apache-superset parsedatetime==2.6 # via apache-superset @@ -197,7 +197,7 @@ prison==0.2.1 # via flask-appbuilder prompt-toolkit==3.0.28 # via click-repl -pyarrow==5.0.0 +pyarrow==6.0.1 # via apache-superset pycparser==2.20 # via cffi diff --git a/setup.py b/setup.py index ba16a2e58f67a..2537e60aae784 100644 --- a/setup.py +++ b/setup.py @@ -99,7 +99,7 @@ def get_git_sha() -> str: "markdown>=3.0", "msgpack>=1.0.0, <1.1", "numpy==1.22.1", - "pandas>=1.3.0, <1.4", + "pandas>=1.4.3, <1.5", "parsedatetime", "pgsanity", "polyline", @@ -107,7 +107,7 @@ def get_git_sha() -> str: "python-dateutil", "python-dotenv", "python-geohash", - "pyarrow>=5.0.0, <6.0", + "pyarrow>=6.0.1, <7", "pyyaml>=5.4", "PyJWT>=2.4.0, <3.0", "redis", From 78debe9d023f0f379e9b43bd37f3d4951d507763 Mon Sep 17 00:00:00 2001 From: EugeneTorap Date: Sun, 7 Aug 2022 10:17:55 +0300 Subject: [PATCH 02/12] Use engine="pyarrow" for pd.read_csv() --- superset/charts/post_processing.py | 2 +- superset/datasets/commands/importers/v1/utils.py | 2 +- superset/examples/countries.md | 4 ++-- superset/examples/country_map.py | 2 +- superset/examples/flights.py | 4 ++-- superset/examples/long_lat.py | 2 +- superset/views/database/views.py | 1 + tests/integration_tests/utils/csv_tests.py | 2 +- 8 files changed, 10 insertions(+), 9 deletions(-) diff --git a/superset/charts/post_processing.py b/superset/charts/post_processing.py index 81d2a09b04790..29e232db78342 100644 --- a/superset/charts/post_processing.py +++ b/superset/charts/post_processing.py @@ -334,7 +334,7 @@ def apply_post_process( if query["result_format"] == ChartDataResultFormat.JSON: df = pd.DataFrame.from_dict(query["data"]) elif query["result_format"] == ChartDataResultFormat.CSV: - df = pd.read_csv(StringIO(query["data"])) + df = pd.read_csv(StringIO(query["data"]), engine="pyarrow") processed_df = post_processor(df, form_data, datasource) diff --git a/superset/datasets/commands/importers/v1/utils.py b/superset/datasets/commands/importers/v1/utils.py index ba2b7df26174a..00d577fddd8b7 100644 --- a/superset/datasets/commands/importers/v1/utils.py +++ b/superset/datasets/commands/importers/v1/utils.py @@ -154,7 +154,7 @@ def load_data( data = request.urlopen(data_uri) # pylint: disable=consider-using-with if data_uri.endswith(".gz"): data = gzip.open(data) - df = pd.read_csv(data, encoding="utf-8") + df = pd.read_csv(data, encoding="utf-8", engine="pyarrow") dtype = get_dtype(df, dataset) # convert temporal columns diff --git a/superset/examples/countries.md b/superset/examples/countries.md index 042c7b3e891d1..05a75f3fc139e 100644 --- a/superset/examples/countries.md +++ b/superset/examples/countries.md @@ -22,12 +22,12 @@ This data was downloaded from the Here's the script that was used to massage the data: DIR = "" - df_country = pd.read_csv(DIR + '/HNP_Country.csv') + df_country = pd.read_csv(DIR + '/HNP_Country.csv', engine="pyarrow") df_country.columns = ['country_code'] + list(df_country.columns[1:]) df_country = df_country[['country_code', 'Region']] df_country.columns = ['country_code', 'region'] - df = pd.read_csv(DIR + '/HNP_Data.csv') + df = pd.read_csv(DIR + '/HNP_Data.csv', engine="pyarrow") del df['Unnamed: 60'] df.columns = ['country_name', 'country_code'] + list(df.columns[2:]) ndf = df.merge(df_country, how='inner') diff --git a/superset/examples/country_map.py b/superset/examples/country_map.py index c959a92085fc0..2a423374b3873 100644 --- a/superset/examples/country_map.py +++ b/superset/examples/country_map.py @@ -47,7 +47,7 @@ def load_country_map_data(only_metadata: bool = False, force: bool = False) -> N csv_bytes = get_example_data( "birth_france_data_for_country_map.csv", is_gzip=False, make_bytes=True ) - data = pd.read_csv(csv_bytes, encoding="utf-8") + data = pd.read_csv(csv_bytes, encoding="utf-8", engine="pyarrow") data["dttm"] = datetime.datetime.now().date() data.to_sql( tbl_name, diff --git a/superset/examples/flights.py b/superset/examples/flights.py index 46fdc5c1d07a1..05604494f1ba2 100644 --- a/superset/examples/flights.py +++ b/superset/examples/flights.py @@ -33,11 +33,11 @@ def load_flights(only_metadata: bool = False, force: bool = False) -> None: if not only_metadata and (not table_exists or force): data = get_example_data("flight_data.csv.gz", make_bytes=True) - pdf = pd.read_csv(data, encoding="latin-1") + pdf = pd.read_csv(data, encoding="latin-1", engine="pyarrow") # Loading airports info to join and get lat/long airports_bytes = get_example_data("airports.csv.gz", make_bytes=True) - airports = pd.read_csv(airports_bytes, encoding="latin-1") + airports = pd.read_csv(airports_bytes, encoding="latin-1", engine="pyarrow") airports = airports.set_index("IATA_CODE") pdf[ # pylint: disable=unsupported-assignment-operation,useless-suppression diff --git a/superset/examples/long_lat.py b/superset/examples/long_lat.py index ba9824bb43fea..ed0f551582a5f 100644 --- a/superset/examples/long_lat.py +++ b/superset/examples/long_lat.py @@ -45,7 +45,7 @@ def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None if not only_metadata and (not table_exists or force): data = get_example_data("san_francisco.csv.gz", make_bytes=True) - pdf = pd.read_csv(data, encoding="utf-8") + pdf = pd.read_csv(data, encoding="utf-8", engine="pyarrow") start = datetime.datetime.now().replace( hour=0, minute=0, second=0, microsecond=0 ) diff --git a/superset/views/database/views.py b/superset/views/database/views.py index bb2e018994e44..e887076a3227f 100644 --- a/superset/views/database/views.py +++ b/superset/views/database/views.py @@ -142,6 +142,7 @@ def form_post(self, form: CsvToDatabaseForm) -> Response: try: df = pd.concat( pd.read_csv( + engine="pyarrow", chunksize=1000, encoding="utf-8", filepath_or_buffer=form.csv_file.data, diff --git a/tests/integration_tests/utils/csv_tests.py b/tests/integration_tests/utils/csv_tests.py index e514efb1d2108..b6ae85970efb1 100644 --- a/tests/integration_tests/utils/csv_tests.py +++ b/tests/integration_tests/utils/csv_tests.py @@ -67,7 +67,7 @@ def test_df_to_escaped_csv(): ] csv_str = "\n".join([",".join(row) for row in csv_rows]) - df = pd.read_csv(io.StringIO(csv_str)) + df = pd.read_csv(io.StringIO(csv_str), engine="pyarrow") escaped_csv_str = csv.df_to_escaped_csv(df, encoding="utf8", index=False) escaped_csv_rows = [row.split(",") for row in escaped_csv_str.strip().split("\n")] From 525551fee7bc97a9d07ff3de3208eac3cd49ca09 Mon Sep 17 00:00:00 2001 From: EugeneTorap Date: Sun, 7 Aug 2022 10:31:30 +0300 Subject: [PATCH 03/12] Refactoring --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 2537e60aae784..81fbef906d9ed 100644 --- a/setup.py +++ b/setup.py @@ -182,5 +182,6 @@ def get_git_sha() -> str: classifiers=[ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", ], ) From ed580eefb2fb3e7b0cb95978c3de9481bb24aab8 Mon Sep 17 00:00:00 2001 From: EugeneTorap Date: Sun, 7 Aug 2022 10:45:44 +0300 Subject: [PATCH 04/12] Refactoring --- CONTRIBUTING.md | 2 +- Makefile | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 30403834689c1..3acf5c2da6be4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -420,7 +420,7 @@ Commits to `master` trigger a rebuild and redeploy of the documentation site. Su Make sure your machine meets the [OS dependencies](https://superset.apache.org/docs/installation/installing-superset-from-scratch#os-dependencies) before following these steps. You also need to install MySQL or [MariaDB](https://mariadb.com/downloads). -Ensure that you are using Python version 3.8 or 3.9, then proceed with: +Ensure that you are using Python version 3.8, 3.9 or 3.10, then proceed with: ```bash # Create a virtual environment and activate it (recommended) diff --git a/Makefile b/Makefile index 8304ccfb151b7..dfbe9d39f94af 100644 --- a/Makefile +++ b/Makefile @@ -15,8 +15,8 @@ # limitations under the License. # -# Python version installed; we need 3.8-3.9 -PYTHON=`command -v python3.9 || command -v python3.8` +# Python version installed; we need 3.8-3.10 +PYTHON=`command -v python3.10 || command -v python3.9 || command -v python3.8` .PHONY: install superset venv pre-commit @@ -70,7 +70,7 @@ update-js: venv: # Create a virtual environment and activate it (recommended) - if ! [ -x "${PYTHON}" ]; then echo "You need Python 3.8 or 3.9 installed"; exit 1; fi + if ! [ -x "${PYTHON}" ]; then echo "You need Python 3.8, 3.9 or 3.10 installed"; exit 1; fi test -d venv || ${PYTHON} -m venv venv # setup a python3 virtualenv . venv/bin/activate From 9eb34bfb363827f906c6c1ed0be778f50cac1dcd Mon Sep 17 00:00:00 2001 From: EugeneTorap Date: Sun, 7 Aug 2022 11:09:16 +0300 Subject: [PATCH 05/12] Refactoring --- UPDATING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/UPDATING.md b/UPDATING.md index 6c6ffff64d59a..d53f7cae94792 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -26,6 +26,7 @@ assists people when migrating to a new version. - [20606](https://github.com/apache/superset/pull/20606): When user clicks on chart title or "Edit chart" button in Dashboard page, Explore opens in the same tab. Clicking while holding cmd/ctrl opens Explore in a new tab. To bring back the old behaviour (always opening Explore in a new tab), flip feature flag `DASHBOARD_EDIT_CHART_IN_NEW_TAB` to `True`. - [20799](https://github.com/apache/superset/pull/20799): Presto and Trino engine will now display tracking URL for running queries in SQL Lab. If for some reason you don't want to show the tracking URL (for example, when your data warehouse hasn't enable access for to Presto or Trino UI), update `TRACKING_URL_TRANSFORMER` in `config.py` to return `None`. +- [21002](https://github.com/apache/superset/pull/21002): Support Python 3.10 and bump pandas 1.4 and pyarrow 6. ### Breaking Changes From 8f6519d61fb6ecbeb73310cff1c7569b508c7638 Mon Sep 17 00:00:00 2001 From: EugeneTorap Date: Sun, 7 Aug 2022 11:31:51 +0300 Subject: [PATCH 06/12] Use bytes in pd.read_json() --- superset/examples/bart_lines.py | 2 +- superset/examples/birth_names.py | 2 +- superset/examples/energy.py | 2 +- superset/examples/multiformat_time_series.py | 2 +- superset/examples/paris.py | 2 +- superset/examples/random_time_series.py | 2 +- superset/examples/sf_population_polygons.py | 2 +- superset/examples/world_bank.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/superset/examples/bart_lines.py b/superset/examples/bart_lines.py index bcdf3589a821e..e2db59eb1721a 100644 --- a/superset/examples/bart_lines.py +++ b/superset/examples/bart_lines.py @@ -34,7 +34,7 @@ def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None: table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - content = get_example_data("bart-lines.json.gz") + content = get_example_data("bart-lines.json.gz", make_bytes=True) df = pd.read_json(content, encoding="latin-1") df["path_json"] = df.path.map(json.dumps) df["polyline"] = df.path.map(polyline.encode) diff --git a/superset/examples/birth_names.py b/superset/examples/birth_names.py index b86f7a25c66b1..2afa04757c640 100644 --- a/superset/examples/birth_names.py +++ b/superset/examples/birth_names.py @@ -66,7 +66,7 @@ def gen_filter( def load_data(tbl_name: str, database: Database, sample: bool = False) -> None: - pdf = pd.read_json(get_example_data("birth_names2.json.gz")) + pdf = pd.read_json(get_example_data("birth_names2.json.gz", make_bytes=True)) # TODO(bkyryliuk): move load examples data into the pytest fixture if database.backend == "presto": pdf.ds = pd.to_datetime(pdf.ds, unit="ms") diff --git a/superset/examples/energy.py b/superset/examples/energy.py index d88d693651d42..a91c15dc403fb 100644 --- a/superset/examples/energy.py +++ b/superset/examples/energy.py @@ -46,7 +46,7 @@ def load_energy( table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("energy.json.gz") + data = get_example_data("energy.json.gz", make_bytes=True) pdf = pd.read_json(data) pdf = pdf.head(100) if sample else pdf pdf.to_sql( diff --git a/superset/examples/multiformat_time_series.py b/superset/examples/multiformat_time_series.py index 9b8bb22c98e89..22db949560e6d 100644 --- a/superset/examples/multiformat_time_series.py +++ b/superset/examples/multiformat_time_series.py @@ -44,7 +44,7 @@ def load_multiformat_time_series( # pylint: disable=too-many-locals table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("multiformat_time_series.json.gz") + data = get_example_data("multiformat_time_series.json.gz", make_bytes=True) pdf = pd.read_json(data) # TODO(bkyryliuk): move load examples data into the pytest fixture if database.backend == "presto": diff --git a/superset/examples/paris.py b/superset/examples/paris.py index 264d80feeb695..7a5ae7b9b67cb 100644 --- a/superset/examples/paris.py +++ b/superset/examples/paris.py @@ -33,7 +33,7 @@ def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("paris_iris.json.gz") + data = get_example_data("paris_iris.json.gz", make_bytes=True) df = pd.read_json(data) df["features"] = df.features.map(json.dumps) diff --git a/superset/examples/random_time_series.py b/superset/examples/random_time_series.py index 152b63e1cc326..7edf6dac774dc 100644 --- a/superset/examples/random_time_series.py +++ b/superset/examples/random_time_series.py @@ -42,7 +42,7 @@ def load_random_time_series_data( table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("random_time_series.json.gz") + data = get_example_data("random_time_series.json.gz", make_bytes=True) pdf = pd.read_json(data) if database.backend == "presto": pdf.ds = pd.to_datetime(pdf.ds, unit="s") diff --git a/superset/examples/sf_population_polygons.py b/superset/examples/sf_population_polygons.py index 6e60d60121b51..aefc3c2f5f855 100644 --- a/superset/examples/sf_population_polygons.py +++ b/superset/examples/sf_population_polygons.py @@ -35,7 +35,7 @@ def load_sf_population_polygons( table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("sf_population.json.gz") + data = get_example_data("sf_population.json.gz", make_bytes=True) df = pd.read_json(data) df["contour"] = df.contour.map(json.dumps) diff --git a/superset/examples/world_bank.py b/superset/examples/world_bank.py index 39b982aa52468..58e4e172eda26 100644 --- a/superset/examples/world_bank.py +++ b/superset/examples/world_bank.py @@ -56,7 +56,7 @@ def load_world_bank_health_n_pop( # pylint: disable=too-many-locals, too-many-s table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("countries.json.gz") + data = get_example_data("countries.json.gz", make_bytes=True) pdf = pd.read_json(data) pdf.columns = [col.replace(".", "_") for col in pdf.columns] if database.backend == "presto": From 8085e28fc6865cba80a3409d33956b34d6fc668a Mon Sep 17 00:00:00 2001 From: EugeneTorap Date: Sun, 7 Aug 2022 11:50:37 +0300 Subject: [PATCH 07/12] Fix test_contribution --- tests/unit_tests/pandas_postprocessing/test_contribution.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit_tests/pandas_postprocessing/test_contribution.py b/tests/unit_tests/pandas_postprocessing/test_contribution.py index 7eb34c4d13f7b..2c4949270e087 100644 --- a/tests/unit_tests/pandas_postprocessing/test_contribution.py +++ b/tests/unit_tests/pandas_postprocessing/test_contribution.py @@ -74,7 +74,7 @@ def test_contribution(): rename_columns=["pct_a"], ) assert processed_df.columns.tolist() == ["a", "b", "c", "pct_a"] - assert_array_equal(processed_df["a"].tolist(), [1, 3, nan]) - assert_array_equal(processed_df["b"].tolist(), [1, 9, nan]) - assert_array_equal(processed_df["c"].tolist(), [nan, nan, nan]) + assert_array_equal(processed_df["a"].tolist(), [1, 3, 0]) + assert_array_equal(processed_df["b"].tolist(), [1, 9, 0]) + assert_array_equal(processed_df["c"].tolist(), [0, 0, 0]) assert processed_df["pct_a"].tolist() == [0.25, 0.75, 0] From 85369fe6af8d4131e67fcff76452203202000078 Mon Sep 17 00:00:00 2001 From: EugeneTorap Date: Sun, 7 Aug 2022 13:13:15 +0300 Subject: [PATCH 08/12] Fix pandas issue when 'arrays' are empty but 'names' contain values --- superset/result_set.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/superset/result_set.py b/superset/result_set.py index 725bf1449cc79..b8a5abb6d192f 100644 --- a/superset/result_set.py +++ b/superset/result_set.py @@ -161,6 +161,9 @@ def __init__( # pylint: disable=too-many-locals except Exception as ex: # pylint: disable=broad-except logger.exception(ex) + if not pa_data: + column_names = [] + self.table = pa.Table.from_arrays(pa_data, names=column_names) self._type_dict: Dict[str, Any] = {} try: From f859ba4cc8523718decf4f3600b1ea97aa07bf9c Mon Sep 17 00:00:00 2001 From: EugeneTorap Date: Sun, 7 Aug 2022 14:21:21 +0300 Subject: [PATCH 09/12] fix: ValueError: For argument "ascending" expected type bool, received type NoneType. --- superset/viz.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/superset/viz.py b/superset/viz.py index 8da90987568f4..42de86497f9e2 100644 --- a/superset/viz.py +++ b/superset/viz.py @@ -2172,14 +2172,14 @@ def get_data(self, df: pd.DataFrame) -> VizData: if df is not None and not df.empty: if metric: df = df.sort_values( - utils.get_metric_name(metric), ascending=flt.get("asc") + utils.get_metric_name(metric), ascending=flt.get("asc", False) ) data[col] = [ {"id": row[0], "text": row[0], "metric": row[1]} for row in df.itertuples(index=False) ] else: - df = df.sort_values(col, ascending=flt.get("asc")) + df = df.sort_values(col, ascending=flt.get("asc", False)) data[col] = [ {"id": row[0], "text": row[0]} for row in df.itertuples(index=False) From 3d63498e988942fd8313dfa0ccfcd9fff3a1dbc6 Mon Sep 17 00:00:00 2001 From: EugeneTorap Date: Tue, 9 Aug 2022 08:56:33 +0300 Subject: [PATCH 10/12] Remove engine="pyarrow" and convert bytes to string --- superset/charts/post_processing.py | 2 +- superset/datasets/commands/importers/v1/utils.py | 2 +- superset/examples/bart_lines.py | 2 +- superset/examples/birth_names.py | 2 +- superset/examples/countries.md | 4 ++-- superset/examples/country_map.py | 2 +- superset/examples/energy.py | 2 +- superset/examples/flights.py | 4 ++-- superset/examples/helpers.py | 4 ++-- superset/examples/long_lat.py | 2 +- superset/examples/multiformat_time_series.py | 2 +- superset/examples/paris.py | 2 +- superset/examples/random_time_series.py | 2 +- superset/examples/sf_population_polygons.py | 2 +- superset/examples/world_bank.py | 2 +- superset/views/database/views.py | 1 - tests/integration_tests/utils/csv_tests.py | 2 +- 17 files changed, 19 insertions(+), 20 deletions(-) diff --git a/superset/charts/post_processing.py b/superset/charts/post_processing.py index 29e232db78342..81d2a09b04790 100644 --- a/superset/charts/post_processing.py +++ b/superset/charts/post_processing.py @@ -334,7 +334,7 @@ def apply_post_process( if query["result_format"] == ChartDataResultFormat.JSON: df = pd.DataFrame.from_dict(query["data"]) elif query["result_format"] == ChartDataResultFormat.CSV: - df = pd.read_csv(StringIO(query["data"]), engine="pyarrow") + df = pd.read_csv(StringIO(query["data"])) processed_df = post_processor(df, form_data, datasource) diff --git a/superset/datasets/commands/importers/v1/utils.py b/superset/datasets/commands/importers/v1/utils.py index 00d577fddd8b7..ba2b7df26174a 100644 --- a/superset/datasets/commands/importers/v1/utils.py +++ b/superset/datasets/commands/importers/v1/utils.py @@ -154,7 +154,7 @@ def load_data( data = request.urlopen(data_uri) # pylint: disable=consider-using-with if data_uri.endswith(".gz"): data = gzip.open(data) - df = pd.read_csv(data, encoding="utf-8", engine="pyarrow") + df = pd.read_csv(data, encoding="utf-8") dtype = get_dtype(df, dataset) # convert temporal columns diff --git a/superset/examples/bart_lines.py b/superset/examples/bart_lines.py index e2db59eb1721a..5e542559e70cc 100644 --- a/superset/examples/bart_lines.py +++ b/superset/examples/bart_lines.py @@ -34,7 +34,7 @@ def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None: table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - content = get_example_data("bart-lines.json.gz", make_bytes=True) + content = get_example_data("bart-lines.json.gz").decode("utf-8") df = pd.read_json(content, encoding="latin-1") df["path_json"] = df.path.map(json.dumps) df["polyline"] = df.path.map(polyline.encode) diff --git a/superset/examples/birth_names.py b/superset/examples/birth_names.py index 2afa04757c640..44ee32675a6ef 100644 --- a/superset/examples/birth_names.py +++ b/superset/examples/birth_names.py @@ -66,7 +66,7 @@ def gen_filter( def load_data(tbl_name: str, database: Database, sample: bool = False) -> None: - pdf = pd.read_json(get_example_data("birth_names2.json.gz", make_bytes=True)) + pdf = pd.read_json(get_example_data("birth_names2.json.gz").decode("utf-8")) # TODO(bkyryliuk): move load examples data into the pytest fixture if database.backend == "presto": pdf.ds = pd.to_datetime(pdf.ds, unit="ms") diff --git a/superset/examples/countries.md b/superset/examples/countries.md index 05a75f3fc139e..042c7b3e891d1 100644 --- a/superset/examples/countries.md +++ b/superset/examples/countries.md @@ -22,12 +22,12 @@ This data was downloaded from the Here's the script that was used to massage the data: DIR = "" - df_country = pd.read_csv(DIR + '/HNP_Country.csv', engine="pyarrow") + df_country = pd.read_csv(DIR + '/HNP_Country.csv') df_country.columns = ['country_code'] + list(df_country.columns[1:]) df_country = df_country[['country_code', 'Region']] df_country.columns = ['country_code', 'region'] - df = pd.read_csv(DIR + '/HNP_Data.csv', engine="pyarrow") + df = pd.read_csv(DIR + '/HNP_Data.csv') del df['Unnamed: 60'] df.columns = ['country_name', 'country_code'] + list(df.columns[2:]) ndf = df.merge(df_country, how='inner') diff --git a/superset/examples/country_map.py b/superset/examples/country_map.py index 2a423374b3873..c959a92085fc0 100644 --- a/superset/examples/country_map.py +++ b/superset/examples/country_map.py @@ -47,7 +47,7 @@ def load_country_map_data(only_metadata: bool = False, force: bool = False) -> N csv_bytes = get_example_data( "birth_france_data_for_country_map.csv", is_gzip=False, make_bytes=True ) - data = pd.read_csv(csv_bytes, encoding="utf-8", engine="pyarrow") + data = pd.read_csv(csv_bytes, encoding="utf-8") data["dttm"] = datetime.datetime.now().date() data.to_sql( tbl_name, diff --git a/superset/examples/energy.py b/superset/examples/energy.py index a91c15dc403fb..78f194e966179 100644 --- a/superset/examples/energy.py +++ b/superset/examples/energy.py @@ -46,7 +46,7 @@ def load_energy( table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("energy.json.gz", make_bytes=True) + data = get_example_data("energy.json.gz").decode("utf-8") pdf = pd.read_json(data) pdf = pdf.head(100) if sample else pdf pdf.to_sql( diff --git a/superset/examples/flights.py b/superset/examples/flights.py index 05604494f1ba2..46fdc5c1d07a1 100644 --- a/superset/examples/flights.py +++ b/superset/examples/flights.py @@ -33,11 +33,11 @@ def load_flights(only_metadata: bool = False, force: bool = False) -> None: if not only_metadata and (not table_exists or force): data = get_example_data("flight_data.csv.gz", make_bytes=True) - pdf = pd.read_csv(data, encoding="latin-1", engine="pyarrow") + pdf = pd.read_csv(data, encoding="latin-1") # Loading airports info to join and get lat/long airports_bytes = get_example_data("airports.csv.gz", make_bytes=True) - airports = pd.read_csv(airports_bytes, encoding="latin-1", engine="pyarrow") + airports = pd.read_csv(airports_bytes, encoding="latin-1") airports = airports.set_index("IATA_CODE") pdf[ # pylint: disable=unsupported-assignment-operation,useless-suppression diff --git a/superset/examples/helpers.py b/superset/examples/helpers.py index 8c2ad29f49102..2183a8d512f19 100644 --- a/superset/examples/helpers.py +++ b/superset/examples/helpers.py @@ -19,7 +19,7 @@ import os import zlib from io import BytesIO -from typing import Any, Dict, List, Set +from typing import Union, Any, Dict, List, Set from urllib import request from superset import app, db @@ -75,7 +75,7 @@ def get_slice_json(defaults: Dict[Any, Any], **kwargs: Any) -> str: def get_example_data( filepath: str, is_gzip: bool = True, make_bytes: bool = False -) -> BytesIO: +) -> Union[bytes, BytesIO]: content = request.urlopen( # pylint: disable=consider-using-with f"{BASE_URL}{filepath}?raw=true" ).read() diff --git a/superset/examples/long_lat.py b/superset/examples/long_lat.py index ed0f551582a5f..ba9824bb43fea 100644 --- a/superset/examples/long_lat.py +++ b/superset/examples/long_lat.py @@ -45,7 +45,7 @@ def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None if not only_metadata and (not table_exists or force): data = get_example_data("san_francisco.csv.gz", make_bytes=True) - pdf = pd.read_csv(data, encoding="utf-8", engine="pyarrow") + pdf = pd.read_csv(data, encoding="utf-8") start = datetime.datetime.now().replace( hour=0, minute=0, second=0, microsecond=0 ) diff --git a/superset/examples/multiformat_time_series.py b/superset/examples/multiformat_time_series.py index 22db949560e6d..1209ff184941d 100644 --- a/superset/examples/multiformat_time_series.py +++ b/superset/examples/multiformat_time_series.py @@ -44,7 +44,7 @@ def load_multiformat_time_series( # pylint: disable=too-many-locals table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("multiformat_time_series.json.gz", make_bytes=True) + data = get_example_data("multiformat_time_series.json.gz").decode("utf-8") pdf = pd.read_json(data) # TODO(bkyryliuk): move load examples data into the pytest fixture if database.backend == "presto": diff --git a/superset/examples/paris.py b/superset/examples/paris.py index 7a5ae7b9b67cb..9fa2fedb5e46e 100644 --- a/superset/examples/paris.py +++ b/superset/examples/paris.py @@ -33,7 +33,7 @@ def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("paris_iris.json.gz", make_bytes=True) + data = get_example_data("paris_iris.json.gz").decode("utf-8") df = pd.read_json(data) df["features"] = df.features.map(json.dumps) diff --git a/superset/examples/random_time_series.py b/superset/examples/random_time_series.py index 7edf6dac774dc..a7972e70d7f41 100644 --- a/superset/examples/random_time_series.py +++ b/superset/examples/random_time_series.py @@ -42,7 +42,7 @@ def load_random_time_series_data( table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("random_time_series.json.gz", make_bytes=True) + data = get_example_data("random_time_series.json.gz").decode("utf-8") pdf = pd.read_json(data) if database.backend == "presto": pdf.ds = pd.to_datetime(pdf.ds, unit="s") diff --git a/superset/examples/sf_population_polygons.py b/superset/examples/sf_population_polygons.py index aefc3c2f5f855..c5cdc0707c95f 100644 --- a/superset/examples/sf_population_polygons.py +++ b/superset/examples/sf_population_polygons.py @@ -35,7 +35,7 @@ def load_sf_population_polygons( table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("sf_population.json.gz", make_bytes=True) + data = get_example_data("sf_population.json.gz").decode("utf-8") df = pd.read_json(data) df["contour"] = df.contour.map(json.dumps) diff --git a/superset/examples/world_bank.py b/superset/examples/world_bank.py index 58e4e172eda26..2531ba34640a7 100644 --- a/superset/examples/world_bank.py +++ b/superset/examples/world_bank.py @@ -56,7 +56,7 @@ def load_world_bank_health_n_pop( # pylint: disable=too-many-locals, too-many-s table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("countries.json.gz", make_bytes=True) + data = get_example_data("countries.json.gz").decode("utf-8") pdf = pd.read_json(data) pdf.columns = [col.replace(".", "_") for col in pdf.columns] if database.backend == "presto": diff --git a/superset/views/database/views.py b/superset/views/database/views.py index e887076a3227f..bb2e018994e44 100644 --- a/superset/views/database/views.py +++ b/superset/views/database/views.py @@ -142,7 +142,6 @@ def form_post(self, form: CsvToDatabaseForm) -> Response: try: df = pd.concat( pd.read_csv( - engine="pyarrow", chunksize=1000, encoding="utf-8", filepath_or_buffer=form.csv_file.data, diff --git a/tests/integration_tests/utils/csv_tests.py b/tests/integration_tests/utils/csv_tests.py index b6ae85970efb1..e514efb1d2108 100644 --- a/tests/integration_tests/utils/csv_tests.py +++ b/tests/integration_tests/utils/csv_tests.py @@ -67,7 +67,7 @@ def test_df_to_escaped_csv(): ] csv_str = "\n".join([",".join(row) for row in csv_rows]) - df = pd.read_csv(io.StringIO(csv_str), engine="pyarrow") + df = pd.read_csv(io.StringIO(csv_str)) escaped_csv_str = csv.df_to_escaped_csv(df, encoding="utf8", index=False) escaped_csv_rows = [row.split(",") for row in escaped_csv_str.strip().split("\n")] From c67d43ca4c24fb42ab1cfa29250bb44f03d252b9 Mon Sep 17 00:00:00 2001 From: Ville Brofeldt Date: Tue, 16 Aug 2022 08:06:04 +0300 Subject: [PATCH 11/12] make copy of selected df to fix regression --- superset/utils/pandas_postprocessing/contribution.py | 3 +++ tests/unit_tests/pandas_postprocessing/test_contribution.py | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/superset/utils/pandas_postprocessing/contribution.py b/superset/utils/pandas_postprocessing/contribution.py index 2bfc6f4be698f..86ef271397291 100644 --- a/superset/utils/pandas_postprocessing/contribution.py +++ b/superset/utils/pandas_postprocessing/contribution.py @@ -49,6 +49,9 @@ def contribution( """ contribution_df = df.copy() numeric_df = contribution_df.select_dtypes(include=["number", Decimal]) + # TODO: copy needed due to following regression in 1.4, remove if not needed: + # https://github.com/pandas-dev/pandas/issues/48090 + numeric_df = numeric_df.copy() numeric_df.fillna(0, inplace=True) # verify column selections if columns: diff --git a/tests/unit_tests/pandas_postprocessing/test_contribution.py b/tests/unit_tests/pandas_postprocessing/test_contribution.py index 2c4949270e087..7eb34c4d13f7b 100644 --- a/tests/unit_tests/pandas_postprocessing/test_contribution.py +++ b/tests/unit_tests/pandas_postprocessing/test_contribution.py @@ -74,7 +74,7 @@ def test_contribution(): rename_columns=["pct_a"], ) assert processed_df.columns.tolist() == ["a", "b", "c", "pct_a"] - assert_array_equal(processed_df["a"].tolist(), [1, 3, 0]) - assert_array_equal(processed_df["b"].tolist(), [1, 9, 0]) - assert_array_equal(processed_df["c"].tolist(), [0, 0, 0]) + assert_array_equal(processed_df["a"].tolist(), [1, 3, nan]) + assert_array_equal(processed_df["b"].tolist(), [1, 9, nan]) + assert_array_equal(processed_df["c"].tolist(), [nan, nan, nan]) assert processed_df["pct_a"].tolist() == [0.25, 0.75, 0] From 213bf7985a3db89e420218a0b4d536c5ca229d49 Mon Sep 17 00:00:00 2001 From: EugeneTorap Date: Tue, 16 Aug 2022 11:41:09 +0300 Subject: [PATCH 12/12] Simplify pd.read_json() and pd.read_csv() for example data --- superset/examples/bart_lines.py | 6 +++--- superset/examples/birth_names.py | 5 +++-- superset/examples/country_map.py | 8 +++----- superset/examples/energy.py | 6 +++--- superset/examples/flights.py | 10 +++++----- superset/examples/helpers.py | 18 +++--------------- superset/examples/long_lat.py | 6 +++--- superset/examples/multiformat_time_series.py | 6 +++--- superset/examples/paris.py | 6 +++--- superset/examples/random_time_series.py | 6 +++--- superset/examples/sf_population_polygons.py | 6 +++--- superset/examples/world_bank.py | 6 +++--- 12 files changed, 38 insertions(+), 51 deletions(-) diff --git a/superset/examples/bart_lines.py b/superset/examples/bart_lines.py index 5e542559e70cc..91257058be75a 100644 --- a/superset/examples/bart_lines.py +++ b/superset/examples/bart_lines.py @@ -23,7 +23,7 @@ from superset import db from ..utils.database import get_example_database -from .helpers import get_example_data, get_table_connector_registry +from .helpers import get_example_url, get_table_connector_registry def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None: @@ -34,8 +34,8 @@ def load_bart_lines(only_metadata: bool = False, force: bool = False) -> None: table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - content = get_example_data("bart-lines.json.gz").decode("utf-8") - df = pd.read_json(content, encoding="latin-1") + url = get_example_url("bart-lines.json.gz") + df = pd.read_json(url, encoding="latin-1", compression="gzip") df["path_json"] = df.path.map(json.dumps) df["polyline"] = df.path.map(polyline.encode) del df["path"] diff --git a/superset/examples/birth_names.py b/superset/examples/birth_names.py index 44ee32675a6ef..f8b8a8ecf7ca8 100644 --- a/superset/examples/birth_names.py +++ b/superset/examples/birth_names.py @@ -33,7 +33,7 @@ from ..utils.database import get_example_database from .helpers import ( - get_example_data, + get_example_url, get_slice_json, get_table_connector_registry, merge_slice, @@ -66,7 +66,8 @@ def gen_filter( def load_data(tbl_name: str, database: Database, sample: bool = False) -> None: - pdf = pd.read_json(get_example_data("birth_names2.json.gz").decode("utf-8")) + url = get_example_url("birth_names2.json.gz") + pdf = pd.read_json(url, compression="gzip") # TODO(bkyryliuk): move load examples data into the pytest fixture if database.backend == "presto": pdf.ds = pd.to_datetime(pdf.ds, unit="ms") diff --git a/superset/examples/country_map.py b/superset/examples/country_map.py index c959a92085fc0..302b55180ea84 100644 --- a/superset/examples/country_map.py +++ b/superset/examples/country_map.py @@ -27,7 +27,7 @@ from superset.utils.core import DatasourceType from .helpers import ( - get_example_data, + get_example_url, get_slice_json, get_table_connector_registry, merge_slice, @@ -44,10 +44,8 @@ def load_country_map_data(only_metadata: bool = False, force: bool = False) -> N table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - csv_bytes = get_example_data( - "birth_france_data_for_country_map.csv", is_gzip=False, make_bytes=True - ) - data = pd.read_csv(csv_bytes, encoding="utf-8") + url = get_example_url("birth_france_data_for_country_map.csv") + data = pd.read_csv(url, encoding="utf-8") data["dttm"] = datetime.datetime.now().date() data.to_sql( tbl_name, diff --git a/superset/examples/energy.py b/superset/examples/energy.py index 78f194e966179..72b22525f2760 100644 --- a/superset/examples/energy.py +++ b/superset/examples/energy.py @@ -28,7 +28,7 @@ from superset.utils.core import DatasourceType from .helpers import ( - get_example_data, + get_example_url, get_table_connector_registry, merge_slice, misc_dash_slices, @@ -46,8 +46,8 @@ def load_energy( table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("energy.json.gz").decode("utf-8") - pdf = pd.read_json(data) + url = get_example_url("energy.json.gz") + pdf = pd.read_json(url, compression="gzip") pdf = pdf.head(100) if sample else pdf pdf.to_sql( tbl_name, diff --git a/superset/examples/flights.py b/superset/examples/flights.py index 46fdc5c1d07a1..1389c65c9a901 100644 --- a/superset/examples/flights.py +++ b/superset/examples/flights.py @@ -20,7 +20,7 @@ import superset.utils.database as database_utils from superset import db -from .helpers import get_example_data, get_table_connector_registry +from .helpers import get_example_url, get_table_connector_registry def load_flights(only_metadata: bool = False, force: bool = False) -> None: @@ -32,12 +32,12 @@ def load_flights(only_metadata: bool = False, force: bool = False) -> None: table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("flight_data.csv.gz", make_bytes=True) - pdf = pd.read_csv(data, encoding="latin-1") + flight_data_url = get_example_url("flight_data.csv.gz") + pdf = pd.read_csv(flight_data_url, encoding="latin-1", compression="gzip") # Loading airports info to join and get lat/long - airports_bytes = get_example_data("airports.csv.gz", make_bytes=True) - airports = pd.read_csv(airports_bytes, encoding="latin-1") + airports_url = get_example_url("airports.csv.gz") + airports = pd.read_csv(airports_url, encoding="latin-1", compression="gzip") airports = airports.set_index("IATA_CODE") pdf[ # pylint: disable=unsupported-assignment-operation,useless-suppression diff --git a/superset/examples/helpers.py b/superset/examples/helpers.py index 2183a8d512f19..e26e05e49739a 100644 --- a/superset/examples/helpers.py +++ b/superset/examples/helpers.py @@ -17,10 +17,7 @@ """Loads datasets, dashboards and slices in a new superset instance""" import json import os -import zlib -from io import BytesIO -from typing import Union, Any, Dict, List, Set -from urllib import request +from typing import Any, Dict, List, Set from superset import app, db from superset.connectors.sqla.models import SqlaTable @@ -73,14 +70,5 @@ def get_slice_json(defaults: Dict[Any, Any], **kwargs: Any) -> str: return json.dumps(defaults_copy, indent=4, sort_keys=True) -def get_example_data( - filepath: str, is_gzip: bool = True, make_bytes: bool = False -) -> Union[bytes, BytesIO]: - content = request.urlopen( # pylint: disable=consider-using-with - f"{BASE_URL}{filepath}?raw=true" - ).read() - if is_gzip: - content = zlib.decompress(content, zlib.MAX_WBITS | 16) - if make_bytes: - content = BytesIO(content) - return content +def get_example_url(filepath: str) -> str: + return f"{BASE_URL}{filepath}?raw=true" diff --git a/superset/examples/long_lat.py b/superset/examples/long_lat.py index ba9824bb43fea..76f51a615951f 100644 --- a/superset/examples/long_lat.py +++ b/superset/examples/long_lat.py @@ -27,7 +27,7 @@ from superset.utils.core import DatasourceType from .helpers import ( - get_example_data, + get_example_url, get_slice_json, get_table_connector_registry, merge_slice, @@ -44,8 +44,8 @@ def load_long_lat_data(only_metadata: bool = False, force: bool = False) -> None table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("san_francisco.csv.gz", make_bytes=True) - pdf = pd.read_csv(data, encoding="utf-8") + url = get_example_url("san_francisco.csv.gz") + pdf = pd.read_csv(url, encoding="utf-8", compression="gzip") start = datetime.datetime.now().replace( hour=0, minute=0, second=0, microsecond=0 ) diff --git a/superset/examples/multiformat_time_series.py b/superset/examples/multiformat_time_series.py index 1209ff184941d..62e16d2cb0881 100644 --- a/superset/examples/multiformat_time_series.py +++ b/superset/examples/multiformat_time_series.py @@ -25,7 +25,7 @@ from ..utils.database import get_example_database from .helpers import ( - get_example_data, + get_example_url, get_slice_json, get_table_connector_registry, merge_slice, @@ -44,8 +44,8 @@ def load_multiformat_time_series( # pylint: disable=too-many-locals table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("multiformat_time_series.json.gz").decode("utf-8") - pdf = pd.read_json(data) + url = get_example_url("multiformat_time_series.json.gz") + pdf = pd.read_json(url, compression="gzip") # TODO(bkyryliuk): move load examples data into the pytest fixture if database.backend == "presto": pdf.ds = pd.to_datetime(pdf.ds, unit="s") diff --git a/superset/examples/paris.py b/superset/examples/paris.py index 9fa2fedb5e46e..c323007028523 100644 --- a/superset/examples/paris.py +++ b/superset/examples/paris.py @@ -22,7 +22,7 @@ import superset.utils.database as database_utils from superset import db -from .helpers import get_example_data, get_table_connector_registry +from .helpers import get_example_url, get_table_connector_registry def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> None: @@ -33,8 +33,8 @@ def load_paris_iris_geojson(only_metadata: bool = False, force: bool = False) -> table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("paris_iris.json.gz").decode("utf-8") - df = pd.read_json(data) + url = get_example_url("paris_iris.json.gz") + df = pd.read_json(url, compression="gzip") df["features"] = df.features.map(json.dumps) df.to_sql( diff --git a/superset/examples/random_time_series.py b/superset/examples/random_time_series.py index a7972e70d7f41..4a2628df7a074 100644 --- a/superset/examples/random_time_series.py +++ b/superset/examples/random_time_series.py @@ -24,7 +24,7 @@ from superset.utils.core import DatasourceType from .helpers import ( - get_example_data, + get_example_url, get_slice_json, get_table_connector_registry, merge_slice, @@ -42,8 +42,8 @@ def load_random_time_series_data( table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("random_time_series.json.gz").decode("utf-8") - pdf = pd.read_json(data) + url = get_example_url("random_time_series.json.gz") + pdf = pd.read_json(url, compression="gzip") if database.backend == "presto": pdf.ds = pd.to_datetime(pdf.ds, unit="s") pdf.ds = pdf.ds.dt.strftime("%Y-%m-%d %H:%M%:%S") diff --git a/superset/examples/sf_population_polygons.py b/superset/examples/sf_population_polygons.py index c5cdc0707c95f..71ba34401af92 100644 --- a/superset/examples/sf_population_polygons.py +++ b/superset/examples/sf_population_polygons.py @@ -22,7 +22,7 @@ import superset.utils.database as database_utils from superset import db -from .helpers import get_example_data, get_table_connector_registry +from .helpers import get_example_url, get_table_connector_registry def load_sf_population_polygons( @@ -35,8 +35,8 @@ def load_sf_population_polygons( table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("sf_population.json.gz").decode("utf-8") - df = pd.read_json(data) + url = get_example_url("sf_population.json.gz") + df = pd.read_json(url, compression="gzip") df["contour"] = df.contour.map(json.dumps) df.to_sql( diff --git a/superset/examples/world_bank.py b/superset/examples/world_bank.py index 2531ba34640a7..4a18f806eae56 100644 --- a/superset/examples/world_bank.py +++ b/superset/examples/world_bank.py @@ -33,7 +33,7 @@ from ..connectors.base.models import BaseDatasource from .helpers import ( - get_example_data, + get_example_url, get_examples_folder, get_slice_json, get_table_connector_registry, @@ -56,8 +56,8 @@ def load_world_bank_health_n_pop( # pylint: disable=too-many-locals, too-many-s table_exists = database.has_table_by_name(tbl_name) if not only_metadata and (not table_exists or force): - data = get_example_data("countries.json.gz").decode("utf-8") - pdf = pd.read_json(data) + url = get_example_url("countries.json.gz") + pdf = pd.read_json(url, compression="gzip") pdf.columns = [col.replace(".", "_") for col in pdf.columns] if database.backend == "presto": pdf.year = pd.to_datetime(pdf.year)