From 8fdd2a43fb72a08f4e27b5f1229c1c3b8eb92af6 Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Tue, 12 Nov 2019 12:03:55 -0800 Subject: [PATCH] fix(automl): fix uploading pandas dataframe to AutoML Tables (#9647) pandas.dataframe.to_csv() by default exports data index as an column with empty column name. This causes uploading the export csv file to fail because AutoML Tables does not allow empty column names. Given that the data index is not useful for training the model. This PR fixes the problem by setting the index argument to false so that the index is not exported. --- automl/google/cloud/automl_v1beta1/tables/gcs_client.py | 7 ++++++- automl/tests/unit/gapic/v1beta1/test_gcs_client_v1beta1.py | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/automl/google/cloud/automl_v1beta1/tables/gcs_client.py b/automl/google/cloud/automl_v1beta1/tables/gcs_client.py index e5de17c3b0c9..99d40da2867c 100644 --- a/automl/google/cloud/automl_v1beta1/tables/gcs_client.py +++ b/automl/google/cloud/automl_v1beta1/tables/gcs_client.py @@ -132,7 +132,12 @@ def upload_pandas_dataframe(self, dataframe, uploaded_csv_name=None): uploaded_csv_name = "automl-tables-dataframe-{}.csv".format( int(time.time()) ) - csv_string = dataframe.to_csv() + + # Setting index to False to ignore exporting the data index: + # 1. The resulting column name for the index column is empty, AutoML + # Tables does not allow empty column name + # 2. The index is not an useful training information + csv_string = dataframe.to_csv(index=False) bucket = self.client.get_bucket(self.bucket_name) blob = bucket.blob(uploaded_csv_name) diff --git a/automl/tests/unit/gapic/v1beta1/test_gcs_client_v1beta1.py b/automl/tests/unit/gapic/v1beta1/test_gcs_client_v1beta1.py index f7a2e27ab7d8..222fca3244ee 100644 --- a/automl/tests/unit/gapic/v1beta1/test_gcs_client_v1beta1.py +++ b/automl/tests/unit/gapic/v1beta1/test_gcs_client_v1beta1.py @@ -139,7 +139,7 @@ def test_upload_pandas_dataframe(self): gcs_client.client.get_bucket.assert_called_with("my-bucket") mock_bucket.blob.assert_called_with("my-file.csv") - mock_blob.upload_from_string.assert_called_with(",col1,col2\n0,1,3\n1,2,4\n") + mock_blob.upload_from_string.assert_called_with("col1,col2\n1,3\n2,4\n") assert gcs_uri == "gs://my-bucket/my-file.csv" def test_upload_pandas_dataframe_no_csv_name(self): @@ -156,7 +156,7 @@ def test_upload_pandas_dataframe_no_csv_name(self): gcs_client.client.get_bucket.assert_called_with("my-bucket") mock_bucket.blob.assert_called_with(generated_csv_name) - mock_blob.upload_from_string.assert_called_with(",col1,col2\n0,1,3\n1,2,4\n") + mock_blob.upload_from_string.assert_called_with("col1,col2\n1,3\n2,4\n") assert re.match("^gs://my-bucket/automl-tables-dataframe-[0-9]*.csv$", gcs_uri) def test_upload_pandas_dataframe_not_type_dataframe(self):