[data] Change fixture from shutdown_only to `ray_start_regular_shar…

…ed` for `test_csv_read_filter_non_csv_file` (ray-project#47513) ## Why are these changes needed? Seems that ray-project#47467 ended up breaking some niche setup for this test, by changing the fixture from `shutdown_only` to `ray_start_regular_shared` we are able to get the test passing again. ## Related issue number ## Checks - [x] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [x] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [x] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [x] Unit tests - [ ] Release tests - [ ] This PR is not tested :( Signed-off-by: Matthew Owen <mowen@anyscale.com> Signed-off-by: ujjawal-khare <ujjawal.khare@dream11.com>
ujjawal-khare-27 · Oct 15, 2024 · 75c03f2 · 75c03f2
1 parent 9e7e855
commit 75c03f2
Showing 1 changed file with 47 additions and 47 deletions.
diff --git a/python/ray/data/tests/test_csv.py b/python/ray/data/tests/test_csv.py
@@ -745,53 +745,7 @@ def test_csv_roundtrip(ray_start_regular_shared, fs, data_path):
         BlockAccessor.for_block(ray.get(block)).size_bytes() == meta.size_bytes
 
 
-# NOTE: The last test using the shared ray_start_regular_shared cluster must use the
-# shutdown_only fixture so the shared cluster is shut down, otherwise the below
-# test_write_datasink_ray_remote_args test, which uses a cluster_utils cluster, will
-# fail with a double-init.
-def test_csv_read_no_header(shutdown_only, tmp_path):
-    from pyarrow import csv
-
-    file_path = os.path.join(tmp_path, "test.csv")
-    df = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
-    df.to_csv(file_path, index=False, header=False)
-    ds = ray.data.read_csv(
-        file_path,
-        read_options=csv.ReadOptions(column_names=["one", "two"]),
-    )
-    out_df = ds.to_pandas()
-    assert df.equals(out_df)
-
-
-def test_csv_read_with_column_type_specified(shutdown_only, tmp_path):
-    from pyarrow import csv
-
-    file_path = os.path.join(tmp_path, "test.csv")
-    df = pd.DataFrame({"one": [1, 2, 3e1], "two": ["a", "b", "c"]})
-    df.to_csv(file_path, index=False)
-
-    # Incorrect to parse scientific notation in int64 as PyArrow represents
-    # it as double.
-    with pytest.raises(ValueError):
-        ray.data.read_csv(
-            file_path,
-            convert_options=csv.ConvertOptions(
-                column_types={"one": "int64", "two": "string"}
-            ),
-        ).schema()
-
-    # Parsing scientific notation in double should work.
-    ds = ray.data.read_csv(
-        file_path,
-        convert_options=csv.ConvertOptions(
-            column_types={"one": "float64", "two": "string"}
-        ),
-    )
-    expected_df = pd.DataFrame({"one": [1.0, 2.0, 30.0], "two": ["a", "b", "c"]})
-    assert ds.to_pandas().equals(expected_df)
-
-
-def test_csv_read_filter_non_csv_file(shutdown_only, tmp_path):
+def test_csv_read_filter_non_csv_file(ray_start_regular_shared, tmp_path):
     df = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
 
     # CSV file with .csv extension.
@@ -840,6 +794,52 @@ def test_csv_read_filter_non_csv_file(shutdown_only, tmp_path):
     assert ds.to_pandas().equals(df)
 
 
+# NOTE: The last test using the shared ray_start_regular_shared cluster must use the
+# shutdown_only fixture so the shared cluster is shut down, otherwise the below
+# test_write_datasink_ray_remote_args test, which uses a cluster_utils cluster, will
+# fail with a double-init.
+def test_csv_read_no_header(shutdown_only, tmp_path):
+    from pyarrow import csv
+
+    file_path = os.path.join(tmp_path, "test.csv")
+    df = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
+    df.to_csv(file_path, index=False, header=False)
+    ds = ray.data.read_csv(
+        file_path,
+        read_options=csv.ReadOptions(column_names=["one", "two"]),
+    )
+    out_df = ds.to_pandas()
+    assert df.equals(out_df)
+
+
+def test_csv_read_with_column_type_specified(shutdown_only, tmp_path):
+    from pyarrow import csv
+
+    file_path = os.path.join(tmp_path, "test.csv")
+    df = pd.DataFrame({"one": [1, 2, 3e1], "two": ["a", "b", "c"]})
+    df.to_csv(file_path, index=False)
+
+    # Incorrect to parse scientific notation in int64 as PyArrow represents
+    # it as double.
+    with pytest.raises(ValueError):
+        ray.data.read_csv(
+            file_path,
+            convert_options=csv.ConvertOptions(
+                column_types={"one": "int64", "two": "string"}
+            ),
+        ).schema()
+
+    # Parsing scientific notation in double should work.
+    ds = ray.data.read_csv(
+        file_path,
+        convert_options=csv.ConvertOptions(
+            column_types={"one": "float64", "two": "string"}
+        ),
+    )
+    expected_df = pd.DataFrame({"one": [1.0, 2.0, 30.0], "two": ["a", "b", "c"]})
+    assert ds.to_pandas().equals(expected_df)
+
+
 @pytest.mark.skipif(
     Version(pa.__version__) < Version("7.0.0"),
     reason="invalid_row_handler was added in pyarrow 7.0.0",