Skip to content

Commit

Permalink
[data] Change fixture from shutdown_only to `ray_start_regular_shar…
Browse files Browse the repository at this point in the history
…ed` for `test_csv_read_filter_non_csv_file` (ray-project#47513)

## Why are these changes needed?
Seems that ray-project#47467 ended up
breaking some niche setup for this test, by changing the fixture from
`shutdown_only` to `ray_start_regular_shared` we are able to get the
test passing again.

## Related issue number

## Checks

- [x] I've signed off every commit(by using the -s flag, i.e., `git
commit -s`) in this PR.
- [x] I've run `scripts/format.sh` to lint the changes in this PR.
- [ ] I've included any doc changes needed for
https://docs.ray.io/en/master/.
- [ ] I've added any new APIs to the API Reference. For example, if I
added a
method in Tune, I've added it in `doc/source/tune/api/` under the
           corresponding `.rst` file.
- [x] I've made sure the tests are passing. Note that there might be a
few flaky tests, see the recent failures at https://flakey-tests.ray.io/
- Testing Strategy
   - [x] Unit tests
   - [ ] Release tests
   - [ ] This PR is not tested :(

Signed-off-by: Matthew Owen <mowen@anyscale.com>
Signed-off-by: ujjawal-khare <ujjawal.khare@dream11.com>
  • Loading branch information
omatthew98 authored and ujjawal-khare committed Oct 15, 2024
1 parent 9e7e855 commit 75c03f2
Showing 1 changed file with 47 additions and 47 deletions.
94 changes: 47 additions & 47 deletions python/ray/data/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,53 +745,7 @@ def test_csv_roundtrip(ray_start_regular_shared, fs, data_path):
BlockAccessor.for_block(ray.get(block)).size_bytes() == meta.size_bytes


# NOTE: The last test using the shared ray_start_regular_shared cluster must use the
# shutdown_only fixture so the shared cluster is shut down, otherwise the below
# test_write_datasink_ray_remote_args test, which uses a cluster_utils cluster, will
# fail with a double-init.
def test_csv_read_no_header(shutdown_only, tmp_path):
from pyarrow import csv

file_path = os.path.join(tmp_path, "test.csv")
df = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
df.to_csv(file_path, index=False, header=False)
ds = ray.data.read_csv(
file_path,
read_options=csv.ReadOptions(column_names=["one", "two"]),
)
out_df = ds.to_pandas()
assert df.equals(out_df)


def test_csv_read_with_column_type_specified(shutdown_only, tmp_path):
from pyarrow import csv

file_path = os.path.join(tmp_path, "test.csv")
df = pd.DataFrame({"one": [1, 2, 3e1], "two": ["a", "b", "c"]})
df.to_csv(file_path, index=False)

# Incorrect to parse scientific notation in int64 as PyArrow represents
# it as double.
with pytest.raises(ValueError):
ray.data.read_csv(
file_path,
convert_options=csv.ConvertOptions(
column_types={"one": "int64", "two": "string"}
),
).schema()

# Parsing scientific notation in double should work.
ds = ray.data.read_csv(
file_path,
convert_options=csv.ConvertOptions(
column_types={"one": "float64", "two": "string"}
),
)
expected_df = pd.DataFrame({"one": [1.0, 2.0, 30.0], "two": ["a", "b", "c"]})
assert ds.to_pandas().equals(expected_df)


def test_csv_read_filter_non_csv_file(shutdown_only, tmp_path):
def test_csv_read_filter_non_csv_file(ray_start_regular_shared, tmp_path):
df = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})

# CSV file with .csv extension.
Expand Down Expand Up @@ -840,6 +794,52 @@ def test_csv_read_filter_non_csv_file(shutdown_only, tmp_path):
assert ds.to_pandas().equals(df)


# NOTE: The last test using the shared ray_start_regular_shared cluster must use the
# shutdown_only fixture so the shared cluster is shut down, otherwise the below
# test_write_datasink_ray_remote_args test, which uses a cluster_utils cluster, will
# fail with a double-init.
def test_csv_read_no_header(shutdown_only, tmp_path):
from pyarrow import csv

file_path = os.path.join(tmp_path, "test.csv")
df = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]})
df.to_csv(file_path, index=False, header=False)
ds = ray.data.read_csv(
file_path,
read_options=csv.ReadOptions(column_names=["one", "two"]),
)
out_df = ds.to_pandas()
assert df.equals(out_df)


def test_csv_read_with_column_type_specified(shutdown_only, tmp_path):
from pyarrow import csv

file_path = os.path.join(tmp_path, "test.csv")
df = pd.DataFrame({"one": [1, 2, 3e1], "two": ["a", "b", "c"]})
df.to_csv(file_path, index=False)

# Incorrect to parse scientific notation in int64 as PyArrow represents
# it as double.
with pytest.raises(ValueError):
ray.data.read_csv(
file_path,
convert_options=csv.ConvertOptions(
column_types={"one": "int64", "two": "string"}
),
).schema()

# Parsing scientific notation in double should work.
ds = ray.data.read_csv(
file_path,
convert_options=csv.ConvertOptions(
column_types={"one": "float64", "two": "string"}
),
)
expected_df = pd.DataFrame({"one": [1.0, 2.0, 30.0], "two": ["a", "b", "c"]})
assert ds.to_pandas().equals(expected_df)


@pytest.mark.skipif(
Version(pa.__version__) < Version("7.0.0"),
reason="invalid_row_handler was added in pyarrow 7.0.0",
Expand Down

0 comments on commit 75c03f2

Please sign in to comment.