Skip to content

Commit

Permalink
[data] Update the strict mode message to be less confusing (ray-proje…
Browse files Browse the repository at this point in the history
  • Loading branch information
ericl authored and architkulkarni committed May 16, 2023
1 parent 256c035 commit 6df3629
Show file tree
Hide file tree
Showing 7 changed files with 23 additions and 31 deletions.
7 changes: 2 additions & 5 deletions doc/source/data/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -287,17 +287,14 @@ Ray Data doesn't perform query optimization, so some manual performance
tuning may be necessary depending on your use case and data scale. Please see our
:ref:`performance tuning guide <data_performance_tips>` for more information.

What is strict mode?
====================
Migrating to strict mode
========================

In Ray 2.5, Ray Data by default always requires data schemas, dropping support for
standalone Python objects. In addition to unification and simplicity benefits, this
aligns the Ray Data API closer to industry-standard distributed data APIs like Apache
Spark and also emerging standards for machine learning datasets like HuggingFace.

Migrating to strict mode
~~~~~~~~~~~~~~~~~~~~~~~~

You can disable strict mode temporarily by setting the environment variable
``RAY_DATA_STRICT_MODE=0`` on all cluster processes. Strict mode will not be
possible to disable in future releases.
Expand Down
4 changes: 2 additions & 2 deletions python/ray/data/_internal/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def __init__(
if legacy_min_size is not None or legacy_max_size is not None:
if ctx.strict_mode:
raise StrictModeError(
"In strict mode, ActorPoolStrategy requires min_size and "
"In Ray 2.5, ActorPoolStrategy requires min_size and "
"max_size to be explicit kwargs."
)
else:
Expand Down Expand Up @@ -503,7 +503,7 @@ def get_compute(compute_spec: Union[str, ComputeStrategy]) -> ComputeStrategy:
compute_spec, (TaskPoolStrategy, ActorPoolStrategy)
):
raise StrictModeError(
"In strict mode, the compute spec must be either "
"In Ray 2.5, the compute spec must be either "
f"TaskPoolStrategy or ActorPoolStategy, was: {compute_spec}."
)
elif not compute_spec or compute_spec == "tasks":
Expand Down
2 changes: 1 addition & 1 deletion python/ray/data/_internal/planner/map_rows.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def fn(
raise StrictModeError(
f"Error validating {_truncated_repr(item)}: "
"Standalone Python objects are not "
"allowed in strict mode. To return Python objects from map(), "
"allowed in Ray 2.5. To return Python objects from map(), "
"wrap them in a dict, e.g., "
"return `{'item': item}` instead of just `item`."
)
Expand Down
23 changes: 11 additions & 12 deletions python/ray/data/block.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,13 @@

STRICT_MODE_EXPLANATION = (
colorama.Fore.YELLOW
+ "[IMPORTANT]: Ray Data strict mode is on by default in Ray 2.5. When in strict "
"mode, data schemas are required, standalone Python "
"objects are no longer supported, and the default batch format changes to `numpy` "
"from `pandas`. To disable strict mode temporarily, set the environment variable "
"RAY_DATA_STRICT_MODE=0 on all cluster processes. Strict mode will not be "
"possible to disable in future releases.\n\n"
"Learn more here: https://docs.ray.io/en/master/data/faq.html#what-is-strict-mode"
+ colorama.Style.RESET_ALL
+ "Important: Ray Data requires schemas for all datasets in Ray 2.5. This means "
"that standalone Python objects are no longer supported. In addition, the default "
"batch format is fixed to NumPy. To revert to legacy behavior temporarily, "
"set the "
"environment variable RAY_DATA_STRICT_MODE=0 on all cluster processes.\n\n"
"Learn more here: https://docs.ray.io/en/master/data/faq.html#"
"migrating-to-strict-mode" + colorama.Style.RESET_ALL
)


Expand Down Expand Up @@ -92,7 +91,7 @@ def _validate_key_fn(
"schema '{}'.".format(key, schema)
)
elif ctx.strict_mode:
raise StrictModeError(f"In strict mode, the key must be a string, was: {key}")
raise StrictModeError(f"In Ray 2.5, the key must be a string, was: {key}")
elif key is None:
if not is_simple_format:
raise ValueError(
Expand Down Expand Up @@ -161,7 +160,7 @@ def _apply_strict_mode_batch_format(given_batch_format: Optional[str]) -> str:
if given_batch_format not in VALID_BATCH_FORMATS_STRICT_MODE:
raise StrictModeError(
f"The given batch format {given_batch_format} is not allowed "
f"in strict mode (must be one of {VALID_BATCH_FORMATS_STRICT_MODE})."
f"in Ray 2.5 (must be one of {VALID_BATCH_FORMATS_STRICT_MODE})."
)
return given_batch_format

Expand Down Expand Up @@ -424,7 +423,7 @@ def batch_to_block(batch: DataBatch) -> Block:
raise StrictModeError(
f"Error validating {_truncated_repr(batch)}: "
"Standalone numpy arrays are not "
"allowed in strict mode. Return a dict of field -> array, "
"allowed in Ray 2.5. Return a dict of field -> array, "
"e.g., `{'data': array}` instead of `array`."
)

Expand Down Expand Up @@ -472,7 +471,7 @@ def for_block(block: Block) -> "BlockAccessor[T]":
raise StrictModeError(
f"Error validating {_truncated_repr(block)}: "
"Standalone Python objects are not "
"allowed in strict mode. To use Python objects in a datastream, "
"allowed in Ray 2.5. To use Python objects in a datastream, "
"wrap them in a dict of numpy arrays, e.g., "
"return `{'item': np.array(batch)}` instead of just `batch`."
)
Expand Down
8 changes: 3 additions & 5 deletions python/ray/data/datastream.py
Original file line number Diff line number Diff line change
Expand Up @@ -2609,7 +2609,7 @@ def write_numpy(
context = DataContext.get_current()
if context.strict_mode and not column:
raise StrictModeError(
"In strict mode, the column must be specified "
"In Ray 2.5, the column must be specified "
"(e.g., `write_numpy(column='data')`)."
)
column = column or TENSOR_COLUMN_NAME
Expand Down Expand Up @@ -4116,9 +4116,7 @@ def _divide(self, block_idx: int) -> ("Datastream", "Datastream"):
def default_batch_format(self) -> Type:
context = DataContext.get_current()
if context.strict_mode:
raise StrictModeError(
"default_batch_format() is not allowed in strict mode"
)
raise StrictModeError("default_batch_format() is not allowed in Ray 2.5")

import pandas as pd
import pyarrow as pa
Expand All @@ -4138,7 +4136,7 @@ def default_batch_format(self) -> Type:
def dataset_format(self) -> BlockFormat:
context = DataContext.get_current()
if context.strict_mode:
raise StrictModeError("dataset_format() is not allowed in strict mode")
raise StrictModeError("dataset_format() is not allowed in Ray 2.5")

if context.use_streaming_executor:
raise DeprecationWarning(
Expand Down
4 changes: 1 addition & 3 deletions python/ray/data/read_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,9 +245,7 @@ def range(n: int, *, parallelism: int = -1) -> Datastream:
def range_table(n: int, *, parallelism: int = -1) -> Datastream:
ctx = ray.data.DataContext.get_current()
if ctx.strict_mode:
raise DeprecationWarning(
"In strict mode, use range() instead of range_table()."
)
raise DeprecationWarning("In Ray 2.5, use range() instead of range_table().")
return read_datasource(
RangeDatasource(),
parallelism=parallelism,
Expand Down
6 changes: 3 additions & 3 deletions python/ray/data/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@ def __init__(self):
self.infos = []

def warning(self, msg):
if "strict mode" in msg:
if "STRICT_MODE" in msg:
return
self.warnings.append(msg)
print("warning:", msg)

def info(self, msg):
if "strict mode" in msg:
if "STRICT_MODE" in msg:
return
self.infos.append(msg)
print("info:", msg)

def debug(self, msg):
if "strict mode" in msg:
if "STRICT_MODE" in msg:
return
print("debug:", msg)

Expand Down

0 comments on commit 6df3629

Please sign in to comment.