Skip to content

Commit

Permalink
Merge branch 'branch-24.12' of github.com:rapidsai/cudf into ci-conda…
Browse files Browse the repository at this point in the history
…-installs
  • Loading branch information
jameslamb committed Oct 8, 2024
2 parents 278ecec + 553d8ec commit 911e9e4
Show file tree
Hide file tree
Showing 17 changed files with 153 additions and 112 deletions.
88 changes: 50 additions & 38 deletions cpp/src/io/json/process_tokens.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
#include <cudf/io/detail/tokenize_json.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>
Expand Down Expand Up @@ -87,38 +88,41 @@ void validate_token_stream(device_span<char const> d_input,
{
CUDF_FUNC_RANGE();
if (!options.is_strict_validation()) { return; }

rmm::device_uvector<bool> d_invalid = cudf::detail::make_zeroed_device_uvector_async<bool>(
tokens.size(), stream, cudf::get_current_device_resource_ref());

using token_t = cudf::io::json::token_t;
cudf::detail::optional_trie trie_na =
cudf::detail::create_serialized_trie(options.get_na_values(), stream);
auto trie_na_view = cudf::detail::make_trie_view(trie_na);
auto literals = options.get_na_values();
literals.emplace_back("null"); // added these too to single trie
literals.emplace_back("true");
literals.emplace_back("false");

cudf::detail::optional_trie trie_literals =
cudf::detail::create_serialized_trie(literals, stream);
cudf::detail::optional_trie trie_nonnumeric = cudf::detail::create_serialized_trie(
{"NaN", "Infinity", "+INF", "+Infinity", "-INF", "-Infinity"}, stream);

auto validate_values = cuda::proclaim_return_type<bool>(
[data = d_input.data(),
trie_na = trie_na_view,
trie_literals = cudf::detail::make_trie_view(trie_literals),
trie_nonnumeric = cudf::detail::make_trie_view(trie_nonnumeric),
allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
allow_nonnumeric =
options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
SymbolOffsetT end) -> bool {
// This validates an unquoted value. A value must match https://www.json.org/json-en.html
// but the leading and training whitespace should already have been removed, and is not
// a string
auto c = data[start];
auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
if (is_null_literal) {
return true;
} else if ('n' == c) {
return substr_eq(data, start, end, 4, "null");
} else if ('t' == c) {
return substr_eq(data, start, end, 4, "true");
} else if ('f' == c) {
return substr_eq(data, start, end, 5, "false");
} else if (allow_nonnumeric && c == 'N') {
return substr_eq(data, start, end, 3, "NaN");
} else if (allow_nonnumeric && c == 'I') {
return substr_eq(data, start, end, 8, "Infinity");
} else if (allow_nonnumeric && c == '+') {
return substr_eq(data, start, end, 4, "+INF") ||
substr_eq(data, start, end, 9, "+Infinity");
} else if ('-' == c || c <= '9' && 'c' >= '0') {
auto const is_literal = serialized_trie_contains(trie_literals, {data + start, end - start});
if (is_literal) { return true; }
if (allow_nonnumeric) {
auto const is_nonnumeric =
serialized_trie_contains(trie_nonnumeric, {data + start, end - start});
if (is_nonnumeric) { return true; }
}
auto c = data[start];
if ('-' == c || c <= '9' && 'c' >= '0') {
// number
auto num_state = number_state::START;
for (auto at = start; at < end; at++) {
Expand All @@ -140,9 +144,6 @@ void validate_token_stream(device_span<char const> d_input,
num_state = number_state::LEADING_ZERO;
} else if (c >= '1' && c <= '9') {
num_state = number_state::WHOLE;
} else if (allow_nonnumeric && 'I' == c) {
return substr_eq(data, start, end, 4, "-INF") ||
substr_eq(data, start, end, 9, "-Infinity");
} else {
return false;
}
Expand Down Expand Up @@ -273,33 +274,44 @@ void validate_token_stream(device_span<char const> d_input,

auto num_tokens = tokens.size();
auto count_it = thrust::make_counting_iterator(0);
auto predicate = [tokens = tokens.begin(),
token_indices = token_indices.begin(),
validate_values,
validate_strings] __device__(auto i) -> bool {
auto predicate = cuda::proclaim_return_type<bool>([tokens = tokens.begin(),
token_indices = token_indices.begin(),
validate_values,
validate_strings] __device__(auto i) -> bool {
if (tokens[i] == token_t::ValueEnd) {
return !validate_values(token_indices[i - 1], token_indices[i]);
} else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) {
return !validate_strings(token_indices[i - 1], token_indices[i]);
}
return false;
};
});

auto conditional_invalidout_it =
cudf::detail::make_tabulate_output_iterator(cuda::proclaim_return_type<void>(
[d_invalid = d_invalid.begin()] __device__(size_type i, bool x) -> void {
if (x) { d_invalid[i] = true; }
}));
thrust::transform(rmm::exec_policy_nosync(stream),
count_it,
count_it + num_tokens,
conditional_invalidout_it,
predicate);

using scan_type = write_if::scan_type;
auto conditional_write = write_if{tokens.begin(), num_tokens};
auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write);
auto transform_op = cuda::proclaim_return_type<scan_type>(
[predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
});
auto binary_op = cuda::proclaim_return_type<scan_type>(
auto binary_op = cuda::proclaim_return_type<scan_type>(
[] __device__(scan_type prev, scan_type curr) -> scan_type {
auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
return {(curr.second ? curr.first : op_result), prev.second | curr.second};
});
auto transform_op = cuda::proclaim_return_type<scan_type>(
[d_invalid = d_invalid.begin(), tokens = tokens.begin()] __device__(auto i) -> scan_type {
if (d_invalid[i]) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
});

thrust::transform_inclusive_scan(rmm::exec_policy(stream),
thrust::transform_inclusive_scan(rmm::exec_policy_nosync(stream),
count_it,
count_it + num_tokens,
conditional_output_it,
Expand Down
17 changes: 17 additions & 0 deletions docs/cudf/source/developer_guide/testing.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,23 @@ specifically the [`pytest-cov`](https://github.com/pytest-dev/pytest-cov) plugin
Code coverage reports are uploaded to [Codecov](https://app.codecov.io/gh/rapidsai/cudf).
Each PR also indicates whether it increases or decreases test coverage.

### Configuring pytest

Pytest will accept configuration in [multiple different
files](https://docs.pytest.org/en/stable/reference/customize.html),
with a specified discovery and precedence order. Note in particular
that there is no automatic "include" mechanism, as soon as a matching
configuration file is found, discovery stops.

For preference, so that all tool configuration lives in the same
place, we use `pyproject.toml`-based configuration. Test configuration
for a given package should live in that package's `pyproject.toml`
file.

Where tests do not naturally belong to a project, for example the
`cudf.pandas` integration tests and the cuDF benchmarks, use a
`pytest.ini` file as close to the tests as possible.

## Test organization

How tests are organized depends on which of the following two groups they fall into:
Expand Down
19 changes: 0 additions & 19 deletions python/cudf/cudf/tests/pytest.ini

This file was deleted.

9 changes: 9 additions & 0 deletions python/cudf/cudf_pandas_tests/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

# Note, this config file overrides the default "cudf" test config in
# ../pyproject.toml We do so deliberately because we have different
# treatment of markers and warnings
[pytest]
addopts = --tb=native --strict-config --strict-markers
empty_parameter_set_mark = fail_at_collect
xfail_strict = true
41 changes: 24 additions & 17 deletions python/cudf/cudf_pandas_tests/test_cudf_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import collections
import contextlib
import copy
import datetime
import operator
Expand All @@ -21,10 +22,15 @@
import pyarrow as pa
import pytest
from nbconvert.preprocessors import ExecutePreprocessor
from numba import NumbaDeprecationWarning, vectorize
from numba import (
NumbaDeprecationWarning,
__version__ as numba_version,
vectorize,
)
from packaging import version
from pytz import utc

from cudf.core._compat import PANDAS_GE_220
from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_220, PANDAS_VERSION
from cudf.pandas import LOADED, Profiler
from cudf.pandas.fast_slow_proxy import (
ProxyFallbackError,
Expand Down Expand Up @@ -52,8 +58,6 @@
get_calendar,
)

from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION

# Accelerated pandas has the real pandas and cudf modules as attributes
pd = xpd._fsproxy_slow
cudf = xpd._fsproxy_fast
Expand Down Expand Up @@ -622,10 +626,6 @@ def test_array_function_series_fallback(series):
tm.assert_equal(expect, got)


@pytest.mark.xfail(
PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
reason="Fails in older versions of pandas",
)
def test_timedeltaproperties(series):
psr, sr = series
psr, sr = psr.astype("timedelta64[ns]"), sr.astype("timedelta64[ns]")
Expand Down Expand Up @@ -685,10 +685,6 @@ def test_maintain_container_subclasses(multiindex):
assert isinstance(got, xpd.core.indexes.frozen.FrozenList)


@pytest.mark.xfail(
PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
reason="Fails in older versions of pandas due to unsupported boxcar window type",
)
def test_rolling_win_type():
pdf = pd.DataFrame(range(5))
df = xpd.DataFrame(range(5))
Expand All @@ -697,8 +693,14 @@ def test_rolling_win_type():
tm.assert_equal(result, expected)


@pytest.mark.skip(
reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009"
@pytest.mark.skipif(
version.parse(numba_version) < version.parse("0.59"),
reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009",
)
@pytest.mark.xfail(
version.parse(numba_version) >= version.parse("0.59")
and PANDAS_VERSION < version.parse("2.1"),
reason="numba.generated_jit removed in 0.59, requires pandas >= 2.1",
)
def test_rolling_apply_numba_engine():
def weighted_mean(x):
Expand All @@ -709,7 +711,12 @@ def weighted_mean(x):
pdf = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]])
df = xpd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]])

with pytest.warns(NumbaDeprecationWarning):
ctx = (
contextlib.nullcontext()
if PANDAS_GE_210
else pytest.warns(NumbaDeprecationWarning)
)
with ctx:
expect = pdf.rolling(2, method="table", min_periods=0).apply(
weighted_mean, raw=True, engine="numba"
)
Expand Down Expand Up @@ -1305,7 +1312,7 @@ def max_times_two(self):


@pytest.mark.xfail(
PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
PANDAS_VERSION < version.parse("2.1"),
reason="DatetimeArray.__floordiv__ missing in pandas-2.0.0",
)
def test_floordiv_array_vs_df():
Expand Down Expand Up @@ -1580,7 +1587,7 @@ def test_numpy_cupy_flatiter(series):


@pytest.mark.xfail(
PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
PANDAS_VERSION < version.parse("2.1"),
reason="pyarrow_numpy storage type was not supported in pandas-2.0.0",
)
def test_arrow_string_arrays():
Expand Down
21 changes: 21 additions & 0 deletions python/cudf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,27 @@ skip = [
"__init__.py",
]

[tool.pytest.ini_options]
addopts = "--tb=native --strict-config --strict-markers"
empty_parameter_set_mark = "fail_at_collect"
filterwarnings = [
"error",
"ignore:::.*xdist.*",
"ignore:::.*pytest.*",
# some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
"ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore",
# Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
"ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning",
# PerformanceWarning from cupy warming up the JIT cache
"ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning",
# Ignore numba PEP 456 warning specific to arm machines
"ignore:FNV hashing is not implemented in Numba.*:UserWarning"
]
markers = [
"spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON`"
]
xfail_strict = true

[tool.rapids-build-backend]
build-backend = "scikit_build_core.build"
dependencies-file = "../../dependencies.yaml"
Expand Down
4 changes: 0 additions & 4 deletions python/cudf_kafka/cudf_kafka/tests/pytest.ini

This file was deleted.

3 changes: 3 additions & 0 deletions python/cudf_kafka/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,12 @@ skip = [
]

[tool.pytest.ini_options]
addopts = "--tb=native --strict-config --strict-markers"
empty_parameter_set_mark = "fail_at_collect"
filterwarnings = [
"error"
]
xfail_strict = true

[tool.scikit-build]
build-dir = "build/{wheel_tag}"
Expand Down
5 changes: 5 additions & 0 deletions python/cudf_polars/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ license-files = ["LICENSE"]
version = {file = "cudf_polars/VERSION"}

[tool.pytest.ini_options]
addopts = "--tb=native --strict-config --strict-markers"
empty_parameter_set_mark = "fail_at_collect"
filterwarnings = [
"error"
]
xfail_strict = true

[tool.coverage.report]
Expand Down
4 changes: 0 additions & 4 deletions python/cudf_polars/tests/pytest.ini

This file was deleted.

4 changes: 0 additions & 4 deletions python/custreamz/custreamz/tests/pytest.ini

This file was deleted.

18 changes: 5 additions & 13 deletions python/custreamz/custreamz/tests/test_dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,24 +377,16 @@ def test_setitem_overwrites(stream):
[
({}, "sum"),
({}, "mean"),
pytest.param({}, "min"),
({}, "min"),
pytest.param(
{},
"median",
marks=pytest.mark.xfail(reason="Unavailable for rolling objects"),
),
pytest.param({}, "max"),
pytest.param(
{},
"var",
marks=pytest.mark.xfail(reason="Unavailable for rolling objects"),
),
pytest.param({}, "count"),
pytest.param(
{"ddof": 0},
"std",
marks=pytest.mark.xfail(reason="Unavailable for rolling objects"),
),
({}, "max"),
({}, "var"),
({}, "count"),
({"ddof": 0}, "std"),
pytest.param(
{"quantile": 0.5},
"quantile",
Expand Down
Loading

0 comments on commit 911e9e4

Please sign in to comment.