Skip to content

Commit

Permalink
Fix for working with cudf 0.15 (#159)
Browse files Browse the repository at this point in the history
Cudf 0.15 recently disabled iterating over the values of an index, which broke
a number of ops in nvtabular. (change rapidsai/cudf#5340)
Fix by using values_host.
  • Loading branch information
benfred authored Jul 20, 2020
1 parent ff5bb4b commit 57ee6de
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 26 deletions.
6 changes: 3 additions & 3 deletions nvtabular/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def stat_logic(self, ddf, columns_ctx, input_cols, target_cols):

@annotate("MinMax_finalize", color="green", domain="nvt_python")
def finalize(self, stats):
for col in stats["mins"].index:
for col in stats["mins"].index.values_host:
self.mins[col] = stats["mins"][col]
self.maxs[col] = stats["maxs"][col]

Expand Down Expand Up @@ -264,7 +264,7 @@ def stat_logic(self, ddf, columns_ctx, input_cols, target_cols):

@annotate("Moments_finalize", color="green", domain="nvt_python")
def finalize(self, dask_stats):
for col in dask_stats["count"].index:
for col in dask_stats["count"].index.values_host:
self.counts[col] = float(dask_stats["count"][col])
self.means[col] = float(dask_stats["mean"][col])
self.stds[col] = float(dask_stats["std"][col])
Expand Down Expand Up @@ -317,7 +317,7 @@ def stat_logic(self, ddf, columns_ctx, input_cols, target_cols):

@annotate("Median_finalize", color="green", domain="nvt_python")
def finalize(self, dask_stats):
for col in dask_stats.index:
for col in dask_stats.index.values_host:
self.medians[col] = float(dask_stats[col])

def registered_stats(self):
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,6 @@ def get_cats(processor, col, stat_name="categories"):
filename = processor.stats[stat_name][col]
gdf = cudf.read_parquet(filename)
gdf.reset_index(drop=True, inplace=True)
return gdf[col].values_to_string()
return gdf[col].values_host
else:
return processor.stats["encoders"][col].get_cats().values_to_string()
return processor.stats["encoders"][col].get_cats().values_host
8 changes: 4 additions & 4 deletions tests/unit/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,13 @@ def test_encoder(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
processor.update_stats(dataset)

if engine == "parquet" and not op_columns:
cats_expected0 = df["name-cat"].unique().values_to_string()
cats_expected0 = df["name-cat"].unique().values_host
cats0 = get_cats(processor, "name-cat")
assert cats0 == ["None"] + cats_expected0
assert cats0.tolist() == [None] + cats_expected0.tolist()

cats_expected1 = df["name-string"].unique().values_to_string()
cats_expected1 = df["name-string"].unique().values_host
cats1 = get_cats(processor, "name-string")
assert cats1 == ["None"] + cats_expected1
assert cats1.tolist() == [None] + cats_expected1.tolist()


@pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1])
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/test_torch_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,12 @@ def get_norms(tar: cudf.Series):

# Check that categories match
if engine == "parquet":
cats_expected0 = df["name-cat"].unique().values_to_string()
cats_expected0 = df["name-cat"].unique().values_host
cats0 = get_cats(processor, "name-cat")
assert cats0 == ["None"] + cats_expected0
cats_expected1 = df["name-string"].unique().values_to_string()
assert cats0.tolist() == [None] + cats_expected0.tolist()
cats_expected1 = df["name-string"].unique().values_host
cats1 = get_cats(processor, "name-string")
assert cats1 == ["None"] + cats_expected1
assert cats1.tolist() == [None] + cats_expected1.tolist()

# Write to new "shuffled" and "processed" dataset
processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True)
Expand Down
25 changes: 12 additions & 13 deletions tests/unit/test_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,14 @@ def get_norms(tar: cudf.Series):

# Check that categories match
if engine == "parquet":
cats_expected0 = df["name-cat"].unique().values_to_string()
cats_expected0 = df["name-cat"].unique().values_host
cats0 = get_cats(processor, "name-cat")
# adding the None entry as a string because of move from gpu
assert cats0 == ["None"] + cats_expected0
cats_expected1 = df["name-string"].unique().values_to_string()
assert cats0.tolist() == [None] + cats_expected0.tolist()
cats_expected1 = df["name-string"].unique().values_host
cats1 = get_cats(processor, "name-string")
# adding the None entry as a string because of move from gpu
assert cats1 == ["None"] + cats_expected1
assert cats1.tolist() == [None] + cats_expected1.tolist()

# Write to new "shuffled" and "processed" dataset
processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True)
Expand Down Expand Up @@ -155,14 +155,14 @@ def get_norms(tar: cudf.Series):

# Check that categories match
if engine == "parquet":
cats_expected0 = df["name-cat"].unique().values_to_string()
cats_expected0 = df["name-cat"].unique().values_host
cats0 = get_cats(processor, "name-cat")
# adding the None entry as a string because of move from gpu
assert cats0 == ["None"] + cats_expected0
cats_expected1 = df["name-string"].unique().values_to_string()
assert cats0.tolist() == [None] + cats_expected0.tolist()
cats_expected1 = df["name-string"].unique().values_host
cats1 = get_cats(processor, "name-string")
# adding the None entry as a string because of move from gpu
assert cats1 == ["None"] + cats_expected1
assert cats1.tolist() == [None] + cats_expected1.tolist()

# Write to new "shuffled" and "processed" dataset
processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True)
Expand Down Expand Up @@ -236,17 +236,16 @@ def get_norms(tar: cudf.Series):
assert math.isclose(
get_norms(df.y).std(), processor.stats["stds"]["y" + concat_ops], rel_tol=1e-1
)

# Check that categories match
if engine == "parquet":
cats_expected0 = df["name-cat"].unique().values_to_string()
cats_expected0 = df["name-cat"].unique().values_host
cats0 = get_cats(processor, "name-cat")
# adding the None entry as a string because of move from gpu
assert cats0 == ["None"] + cats_expected0
cats_expected1 = df["name-string"].unique().values_to_string()
assert cats0.tolist() == [None] + cats_expected0.tolist()
cats_expected1 = df["name-string"].unique().values_host
cats1 = get_cats(processor, "name-string")
# adding the None entry as a string because of move from gpu
assert cats1 == ["None"] + cats_expected1
assert cats1.tolist() == [None] + cats_expected1.tolist()

# Write to new "shuffled" and "processed" dataset
processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True)
Expand Down

0 comments on commit 57ee6de

Please sign in to comment.