Fix for working with cudf 0.15 (#159)

Cudf 0.15 recently disabled iterating over the values of an index, which broke a number of ops in nvtabular. (change rapidsai/cudf#5340) Fix by using values_host.
NVIDIA-Merlin · Jul 20, 2020 · 57ee6de · 57ee6de
1 parent ff5bb4b
commit 57ee6de
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 26 deletions.
diff --git a/nvtabular/ops.py b/nvtabular/ops.py
@@ -208,7 +208,7 @@ def stat_logic(self, ddf, columns_ctx, input_cols, target_cols):
 
     @annotate("MinMax_finalize", color="green", domain="nvt_python")
     def finalize(self, stats):
-        for col in stats["mins"].index:
+        for col in stats["mins"].index.values_host:
             self.mins[col] = stats["mins"][col]
             self.maxs[col] = stats["maxs"][col]
 
@@ -264,7 +264,7 @@ def stat_logic(self, ddf, columns_ctx, input_cols, target_cols):
 
     @annotate("Moments_finalize", color="green", domain="nvt_python")
     def finalize(self, dask_stats):
-        for col in dask_stats["count"].index:
+        for col in dask_stats["count"].index.values_host:
             self.counts[col] = float(dask_stats["count"][col])
             self.means[col] = float(dask_stats["mean"][col])
             self.stds[col] = float(dask_stats["std"][col])
@@ -317,7 +317,7 @@ def stat_logic(self, ddf, columns_ctx, input_cols, target_cols):
 
     @annotate("Median_finalize", color="green", domain="nvt_python")
     def finalize(self, dask_stats):
-        for col in dask_stats.index:
+        for col in dask_stats.index.values_host:
             self.medians[col] = float(dask_stats[col])
 
     def registered_stats(self):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -151,6 +151,6 @@ def get_cats(processor, col, stat_name="categories"):
         filename = processor.stats[stat_name][col]
         gdf = cudf.read_parquet(filename)
         gdf.reset_index(drop=True, inplace=True)
-        return gdf[col].values_to_string()
+        return gdf[col].values_host
     else:
-        return processor.stats["encoders"][col].get_cats().values_to_string()
+        return processor.stats["encoders"][col].get_cats().values_host
diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py
@@ -109,13 +109,13 @@ def test_encoder(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns):
     processor.update_stats(dataset)
 
     if engine == "parquet" and not op_columns:
-        cats_expected0 = df["name-cat"].unique().values_to_string()
+        cats_expected0 = df["name-cat"].unique().values_host
         cats0 = get_cats(processor, "name-cat")
-        assert cats0 == ["None"] + cats_expected0
+        assert cats0.tolist() == [None] + cats_expected0.tolist()
 
-    cats_expected1 = df["name-string"].unique().values_to_string()
+    cats_expected1 = df["name-string"].unique().values_host
     cats1 = get_cats(processor, "name-string")
-    assert cats1 == ["None"] + cats_expected1
+    assert cats1.tolist() == [None] + cats_expected1.tolist()
 
 
 @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1])

diff --git a/tests/unit/test_torch_dataloader.py b/tests/unit/test_torch_dataloader.py
@@ -126,12 +126,12 @@ def get_norms(tar: cudf.Series):
 
     # Check that categories match
     if engine == "parquet":
-        cats_expected0 = df["name-cat"].unique().values_to_string()
+        cats_expected0 = df["name-cat"].unique().values_host
         cats0 = get_cats(processor, "name-cat")
-        assert cats0 == ["None"] + cats_expected0
-    cats_expected1 = df["name-string"].unique().values_to_string()
+        assert cats0.tolist() == [None] + cats_expected0.tolist()
+    cats_expected1 = df["name-string"].unique().values_host
     cats1 = get_cats(processor, "name-string")
-    assert cats1 == ["None"] + cats_expected1
+    assert cats1.tolist() == [None] + cats_expected1.tolist()
 
     #     Write to new "shuffled" and "processed" dataset
     processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True)

diff --git a/tests/unit/test_workflow.py b/tests/unit/test_workflow.py
@@ -79,14 +79,14 @@ def get_norms(tar: cudf.Series):
 
     # Check that categories match
     if engine == "parquet":
-        cats_expected0 = df["name-cat"].unique().values_to_string()
+        cats_expected0 = df["name-cat"].unique().values_host
         cats0 = get_cats(processor, "name-cat")
         # adding the None entry as a string because of move from gpu
-        assert cats0 == ["None"] + cats_expected0
-    cats_expected1 = df["name-string"].unique().values_to_string()
+        assert cats0.tolist() == [None] + cats_expected0.tolist()
+    cats_expected1 = df["name-string"].unique().values_host
     cats1 = get_cats(processor, "name-string")
     # adding the None entry as a string because of move from gpu
-    assert cats1 == ["None"] + cats_expected1
+    assert cats1.tolist() == [None] + cats_expected1.tolist()
 
     # Write to new "shuffled" and "processed" dataset
     processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True)
@@ -155,14 +155,14 @@ def get_norms(tar: cudf.Series):
 
     # Check that categories match
     if engine == "parquet":
-        cats_expected0 = df["name-cat"].unique().values_to_string()
+        cats_expected0 = df["name-cat"].unique().values_host
         cats0 = get_cats(processor, "name-cat")
         # adding the None entry as a string because of move from gpu
-        assert cats0 == ["None"] + cats_expected0
-    cats_expected1 = df["name-string"].unique().values_to_string()
+        assert cats0.tolist() == [None] + cats_expected0.tolist()
+    cats_expected1 = df["name-string"].unique().values_host
     cats1 = get_cats(processor, "name-string")
     # adding the None entry as a string because of move from gpu
-    assert cats1 == ["None"] + cats_expected1
+    assert cats1.tolist() == [None] + cats_expected1.tolist()
 
     # Write to new "shuffled" and "processed" dataset
     processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True)
@@ -236,17 +236,16 @@ def get_norms(tar: cudf.Series):
     assert math.isclose(
         get_norms(df.y).std(), processor.stats["stds"]["y" + concat_ops], rel_tol=1e-1
     )
-
     # Check that categories match
     if engine == "parquet":
-        cats_expected0 = df["name-cat"].unique().values_to_string()
+        cats_expected0 = df["name-cat"].unique().values_host
         cats0 = get_cats(processor, "name-cat")
         # adding the None entry as a string because of move from gpu
-        assert cats0 == ["None"] + cats_expected0
-    cats_expected1 = df["name-string"].unique().values_to_string()
+        assert cats0.tolist() == [None] + cats_expected0.tolist()
+    cats_expected1 = df["name-string"].unique().values_host
     cats1 = get_cats(processor, "name-string")
     # adding the None entry as a string because of move from gpu
-    assert cats1 == ["None"] + cats_expected1
+    assert cats1.tolist() == [None] + cats_expected1.tolist()
 
     # Write to new "shuffled" and "processed" dataset
     processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True)