Skip to content

Commit

Permalink
Merge pull request #5340 from galipremsagar/5322
Browse files Browse the repository at this point in the history
[REVIEW] Disable iteration in cudf objects and add support for `DataFrame` initialization with list of `Series`
  • Loading branch information
galipremsagar authored Jul 15, 2020
2 parents d994974 + dd40953 commit 89740c6
Show file tree
Hide file tree
Showing 31 changed files with 875 additions and 189 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@
- PR #5619 Fix issue related to typecasting when using a `CategoricalDtype`
- PR #5649 Fix issue when empty Dataframe with index are passed to `cudf.concat`
- PR #5644 Fix issue related to Dataframe init when passing in `columns`
- PR #5340 Disable iteration in cudf objects and add support for `DataFrame` initialization with list of `Series`
- PR #5663 Move Duration types under Timestamps in doxygen Modules page
- PR #5664 Update conda upload versions for new supported CUDA/Python
- PR #5656 Fix issue with incorrect docker image being used in local build script
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,8 @@ def read_csv(
if index_col is not None and index_col is not False:
if isinstance(index_col, int):
df = df.set_index(df._data.get_by_index(index_col).names[0])
if names is None:
df._index.name = index_col
else:
df = df.set_index(index_col)

Expand Down
14 changes: 14 additions & 0 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,20 @@ def to_arrow(self):
dictionary=self.categories.to_arrow(),
)

@property
def values_host(self):
"""
Return a numpy representation of the CategoricalColumn.
"""
return self.to_pandas().values

@property
def values(self):
"""
Return a CuPy representation of the CategoricalColumn.
"""
raise NotImplementedError("cudf.Categorical is not yet implemented")

def unique(self):
codes = self.as_numerical.unique()
return column.build_categorical_column(
Expand Down
23 changes: 23 additions & 0 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ def mask_array_view(self):
def __len__(self):
return self.size

def __iter__(self):
cudf.utils.utils.raise_iteration_error(obj=self)

def to_pandas(self):
arr = self.data_array_view
sr = pd.Series(arr.copy_to_host())
Expand All @@ -133,6 +136,26 @@ def to_pandas(self):
sr[~mask_bytes] = None
return sr

@property
def values_host(self):
"""
Return a numpy representation of the Column.
"""
return self.data_array_view.copy_to_host()

@property
def values(self):
"""
Return a CuPy representation of the Column.
"""
if len(self) == 0:
return cupy.asarray([], dtype=self.dtype)

if self.has_nulls:
raise ValueError("Column must have no nulls.")

return cupy.asarray(self.data_array_view)

def clip(self, lo, hi):
if is_categorical_dtype(self):
input_col = self.astype(self.categories.dtype)
Expand Down
16 changes: 16 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4333,6 +4333,22 @@ def to_pandas(self, index=None):
pd_series.index = index
return pd_series

@property
def values_host(self):
"""
Return a numpy representation of the StringColumn.
"""
return self.to_pandas().values

@property
def values(self):
"""
Return a CuPy representation of the StringColumn.
"""
raise NotImplementedError(
"String Arrays is not yet implemented in cudf"
)

def to_array(self, fillna=None):
"""Get a dense numpy array for the data.
Expand Down
146 changes: 144 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
is_list_like,
is_scalar,
is_string_dtype,
numeric_normalize_types,
)
from cudf.utils.utils import OrderedColumnDict

Expand Down Expand Up @@ -210,6 +211,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
else:
self._index = as_index(index)
if columns is not None:
if isinstance(columns, (cudf.Series, cudf.Index)):
columns = columns.to_pandas()

self._data = ColumnAccessor(
OrderedDict.fromkeys(
columns,
Expand Down Expand Up @@ -246,6 +250,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
self._data = new_df._data
self._index = new_df._index
self.columns = new_df.columns
elif len(data) > 0 and isinstance(data[0], cudf.Series):
self._init_from_series_list(
data=data, columns=columns, index=index
)
else:
self._init_from_list_like(
data, index=index, columns=columns
Expand All @@ -260,11 +268,109 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
if dtype:
self._data = self.astype(dtype)._data

def _init_from_series_list(self, data, columns, index):
if index is None:
# When `index` is `None`, the final index of
# resulting dataframe will be union of
# all Series's names.
final_index = as_index(_get_union_of_series_names(data))
else:
# When an `index` is passed, the final index of
# resulting dataframe will be whatever
# index passed, but will need
# shape validations - explained below
data_length = len(data)
index_length = len(index)
if data_length != index_length:
# If the passed `index` length doesn't match
# length of Series objects in `data`, we must
# check if `data` can be duplicated/expanded
# to match the length of index. For that we
# check if the length of index is a factor
# of length of data.
#
# 1. If yes, we extend data
# until length of data is equal to length of index.
# 2. If no, we throw an error stating the
# shape of resulting `data` and `index`

# Simple example
# >>> import pandas as pd
# >>> s = pd.Series([1, 2, 3])
# >>> pd.DataFrame([s], index=['a', 'b'])
# 0 1 2
# a 1 2 3
# b 1 2 3
# >>> pd.DataFrame([s], index=['a', 'b', 'c'])
# 0 1 2
# a 1 2 3
# b 1 2 3
# c 1 2 3
if index_length % data_length == 0:
initial_data = data
data = []
for _ in range(int(index_length / data_length)):
data.extend([o for o in initial_data])
else:
raise ValueError(
f"Shape of passed values is "
f"{(data_length, len(data[0]))}, "
f"indices imply {(index_length, len(data[0]))}"
)

final_index = as_index(index)

series_lengths = list(map(lambda x: len(x), data))
data = numeric_normalize_types(*data)
if series_lengths.count(series_lengths[0]) == len(series_lengths):
# Calculating the final dataframe columns by
# getting union of all `index` of the Series objects.
final_columns = _get_union_of_indices([d.index for d in data])

for idx, series in enumerate(data):
if not series.index.is_unique:
raise ValueError(
"Reindexing only valid with uniquely valued Index "
"objects"
)
if not series.index.equals(final_columns):
series = series.reindex(final_columns)
self._data[idx] = column.as_column(series._column)

# Setting `final_columns` to self._index so
# that the resulting `transpose` will be have
# columns set to `final_columns`
self._index = final_columns

transpose = self.T
else:
concat_df = cudf.concat(data, axis=1)

if concat_df.columns.dtype == "object":
concat_df.columns = concat_df.columns.astype("str")

transpose = concat_df.T

transpose._index = final_index
self._mimic_inplace(transpose, inplace=True)

# If `columns` is passed, the result dataframe
# contain a dataframe with only the
# specified `columns` in the same order.
if columns:
for col_name in columns:
if col_name not in self._data:
self._data[col_name] = column.column_empty(
row_count=len(self), dtype=None, masked=True
)
self._data = self._data.get_by_label(columns)

def _init_from_list_like(self, data, index=None, columns=None):
if index is None:
index = RangeIndex(start=0, stop=len(data))
else:
index = as_index(index)

self._index = as_index(index)
data = list(itertools.zip_longest(*data))

Expand Down Expand Up @@ -2266,7 +2372,11 @@ def columns(self, columns):
columns = pd.Index(range(len(self._data.columns)))
is_multiindex = isinstance(columns, pd.MultiIndex)

if not isinstance(columns, pd.Index):
if isinstance(
columns, (cudf.Series, cudf.Index, cudf.core.column.ColumnBase)
):
columns = pd.Index(columns.to_array(), tupleize_cols=is_multiindex)
elif not isinstance(columns, pd.Index):
columns = pd.Index(columns, tupleize_cols=is_multiindex)

if not len(columns) == len(self._data.names):
Expand Down Expand Up @@ -2568,7 +2678,7 @@ def take(self, positions, keep_index=True):
Examples
--------
>>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0],
'b': pd.Series(['a', 'b', 'c'])})
'b': cudf.Series(['a', 'b', 'c'])})
>>> a.take([0, 2, 2])
a b
0 1.0 a
Expand Down Expand Up @@ -6326,6 +6436,13 @@ def corr(self):
df.columns = self.columns
return df

def to_dict(self, orient="dict", into=dict):
raise TypeError(
"Implicit conversion to a host memory via to_dict() is not "
"allowed, To explicitly construct a dictionary object, "
"consider using .to_pandas().to_dict()"
)

def append(
self, other, ignore_index=False, verify_integrity=False, sort=False
):
Expand Down Expand Up @@ -6616,3 +6733,28 @@ def extract_col(df, col):
):
return df.index._data.columns[0]
return df.index._data[col]


def _get_union_of_indices(indexes):
if len(indexes) == 1:
return indexes[0]
else:
merged_index = cudf.core.Index._concat(indexes)
merged_index = merged_index.drop_duplicates()
_, inds = merged_index._values.sort_by_values()
return merged_index.take(inds)


def _get_union_of_series_names(series_list):
names_list = []
unnamed_count = 0
for idx, series in enumerate(series_list):
if series.name is None:
names_list.append(f"Unnamed {unnamed_count}")
unnamed_count += 1
else:
names_list.append(series.name)
if unnamed_count == len(series_list):
names_list = [*range(len(series_list))]

return names_list
4 changes: 1 addition & 3 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,9 +743,7 @@ def where(self, cond, other=None, inplace=False):
if len(common_cols) > 0:
# If `self` and `cond` are having unequal index,
# then re-index `cond`.
if len(self.index) != len(cond.index) or any(
self.index != cond.index
):
if not self.index.equals(cond.index):
cond = cond.reindex(self.index)
else:
if cond.shape != self.shape:
Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def __getattribute__(self, key):

def __iter__(self):
group_names, offsets, grouped_values = self._grouped()
if isinstance(group_names, cudf.Index):
group_names = group_names.to_pandas()
for i, name in enumerate(group_names):
yield name, grouped_values[offsets[i] : offsets[i + 1]]

Expand Down
Loading

0 comments on commit 89740c6

Please sign in to comment.