Skip to content

Commit

Permalink
No public description
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 607566238
  • Loading branch information
blois authored and colaboratory-team committed Feb 16, 2024
1 parent 8cd001d commit 6d9c478
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 16 deletions.
112 changes: 112 additions & 0 deletions google/colab/_dataframe_summarizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Summarize properties of each column in a pandas DataFrame."""

import warnings
import numpy as np
import pandas as pd

_MAX_DATAFRAME_ROWS = 100000
_MAX_DATAFRAME_COLS = 20


def summarize_dataframe(df, variable_name):
"""Summarizes a dataframe."""

columns = _summarize_columns(df)
return {
"name": variable_name,
"rows": len(df),
"fields": columns,
}


def _check_type(dtype: str, value):
"""Cast value to right type to ensure it is JSON serializable."""
if np.isnan(value):
return None
if "float" in str(dtype):
return float(value)
elif "int" in str(dtype):
return int(value)
else:
return value


# Inspired by:
# https://github.com/microsoft/lida/blob/9bb26c0adb56cab2d7c5d49ad96bc14e204c87ec/lida/components/summarizer.py#L34
def _summarize_columns(df: pd.DataFrame, n_samples: int = 3):
"""Summarize properties of each column in a pandas DataFrame."""
properties_list = []
for column in df.columns:
dtype = df[column].dtype
properties = {}
if dtype in (int, float, complex):
properties["dtype"] = "number"
properties["std"] = _check_type(dtype, df[column].std())
properties["min"] = _check_type(dtype, df[column].min())
properties["max"] = _check_type(dtype, df[column].max())

elif dtype == bool:
properties["dtype"] = "boolean"
elif dtype == object:
# Check if the string column can be cast to a valid datetime
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore")
pd.to_datetime(df[column], errors="raise")
if (
not column.empty
and column.dtype.kind == "O"
and isinstance(column[0], str)
):
properties["dtype"] = "object"
else:
properties["dtype"] = "date"
except (TypeError, ValueError):
try:
# Check if the string column has a limited number of values
if df[column].nunique() / len(df[column]) < 0.5:
properties["dtype"] = "category"
else:
properties["dtype"] = "string"
except TypeError:
properties["dtype"] = str(dtype)
elif pd.api.types.is_categorical_dtype(df[column]):
properties["dtype"] = "category"
elif pd.api.types.is_datetime64_any_dtype(df[column]):
properties["dtype"] = "date"
else:
properties["dtype"] = str(dtype)

# add min max if dtype is date
if properties["dtype"] == "date":
try:
properties["min"] = df[column].min()
properties["max"] = df[column].max()
except TypeError:
cast_date_col = pd.to_datetime(df[column], errors="coerce")
properties["min"] = cast_date_col.min()
properties["max"] = cast_date_col.max()
# Add additional properties to the output dictionary
try:
nunique = df[column].nunique()
properties["num_unique_values"] = nunique
except TypeError:
pass
if "samples" not in properties:
try:
non_null_values = df[column][df[column].notnull()].unique()
n_samples = min(n_samples, len(non_null_values))
samples = (
pd.Series(non_null_values)
.sample(n_samples, random_state=42)
.tolist()
)
properties["samples"] = samples
except TypeError:
# Samples is optional here.
pass
properties["semantic_type"] = ""
properties["description"] = ""
properties_list.append({"column": column, "properties": properties})

return properties_list
21 changes: 5 additions & 16 deletions google/colab/_reprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,31 +189,20 @@ def _dataframe_intrinsic_repr(dataframe):

def _summarize_dataframe(df, variable_name):
"""Summarizes a dataframe."""
from lida.components import summarizer
from google.colab import _dataframe_summarizer

if len(df) > _MAX_DATAFRAME_ROWS or len(df.columns) > _MAX_DATAFRAME_COLS:
return None

columns = summarizer.Summarizer().get_column_properties(df)
# LIDA's summarizer will declare `type: date` for date-*like* columns,
# which leads to bad code predictions, which seem to assume `.dt` methods
# are available on those columns. We use a heuristic to "correct" this
# here.
for c in columns:
if c['properties']['dtype'] == 'date':
col = df[c['column']]
if not col.empty and col.dtype.kind == 'O' and isinstance(col[0], str):
c['properties']['dtype'] = 'object'
summary = _dataframe_summarizer.summarize_dataframe(df, variable_name)
return json.dumps(
{
'name': variable_name,
'rows': len(df),
'fields': columns,
},
summary,
indent=2,
# This is used for serializing any types unknown to Python's json
# serialization.
default=str,
# NaN's are non-standard JSON and cannot be decoded by clients.
allow_nan=False,
)


Expand Down

0 comments on commit 6d9c478

Please sign in to comment.