Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement libmissing; untangles _libs dependencies #18357

Merged
merged 16 commits into from
Nov 22, 2017
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@ from libc.math cimport sqrt, fabs
# this is our util.pxd
from util cimport numeric, get_nat

cimport lib
from pandas._libs import lib
import missing

cdef int64_t iNaT = get_nat()

Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/algos_rank_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ascending=True,
nan_value = {{neg_nan_value}}

{{if dtype == 'object'}}
mask = lib.isnaobj(values)
mask = missing.isnaobj(values)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these could be cimports instead?

Copy link
Member Author

@jbrockmendel jbrockmendel Nov 19, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ATM these are not cdef

{{elif dtype == 'float64'}}
mask = np.isnan(values)
{{elif dtype == 'int64'}}
Expand Down Expand Up @@ -259,7 +259,7 @@ def rank_2d_{{dtype}}(object in_arr, axis=0, ties_method='average',
nan_value = {{neg_nan_value}}

{{if dtype == 'object'}}
mask = lib.isnaobj2d(values)
mask = missing.isnaobj2d(values)
{{elif dtype == 'float64'}}
mask = np.isnan(values)
{{elif dtype == 'int64'}}
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ cdef extern from "numpy/npy_math.h":
cimport cython
cimport numpy as cnp

from pandas._libs.lib import checknull
from missing cimport checknull

cnp.import_array()
cnp.import_ufunc()
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Template for each `dtype` helper function for hashtable
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""

from lib cimport is_null_datetimelike
from missing cimport is_null_datetimelike


#----------------------------------------------------------------------
Expand Down
1 change: 0 additions & 1 deletion pandas/_libs/lib.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# prototypes for sharing

cdef bint is_null_datetimelike(v)
cpdef bint is_period(val)
122 changes: 1 addition & 121 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ PyDateTime_IMPORT

from tslibs.np_datetime cimport get_timedelta64_value, get_datetime64_value

from tslib cimport _check_all_nulls
from tslib import NaT, Timestamp, Timedelta, array_to_datetime
from interval import Interval
from missing cimport checknull

cdef int64_t NPY_NAT = util.get_nat()

Expand Down Expand Up @@ -112,54 +112,6 @@ def memory_usage_of_objects(ndarray[object, ndim=1] arr):


# ----------------------------------------------------------------------
# isnull / notnull related

cdef double INF = <double> np.inf
cdef double NEGINF = -INF


cpdef bint checknull(object val):
if util.is_float_object(val) or util.is_complex_object(val):
return val != val # and val != INF and val != NEGINF
elif util.is_datetime64_object(val):
return get_datetime64_value(val) == NPY_NAT
elif val is NaT:
return True
elif util.is_timedelta64_object(val):
return get_timedelta64_value(val) == NPY_NAT
elif is_array(val):
return False
else:
return _checknull(val)


cpdef bint checknull_old(object val):
if util.is_float_object(val) or util.is_complex_object(val):
return val != val or val == INF or val == NEGINF
elif util.is_datetime64_object(val):
return get_datetime64_value(val) == NPY_NAT
elif val is NaT:
return True
elif util.is_timedelta64_object(val):
return get_timedelta64_value(val) == NPY_NAT
elif is_array(val):
return False
else:
return _checknull(val)


cpdef bint isposinf_scalar(object val):
if util.is_float_object(val) and val == INF:
return True
else:
return False


cpdef bint isneginf_scalar(object val):
if util.is_float_object(val) and val == NEGINF:
return True
else:
return False


cpdef bint isscalar(object val):
Expand Down Expand Up @@ -212,78 +164,6 @@ def item_from_zerodim(object val):
return util.unbox_if_zerodim(val)


@cython.wraparound(False)
@cython.boundscheck(False)
def isnaobj(ndarray arr):
cdef Py_ssize_t i, n
cdef object val
cdef ndarray[uint8_t] result

assert arr.ndim == 1, "'arr' must be 1-D."

n = len(arr)
result = np.empty(n, dtype=np.uint8)
for i from 0 <= i < n:
val = arr[i]
result[i] = _check_all_nulls(val)
return result.view(np.bool_)


@cython.wraparound(False)
@cython.boundscheck(False)
def isnaobj_old(ndarray arr):
cdef Py_ssize_t i, n
cdef object val
cdef ndarray[uint8_t] result

assert arr.ndim == 1, "'arr' must be 1-D."

n = len(arr)
result = np.zeros(n, dtype=np.uint8)
for i from 0 <= i < n:
val = arr[i]
result[i] = val is NaT or util._checknull_old(val)
return result.view(np.bool_)


@cython.wraparound(False)
@cython.boundscheck(False)
def isnaobj2d(ndarray arr):
cdef Py_ssize_t i, j, n, m
cdef object val
cdef ndarray[uint8_t, ndim=2] result

assert arr.ndim == 2, "'arr' must be 2-D."

n, m = (<object> arr).shape
result = np.zeros((n, m), dtype=np.uint8)
for i from 0 <= i < n:
for j from 0 <= j < m:
val = arr[i, j]
if checknull(val):
result[i, j] = 1
return result.view(np.bool_)


@cython.wraparound(False)
@cython.boundscheck(False)
def isnaobj2d_old(ndarray arr):
cdef Py_ssize_t i, j, n, m
cdef object val
cdef ndarray[uint8_t, ndim=2] result

assert arr.ndim == 2, "'arr' must be 2-D."

n, m = (<object> arr).shape
result = np.zeros((n, m), dtype=np.uint8)
for i from 0 <= i < n:
for j from 0 <= j < m:
val = arr[i, j]
if checknull_old(val):
result[i, j] = 1
return result.view(np.bool_)


@cython.wraparound(False)
@cython.boundscheck(False)
cpdef ndarray[object] list_to_object_array(list obj):
Expand Down
6 changes: 6 additions & 0 deletions pandas/_libs/missing.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
# cython: profile=False

cdef bint is_null_datetimelike(v)
cpdef bint checknull(object val)
cpdef bint checknull_old(object val)
176 changes: 176 additions & 0 deletions pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# -*- coding: utf-8 -*-
# cython: profile=False

from cpython cimport PyFloat_Check, PyComplex_Check

cimport cython
from cython cimport Py_ssize_t

import numpy as np
cimport numpy as np
from numpy cimport ndarray, int64_t, uint8_t
np.import_array()

cimport util

from tslibs.np_datetime cimport get_timedelta64_value, get_datetime64_value
from tslibs.nattype import NaT, iNaT
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of importing iNaT, just use NPY_NAT to avoid perf issues.


cdef double INF = <double> np.inf
cdef double NEGINF = -INF

cdef int64_t NPY_NAT = util.get_nat()


cdef inline bint is_null_datetimelike(v):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prob should rename this for consistency (checknull_datetimelike), can be TODO

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

type as object

# determine if we have a null for a timedelta/datetime (or integer
# versions)
if util._checknull(v):
return True
elif v is NaT:
return True
elif util.is_timedelta64_object(v):
return v.view('int64') == iNaT
elif util.is_datetime64_object(v):
return v.view('int64') == iNaT
elif util.is_integer_object(v):
return v == iNaT
return False


cdef inline bint _check_all_nulls(object val):
""" utility to check if a value is any type of null """
cdef bint res
if PyFloat_Check(val) or PyComplex_Check(val):
res = val != val
elif val is NaT:
res = 1
elif val is None:
res = 1
elif util.is_datetime64_object(val):
res = get_datetime64_value(val) == NPY_NAT
elif util.is_timedelta64_object(val):
res = get_timedelta64_value(val) == NPY_NAT
else:
res = 0
return res


cpdef bint checknull(object val):
if util.is_float_object(val) or util.is_complex_object(val):
return val != val # and val != INF and val != NEGINF
elif util.is_datetime64_object(val):
return get_datetime64_value(val) == NPY_NAT
elif val is NaT:
return True
elif util.is_timedelta64_object(val):
return get_timedelta64_value(val) == NPY_NAT
elif util.is_array(val):
return False
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reason for not pulling in util._checknull here as well, as seems logical? (or just future PR)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean defining it here instead of in util? Or importing it into the namespace? I'd be +1 on the former, indifferent to the latter.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think for sure should define it here. but then this puts missing as a dep of things like hashing.pyx. ok with it being a dep of any of the tslibs though.

(pandas) bash-3.2$ find pandas -name '*.pyx' | xargs grep _checknull
pandas/_libs/hashing.pyx:from util cimport _checknull
pandas/_libs/hashing.pyx:        elif _checknull(val):
pandas/_libs/interval.pyx:        if util._checknull(interval):
pandas/_libs/lib.pyx:from util cimport is_array, _checknull, _checknan
pandas/_libs/lib.pyx:        return _checknull(val)
pandas/_libs/lib.pyx:        return _checknull(val)
pandas/_libs/lib.pyx:        result[i] = val is NaT or util._checknull_old(val)
pandas/_libs/lib.pyx:                _checknull(x) and _checknull(y)):
pandas/_libs/lib.pyx:    if _checknull(val):
pandas/_libs/lib.pyx:        if _checknull(x):
pandas/_libs/lib.pyx:            if _checknull(x):
pandas/_libs/lib.pyx:            elif _checknull(y):
pandas/_libs/src/inference.pyx:        if util._checknull(val):
pandas/_libs/src/inference.pyx:        elif util._checknull(v):
pandas/_libs/src/inference.pyx:    if util._checknull(v):
pandas/_libs/src/inference.pyx:    if util._checknull(v):
pandas/_libs/src/inference.pyx:    if util._checknull(v):
pandas/_libs/src/inference.pyx:    if util._checknull(v):
pandas/_libs/src/inference.pyx:        return util._checknull(value)
pandas/_libs/src/inference.pyx:            bint is_generic_null = util._checknull(value)
pandas/_libs/tslib.pyx:from tslibs.nattype cimport _checknull_with_nat, NPY_NAT
pandas/_libs/tslib.pyx:        if _checknull_with_nat(val):
pandas/_libs/tslib.pyx:            if _checknull_with_nat(val):
pandas/_libs/tslib.pyx:        if _checknull_with_nat(val):
pandas/_libs/tslib.pyx:            if _checknull_with_nat(val):
pandas/_libs/tslib.pyx:            if _checknull_with_nat(val):
pandas/_libs/tslib.pyx:            if _checknull_with_nat(val):
pandas/_libs/tslibs/nattype.pyx:cdef inline bint _checknull_with_nat(object val):
pandas/_libs/tslibs/strptime.pyx:from nattype cimport _checknull_with_nat, NPY_NAT
pandas/_libs/tslibs/strptime.pyx:            if _checknull_with_nat(val):
pandas/_libs/tslibs/timedeltas.pyx:from nattype cimport _checknull_with_nat, NPY_NAT
pandas/_libs/tslibs/timedeltas.pyx:    if _checknull_with_nat(ts):
pandas/_libs/tslibs/timedeltas.pyx:    if _checknull_with_nat(other):
pandas/_libs/tslibs/timedeltas.pyx:        elif _checknull_with_nat(value):

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're moving util._checknull anyway, I'd advocate renaming it to e.g. check_none_or_nan

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think for sure should define it here. but then this puts missing as a dep of things like hashing.pyx. ok with it being a dep of any of the tslibs though.

I'll take a look and see which util funcs can be moved without messing with dependencies.

FWIW this PR already adds missing to the 'pxdfiles` key of hashtable, which cimports missing.checknull. Previously it was an un-declared dependency on lib.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like util._checknull_old can be moved to missing (is used there once, nowhere else). Let's saving util._checknull for later, since it is used in a bunch of places.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok with leaving these for later as well

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. Just pushed commit with docstrings.

return util._checknull(val)


cpdef bint checknull_old(object val):
if util.is_float_object(val) or util.is_complex_object(val):
return val != val or val == INF or val == NEGINF
elif util.is_datetime64_object(val):
return get_datetime64_value(val) == NPY_NAT
elif val is NaT:
return True
elif util.is_timedelta64_object(val):
return get_timedelta64_value(val) == NPY_NAT
elif util.is_array(val):
return False
else:
return util._checknull(val)


@cython.wraparound(False)
@cython.boundscheck(False)
def isnaobj(ndarray arr):
cdef:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ideally add some doc-strings

Py_ssize_t i, n
object val
ndarray[uint8_t] result

assert arr.ndim == 1, "'arr' must be 1-D."

n = len(arr)
result = np.empty(n, dtype=np.uint8)
for i from 0 <= i < n:
val = arr[i]
result[i] = _check_all_nulls(val)
return result.view(np.bool_)


@cython.wraparound(False)
@cython.boundscheck(False)
def isnaobj_old(ndarray arr):
cdef:
Py_ssize_t i, n
object val
ndarray[uint8_t] result

assert arr.ndim == 1, "'arr' must be 1-D."

n = len(arr)
result = np.zeros(n, dtype=np.uint8)
for i from 0 <= i < n:
val = arr[i]
result[i] = val is NaT or util._checknull_old(val)
return result.view(np.bool_)


@cython.wraparound(False)
@cython.boundscheck(False)
def isnaobj2d(ndarray arr):
cdef:
Py_ssize_t i, j, n, m
object val
ndarray[uint8_t, ndim=2] result

assert arr.ndim == 2, "'arr' must be 2-D."

n, m = (<object> arr).shape
result = np.zeros((n, m), dtype=np.uint8)
for i from 0 <= i < n:
for j from 0 <= j < m:
val = arr[i, j]
if checknull(val):
result[i, j] = 1
return result.view(np.bool_)


@cython.wraparound(False)
@cython.boundscheck(False)
def isnaobj2d_old(ndarray arr):
cdef:
Py_ssize_t i, j, n, m
object val
ndarray[uint8_t, ndim=2] result

assert arr.ndim == 2, "'arr' must be 2-D."

n, m = (<object> arr).shape
result = np.zeros((n, m), dtype=np.uint8)
for i from 0 <= i < n:
for j from 0 <= j < m:
val = arr[i, j]
if checknull_old(val):
result[i, j] = 1
return result.view(np.bool_)


cpdef bint isposinf_scalar(object val):
if util.is_float_object(val) and val == INF:
return True
else:
return False


cpdef bint isneginf_scalar(object val):
if util.is_float_object(val) and val == NEGINF:
return True
else:
return False
2 changes: 1 addition & 1 deletion pandas/_libs/period.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ from tslibs.np_datetime cimport (pandas_datetimestruct,
cimport util
from util cimport is_period_object, is_string_object, INT32_MIN

from lib cimport is_null_datetimelike
from missing cimport is_null_datetimelike
from pandas._libs.tslib import Timestamp
from tslibs.timezones cimport (
is_utc, is_tzlocal, get_utcoffset, get_dst_info)
Expand Down
Loading