From 3f5d728f871e6709b9ce3520b38c0746b03ed62f Mon Sep 17 00:00:00 2001 From: Michael Charlton Date: Fri, 9 Dec 2016 11:30:51 +0000 Subject: [PATCH 01/10] astype method now takes dict mapping col names to datatypes #14761 Updating documentation to reflect change --- doc/source/basics.rst | 4 +++- doc/source/whatsnew/v0.20.0.txt | 10 ++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index e5aa6b577270a..ada7877c4505c 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1755,12 +1755,14 @@ then the more *general* one will be used as the result of the operation. # conversion of dtypes df3.astype('float32').dtypes +.. versionadded:: 0.20.0 + Convert a subset of columns to a specified type using :meth:`~DataFrame.astype` .. ipython:: python dft = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7, 8, 9]}) - dft[['a','b']] = dft[['a','b']].astype(np.uint8) + dft = dft.astype({'a': np.float64, 'c': np.uint8}) dft dft.dtypes diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 0bfd755aae40c..35543b3f025d3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -43,6 +43,16 @@ fixed-width text files, and :func:`read_excel` for parsing Excel files. pd.read_fwf(StringIO(data)).dtypes pd.read_fwf(StringIO(data), dtype={'a':'float64', 'b':'object'}).dtypes +You can now pass a dictionary mapping column names to desired data types for that +column to :meth:`~DataFrame.astype`. + +.. ipython:: python + + dft = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7, 8, 9]}) + dft = dft.astype({'a': np.float64, 'c': np.uint8}) + dft + dft.dtypes + .. _whatsnew_0200.enhancements.other: Other enhancements From fddbb2ed0537cd32c3e37f61983972e1957399d4 Mon Sep 17 00:00:00 2001 From: Michael Charlton Date: Thu, 15 Dec 2016 15:39:18 +0000 Subject: [PATCH 02/10] Updates after review DataFrame.astype now allows changing the dtype of a column by passing a dict mapping column name to dtype. --- doc/source/basics.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index e7db814483905..58da24d889507 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1779,6 +1779,18 @@ Convert a subset of columns to a specified type using :meth:`~DataFrame.astype` dft.loc[:, ['a', 'b']] = dft.loc[:, ['a', 'b']].astype(np.uint8) dft.dtypes + +.. versionadded:: 0.19 + +Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFrame.astype` + +.. ipython:: python + + dft1 = pd.DataFrame({'a': [1,0,1], 'b': [4,5,6], 'c': [7, 8, 9]}) + dft1 = dft1.astype({'a': np.bool, 'c': np.float64}) + dft1 + dft1.dtypes + .. _basics.object_conversion: object conversion From a61ec51dfd32a6d434318a78da36822f531518e6 Mon Sep 17 00:00:00 2001 From: Michael Charlton Date: Thu, 15 Dec 2016 16:31:13 +0000 Subject: [PATCH 03/10] Corrections to docs after review feedback DataFrame.astype now allows setting the type of columns by passing a dict mapping column to dtype. --- doc/source/basics.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index e7db814483905..58da24d889507 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1779,6 +1779,18 @@ Convert a subset of columns to a specified type using :meth:`~DataFrame.astype` dft.loc[:, ['a', 'b']] = dft.loc[:, ['a', 'b']].astype(np.uint8) dft.dtypes + +.. versionadded:: 0.19 + +Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFrame.astype` + +.. ipython:: python + + dft1 = pd.DataFrame({'a': [1,0,1], 'b': [4,5,6], 'c': [7, 8, 9]}) + dft1 = dft1.astype({'a': np.bool, 'c': np.float64}) + dft1 + dft1.dtypes + .. _basics.object_conversion: object conversion From 0c9078562c771e139d2e8ea3291dd52f41a68719 Mon Sep 17 00:00:00 2001 From: Michael Charlton Date: Thu, 15 Dec 2016 17:22:08 +0000 Subject: [PATCH 04/10] Removing uneeded changes Mistakenly added changes carried out in v0.19 to v0.20 --- doc/source/whatsnew/v0.20.0.txt | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 07e8154f557b3..2855cde95ac2a 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -43,15 +43,27 @@ fixed-width text files, and :func:`read_excel` for parsing Excel files. pd.read_fwf(StringIO(data)).dtypes pd.read_fwf(StringIO(data), dtype={'a':'float64', 'b':'object'}).dtypes -You can now pass a dictionary mapping column names to desired data types for that -column to :meth:`~DataFrame.astype`. +.. _whatsnew_0200.enhancements.groupby_access: + +Groupby Enhancements +^^^^^^^^^^^^^^^^^^^^ + +Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now reference either column names or index level names (:issue:`5677`) .. ipython:: python - dft = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7, 8, 9]}) - dft = dft.astype({'a': np.float64, 'c': np.uint8}) - dft - dft.dtypes + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + + index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) + + df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3], + 'B': np.arange(8)}, + index=index) + df + + df.groupby(['second', 'A']).sum() + .. _whatsnew_0200.enhancements.other: From f51dbdf7ccc703326906935c0e7e890f52f64d76 Mon Sep 17 00:00:00 2001 From: Michael Charlton Date: Thu, 22 Dec 2016 17:44:15 +0000 Subject: [PATCH 05/10] removing stray orig file after merge --- doc/source/basics.rst.orig | 2008 ------------------------------------ 1 file changed, 2008 deletions(-) delete mode 100644 doc/source/basics.rst.orig diff --git a/doc/source/basics.rst.orig b/doc/source/basics.rst.orig deleted file mode 100644 index 2e8abe0a5c329..0000000000000 --- a/doc/source/basics.rst.orig +++ /dev/null @@ -1,2008 +0,0 @@ -.. currentmodule:: pandas - -.. ipython:: python - :suppress: - - import numpy as np - import pandas as pd - np.set_printoptions(precision=4, suppress=True) - pd.options.display.max_rows = 15 - -.. _basics: - -============================== - Essential Basic Functionality -============================== - -Here we discuss a lot of the essential functionality common to the pandas data -structures. Here's how to create some of the objects used in the examples from -the previous section: - -.. ipython:: python - - index = pd.date_range('1/1/2000', periods=8) - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = pd.DataFrame(np.random.randn(8, 3), index=index, - columns=['A', 'B', 'C']) - wp = pd.Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=pd.date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - -.. _basics.head_tail: - -Head and Tail -------------- - -To view a small sample of a Series or DataFrame object, use the -:meth:`~DataFrame.head` and :meth:`~DataFrame.tail` methods. The default number -of elements to display is five, but you may pass a custom number. - -.. ipython:: python - - long_series = pd.Series(np.random.randn(1000)) - long_series.head() - long_series.tail(3) - -.. _basics.attrs: - -Attributes and the raw ndarray(s) ---------------------------------- - -pandas objects have a number of attributes enabling you to access the metadata - - * **shape**: gives the axis dimensions of the object, consistent with ndarray - * Axis labels - - * **Series**: *index* (only axis) - * **DataFrame**: *index* (rows) and *columns* - * **Panel**: *items*, *major_axis*, and *minor_axis* - -Note, **these attributes can be safely assigned to**! - -.. ipython:: python - - df[:2] - df.columns = [x.lower() for x in df.columns] - df - -To get the actual data inside a data structure, one need only access the -**values** property: - -.. ipython:: python - - s.values - df.values - wp.values - -If a DataFrame or Panel contains homogeneously-typed data, the ndarray can -actually be modified in-place, and the changes will be reflected in the data -structure. For heterogeneous data (e.g. some of the DataFrame's columns are not -all the same dtype), this will not be the case. The values attribute itself, -unlike the axis labels, cannot be assigned to. - -.. note:: - - When working with heterogeneous data, the dtype of the resulting ndarray - will be chosen to accommodate all of the data involved. For example, if - strings are involved, the result will be of object dtype. If there are only - floats and integers, the resulting array will be of float dtype. - -.. _basics.accelerate: - -Accelerated operations ----------------------- - -pandas has support for accelerating certain types of binary numerical and boolean operations using -the ``numexpr`` library (starting in 0.11.0) and the ``bottleneck`` libraries. - -These libraries are especially useful when dealing with large data sets, and provide large -speedups. ``numexpr`` uses smart chunking, caching, and multiple cores. ``bottleneck`` is -a set of specialized cython routines that are especially fast when dealing with arrays that have -``nans``. - -Here is a sample (using 100 column x 100,000 row ``DataFrames``): - -.. csv-table:: - :header: "Operation", "0.11.0 (ms)", "Prior Version (ms)", "Ratio to Prior" - :widths: 25, 25, 25, 25 - :delim: ; - - ``df1 > df2``; 13.32; 125.35; 0.1063 - ``df1 * df2``; 21.71; 36.63; 0.5928 - ``df1 + df2``; 22.04; 36.50; 0.6039 - -You are highly encouraged to install both libraries. See the section -:ref:`Recommended Dependencies ` for more installation info. - -.. _basics.binop: - -Flexible binary operations --------------------------- - -With binary operations between pandas data structures, there are two key points -of interest: - - * Broadcasting behavior between higher- (e.g. DataFrame) and - lower-dimensional (e.g. Series) objects. - * Missing data in computations - -We will demonstrate how to manage these issues independently, though they can -be handled simultaneously. - -Matching / broadcasting behavior -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -DataFrame has the methods :meth:`~DataFrame.add`, :meth:`~DataFrame.sub`, -:meth:`~DataFrame.mul`, :meth:`~DataFrame.div` and related functions -:meth:`~DataFrame.radd`, :meth:`~DataFrame.rsub`, ... -for carrying out binary operations. For broadcasting behavior, -Series input is of primary interest. Using these functions, you can use to -either match on the *index* or *columns* via the **axis** keyword: - -.. ipython:: python - - df = pd.DataFrame({'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']), - 'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), - 'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) - df - row = df.ix[1] - column = df['two'] - - df.sub(row, axis='columns') - df.sub(row, axis=1) - - df.sub(column, axis='index') - df.sub(column, axis=0) - -.. ipython:: python - :suppress: - - df_orig = df - -Furthermore you can align a level of a multi-indexed DataFrame with a Series. - -.. ipython:: python - - dfmi = df.copy() - dfmi.index = pd.MultiIndex.from_tuples([(1,'a'),(1,'b'),(1,'c'),(2,'a')], - names=['first','second']) - dfmi.sub(column, axis=0, level='second') - -With Panel, describing the matching behavior is a bit more difficult, so -the arithmetic methods instead (and perhaps confusingly?) give you the option -to specify the *broadcast axis*. For example, suppose we wished to demean the -data over a particular axis. This can be accomplished by taking the mean over -an axis and broadcasting over the same axis: - -.. ipython:: python - - major_mean = wp.mean(axis='major') - major_mean - wp.sub(major_mean, axis='major') - -And similarly for ``axis="items"`` and ``axis="minor"``. - -.. note:: - - I could be convinced to make the **axis** argument in the DataFrame methods - match the broadcasting behavior of Panel. Though it would require a - transition period so users can change their code... - -Series and Index also support the :func:`divmod` builtin. This function takes -the floor division and modulo operation at the same time returning a two-tuple -of the same type as the left hand side. For example: - -.. ipython:: python - - s = pd.Series(np.arange(10)) - s - div, rem = divmod(s, 3) - div - rem - - idx = pd.Index(np.arange(10)) - idx - div, rem = divmod(idx, 3) - div - rem - -We can also do elementwise :func:`divmod`: - -.. ipython:: python - - div, rem = divmod(s, [2, 2, 3, 3, 4, 4, 5, 5, 6, 6]) - div - rem - -Missing data / operations with fill values -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In Series and DataFrame (though not yet in Panel), the arithmetic functions -have the option of inputting a *fill_value*, namely a value to substitute when -at most one of the values at a location are missing. For example, when adding -two DataFrame objects, you may wish to treat NaN as 0 unless both DataFrames -are missing that value, in which case the result will be NaN (you can later -replace NaN with some other value using ``fillna`` if you wish). - -.. ipython:: python - :suppress: - - df2 = df.copy() - df2['three']['a'] = 1. - -.. ipython:: python - - df - df2 - df + df2 - df.add(df2, fill_value=0) - -.. _basics.compare: - -Flexible Comparisons -~~~~~~~~~~~~~~~~~~~~ - -Starting in v0.8, pandas introduced binary comparison methods eq, ne, lt, gt, -le, and ge to Series and DataFrame whose behavior is analogous to the binary -arithmetic operations described above: - -.. ipython:: python - - df.gt(df2) - df2.ne(df) - -These operations produce a pandas object the same type as the left-hand-side input -that if of dtype ``bool``. These ``boolean`` objects can be used in indexing operations, -see :ref:`here` - -.. _basics.reductions: - -Boolean Reductions -~~~~~~~~~~~~~~~~~~ - -You can apply the reductions: :attr:`~DataFrame.empty`, :meth:`~DataFrame.any`, -:meth:`~DataFrame.all`, and :meth:`~DataFrame.bool` to provide a -way to summarize a boolean result. - -.. ipython:: python - - (df > 0).all() - (df > 0).any() - -You can reduce to a final boolean value. - -.. ipython:: python - - (df > 0).any().any() - -You can test if a pandas object is empty, via the :attr:`~DataFrame.empty` property. - -.. ipython:: python - - df.empty - pd.DataFrame(columns=list('ABC')).empty - -To evaluate single-element pandas objects in a boolean context, use the method -:meth:`~DataFrame.bool`: - -.. ipython:: python - - pd.Series([True]).bool() - pd.Series([False]).bool() - pd.DataFrame([[True]]).bool() - pd.DataFrame([[False]]).bool() - -.. warning:: - - You might be tempted to do the following: - - .. code-block:: python - - >>> if df: - ... - - Or - - .. code-block:: python - - >>> df and df2 - - These both will raise as you are trying to compare multiple values. - - .. code-block:: python - - ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). - -See :ref:`gotchas` for a more detailed discussion. - -.. _basics.equals: - -Comparing if objects are equivalent -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Often you may find there is more than one way to compute the same -result. As a simple example, consider ``df+df`` and ``df*2``. To test -that these two computations produce the same result, given the tools -shown above, you might imagine using ``(df+df == df*2).all()``. But in -fact, this expression is False: - -.. ipython:: python - - df+df == df*2 - (df+df == df*2).all() - -Notice that the boolean DataFrame ``df+df == df*2`` contains some False values! -That is because NaNs do not compare as equals: - -.. ipython:: python - - np.nan == np.nan - -So, as of v0.13.1, NDFrames (such as Series, DataFrames, and Panels) -have an :meth:`~DataFrame.equals` method for testing equality, with NaNs in -corresponding locations treated as equal. - -.. ipython:: python - - (df+df).equals(df*2) - -Note that the Series or DataFrame index needs to be in the same order for -equality to be True: - -.. ipython:: python - - df1 = pd.DataFrame({'col':['foo', 0, np.nan]}) - df2 = pd.DataFrame({'col':[np.nan, 0, 'foo']}, index=[2,1,0]) - df1.equals(df2) - df1.equals(df2.sort_index()) - -Comparing array-like objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can conveniently do element-wise comparisons when comparing a pandas -data structure with a scalar value: - -.. ipython:: python - - pd.Series(['foo', 'bar', 'baz']) == 'foo' - pd.Index(['foo', 'bar', 'baz']) == 'foo' - -Pandas also handles element-wise comparisons between different array-like -objects of the same length: - -.. ipython:: python - - pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux']) - pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux']) - -Trying to compare ``Index`` or ``Series`` objects of different lengths will -raise a ValueError: - -.. code-block:: ipython - - In [55]: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo', 'bar']) - ValueError: Series lengths must match to compare - - In [56]: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo']) - ValueError: Series lengths must match to compare - -Note that this is different from the numpy behavior where a comparison can -be broadcast: - -.. ipython:: python - - np.array([1, 2, 3]) == np.array([2]) - -or it can return False if broadcasting can not be done: - -.. ipython:: python - :okwarning: - - np.array([1, 2, 3]) == np.array([1, 2]) - -Combining overlapping data sets -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A problem occasionally arising is the combination of two similar data sets -where values in one are preferred over the other. An example would be two data -series representing a particular economic indicator where one is considered to -be of "higher quality". However, the lower quality series might extend further -back in history or have more complete data coverage. As such, we would like to -combine two DataFrame objects where missing values in one DataFrame are -conditionally filled with like-labeled values from the other DataFrame. The -function implementing this operation is :meth:`~DataFrame.combine_first`, -which we illustrate: - -.. ipython:: python - - df1 = pd.DataFrame({'A' : [1., np.nan, 3., 5., np.nan], - 'B' : [np.nan, 2., 3., np.nan, 6.]}) - df2 = pd.DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], - 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) - df1 - df2 - df1.combine_first(df2) - -General DataFrame Combine -~~~~~~~~~~~~~~~~~~~~~~~~~ - -The :meth:`~DataFrame.combine_first` method above calls the more general -DataFrame method :meth:`~DataFrame.combine`. This method takes another DataFrame -and a combiner function, aligns the input DataFrame and then passes the combiner -function pairs of Series (i.e., columns whose names are the same). - -So, for instance, to reproduce :meth:`~DataFrame.combine_first` as above: - -.. ipython:: python - - combiner = lambda x, y: np.where(pd.isnull(x), y, x) - df1.combine(df2, combiner) - -.. _basics.stats: - -Descriptive statistics ----------------------- - -A large number of methods for computing descriptive statistics and other related -operations on :ref:`Series `, :ref:`DataFrame -`, and :ref:`Panel `. Most of these -are aggregations (hence producing a lower-dimensional result) like -:meth:`~DataFrame.sum`, :meth:`~DataFrame.mean`, and :meth:`~DataFrame.quantile`, -but some of them, like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod`, -produce an object of the same size. Generally speaking, these methods take an -**axis** argument, just like *ndarray.{sum, std, ...}*, but the axis can be -specified by name or integer: - - - **Series**: no axis argument needed - - **DataFrame**: "index" (axis=0, default), "columns" (axis=1) - - **Panel**: "items" (axis=0), "major" (axis=1, default), "minor" - (axis=2) - -For example: - -.. ipython:: python - - df - df.mean(0) - df.mean(1) - -All such methods have a ``skipna`` option signaling whether to exclude missing -data (``True`` by default): - -.. ipython:: python - - df.sum(0, skipna=False) - df.sum(axis=1, skipna=True) - -Combined with the broadcasting / arithmetic behavior, one can describe various -statistical procedures, like standardization (rendering data zero mean and -standard deviation 1), very concisely: - -.. ipython:: python - - ts_stand = (df - df.mean()) / df.std() - ts_stand.std() - xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0) - xs_stand.std(1) - -Note that methods like :meth:`~DataFrame.cumsum` and :meth:`~DataFrame.cumprod` -preserve the location of ``NaN`` values. This is somewhat different from -:meth:`~DataFrame.expanding` and :meth:`~DataFrame.rolling`. -For more details please see :ref:`this note `. - -.. ipython:: python - - df.cumsum() - -Here is a quick reference summary table of common functions. Each also takes an -optional ``level`` parameter which applies only if the object has a -:ref:`hierarchical index`. - -.. csv-table:: - :header: "Function", "Description" - :widths: 20, 80 - - ``count``, Number of non-null observations - ``sum``, Sum of values - ``mean``, Mean of values - ``mad``, Mean absolute deviation - ``median``, Arithmetic median of values - ``min``, Minimum - ``max``, Maximum - ``mode``, Mode - ``abs``, Absolute Value - ``prod``, Product of values - ``std``, Bessel-corrected sample standard deviation - ``var``, Unbiased variance - ``sem``, Standard error of the mean - ``skew``, Sample skewness (3rd moment) - ``kurt``, Sample kurtosis (4th moment) - ``quantile``, Sample quantile (value at %) - ``cumsum``, Cumulative sum - ``cumprod``, Cumulative product - ``cummax``, Cumulative maximum - ``cummin``, Cumulative minimum - -Note that by chance some NumPy methods, like ``mean``, ``std``, and ``sum``, -will exclude NAs on Series input by default: - -.. ipython:: python - - np.mean(df['one']) - np.mean(df['one'].values) - -``Series`` also has a method :meth:`~Series.nunique` which will return the -number of unique non-null values: - -.. ipython:: python - - series = pd.Series(np.random.randn(500)) - series[20:500] = np.nan - series[10:20] = 5 - series.nunique() - -.. _basics.describe: - -Summarizing data: describe -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -There is a convenient :meth:`~DataFrame.describe` function which computes a variety of summary -statistics about a Series or the columns of a DataFrame (excluding NAs of -course): - -.. ipython:: python - - series = pd.Series(np.random.randn(1000)) - series[::2] = np.nan - series.describe() - frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) - frame.ix[::2] = np.nan - frame.describe() - -You can select specific percentiles to include in the output: - -.. ipython:: python - - series.describe(percentiles=[.05, .25, .75, .95]) - -By default, the median is always included. - -For a non-numerical Series object, :meth:`~Series.describe` will give a simple -summary of the number of unique values and most frequently occurring values: - -.. ipython:: python - - s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) - s.describe() - -Note that on a mixed-type DataFrame object, :meth:`~DataFrame.describe` will -restrict the summary to include only numerical columns or, if none are, only -categorical columns: - -.. ipython:: python - - frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) - frame.describe() - -This behaviour can be controlled by providing a list of types as ``include``/``exclude`` -arguments. The special value ``all`` can also be used: - -.. ipython:: python - - frame.describe(include=['object']) - frame.describe(include=['number']) - frame.describe(include='all') - -That feature relies on :ref:`select_dtypes `. Refer to -there for details about accepted inputs. - -.. _basics.idxmin: - -Index of Min/Max Values -~~~~~~~~~~~~~~~~~~~~~~~ - -The :meth:`~DataFrame.idxmin` and :meth:`~DataFrame.idxmax` functions on Series -and DataFrame compute the index labels with the minimum and maximum -corresponding values: - -.. ipython:: python - - s1 = pd.Series(np.random.randn(5)) - s1 - s1.idxmin(), s1.idxmax() - - df1 = pd.DataFrame(np.random.randn(5,3), columns=['A','B','C']) - df1 - df1.idxmin(axis=0) - df1.idxmax(axis=1) - -When there are multiple rows (or columns) matching the minimum or maximum -value, :meth:`~DataFrame.idxmin` and :meth:`~DataFrame.idxmax` return the first -matching index: - -.. ipython:: python - - df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba')) - df3 - df3['A'].idxmin() - -.. note:: - - ``idxmin`` and ``idxmax`` are called ``argmin`` and ``argmax`` in NumPy. - -.. _basics.discretization: - -Value counts (histogramming) / Mode -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The :meth:`~Series.value_counts` Series method and top-level function computes a histogram -of a 1D array of values. It can also be used as a function on regular arrays: - -.. ipython:: python - - data = np.random.randint(0, 7, size=50) - data - s = pd.Series(data) - s.value_counts() - pd.value_counts(data) - -Similarly, you can get the most frequently occurring value(s) (the mode) of the values in a Series or DataFrame: - -.. ipython:: python - - s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) - s5.mode() - df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50), - "B": np.random.randint(-10, 15, size=50)}) - df5.mode() - - -Discretization and quantiling -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Continuous values can be discretized using the :func:`cut` (bins based on values) -and :func:`qcut` (bins based on sample quantiles) functions: - -.. ipython:: python - - arr = np.random.randn(20) - factor = pd.cut(arr, 4) - factor - - factor = pd.cut(arr, [-5, -1, 0, 1, 5]) - factor - -:func:`qcut` computes sample quantiles. For example, we could slice up some -normally distributed data into equal-size quartiles like so: - -.. ipython:: python - - arr = np.random.randn(30) - factor = pd.qcut(arr, [0, .25, .5, .75, 1]) - factor - pd.value_counts(factor) - -We can also pass infinite values to define the bins: - -.. ipython:: python - - arr = np.random.randn(20) - factor = pd.cut(arr, [-np.inf, 0, np.inf]) - factor - -.. _basics.apply: - -Function application --------------------- - -To apply your own or another library's functions to pandas objects, -you should be aware of the three methods below. The appropriate -method to use depends on whether your function expects to operate -on an entire ``DataFrame`` or ``Series``, row- or column-wise, or elementwise. - -1. `Tablewise Function Application`_: :meth:`~DataFrame.pipe` -2. `Row or Column-wise Function Application`_: :meth:`~DataFrame.apply` -3. Elementwise_ function application: :meth:`~DataFrame.applymap` - -.. _basics.pipe: - -Tablewise Function Application -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. versionadded:: 0.16.2 - -``DataFrames`` and ``Series`` can of course just be passed into functions. -However, if the function needs to be called in a chain, consider using the :meth:`~DataFrame.pipe` method. -Compare the following - -.. code-block:: python - - # f, g, and h are functions taking and returning ``DataFrames`` - >>> f(g(h(df), arg1=1), arg2=2, arg3=3) - -with the equivalent - -.. code-block:: python - - >>> (df.pipe(h) - .pipe(g, arg1=1) - .pipe(f, arg2=2, arg3=3) - ) - -Pandas encourages the second style, which is known as method chaining. -``pipe`` makes it easy to use your own or another library's functions -in method chains, alongside pandas' methods. - -In the example above, the functions ``f``, ``g``, and ``h`` each expected the ``DataFrame`` as the first positional argument. -What if the function you wish to apply takes its data as, say, the second argument? -In this case, provide ``pipe`` with a tuple of ``(callable, data_keyword)``. -``.pipe`` will route the ``DataFrame`` to the argument specified in the tuple. - -For example, we can fit a regression using statsmodels. Their API expects a formula first and a ``DataFrame`` as the second argument, ``data``. We pass in the function, keyword pair ``(sm.poisson, 'data')`` to ``pipe``: - -.. ipython:: python - - import statsmodels.formula.api as sm - - bb = pd.read_csv('data/baseball.csv', index_col='id') - - (bb.query('h > 0') - .assign(ln_h = lambda df: np.log(df.h)) - .pipe((sm.poisson, 'data'), 'hr ~ ln_h + year + g + C(lg)') - .fit() - .summary() - ) - -The pipe method is inspired by unix pipes and more recently dplyr_ and magrittr_, which -have introduced the popular ``(%>%)`` (read pipe) operator for R_. -The implementation of ``pipe`` here is quite clean and feels right at home in python. -We encourage you to view the source code (``pd.DataFrame.pipe??`` in IPython). - -.. _dplyr: https://github.com/hadley/dplyr -.. _magrittr: https://github.com/smbache/magrittr -.. _R: http://www.r-project.org - - -Row or Column-wise Function Application -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Arbitrary functions can be applied along the axes of a DataFrame or Panel -using the :meth:`~DataFrame.apply` method, which, like the descriptive -statistics methods, take an optional ``axis`` argument: - -.. ipython:: python - - df.apply(np.mean) - df.apply(np.mean, axis=1) - df.apply(lambda x: x.max() - x.min()) - df.apply(np.cumsum) - df.apply(np.exp) - -Depending on the return type of the function passed to :meth:`~DataFrame.apply`, -the result will either be of lower dimension or the same dimension. - -:meth:`~DataFrame.apply` combined with some cleverness can be used to answer many questions -about a data set. For example, suppose we wanted to extract the date where the -maximum value for each column occurred: - -.. ipython:: python - - tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=1000)) - tsdf.apply(lambda x: x.idxmax()) - -You may also pass additional arguments and keyword arguments to the :meth:`~DataFrame.apply` -method. For instance, consider the following function you would like to apply: - -.. code-block:: python - - def subtract_and_divide(x, sub, divide=1): - return (x - sub) / divide - -You may then apply this function as follows: - -.. code-block:: python - - df.apply(subtract_and_divide, args=(5,), divide=3) - -Another useful feature is the ability to pass Series methods to carry out some -Series operation on each column or row: - -.. ipython:: python - :suppress: - - tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) - tsdf.values[3:7] = np.nan - -.. ipython:: python - - tsdf - tsdf.apply(pd.Series.interpolate) - - -Finally, :meth:`~DataFrame.apply` takes an argument ``raw`` which is False by default, which -converts each row or column into a Series before applying the function. When -set to True, the passed function will instead receive an ndarray object, which -has positive performance implications if you do not need the indexing -functionality. - -.. seealso:: - - The section on :ref:`GroupBy ` demonstrates related, flexible - functionality for grouping by some criterion, applying, and combining the - results into a Series, DataFrame, etc. - -.. _Elementwise: - -Applying elementwise Python functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Since not all functions can be vectorized (accept NumPy arrays and return -another array or value), the methods :meth:`~DataFrame.applymap` on DataFrame -and analogously :meth:`~Series.map` on Series accept any Python function taking -a single value and returning a single value. For example: - -.. ipython:: python - :suppress: - - df4 = df_orig.copy() - -.. ipython:: python - - df4 - f = lambda x: len(str(x)) - df4['one'].map(f) - df4.applymap(f) - -:meth:`Series.map` has an additional feature which is that it can be used to easily -"link" or "map" values defined by a secondary series. This is closely related -to :ref:`merging/joining functionality `: - -.. ipython:: python - - s = pd.Series(['six', 'seven', 'six', 'seven', 'six'], - index=['a', 'b', 'c', 'd', 'e']) - t = pd.Series({'six' : 6., 'seven' : 7.}) - s - s.map(t) - - -.. _basics.apply_panel: - -Applying with a Panel -~~~~~~~~~~~~~~~~~~~~~ - -Applying with a ``Panel`` will pass a ``Series`` to the applied function. If the applied -function returns a ``Series``, the result of the application will be a ``Panel``. If the applied function -reduces to a scalar, the result of the application will be a ``DataFrame``. - -.. note:: - - Prior to 0.13.1 ``apply`` on a ``Panel`` would only work on ``ufuncs`` (e.g. ``np.sum/np.max``). - -.. ipython:: python - - import pandas.util.testing as tm - panel = tm.makePanel(5) - panel - panel['ItemA'] - -A transformational apply. - -.. ipython:: python - - result = panel.apply(lambda x: x*2, axis='items') - result - result['ItemA'] - -A reduction operation. - -.. ipython:: python - - panel.apply(lambda x: x.dtype, axis='items') - -A similar reduction type operation - -.. ipython:: python - - panel.apply(lambda x: x.sum(), axis='major_axis') - -This last reduction is equivalent to - -.. ipython:: python - - panel.sum('major_axis') - -A transformation operation that returns a ``Panel``, but is computing -the z-score across the ``major_axis``. - -.. ipython:: python - - result = panel.apply( - lambda x: (x-x.mean())/x.std(), - axis='major_axis') - result - result['ItemA'] - -Apply can also accept multiple axes in the ``axis`` argument. This will pass a -``DataFrame`` of the cross-section to the applied function. - -.. ipython:: python - - f = lambda x: ((x.T-x.mean(1))/x.std(1)).T - - result = panel.apply(f, axis = ['items','major_axis']) - result - result.loc[:,:,'ItemA'] - -This is equivalent to the following - -.. ipython:: python - - result = pd.Panel(dict([ (ax, f(panel.loc[:,:,ax])) - for ax in panel.minor_axis ])) - result - result.loc[:,:,'ItemA'] - - -.. _basics.reindexing: - -Reindexing and altering labels ------------------------------- - -:meth:`~Series.reindex` is the fundamental data alignment method in pandas. -It is used to implement nearly all other features relying on label-alignment -functionality. To *reindex* means to conform the data to match a given set of -labels along a particular axis. This accomplishes several things: - - * Reorders the existing data to match a new set of labels - * Inserts missing value (NA) markers in label locations where no data for - that label existed - * If specified, **fill** data for missing labels using logic (highly relevant - to working with time series data) - -Here is a simple example: - -.. ipython:: python - - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - s - s.reindex(['e', 'b', 'f', 'd']) - -Here, the ``f`` label was not contained in the Series and hence appears as -``NaN`` in the result. - -With a DataFrame, you can simultaneously reindex the index and columns: - -.. ipython:: python - - df - df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one']) - -For convenience, you may utilize the :meth:`~Series.reindex_axis` method, which -takes the labels and a keyword ``axis`` parameter. - -Note that the ``Index`` objects containing the actual axis labels can be -**shared** between objects. So if we have a Series and a DataFrame, the -following can be done: - -.. ipython:: python - - rs = s.reindex(df.index) - rs - rs.index is df.index - -This means that the reindexed Series's index is the same Python object as the -DataFrame's index. - - -.. seealso:: - - :ref:`MultiIndex / Advanced Indexing ` is an even more concise way of - doing reindexing. - -.. note:: - - When writing performance-sensitive code, there is a good reason to spend - some time becoming a reindexing ninja: **many operations are faster on - pre-aligned data**. Adding two unaligned DataFrames internally triggers a - reindexing step. For exploratory analysis you will hardly notice the - difference (because ``reindex`` has been heavily optimized), but when CPU - cycles matter sprinkling a few explicit ``reindex`` calls here and there can - have an impact. - -.. _basics.reindex_like: - -Reindexing to align with another object -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You may wish to take an object and reindex its axes to be labeled the same as -another object. While the syntax for this is straightforward albeit verbose, it -is a common enough operation that the :meth:`~DataFrame.reindex_like` method is -available to make this simpler: - -.. ipython:: python - :suppress: - - df2 = df.reindex(['a', 'b', 'c'], columns=['one', 'two']) - df3 = df2 - df2.mean() - - -.. ipython:: python - - df2 - df3 - df.reindex_like(df2) - -.. _basics.align: - -Aligning objects with each other with ``align`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The :meth:`~Series.align` method is the fastest way to simultaneously align two objects. It -supports a ``join`` argument (related to :ref:`joining and merging `): - - - ``join='outer'``: take the union of the indexes (default) - - ``join='left'``: use the calling object's index - - ``join='right'``: use the passed object's index - - ``join='inner'``: intersect the indexes - -It returns a tuple with both of the reindexed Series: - -.. ipython:: python - - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - s1 = s[:4] - s2 = s[1:] - s1.align(s2) - s1.align(s2, join='inner') - s1.align(s2, join='left') - -.. _basics.df_join: - -For DataFrames, the join method will be applied to both the index and the -columns by default: - -.. ipython:: python - - df.align(df2, join='inner') - -You can also pass an ``axis`` option to only align on the specified axis: - -.. ipython:: python - - df.align(df2, join='inner', axis=0) - -.. _basics.align.frame.series: - -If you pass a Series to :meth:`DataFrame.align`, you can choose to align both -objects either on the DataFrame's index or columns using the ``axis`` argument: - -.. ipython:: python - - df.align(df2.ix[0], axis=1) - -.. _basics.reindex_fill: - -Filling while reindexing -~~~~~~~~~~~~~~~~~~~~~~~~ - -:meth:`~Series.reindex` takes an optional parameter ``method`` which is a -filling method chosen from the following table: - -.. csv-table:: - :header: "Method", "Action" - :widths: 30, 50 - - pad / ffill, Fill values forward - bfill / backfill, Fill values backward - nearest, Fill from the nearest index value - -We illustrate these fill methods on a simple Series: - -.. ipython:: python - - rng = pd.date_range('1/3/2000', periods=8) - ts = pd.Series(np.random.randn(8), index=rng) - ts2 = ts[[0, 3, 6]] - ts - ts2 - - ts2.reindex(ts.index) - ts2.reindex(ts.index, method='ffill') - ts2.reindex(ts.index, method='bfill') - ts2.reindex(ts.index, method='nearest') - -These methods require that the indexes are **ordered** increasing or -decreasing. - -Note that the same result could have been achieved using -:ref:`fillna ` (except for ``method='nearest'``) or -:ref:`interpolate `: - -.. ipython:: python - - ts2.reindex(ts.index).fillna(method='ffill') - -:meth:`~Series.reindex` will raise a ValueError if the index is not monotonic -increasing or decreasing. :meth:`~Series.fillna` and :meth:`~Series.interpolate` -will not make any checks on the order of the index. - -.. _basics.limits_on_reindex_fill: - -Limits on filling while reindexing -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``limit`` and ``tolerance`` arguments provide additional control over -filling while reindexing. Limit specifies the maximum count of consecutive -matches: - -.. ipython:: python - - ts2.reindex(ts.index, method='ffill', limit=1) - -In contrast, tolerance specifies the maximum distance between the index and -indexer values: - -.. ipython:: python - - ts2.reindex(ts.index, method='ffill', tolerance='1 day') - -Notice that when used on a ``DatetimeIndex``, ``TimedeltaIndex`` or -``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible. -This allows you to specify tolerance with appropriate strings. - -.. _basics.drop: - -Dropping labels from an axis -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A method closely related to ``reindex`` is the :meth:`~DataFrame.drop` function. -It removes a set of labels from an axis: - -.. ipython:: python - - df - df.drop(['a', 'd'], axis=0) - df.drop(['one'], axis=1) - -Note that the following also works, but is a bit less obvious / clean: - -.. ipython:: python - - df.reindex(df.index.difference(['a', 'd'])) - -.. _basics.rename: - -Renaming / mapping labels -~~~~~~~~~~~~~~~~~~~~~~~~~ - -The :meth:`~DataFrame.rename` method allows you to relabel an axis based on some -mapping (a dict or Series) or an arbitrary function. - -.. ipython:: python - - s - s.rename(str.upper) - -If you pass a function, it must return a value when called with any of the -labels (and must produce a set of unique values). A dict or -Series can also be used: - -.. ipython:: python - - df.rename(columns={'one' : 'foo', 'two' : 'bar'}, - index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'}) - -If the mapping doesn't include a column/index label, it isn't renamed. Also -extra labels in the mapping don't throw an error. - -The :meth:`~DataFrame.rename` method also provides an ``inplace`` named -parameter that is by default ``False`` and copies the underlying data. Pass -``inplace=True`` to rename the data in place. - -.. versionadded:: 0.18.0 - -Finally, :meth:`~Series.rename` also accepts a scalar or list-like -for altering the ``Series.name`` attribute. - -.. ipython:: python - - s.rename("scalar-name") - -.. _basics.rename_axis: - -The Panel class has a related :meth:`~Panel.rename_axis` class which can rename -any of its three axes. - -.. _basics.iteration: - -Iteration ---------- - -The behavior of basic iteration over pandas objects depends on the type. -When iterating over a Series, it is regarded as array-like, and basic iteration -produces the values. Other data structures, like DataFrame and Panel, -follow the dict-like convention of iterating over the "keys" of the -objects. - -In short, basic iteration (``for i in object``) produces: - -* **Series**: values -* **DataFrame**: column labels -* **Panel**: item labels - -Thus, for example, iterating over a DataFrame gives you the column names: - -.. ipython:: - - In [0]: df = pd.DataFrame({'col1' : np.random.randn(3), 'col2' : np.random.randn(3)}, - ...: index=['a', 'b', 'c']) - - In [0]: for col in df: - ...: print(col) - ...: - -Pandas objects also have the dict-like :meth:`~DataFrame.iteritems` method to -iterate over the (key, value) pairs. - -To iterate over the rows of a DataFrame, you can use the following methods: - -* :meth:`~DataFrame.iterrows`: Iterate over the rows of a DataFrame as (index, Series) pairs. - This converts the rows to Series objects, which can change the dtypes and has some - performance implications. -* :meth:`~DataFrame.itertuples`: Iterate over the rows of a DataFrame - as namedtuples of the values. This is a lot faster than - :meth:`~DataFrame.iterrows`, and is in most cases preferable to use - to iterate over the values of a DataFrame. - -.. warning:: - - Iterating through pandas objects is generally **slow**. In many cases, - iterating manually over the rows is not needed and can be avoided with - one of the following approaches: - - * Look for a *vectorized* solution: many operations can be performed using - built-in methods or numpy functions, (boolean) indexing, ... - - * When you have a function that cannot work on the full DataFrame/Series - at once, it is better to use :meth:`~DataFrame.apply` instead of iterating - over the values. See the docs on :ref:`function application `. - - * If you need to do iterative manipulations on the values but performance is - important, consider writing the inner loop using e.g. cython or numba. - See the :ref:`enhancing performance ` section for some - examples of this approach. - -.. warning:: - - You should **never modify** something you are iterating over. - This is not guaranteed to work in all cases. Depending on the - data types, the iterator returns a copy and not a view, and writing - to it will have no effect! - - For example, in the following case setting the value has no effect: - - .. ipython:: python - - df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']}) - - for index, row in df.iterrows(): - row['a'] = 10 - - df - -iteritems -~~~~~~~~~ - -Consistent with the dict-like interface, :meth:`~DataFrame.iteritems` iterates -through key-value pairs: - -* **Series**: (index, scalar value) pairs -* **DataFrame**: (column, Series) pairs -* **Panel**: (item, DataFrame) pairs - -For example: - -.. ipython:: - - In [0]: for item, frame in wp.iteritems(): - ...: print(item) - ...: print(frame) - ...: - -.. _basics.iterrows: - -iterrows -~~~~~~~~ - -:meth:`~DataFrame.iterrows` allows you to iterate through the rows of a -DataFrame as Series objects. It returns an iterator yielding each -index value along with a Series containing the data in each row: - -.. ipython:: - - In [0]: for row_index, row in df.iterrows(): - ...: print('%s\n%s' % (row_index, row)) - ...: - -.. note:: - - Because :meth:`~DataFrame.iterrows` returns a Series for each row, - it does **not** preserve dtypes across the rows (dtypes are - preserved across columns for DataFrames). For example, - - .. ipython:: python - - df_orig = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) - df_orig.dtypes - row = next(df_orig.iterrows())[1] - row - - All values in ``row``, returned as a Series, are now upcasted - to floats, also the original integer value in column `x`: - - .. ipython:: python - - row['int'].dtype - df_orig['int'].dtype - - To preserve dtypes while iterating over the rows, it is better - to use :meth:`~DataFrame.itertuples` which returns namedtuples of the values - and which is generally much faster as ``iterrows``. - -For instance, a contrived way to transpose the DataFrame would be: - -.. ipython:: python - - df2 = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) - print(df2) - print(df2.T) - - df2_t = pd.DataFrame(dict((idx,values) for idx, values in df2.iterrows())) - print(df2_t) - -itertuples -~~~~~~~~~~ - -The :meth:`~DataFrame.itertuples` method will return an iterator -yielding a namedtuple for each row in the DataFrame. The first element -of the tuple will be the row's corresponding index value, while the -remaining values are the row values. - -For instance, - -.. ipython:: python - - for row in df.itertuples(): - print(row) - -This method does not convert the row to a Series object but just -returns the values inside a namedtuple. Therefore, -:meth:`~DataFrame.itertuples` preserves the data type of the values -and is generally faster as :meth:`~DataFrame.iterrows`. - -.. note:: - - The column names will be renamed to positional names if they are - invalid Python identifiers, repeated, or start with an underscore. - With a large number of columns (>255), regular tuples are returned. - -.. _basics.dt_accessors: - -.dt accessor ------------- - -``Series`` has an accessor to succinctly return datetime like properties for the -*values* of the Series, if it is a datetime/period like Series. -This will return a Series, indexed like the existing Series. - -.. ipython:: python - - # datetime - s = pd.Series(pd.date_range('20130101 09:10:12', periods=4)) - s - s.dt.hour - s.dt.second - s.dt.day - -This enables nice expressions like this: - -.. ipython:: python - - s[s.dt.day==2] - -You can easily produces tz aware transformations: - -.. ipython:: python - - stz = s.dt.tz_localize('US/Eastern') - stz - stz.dt.tz - -You can also chain these types of operations: - -.. ipython:: python - - s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') - -You can also format datetime values as strings with :meth:`Series.dt.strftime` which -supports the same format as the standard :meth:`~datetime.datetime.strftime`. - -.. ipython:: python - - # DatetimeIndex - s = pd.Series(pd.date_range('20130101', periods=4)) - s - s.dt.strftime('%Y/%m/%d') - -.. ipython:: python - - # PeriodIndex - s = pd.Series(pd.period_range('20130101', periods=4)) - s - s.dt.strftime('%Y/%m/%d') - -The ``.dt`` accessor works for period and timedelta dtypes. - -.. ipython:: python - - # period - s = pd.Series(pd.period_range('20130101', periods=4, freq='D')) - s - s.dt.year - s.dt.day - -.. ipython:: python - - # timedelta - s = pd.Series(pd.timedelta_range('1 day 00:00:05', periods=4, freq='s')) - s - s.dt.days - s.dt.seconds - s.dt.components - -.. note:: - - ``Series.dt`` will raise a ``TypeError`` if you access with a non-datetimelike values - -Vectorized string methods -------------------------- - -Series is equipped with a set of string processing methods that make it easy to -operate on each element of the array. Perhaps most importantly, these methods -exclude missing/NA values automatically. These are accessed via the Series's -``str`` attribute and generally have names matching the equivalent (scalar) -built-in string methods. For example: - - .. ipython:: python - - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) - s.str.lower() - -Powerful pattern-matching methods are provided as well, but note that -pattern-matching generally uses `regular expressions -`__ by default (and in some cases -always uses them). - -Please see :ref:`Vectorized String Methods ` for a complete -description. - -.. _basics.sorting: - -Sorting -------- - -.. warning:: - - The sorting API is substantially changed in 0.17.0, see :ref:`here ` for these changes. - In particular, all sorting methods now return a new object by default, and **DO NOT** operate in-place (except by passing ``inplace=True``). - -There are two obvious kinds of sorting that you may be interested in: sorting -by label and sorting by actual values. - -By Index -~~~~~~~~ - -The primary method for sorting axis -labels (indexes) are the ``Series.sort_index()`` and the ``DataFrame.sort_index()`` methods. - -.. ipython:: python - - unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'], - columns=['three', 'two', 'one']) - - # DataFrame - unsorted_df.sort_index() - unsorted_df.sort_index(ascending=False) - unsorted_df.sort_index(axis=1) - - # Series - unsorted_df['three'].sort_index() - -By Values -~~~~~~~~~ - -The :meth:`Series.sort_values` and :meth:`DataFrame.sort_values` are the entry points for **value** sorting (that is the values in a column or row). -:meth:`DataFrame.sort_values` can accept an optional ``by`` argument for ``axis=0`` -which will use an arbitrary vector or a column name of the DataFrame to -determine the sort order: - -.. ipython:: python - - df1 = pd.DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]}) - df1.sort_values(by='two') - -The ``by`` argument can take a list of column names, e.g.: - -.. ipython:: python - - df1[['one', 'two', 'three']].sort_values(by=['one','two']) - -These methods have special treatment of NA values via the ``na_position`` -argument: - -.. ipython:: python - - s[2] = np.nan - s.sort_values() - s.sort_values(na_position='first') - - -.. _basics.searchsorted: - -searchsorted -~~~~~~~~~~~~ - -Series has the :meth:`~Series.searchsorted` method, which works similar to -:meth:`numpy.ndarray.searchsorted`. - -.. ipython:: python - - ser = pd.Series([1, 2, 3]) - ser.searchsorted([0, 3]) - ser.searchsorted([0, 4]) - ser.searchsorted([1, 3], side='right') - ser.searchsorted([1, 3], side='left') - ser = pd.Series([3, 1, 2]) - ser.searchsorted([0, 3], sorter=np.argsort(ser)) - -.. _basics.nsorted: - -smallest / largest values -~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. versionadded:: 0.14.0 - -``Series`` has the :meth:`~Series.nsmallest` and :meth:`~Series.nlargest` methods which return the -smallest or largest :math:`n` values. For a large ``Series`` this can be much -faster than sorting the entire Series and calling ``head(n)`` on the result. - -.. ipython:: python - - s = pd.Series(np.random.permutation(10)) - s - s.sort_values() - s.nsmallest(3) - s.nlargest(3) - -.. versionadded:: 0.17.0 - -``DataFrame`` also has the ``nlargest`` and ``nsmallest`` methods. - -.. ipython:: python - - df = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1], - 'b': list('abdceff'), - 'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]}) - df.nlargest(3, 'a') - df.nlargest(5, ['a', 'c']) - df.nsmallest(3, 'a') - df.nsmallest(5, ['a', 'c']) - - -.. _basics.multi-index_sorting: - -Sorting by a multi-index column -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You must be explicit about sorting when the column is a multi-index, and fully specify -all levels to ``by``. - -.. ipython:: python - - df1.columns = pd.MultiIndex.from_tuples([('a','one'),('a','two'),('b','three')]) - df1.sort_values(by=('a','two')) - - -Copying -------- - -The :meth:`~DataFrame.copy` method on pandas objects copies the underlying data (though not -the axis indexes, since they are immutable) and returns a new object. Note that -**it is seldom necessary to copy objects**. For example, there are only a -handful of ways to alter a DataFrame *in-place*: - - * Inserting, deleting, or modifying a column - * Assigning to the ``index`` or ``columns`` attributes - * For homogeneous data, directly modifying the values via the ``values`` - attribute or advanced indexing - -To be clear, no pandas methods have the side effect of modifying your data; -almost all methods return new objects, leaving the original object -untouched. If data is modified, it is because you did so explicitly. - -.. _basics.dtypes: - -dtypes ------- - -The main types stored in pandas objects are ``float``, ``int``, ``bool``, -``datetime64[ns]`` and ``datetime64[ns, tz]`` (in >= 0.17.0), ``timedelta[ns]``, ``category`` (in >= 0.15.0), and ``object``. In addition these dtypes -have item sizes, e.g. ``int64`` and ``int32``. See :ref:`Series with TZ ` for more detail on ``datetime64[ns, tz]`` dtypes. - -A convenient :attr:`~DataFrame.dtypes` attribute for DataFrames returns a Series with the data type of each column. - -.. ipython:: python - - dft = pd.DataFrame(dict(A = np.random.rand(3), - B = 1, - C = 'foo', - D = pd.Timestamp('20010102'), - E = pd.Series([1.0]*3).astype('float32'), - F = False, - G = pd.Series([1]*3,dtype='int8'))) - dft - dft.dtypes - -On a ``Series`` use the :attr:`~Series.dtype` attribute. - -.. ipython:: python - - dft['A'].dtype - -If a pandas object contains data multiple dtypes *IN A SINGLE COLUMN*, the dtype of the -column will be chosen to accommodate all of the data types (``object`` is the most -general). - -.. ipython:: python - - # these ints are coerced to floats - pd.Series([1, 2, 3, 4, 5, 6.]) - - # string data forces an ``object`` dtype - pd.Series([1, 2, 3, 6., 'foo']) - -The method :meth:`~DataFrame.get_dtype_counts` will return the number of columns of -each type in a ``DataFrame``: - -.. ipython:: python - - dft.get_dtype_counts() - -Numeric dtypes will propagate and can coexist in DataFrames (starting in v0.11.0). -If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, -or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, -different numeric dtypes will **NOT** be combined. The following example will give you a taste. - -.. ipython:: python - - df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') - df1 - df1.dtypes - df2 = pd.DataFrame(dict( A = pd.Series(np.random.randn(8), dtype='float16'), - B = pd.Series(np.random.randn(8)), - C = pd.Series(np.array(np.random.randn(8), dtype='uint8')) )) - df2 - df2.dtypes - -defaults -~~~~~~~~ - -By default integer types are ``int64`` and float types are ``float64``, -*REGARDLESS* of platform (32-bit or 64-bit). The following will all result in ``int64`` dtypes. - -.. ipython:: python - - pd.DataFrame([1, 2], columns=['a']).dtypes - pd.DataFrame({'a': [1, 2]}).dtypes - pd.DataFrame({'a': 1 }, index=list(range(2))).dtypes - -Numpy, however will choose *platform-dependent* types when creating arrays. -The following **WILL** result in ``int32`` on 32-bit platform. - -.. ipython:: python - - frame = pd.DataFrame(np.array([1, 2])) - - -upcasting -~~~~~~~~~ - -Types can potentially be *upcasted* when combined with other types, meaning they are promoted -from the current type (say ``int`` to ``float``) - -.. ipython:: python - - df3 = df1.reindex_like(df2).fillna(value=0.0) + df2 - df3 - df3.dtypes - -The ``values`` attribute on a DataFrame return the *lower-common-denominator* of the dtypes, meaning -the dtype that can accommodate **ALL** of the types in the resulting homogeneous dtyped numpy array. This can -force some *upcasting*. - -.. ipython:: python - - df3.values.dtype - -astype -~~~~~~ - -.. _basics.cast: - -You can use the :meth:`~DataFrame.astype` method to explicitly convert dtypes from one to another. These will by default return a copy, -even if the dtype was unchanged (pass ``copy=False`` to change this behavior). In addition, they will raise an -exception if the astype operation is invalid. - -Upcasting is always according to the **numpy** rules. If two different dtypes are involved in an operation, -then the more *general* one will be used as the result of the operation. - -.. ipython:: python - - df3 - df3.dtypes - - # conversion of dtypes - df3.astype('float32').dtypes - - -Convert a subset of columns to a specified type using :meth:`~DataFrame.astype` - -.. ipython:: python - - dft = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7, 8, 9]}) - dft[['a','b']] = dft[['a','b']].astype(np.uint8) - dft - dft.dtypes - -.. versionadded:: 0.19.0 - -Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFrame.astype` - -.. ipython:: python - - dft1 = pd.DataFrame({'a': [1,0,1], 'b': [4,5,6], 'c': [7, 8, 9]}) - dft1 = dft1.astype({'a': np.bool, 'c': np.float64}) - dft1 - dft1.dtypes - -.. note:: - - When trying to convert a subset of columns to a specified type using :meth:`~DataFrame.astype` and :meth:`~DataFrame.loc`, upcasting occurs. - - :meth:`~DataFrame.loc` tries to fit in what we are assigning to the current dtypes, while ``[]`` will overwrite them taking the dtype from the right hand side. Therefore the following piece of code produces the unintended result. - - .. ipython:: python - - dft = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7, 8, 9]}) - dft.loc[:, ['a', 'b']].astype(np.uint8).dtypes - dft.loc[:, ['a', 'b']] = dft.loc[:, ['a', 'b']].astype(np.uint8) - dft.dtypes - -.. _basics.object_conversion: - -object conversion -~~~~~~~~~~~~~~~~~ - -pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types. -The following functions are available for one dimensional object arrays or scalars: - -- :meth:`~pandas.to_numeric` (conversion to numeric dtypes) - - .. ipython:: python - - m = ['1.1', 2, 3] - pd.to_numeric(m) - -- :meth:`~pandas.to_datetime` (conversion to datetime objects) - - .. ipython:: python - - import datetime - m = ['2016-07-09', datetime.datetime(2016, 3, 2)] - pd.to_datetime(m) - -- :meth:`~pandas.to_timedelta` (conversion to timedelta objects) - - .. ipython:: python - - m = ['5us', pd.Timedelta('1day')] - pd.to_timedelta(m) - -To force a conversion, we can pass in an ``errors`` argument, which specifies how pandas should deal with elements -that cannot be converted to desired dtype or object. By default, ``errors='raise'``, meaning that any errors encountered -will be raised during the conversion process. However, if ``errors='coerce'``, these errors will be ignored and pandas -will convert problematic elements to ``pd.NaT`` (for datetime and timedelta) or ``np.nan`` (for numeric). This might be -useful if you are reading in data which is mostly of the desired dtype (e.g. numeric, datetime), but occasionally has -non-conforming elements intermixed that you want to represent as missing: - -.. ipython:: python - - import datetime - m = ['apple', datetime.datetime(2016, 3, 2)] - pd.to_datetime(m, errors='coerce') - - m = ['apple', 2, 3] - pd.to_numeric(m, errors='coerce') - - m = ['apple', pd.Timedelta('1day')] - pd.to_timedelta(m, errors='coerce') - -The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it -encounters any errors with the conversion to a desired data type: - -.. ipython:: python - - import datetime - m = ['apple', datetime.datetime(2016, 3, 2)] - pd.to_datetime(m, errors='ignore') - - m = ['apple', 2, 3] - pd.to_numeric(m, errors='ignore') - - m = ['apple', pd.Timedelta('1day')] - pd.to_timedelta(m, errors='ignore') - -In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument ``downcast``, which gives the -option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory: - -.. ipython:: python - - m = ['1', 2, 3] - pd.to_numeric(m, downcast='integer') # smallest signed int dtype - pd.to_numeric(m, downcast='signed') # same as 'integer' - pd.to_numeric(m, downcast='unsigned') # smallest unsigned int dtype - pd.to_numeric(m, downcast='float') # smallest float dtype - -As these methods apply only to one-dimensional arrays, lists or scalars; they cannot be used directly on multi-dimensional objects such -as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the function over each column efficiently: - -.. ipython:: python - - import datetime - df = pd.DataFrame([['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O') - df - df.apply(pd.to_datetime) - - df = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O') - df - df.apply(pd.to_numeric) - - df = pd.DataFrame([['5us', pd.Timedelta('1day')]] * 2, dtype='O') - df - df.apply(pd.to_timedelta) - -gotchas -~~~~~~~ - -Performing selection operations on ``integer`` type data can easily upcast the data to ``floating``. -The dtype of the input data will be preserved in cases where ``nans`` are not introduced (starting in 0.11.0) -See also :ref:`integer na gotchas ` - -.. ipython:: python - - dfi = df3.astype('int32') - dfi['E'] = 1 - dfi - dfi.dtypes - - casted = dfi[dfi>0] - casted - casted.dtypes - -While float dtypes are unchanged. - -.. ipython:: python - - dfa = df3.copy() - dfa['A'] = dfa['A'].astype('float32') - dfa.dtypes - - casted = dfa[df2>0] - casted - casted.dtypes - -Selecting columns based on ``dtype`` ------------------------------------- - -.. _basics.selectdtypes: - -.. versionadded:: 0.14.1 - -The :meth:`~DataFrame.select_dtypes` method implements subsetting of columns -based on their ``dtype``. - -First, let's create a :class:`DataFrame` with a slew of different -dtypes: - -.. ipython:: python - - df = pd.DataFrame({'string': list('abc'), - 'int64': list(range(1, 4)), - 'uint8': np.arange(3, 6).astype('u1'), - 'float64': np.arange(4.0, 7.0), - 'bool1': [True, False, True], - 'bool2': [False, True, False], - 'dates': pd.date_range('now', periods=3).values, - 'category': pd.Series(list("ABC")).astype('category')}) - df['tdeltas'] = df.dates.diff() - df['uint64'] = np.arange(3, 6).astype('u8') - df['other_dates'] = pd.date_range('20130101', periods=3).values - df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern') - df - -And the dtypes - -.. ipython:: python - - df.dtypes - -:meth:`~DataFrame.select_dtypes` has two parameters ``include`` and ``exclude`` that allow you to -say "give me the columns WITH these dtypes" (``include``) and/or "give the -columns WITHOUT these dtypes" (``exclude``). - -For example, to select ``bool`` columns - -.. ipython:: python - - df.select_dtypes(include=[bool]) - -You can also pass the name of a dtype in the `numpy dtype hierarchy -`__: - -.. ipython:: python - - df.select_dtypes(include=['bool']) - -:meth:`~pandas.DataFrame.select_dtypes` also works with generic dtypes as well. - -For example, to select all numeric and boolean columns while excluding unsigned -integers - -.. ipython:: python - - df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger']) - -To select string columns you must use the ``object`` dtype: - -.. ipython:: python - - df.select_dtypes(include=['object']) - -To see all the child dtypes of a generic ``dtype`` like ``numpy.number`` you -can define a function that returns a tree of child dtypes: - -.. ipython:: python - - def subdtypes(dtype): - subs = dtype.__subclasses__() - if not subs: - return dtype - return [dtype, [subdtypes(dt) for dt in subs]] - -All numpy dtypes are subclasses of ``numpy.generic``: - -.. ipython:: python - - subdtypes(np.generic) - -.. note:: - - Pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal - numpy hierarchy and wont show up with the above function. - -.. note:: - - The ``include`` and ``exclude`` parameters must be non-string sequences. From e2a3f325221c431efa01acb5b5dada557348aae0 Mon Sep 17 00:00:00 2001 From: Michael Charlton Date: Thu, 22 Dec 2016 20:28:03 +0000 Subject: [PATCH 06/10] Deprecate 'raise_on_error' kwarg with 'errors' for DataFrame.astype Valid arguments for new 'errors' kwarg are 'ignore' or 'raise' see #14878 --- doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/generic.py | 14 +++++++++----- pandas/core/internals.py | 13 ++++++------- pandas/tests/frame/test_dtypes.py | 4 ++-- pandas/tests/test_internals.py | 2 +- 5 files changed, 19 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e7b2fc5a6505d..4027edd6eb9eb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -247,7 +247,7 @@ Deprecations - ``Index.repeat()`` and ``MultiIndex.repeat()`` have deprecated the ``n`` parameter in favor of ``repeats`` (:issue:`12662`) - ``Categorical.searchsorted()`` and ``Series.searchsorted()`` have deprecated the ``v`` parameter in favor of ``value`` (:issue:`12662`) - ``TimedeltaIndex.searchsorted()``, ``DatetimeIndex.searchsorted()``, and ``PeriodIndex.searchsorted()`` have deprecated the ``key`` parameter in favor of ``value`` (:issue:`12662`) - +- ``DataFrame.astype()`` has deprecated the ``raise_on_error`` parameter in favor of ``errors`` (:issue:`14878`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3678168890444..d2dfb9c5992c1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3061,7 +3061,9 @@ def blocks(self): """Internal property, property synonym for as_blocks()""" return self.as_blocks() - def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): + @deprecate_kwarg(old_arg_name='raise_on_error', new_arg_name='errors', + mapping={True: 'raise', False: 'ignore'}) + def astype(self, dtype, copy=True, errors='raise', **kwargs): """ Cast object to input numpy.dtype Return a copy when copy = True (be really careful with this!) @@ -3073,7 +3075,9 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): the same type. Alternatively, use {col: dtype, ...}, where col is a column label and dtype is a numpy.dtype or Python type to cast one or more of the DataFrame's columns to column-specific types. - raise_on_error : raise on invalid input + errors : {'raise', 'ignore'}, default 'raise' + - ``raise`` : allow exceptions to be raised on invalid input + - ``ignore`` : suppress raising exceptions on invalid input kwargs : keyword arguments to pass on to the constructor Returns @@ -3086,7 +3090,7 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): raise KeyError('Only the Series name can be used for ' 'the key in Series dtype mappings.') new_type = list(dtype.values())[0] - return self.astype(new_type, copy, raise_on_error, **kwargs) + return self.astype(new_type, copy, errors, **kwargs) elif self.ndim > 2: raise NotImplementedError( 'astype() only accepts a dtype arg of type dict when ' @@ -3107,8 +3111,8 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): return concat(results, axis=1, copy=False) # else, only a single dtype is given - new_data = self._data.astype(dtype=dtype, copy=copy, - raise_on_error=raise_on_error, **kwargs) + new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors, + **kwargs) return self._constructor(new_data).__finalize__(self) def copy(self, deep=True): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 05ac3356c1770..acc056c72c7c3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -455,12 +455,11 @@ def downcast(self, dtypes=None, mgr=None): return blocks - def astype(self, dtype, copy=False, raise_on_error=True, values=None, - **kwargs): - return self._astype(dtype, copy=copy, raise_on_error=raise_on_error, - values=values, **kwargs) + def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): + return self._astype(dtype, copy=copy, errors=errors, values=values, + **kwargs) - def _astype(self, dtype, copy=False, raise_on_error=True, values=None, + def _astype(self, dtype, copy=False, errors='raise', values=None, klass=None, mgr=None, **kwargs): """ Coerce to the new type (if copy=True, return a new copy) @@ -507,7 +506,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, newb = make_block(values, placement=self.mgr_locs, dtype=dtype, klass=klass) except: - if raise_on_error is True: + if errors == 'raise': raise newb = self.copy() if copy else self @@ -2147,7 +2146,7 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) - def _astype(self, dtype, copy=False, raise_on_error=True, values=None, + def _astype(self, dtype, copy=False, errors='raise', values=None, klass=None, mgr=None): """ Coerce to the new type (if copy=True, return a new copy) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 43a108e9acc80..f460d76827347 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -357,7 +357,7 @@ def test_astype_with_exclude_string(self): df = self.frame.copy() expected = self.frame.astype(int) df['string'] = 'foo' - casted = df.astype(int, raise_on_error=False) + casted = df.astype(int, errors='ignore') expected['string'] = 'foo' assert_frame_equal(casted, expected) @@ -365,7 +365,7 @@ def test_astype_with_exclude_string(self): df = self.frame.copy() expected = self.frame.astype(np.int32) df['string'] = 'foo' - casted = df.astype(np.int32, raise_on_error=False) + casted = df.astype(np.int32, errors='ignore') expected['string'] = 'foo' assert_frame_equal(casted, expected) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index db1c8da4cae73..32e8f44e6f258 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -553,7 +553,7 @@ def test_astype(self): 'e: f4; f: f2; g: f8') for t in ['float16', 'float32', 'float64', 'int32', 'int64']: t = np.dtype(t) - tmgr = mgr.astype(t, raise_on_error=False) + tmgr = mgr.astype(t, errors='ignore') self.assertEqual(tmgr.get('c').dtype.type, t) self.assertEqual(tmgr.get('e').dtype.type, t) self.assertEqual(tmgr.get('f').dtype.type, t) From 15bdcf48e37e2596639e6fda8e942cd62212bf1a Mon Sep 17 00:00:00 2001 From: Michael Charlton Date: Fri, 23 Dec 2016 22:03:19 +0000 Subject: [PATCH 07/10] Check & unit test added for validity of arguments to 'errors' kwarg. Docstrings to astype clarified. --- pandas/core/generic.py | 5 +++++ pandas/core/internals.py | 7 +++++++ pandas/tests/test_internals.py | 7 +++++++ 3 files changed, 19 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d2dfb9c5992c1..5ea7770ccf1d3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3075,9 +3075,14 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): the same type. Alternatively, use {col: dtype, ...}, where col is a column label and dtype is a numpy.dtype or Python type to cast one or more of the DataFrame's columns to column-specific types. + raise_on_error : raise on invalid input. DEPRECATED use ``errors`` + instead errors : {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised on invalid input - ``ignore`` : suppress raising exceptions on invalid input + + .. versionadded:: 0.20.0 + kwargs : keyword arguments to pass on to the constructor Returns diff --git a/pandas/core/internals.py b/pandas/core/internals.py index acc056c72c7c3..0e64c6d1993a7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -465,6 +465,13 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, Coerce to the new type (if copy=True, return a new copy) raise on an except if raise == True """ + errors_legal_values = ('raise', 'ignore') + + if errors not in errors_legal_values: + invalid_arg = "Expected value of kwarg 'errors' to be one of %s. "\ + "Supplied value is '%s'" % (', '.join("'%s'" % arg for arg in + errors_legal_values), errors) + raise ValueError(invalid_arg) # may need to convert to categorical # this is only called for non-categoricals diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 32e8f44e6f258..0425849f10934 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -566,6 +566,13 @@ def test_astype(self): else: self.assertEqual(tmgr.get('d').dtype.type, t) + def test_illegal_arg_for_errors_in_astype(self): + """ ValueError exception raised when illegal value used for errors """ + mgr = create_mgr('a,b,c: i8') + + with self.assertRaises(ValueError): + mgr.astype(np.float64, errors=True) + def test_convert(self): def _compare(old_mgr, new_mgr): """ compare the blocks, numeric compare ==, object don't """ From 39314e631b0ce8ef2f57be8b8904a818780920ee Mon Sep 17 00:00:00 2001 From: Michael Charlton Date: Tue, 3 Jan 2017 11:47:10 +0000 Subject: [PATCH 08/10] Update after code review #1478 Tests added for deprected 'raise_on_error' kwarg & new 'errors' kwarg. Clarified docstring for DataFrame.astype method --- pandas/core/generic.py | 11 ++++++----- pandas/core/internals.py | 6 +++--- pandas/tests/frame/test_dtypes.py | 18 ++++++++++++++++++ pandas/tests/test_internals.py | 7 ------- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5ea7770ccf1d3..3b008d84454e9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3075,11 +3075,12 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): the same type. Alternatively, use {col: dtype, ...}, where col is a column label and dtype is a numpy.dtype or Python type to cast one or more of the DataFrame's columns to column-specific types. - raise_on_error : raise on invalid input. DEPRECATED use ``errors`` - instead - errors : {'raise', 'ignore'}, default 'raise' - - ``raise`` : allow exceptions to be raised on invalid input - - ``ignore`` : suppress raising exceptions on invalid input + raise_on_error : DEPRECATED use ``errors`` instead + errors : {'raise', 'ignore'}, default 'raise'. + Control raising of exceptions on invalid data for provided dtype. + + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object .. versionadded:: 0.20.0 diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 0e64c6d1993a7..aa865ae430d4a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -468,9 +468,9 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, errors_legal_values = ('raise', 'ignore') if errors not in errors_legal_values: - invalid_arg = "Expected value of kwarg 'errors' to be one of %s. "\ - "Supplied value is '%s'" % (', '.join("'%s'" % arg for arg in - errors_legal_values), errors) + invalid_arg = ("Expected value of kwarg 'errors' to be one of {}. " + "Supplied value is '{}'".format( + list(errors_legal_values), errors)) raise ValueError(invalid_arg) # may need to convert to categorical diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index f460d76827347..dc55d31662c8f 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -523,6 +523,24 @@ def test_timedeltas(self): result = df.get_dtype_counts().sort_values() assert_series_equal(result, expected) + def test_illegal_arg_for_errors_in_astype(self): + # issue #14878 + + df = DataFrame([1, 2, 3]) + + with self.assertRaises(ValueError): + df.astype(np.float64, errors=True) + + def test_depr_kwarg_produces_future_warning(self): + # issue #14878 + + df = DataFrame([1, 2, 3]) + + with tm.assert_produces_warning(FutureWarning): + df.astype(np.int8, raise_on_error=False) + + df.astype(np.int8, errors='ignore') + class TestDataFrameDatetimeWithTZ(tm.TestCase, TestData): diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 0425849f10934..32e8f44e6f258 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -566,13 +566,6 @@ def test_astype(self): else: self.assertEqual(tmgr.get('d').dtype.type, t) - def test_illegal_arg_for_errors_in_astype(self): - """ ValueError exception raised when illegal value used for errors """ - mgr = create_mgr('a,b,c: i8') - - with self.assertRaises(ValueError): - mgr.astype(np.float64, errors=True) - def test_convert(self): def _compare(old_mgr, new_mgr): """ compare the blocks, numeric compare ==, object don't """ From 0ae855067cd8d5d42282ff0956fb774f86f80c8f Mon Sep 17 00:00:00 2001 From: Michael Charlton Date: Tue, 3 Jan 2017 15:14:39 +0000 Subject: [PATCH 09/10] Update after code review of unit tests #1478 Unit tests for DataFrame.astype merged. Dupliacted those tests for Series.astype. Both testing deprecation of 'raise_on_error' kwarg. --- pandas/tests/frame/test_dtypes.py | 7 +------ pandas/tests/series/test_dtypes.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index dc55d31662c8f..95c5e7ea6e9fc 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -523,7 +523,7 @@ def test_timedeltas(self): result = df.get_dtype_counts().sort_values() assert_series_equal(result, expected) - def test_illegal_arg_for_errors_in_astype(self): + def test_arg_for_errors_in_astype(self): # issue #14878 df = DataFrame([1, 2, 3]) @@ -531,11 +531,6 @@ def test_illegal_arg_for_errors_in_astype(self): with self.assertRaises(ValueError): df.astype(np.float64, errors=True) - def test_depr_kwarg_produces_future_warning(self): - # issue #14878 - - df = DataFrame([1, 2, 3]) - with tm.assert_produces_warning(FutureWarning): df.astype(np.int8, raise_on_error=False) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 3eafbaf912797..bf9c64276b693 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -168,3 +168,16 @@ def test_complexx(self): b.real = np.arange(5) + 5 tm.assert_numpy_array_equal(a + 5, b.real) tm.assert_numpy_array_equal(4 * a, b.imag) + + def test_arg_for_errors_in_astype(self): + # issue #14878 + + sr = Series([1, 2, 3]) + + with self.assertRaises(ValueError): + sr.astype(np.float64, errors=False) + + with tm.assert_produces_warning(FutureWarning): + sr.astype(np.int8, raise_on_error=True) + + sr.astype(np.int8, errors='raise') From b174e6f4dec2faa3eaf9124dc694c3fd92e1a890 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 3 Jan 2017 21:52:39 +0100 Subject: [PATCH 10/10] switch kwarg order in docstring --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3b008d84454e9..cd4b95ad48e0d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3075,7 +3075,6 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): the same type. Alternatively, use {col: dtype, ...}, where col is a column label and dtype is a numpy.dtype or Python type to cast one or more of the DataFrame's columns to column-specific types. - raise_on_error : DEPRECATED use ``errors`` instead errors : {'raise', 'ignore'}, default 'raise'. Control raising of exceptions on invalid data for provided dtype. @@ -3084,6 +3083,7 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs): .. versionadded:: 0.20.0 + raise_on_error : DEPRECATED use ``errors`` instead kwargs : keyword arguments to pass on to the constructor Returns