Source code for iris.pandas

# Copyright Iris contributors
#
# This file is part of Iris and is released under the LGPL license.
# See COPYING and COPYING.LESSER in the root of the repository for full
# licensing details.
"""
Provide conversion to and from Pandas data structures.

See also: http://pandas.pydata.org/

"""

import datetime
from itertools import chain, combinations
import warnings

import cf_units
from cf_units import Unit
import cftime
import numpy as np
import numpy.ma as ma
import pandas

try:
    from pandas.core.indexes.datetimes import DatetimeIndex  # pandas >=0.20
except ImportError:
    from pandas.tseries.index import DatetimeIndex  # pandas <0.20

import iris
from iris._deprecation import warn_deprecated
from iris.coords import AncillaryVariable, AuxCoord, CellMeasure, DimCoord
from iris.cube import Cube, CubeList


def _get_dimensional_metadata(name, values, calendar=None, dm_class=None):
    """
    Create a Coord or other dimensional metadata from a Pandas index or columns array.

    If no calendar is specified for a time series, Standard is assumed.

    """
    units = Unit("unknown")
    if calendar is None:
        calendar = cf_units.CALENDAR_STANDARD

    # Getting everything into a single datetime format is hard!

    # Convert out of NumPy's own datetime format.
    if np.issubdtype(values.dtype, np.datetime64):
        values = pandas.to_datetime(values)

    # Convert pandas datetime objects to python datetime objects.
    if isinstance(values, DatetimeIndex):
        values = np.array([i.to_pydatetime() for i in values])

    # Convert datetime objects to Iris' current datetime representation.
    if values.dtype == object:
        dt_types = (datetime.datetime, cftime.datetime)
        if all([isinstance(i, dt_types) for i in values]):
            units = Unit("hours since epoch", calendar=calendar)
            values = units.date2num(values)

    values = np.array(values)

    if dm_class is None:
        if np.issubdtype(values.dtype, np.number) and iris.util.monotonic(
            values, strict=True
        ):
            dm_class = DimCoord
        else:
            dm_class = AuxCoord

    instance = dm_class(values, units=units)
    if name is not None:
        # Use rename() to attempt standard_name but fall back on long_name.
        instance.rename(str(name))

    return instance


def _add_iris_coord(cube, name, points, dim, calendar=None):
    """
    Add a Coord or other dimensional metadata to a Cube from a Pandas index or columns array.
    """
    # Most functionality has been abstracted to _get_dimensional_metadata,
    #  allowing re-use in as_cube() and as_cubes().
    coord = _get_dimensional_metadata(name, points, calendar)

    if coord.__class__ == DimCoord:
        cube.add_dim_coord(coord, dim)
    else:
        cube.add_aux_coord(coord, dim)


def _series_index_unique(pandas_series: pandas.Series):
    """
    Find an index grouping of a :class:`pandas.Series` that has just one Series value per group.

    Iterates through grouping single index levels, then combinations of 2
    levels, then 3 etcetera, until single :class:`~pandas.Series` values per
    group are found. Returns a ``tuple`` of the index levels that group to
    produce single values, as soon as one is found.

    Returns ``None`` if no index level combination produces single values.

    """
    unique_number = pandas_series.nunique()
    pandas_index = pandas_series.index
    levels_range = range(pandas_index.nlevels)
    if unique_number == 1:
        # Scalar - identical for all indices.
        result = ()
    else:
        result = None
        levels_combinations = chain(
            *[
                combinations(levels_range, levels + 1)
                for levels in levels_range
            ]
        )
        for lc in levels_combinations:
            if pandas_series.groupby(level=lc).nunique().max() == 1:
                result = lc
                # Escape as early as possible - heavy operation.
                break
    return result


[docs]def as_cube(
    pandas_array,
    copy=True,
    calendars=None,
):
    """
    Convert a Pandas Series/DataFrame into a 1D/2D Iris Cube.

    .. deprecated:: 3.3.0

        This function is scheduled for removal in a future release, being
        replaced by :func:`iris.pandas.as_cubes`, which offers richer
        dimensional intelligence.

    Parameters
    ----------
    pandas_array : :class:`pandas.Series` or :class:`pandas.DataFrame`
        The Pandas object to convert
    copy : bool, default=True
        Whether to copy `pandas_array`, or to create array views where
        possible. Provided in case of memory limit concerns.
    calendars : dict, optional
        A dict mapping a dimension to a calendar. Required to convert datetime
        indices/columns.

    Notes
    -----
    This function will copy your data by default.

    Example usage::

        as_cube(series, calendars={0: cf_units.CALENDAR_360_DAY})
        as_cube(data_frame, calendars={1: cf_units.CALENDAR_STANDARD})

    """
    message = (
        "iris.pandas.as_cube has been deprecated, and will be removed in a "
        "future release. Please use iris.pandas.as_cubes instead."
    )
    warn_deprecated(message)

    calendars = calendars or {}
    if pandas_array.ndim not in [1, 2]:
        raise ValueError(
            "Only 1D or 2D Pandas arrays "
            "can currently be conveted to Iris cubes."
        )

    # Make the copy work consistently across NumPy 1.6 and 1.7.
    # (When 1.7 takes a copy it preserves the C/Fortran ordering, but
    # 1.6 doesn't. Since we don't care about preserving the order we can
    # just force it back to C-order.)
    order = "C" if copy else "A"
    data = np.array(pandas_array, copy=copy, order=order)
    cube = Cube(np.ma.masked_invalid(data, copy=False))
    _add_iris_coord(
        cube, "index", pandas_array.index, 0, calendars.get(0, None)
    )
    if pandas_array.ndim == 2:
        _add_iris_coord(
            cube,
            "columns",
            pandas_array.columns.values,
            1,
            calendars.get(1, None),
        )
    return cube


[docs]def as_cubes(
    pandas_structure,
    copy=True,
    calendars=None,
    aux_coord_cols=None,
    cell_measure_cols=None,
    ancillary_variable_cols=None,
):
    """
    Convert a Pandas Series/DataFrame into n-dimensional Iris Cubes, including dimensional metadata.

    The index of `pandas_structure` will be used for generating the
    :class:`~iris.cube.Cube` dimension(s) and :class:`~iris.coords.DimCoord`\\ s.
    Other dimensional metadata may span multiple dimensions - based on how the
    column values vary with the index values.

    Parameters
    ----------
    pandas_structure : :class:`pandas.Series` or :class:`pandas.DataFrame`
        The Pandas object to convert
    copy : bool, default=True
        Whether the Cube :attr:`~iris.cube.Cube.data` is a copy of the
        `pandas_structure` column, or a view of the same array. Arrays other than
        the data (coords etc.) are always copies. This option is provided to
        help with memory size concerns.
    calendars : dict, optional
        Calendar conversions for individual date-time coordinate
        columns/index-levels e.g. ``{"my_column": cf_units.CALENDAR_360_DAY}``.
    aux_coord_cols, cell_measure_cols, ancillary_variable_cols : list of str, optional
        Names of columns to be converted into :class:`~iris.coords.AuxCoord`,
        :class:`~iris.coords.CellMeasure` and
        :class:`~iris.coords.AncillaryVariable` objects.

    Returns
    --------
    :class:`~iris.cube.CubeList`
        One :class:`~iris.cube.Cube` for each column not referenced in
        `aux_coord_cols`/`cell_measure_cols`/`ancillary_variable_cols`.

    Notes
    -----
    A :class:`~pandas.DataFrame` using columns as a second data dimension will
    need to be 'melted' before conversion. See the Examples for how.

    Dask ``DataFrame``\\s are not supported.

    Examples
    --------
    >>> from iris.pandas import as_cubes
    >>> import numpy as np
    >>> from pandas import DataFrame, Series

    Converting a simple :class:`~pandas.Series` :

    >>> my_series = Series([300, 301, 302], name="air_temperature")
    >>> converted_cubes = as_cubes(my_series)
    >>> print(converted_cubes)
    0: air_temperature / (unknown)         (unknown: 3)
    >>> print(converted_cubes[0])
    air_temperature / (unknown)         (unknown: 3)
        Dimension coordinates:
            unknown                             x

    A :class:`~pandas.DataFrame`, with a custom index becoming the
    :class:`~iris.coords.DimCoord` :

    >>> my_df = DataFrame({
    ...     "air_temperature": [300, 301, 302],
    ...     "longitude": [30, 40, 50]
    ...     })
    >>> my_df = my_df.set_index("longitude")
    >>> converted_cubes = as_cubes(my_df)
    >>> print(converted_cubes[0])
    air_temperature / (unknown)         (longitude: 3)
        Dimension coordinates:
            longitude                             x

    A :class:`~pandas.DataFrame` representing two 3-dimensional datasets,
    including a 2-dimensional :class:`~iris.coords.AuxCoord` :

    >>> my_df = DataFrame({
    ...     "air_temperature": np.arange(300, 312, 1),
    ...     "air_pressure": np.arange(1000, 1012, 1),
    ...     "longitude": [0, 10] * 6,
    ...     "latitude": [25, 25, 35, 35] * 3,
    ...     "height": ([0] * 4) + ([100] * 4) + ([200] * 4),
    ...     "in_region": [True, False, False, False] * 3
    ... })
    >>> print(my_df)
        air_temperature  air_pressure  longitude  latitude  height  in_region
    0               300          1000          0        25       0       True
    1               301          1001         10        25       0      False
    2               302          1002          0        35       0      False
    3               303          1003         10        35       0      False
    4               304          1004          0        25     100       True
    5               305          1005         10        25     100      False
    6               306          1006          0        35     100      False
    7               307          1007         10        35     100      False
    8               308          1008          0        25     200       True
    9               309          1009         10        25     200      False
    10              310          1010          0        35     200      False
    11              311          1011         10        35     200      False
    >>> my_df = my_df.set_index(["longitude", "latitude", "height"])
    >>> my_df = my_df.sort_index()
    >>> converted_cubes = as_cubes(my_df, aux_coord_cols=["in_region"])
    >>> print(converted_cubes)
    0: air_temperature / (unknown)         (longitude: 2; latitude: 2; height: 3)
    1: air_pressure / (unknown)            (longitude: 2; latitude: 2; height: 3)
    >>> print(converted_cubes[0])
    air_temperature / (unknown)         (longitude: 2; latitude: 2; height: 3)
        Dimension coordinates:
            longitude                             x            -          -
            latitude                              -            x          -
            height                                -            -          x
        Auxiliary coordinates:
            in_region                             x            x          -

    Pandas uses ``NaN`` rather than masking data. Converted
    :class:`~iris.cube.Cube`\\s can be masked in downstream user code :

    >>> my_series = Series([300, np.NaN, 302], name="air_temperature")
    >>> converted_cube = as_cubes(my_series)[0]
    >>> print(converted_cube.data)
    [300.  nan 302.]
    >>> converted_cube.data = np.ma.masked_invalid(converted_cube.data)
    >>> print(converted_cube.data)
    [300.0 -- 302.0]

    If the :class:`~pandas.DataFrame` uses columns as a second dimension,
    :func:`pandas.melt` should be used to convert the data to the expected
    n-dimensional format :

    >>> my_df = DataFrame({
    ...     "latitude": [35, 25],
    ...     0: [300, 301],
    ...     10: [302, 303],
    ... })
    >>> print(my_df)
       latitude    0   10
    0        35  300  302
    1        25  301  303
    >>> my_df = my_df.melt(
    ...     id_vars=["latitude"],
    ...     value_vars=[0, 10],
    ...     var_name="longitude",
    ...     value_name="air_temperature"
    ... )
    >>> print(my_df)
       latitude longitude  air_temperature
    0        35         0              300
    1        25         0              301
    2        35        10              302
    3        25        10              303
    >>> my_df = my_df.set_index(["latitude", "longitude"])
    >>> my_df = my_df.sort_index()
    >>> converted_cube = as_cubes(my_df)[0]
    >>> print(converted_cube)
    air_temperature / (unknown)         (latitude: 2; longitude: 2)
        Dimension coordinates:
            latitude                             x             -
            longitude                            -             x

    """
    if pandas_structure.empty:
        return CubeList()

    calendars = calendars or {}
    aux_coord_cols = aux_coord_cols or []
    cell_measure_cols = cell_measure_cols or []
    ancillary_variable_cols = ancillary_variable_cols or []

    is_series = isinstance(pandas_structure, pandas.Series)

    if copy:
        pandas_structure = pandas_structure.copy()

    pandas_index = pandas_structure.index
    if not pandas_index.is_unique:
        message = (
            f"DataFrame index ({pandas_index.names}) is not unique per "
            "row; cannot be used for DimCoords."
        )
        raise ValueError(message)

    if not pandas_index.is_monotonic:
        # Need monotonic index for use in DimCoord(s).
        # This function doesn't sort_index itself since that breaks the
        #  option to return a data view instead of a copy.
        message = (
            "Pandas index is not monotonic. Consider using the "
            "sort_index() method before passing in."
        )
        raise ValueError(message)

    cube_shape = getattr(pandas_index, "levshape", (pandas_index.nunique(),))
    n_rows = len(pandas_structure)
    if np.product(cube_shape) > n_rows:
        message = (
            f"Not all index values have a corresponding row - {n_rows} rows "
            f"cannot be reshaped into {cube_shape}. Consider padding with NaN "
            "rows where needed."
        )
        raise ValueError(message)

    cube_kwargs = {}

    def format_dimensional_metadata(dm_class_, values_, name_, dimensions_):
        # Common convenience to get the right DM in the right format for
        #  Cube creation.
        calendar = calendars.get(name_)
        instance = _get_dimensional_metadata(
            name_, values_, calendar, dm_class_
        )
        return (instance, dimensions_)

    # DimCoords.
    dim_coord_kwarg = []
    for ix, dim_name in enumerate(pandas_index.names):
        if hasattr(pandas_index, "levels"):
            coord_points = pandas_index.levels[ix]
        else:
            coord_points = pandas_index
        new_dim_coord = format_dimensional_metadata(
            DimCoord, coord_points, dim_name, ix
        )
        dim_coord_kwarg.append(new_dim_coord)
    cube_kwargs["dim_coords_and_dims"] = dim_coord_kwarg

    # Other dimensional metadata.
    class_arg_mapping = [
        (AuxCoord, aux_coord_cols, "aux_coords_and_dims"),
        (CellMeasure, cell_measure_cols, "cell_measures_and_dims"),
        (
            AncillaryVariable,
            ancillary_variable_cols,
            "ancillary_variables_and_dims",
        ),
    ]

    if is_series:
        columns_ignored = any([len(t[1]) > 0 for t in class_arg_mapping])
        if columns_ignored:
            ignored_args = ", ".join([t[2] for t in class_arg_mapping])
            message = f"The input pandas_structure is a Series; ignoring arguments: {ignored_args} ."
            warnings.warn(message)
        class_arg_mapping = []

    non_data_names = []
    for dm_class, column_names, kwarg in class_arg_mapping:
        class_kwarg = []
        non_data_names.extend(column_names)
        for column_name in column_names:
            column = pandas_structure[column_name]

            # Should be impossible for None to be returned - would require a
            #  non-unique index, which we protect against.
            dimensions = _series_index_unique(column)

            content = column.to_numpy()
            # Remove duplicate entries to get down to the correct dimensions
            #  for this object. _series_index_unique should have ensured
            #  that we are indeed removing the duplicates.
            shaped = content.reshape(cube_shape)
            indices = [0] * len(cube_shape)
            for dim in dimensions:
                indices[dim] = slice(None)
            collapsed = shaped[tuple(indices)]

            new_dm = format_dimensional_metadata(
                dm_class, collapsed, column_name, dimensions
            )
            class_kwarg.append(new_dm)

        cube_kwargs[kwarg] = class_kwarg

    # Cube creation.
    if is_series:
        data_series_list = [pandas_structure]
    else:
        data_series_list = [
            pandas_structure[column_name]
            for column_name in pandas_structure.columns
            if column_name not in non_data_names
        ]
    cubes = CubeList()
    for data_series in data_series_list:
        cube_data = data_series.to_numpy().reshape(cube_shape)
        new_cube = Cube(cube_data, **cube_kwargs)
        if data_series.name is not None:
            # Use rename() to attempt standard_name but fall back on long_name.
            new_cube.rename(str(data_series.name))
        cubes.append(new_cube)

    return cubes


def _as_pandas_coord(coord):
    """Convert an Iris Coord into a Pandas index or columns array."""
    index = coord.points
    if coord.units.is_time_reference():
        index = coord.units.num2date(index)
    return index


def _assert_shared(np_obj, pandas_obj):
    """Ensure the pandas object shares memory."""
    values = pandas_obj.values

    def _get_base(array):
        # Chase the stack of NumPy `base` references back to the original array
        while array.base is not None:
            array = array.base
        return array

    base = _get_base(values)
    np_base = _get_base(np_obj)
    if base is not np_base:
        msg = "Pandas {} does not share memory".format(
            type(pandas_obj).__name__
        )
        raise AssertionError(msg)


[docs]def as_series(cube, copy=True):
    """
    Convert a 1D cube to a Pandas Series.

    Args:

        * cube - The cube to convert to a Pandas Series.

    Kwargs:

        * copy - Whether to make a copy of the data.
                 Defaults to True. Must be True for masked data.

    .. note::

        This function will copy your data by default.
        If you have a large array that cannot be copied,
        make sure it is not masked and use copy=False.

    """
    data = cube.data
    if ma.isMaskedArray(data):
        if not copy:
            raise ValueError("Masked arrays must always be copied.")
        data = data.astype("f").filled(np.nan)
    elif copy:
        data = data.copy()

    index = None
    if cube.dim_coords:
        index = _as_pandas_coord(cube.dim_coords[0])

    series = pandas.Series(data, index)
    if not copy:
        _assert_shared(data, series)

    return series


[docs]def as_data_frame(cube, copy=True):
    """
    Convert a 2D cube to a Pandas DataFrame.

    Args:

        * cube - The cube to convert to a Pandas DataFrame.

    Kwargs:

        * copy - Whether to make a copy of the data.
                 Defaults to True. Must be True for masked data
                 and some data types (see notes below).

    .. note::

        This function will copy your data by default.
        If you have a large array that cannot be copied,
        make sure it is not masked and use copy=False.

    .. note::

        Pandas will sometimes make a copy of the array,
        for example when creating from an int32 array.
        Iris will detect this and raise an exception if copy=False.

    """
    data = cube.data
    if ma.isMaskedArray(data):
        if not copy:
            raise ValueError("Masked arrays must always be copied.")
        data = data.astype("f").filled(np.nan)
    elif copy:
        data = data.copy()

    index = columns = None
    if cube.coords(dimensions=[0]):
        index = _as_pandas_coord(cube.coord(dimensions=[0]))
    if cube.coords(dimensions=[1]):
        columns = _as_pandas_coord(cube.coord(dimensions=[1]))

    data_frame = pandas.DataFrame(data, index, columns)
    if not copy:
        _assert_shared(data, data_frame)

    return data_frame