diff --git a/docs/src/conf.py b/docs/src/conf.py index 082d7eb384..7f0c517e05 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -225,6 +225,7 @@ def _dotv(version): "numpy": ("https://numpy.org/doc/stable/", None), "python": ("https://docs.python.org/3/", None), "scipy": ("https://docs.scipy.org/doc/scipy/", None), + "pandas": ("https://pandas.pydata.org/docs/", None), } # The name of the Pygments (syntax highlighting) style to use. diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index c111110bd1..70e2d1a4a2 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -89,6 +89,14 @@ This document explains the changes made to Iris for this release #. `@stephenworsley`_ updated to the latest CF Standard Names Table ``v79`` (19 March 2022). (:pull:`4910`) +#. `@trexfeathers`_ and `@lbdreyer`_ (reviewer) added + :func:`iris.pandas.as_cubes`, which provides richer conversion from + Pandas :class:`~pandas.Series` / :class:`~pandas.DataFrame`\s to one or more + :class:`~iris.cube.Cube`\s. This includes: n-dimensional datasets, + :class:`~iris.coords.AuxCoord`\s, :class:`~iris.coords.CellMeasure`\s, + :class:`~iris.coords.AncillaryVariable`\s, and multi-dimensional + coordinates. (:pull:`4890`) + 🐛 Bugs Fixed ============= @@ -190,7 +198,10 @@ This document explains the changes made to Iris for this release 🔥 Deprecations =============== -#. N/A +#. `@trexfeathers`_ and `@lbdreyer`_ (reviewer) deprecated + :func:`iris.pandas.as_cube` in favour of the new + :func:`iris.pandas.as_cubes` - see `✨ Features`_ for more details. + (:pull:`4890`) 🔗 Dependencies diff --git a/lib/iris/pandas.py b/lib/iris/pandas.py index 6b35a1d1cd..b00eb3f117 100644 --- a/lib/iris/pandas.py +++ b/lib/iris/pandas.py @@ -11,6 +11,8 @@ """ import datetime +from itertools import chain, combinations +import warnings import cf_units from cf_units import Unit @@ -25,13 +27,14 @@ from pandas.tseries.index import DatetimeIndex # pandas <0.20 import iris -from iris.coords import AuxCoord, DimCoord -from iris.cube import Cube +from iris._deprecation import warn_deprecated +from iris.coords import AncillaryVariable, AuxCoord, CellMeasure, DimCoord +from iris.cube import Cube, CubeList -def _add_iris_coord(cube, name, points, dim, calendar=None): +def _get_dimensional_metadata(name, values, calendar=None, dm_class=None): """ - Add a Coord to a Cube from a Pandas index or columns array. + Create a Coord or other dimensional metadata from a Pandas index or columns array. If no calendar is specified for a time series, Standard is assumed. @@ -40,54 +43,130 @@ def _add_iris_coord(cube, name, points, dim, calendar=None): if calendar is None: calendar = cf_units.CALENDAR_STANDARD - # Convert pandas datetime objects to python datetime obejcts. - if isinstance(points, DatetimeIndex): - points = np.array([i.to_pydatetime() for i in points]) + # Getting everything into a single datetime format is hard! + + # Convert out of NumPy's own datetime format. + if np.issubdtype(values.dtype, np.datetime64): + values = pandas.to_datetime(values) + + # Convert pandas datetime objects to python datetime objects. + if isinstance(values, DatetimeIndex): + values = np.array([i.to_pydatetime() for i in values]) # Convert datetime objects to Iris' current datetime representation. - if points.dtype == object: + if values.dtype == object: dt_types = (datetime.datetime, cftime.datetime) - if all([isinstance(i, dt_types) for i in points]): + if all([isinstance(i, dt_types) for i in values]): units = Unit("hours since epoch", calendar=calendar) - points = units.date2num(points) - - points = np.array(points) - if np.issubdtype(points.dtype, np.number) and iris.util.monotonic( - points, strict=True - ): - coord = DimCoord(points, units=units) - coord.rename(name) + values = units.date2num(values) + + values = np.array(values) + + if dm_class is None: + if np.issubdtype(values.dtype, np.number) and iris.util.monotonic( + values, strict=True + ): + dm_class = DimCoord + else: + dm_class = AuxCoord + + instance = dm_class(values, units=units) + if name is not None: + # Use rename() to attempt standard_name but fall back on long_name. + instance.rename(str(name)) + + return instance + + +def _add_iris_coord(cube, name, points, dim, calendar=None): + """ + Add a Coord or other dimensional metadata to a Cube from a Pandas index or columns array. + """ + # Most functionality has been abstracted to _get_dimensional_metadata, + # allowing re-use in as_cube() and as_cubes(). + coord = _get_dimensional_metadata(name, points, calendar) + + if coord.__class__ == DimCoord: cube.add_dim_coord(coord, dim) else: - coord = AuxCoord(points, units=units) - coord.rename(name) cube.add_aux_coord(coord, dim) -def as_cube(pandas_array, copy=True, calendars=None): +def _series_index_unique(pandas_series: pandas.Series): """ - Convert a Pandas array into an Iris cube. + Find an index grouping of a :class:`pandas.Series` that has just one Series value per group. - Args: + Iterates through grouping single index levels, then combinations of 2 + levels, then 3 etcetera, until single :class:`~pandas.Series` values per + group are found. Returns a ``tuple`` of the index levels that group to + produce single values, as soon as one is found. - * pandas_array - A Pandas Series or DataFrame. + Returns ``None`` if no index level combination produces single values. - Kwargs: + """ + unique_number = pandas_series.nunique() + pandas_index = pandas_series.index + levels_range = range(pandas_index.nlevels) + if unique_number == 1: + # Scalar - identical for all indices. + result = () + else: + result = None + levels_combinations = chain( + *[ + combinations(levels_range, levels + 1) + for levels in levels_range + ] + ) + for lc in levels_combinations: + if pandas_series.groupby(level=lc).nunique().max() == 1: + result = lc + # Escape as early as possible - heavy operation. + break + return result + + +def as_cube( + pandas_array, + copy=True, + calendars=None, +): + """ + Convert a Pandas Series/DataFrame into a 1D/2D Iris Cube. + + .. deprecated:: 3.3.0 + + This function is scheduled for removal in a future release, being + replaced by :func:`iris.pandas.as_cubes`, which offers richer + dimensional intelligence. - * copy - Whether to make a copy of the data. - Defaults to True. + Parameters + ---------- + pandas_array : :class:`pandas.Series` or :class:`pandas.DataFrame` + The Pandas object to convert + copy : bool, default=True + Whether to copy `pandas_array`, or to create array views where + possible. Provided in case of memory limit concerns. + calendars : dict, optional + A dict mapping a dimension to a calendar. Required to convert datetime + indices/columns. - * calendars - A dict mapping a dimension to a calendar. - Required to convert datetime indices/columns. + Notes + ----- + This function will copy your data by default. Example usage:: as_cube(series, calendars={0: cf_units.CALENDAR_360_DAY}) as_cube(data_frame, calendars={1: cf_units.CALENDAR_STANDARD}) - .. note:: This function will copy your data by default. - """ + message = ( + "iris.pandas.as_cube has been deprecated, and will be removed in a " + "future release. Please use iris.pandas.as_cubes instead." + ) + warn_deprecated(message) + calendars = calendars or {} if pandas_array.ndim not in [1, 2]: raise ValueError( @@ -116,6 +195,302 @@ def as_cube(pandas_array, copy=True, calendars=None): return cube +def as_cubes( + pandas_structure, + copy=True, + calendars=None, + aux_coord_cols=None, + cell_measure_cols=None, + ancillary_variable_cols=None, +): + """ + Convert a Pandas Series/DataFrame into n-dimensional Iris Cubes, including dimensional metadata. + + The index of `pandas_structure` will be used for generating the + :class:`~iris.cube.Cube` dimension(s) and :class:`~iris.coords.DimCoord`\\ s. + Other dimensional metadata may span multiple dimensions - based on how the + column values vary with the index values. + + Parameters + ---------- + pandas_structure : :class:`pandas.Series` or :class:`pandas.DataFrame` + The Pandas object to convert + copy : bool, default=True + Whether the Cube :attr:`~iris.cube.Cube.data` is a copy of the + `pandas_structure` column, or a view of the same array. Arrays other than + the data (coords etc.) are always copies. This option is provided to + help with memory size concerns. + calendars : dict, optional + Calendar conversions for individual date-time coordinate + columns/index-levels e.g. ``{"my_column": cf_units.CALENDAR_360_DAY}``. + aux_coord_cols, cell_measure_cols, ancillary_variable_cols : list of str, optional + Names of columns to be converted into :class:`~iris.coords.AuxCoord`, + :class:`~iris.coords.CellMeasure` and + :class:`~iris.coords.AncillaryVariable` objects. + + Returns + -------- + :class:`~iris.cube.CubeList` + One :class:`~iris.cube.Cube` for each column not referenced in + `aux_coord_cols`/`cell_measure_cols`/`ancillary_variable_cols`. + + Notes + ----- + A :class:`~pandas.DataFrame` using columns as a second data dimension will + need to be 'melted' before conversion. See the Examples for how. + + Dask ``DataFrame``\\s are not supported. + + Examples + -------- + >>> from iris.pandas import as_cubes + >>> import numpy as np + >>> from pandas import DataFrame, Series + + Converting a simple :class:`~pandas.Series` : + + >>> my_series = Series([300, 301, 302], name="air_temperature") + >>> converted_cubes = as_cubes(my_series) + >>> print(converted_cubes) + 0: air_temperature / (unknown) (unknown: 3) + >>> print(converted_cubes[0]) + air_temperature / (unknown) (unknown: 3) + Dimension coordinates: + unknown x + + A :class:`~pandas.DataFrame`, with a custom index becoming the + :class:`~iris.coords.DimCoord` : + + >>> my_df = DataFrame({ + ... "air_temperature": [300, 301, 302], + ... "longitude": [30, 40, 50] + ... }) + >>> my_df = my_df.set_index("longitude") + >>> converted_cubes = as_cubes(my_df) + >>> print(converted_cubes[0]) + air_temperature / (unknown) (longitude: 3) + Dimension coordinates: + longitude x + + A :class:`~pandas.DataFrame` representing two 3-dimensional datasets, + including a 2-dimensional :class:`~iris.coords.AuxCoord` : + + >>> my_df = DataFrame({ + ... "air_temperature": np.arange(300, 312, 1), + ... "air_pressure": np.arange(1000, 1012, 1), + ... "longitude": [0, 10] * 6, + ... "latitude": [25, 25, 35, 35] * 3, + ... "height": ([0] * 4) + ([100] * 4) + ([200] * 4), + ... "in_region": [True, False, False, False] * 3 + ... }) + >>> print(my_df) + air_temperature air_pressure longitude latitude height in_region + 0 300 1000 0 25 0 True + 1 301 1001 10 25 0 False + 2 302 1002 0 35 0 False + 3 303 1003 10 35 0 False + 4 304 1004 0 25 100 True + 5 305 1005 10 25 100 False + 6 306 1006 0 35 100 False + 7 307 1007 10 35 100 False + 8 308 1008 0 25 200 True + 9 309 1009 10 25 200 False + 10 310 1010 0 35 200 False + 11 311 1011 10 35 200 False + >>> my_df = my_df.set_index(["longitude", "latitude", "height"]) + >>> my_df = my_df.sort_index() + >>> converted_cubes = as_cubes(my_df, aux_coord_cols=["in_region"]) + >>> print(converted_cubes) + 0: air_temperature / (unknown) (longitude: 2; latitude: 2; height: 3) + 1: air_pressure / (unknown) (longitude: 2; latitude: 2; height: 3) + >>> print(converted_cubes[0]) + air_temperature / (unknown) (longitude: 2; latitude: 2; height: 3) + Dimension coordinates: + longitude x - - + latitude - x - + height - - x + Auxiliary coordinates: + in_region x x - + + Pandas uses ``NaN`` rather than masking data. Converted + :class:`~iris.cube.Cube`\\s can be masked in downstream user code : + + >>> my_series = Series([300, np.NaN, 302], name="air_temperature") + >>> converted_cube = as_cubes(my_series)[0] + >>> print(converted_cube.data) + [300. nan 302.] + >>> converted_cube.data = np.ma.masked_invalid(converted_cube.data) + >>> print(converted_cube.data) + [300.0 -- 302.0] + + If the :class:`~pandas.DataFrame` uses columns as a second dimension, + :func:`pandas.melt` should be used to convert the data to the expected + n-dimensional format : + + >>> my_df = DataFrame({ + ... "latitude": [35, 25], + ... 0: [300, 301], + ... 10: [302, 303], + ... }) + >>> print(my_df) + latitude 0 10 + 0 35 300 302 + 1 25 301 303 + >>> my_df = my_df.melt( + ... id_vars=["latitude"], + ... value_vars=[0, 10], + ... var_name="longitude", + ... value_name="air_temperature" + ... ) + >>> print(my_df) + latitude longitude air_temperature + 0 35 0 300 + 1 25 0 301 + 2 35 10 302 + 3 25 10 303 + >>> my_df = my_df.set_index(["latitude", "longitude"]) + >>> my_df = my_df.sort_index() + >>> converted_cube = as_cubes(my_df)[0] + >>> print(converted_cube) + air_temperature / (unknown) (latitude: 2; longitude: 2) + Dimension coordinates: + latitude x - + longitude - x + + """ + if pandas_structure.empty: + return CubeList() + + calendars = calendars or {} + aux_coord_cols = aux_coord_cols or [] + cell_measure_cols = cell_measure_cols or [] + ancillary_variable_cols = ancillary_variable_cols or [] + + is_series = isinstance(pandas_structure, pandas.Series) + + if copy: + pandas_structure = pandas_structure.copy() + + pandas_index = pandas_structure.index + if not pandas_index.is_unique: + message = ( + f"DataFrame index ({pandas_index.names}) is not unique per " + "row; cannot be used for DimCoords." + ) + raise ValueError(message) + + if not pandas_index.is_monotonic: + # Need monotonic index for use in DimCoord(s). + # This function doesn't sort_index itself since that breaks the + # option to return a data view instead of a copy. + message = ( + "Pandas index is not monotonic. Consider using the " + "sort_index() method before passing in." + ) + raise ValueError(message) + + cube_shape = getattr(pandas_index, "levshape", (pandas_index.nunique(),)) + n_rows = len(pandas_structure) + if np.product(cube_shape) > n_rows: + message = ( + f"Not all index values have a corresponding row - {n_rows} rows " + f"cannot be reshaped into {cube_shape}. Consider padding with NaN " + "rows where needed." + ) + raise ValueError(message) + + cube_kwargs = {} + + def format_dimensional_metadata(dm_class_, values_, name_, dimensions_): + # Common convenience to get the right DM in the right format for + # Cube creation. + calendar = calendars.get(name_) + instance = _get_dimensional_metadata( + name_, values_, calendar, dm_class_ + ) + return (instance, dimensions_) + + # DimCoords. + dim_coord_kwarg = [] + for ix, dim_name in enumerate(pandas_index.names): + if hasattr(pandas_index, "levels"): + coord_points = pandas_index.levels[ix] + else: + coord_points = pandas_index + new_dim_coord = format_dimensional_metadata( + DimCoord, coord_points, dim_name, ix + ) + dim_coord_kwarg.append(new_dim_coord) + cube_kwargs["dim_coords_and_dims"] = dim_coord_kwarg + + # Other dimensional metadata. + class_arg_mapping = [ + (AuxCoord, aux_coord_cols, "aux_coords_and_dims"), + (CellMeasure, cell_measure_cols, "cell_measures_and_dims"), + ( + AncillaryVariable, + ancillary_variable_cols, + "ancillary_variables_and_dims", + ), + ] + + if is_series: + columns_ignored = any([len(t[1]) > 0 for t in class_arg_mapping]) + if columns_ignored: + ignored_args = ", ".join([t[2] for t in class_arg_mapping]) + message = f"The input pandas_structure is a Series; ignoring arguments: {ignored_args} ." + warnings.warn(message) + class_arg_mapping = [] + + non_data_names = [] + for dm_class, column_names, kwarg in class_arg_mapping: + class_kwarg = [] + non_data_names.extend(column_names) + for column_name in column_names: + column = pandas_structure[column_name] + + # Should be impossible for None to be returned - would require a + # non-unique index, which we protect against. + dimensions = _series_index_unique(column) + + content = column.to_numpy() + # Remove duplicate entries to get down to the correct dimensions + # for this object. _series_index_unique should have ensured + # that we are indeed removing the duplicates. + shaped = content.reshape(cube_shape) + indices = [0] * len(cube_shape) + for dim in dimensions: + indices[dim] = slice(None) + collapsed = shaped[tuple(indices)] + + new_dm = format_dimensional_metadata( + dm_class, collapsed, column_name, dimensions + ) + class_kwarg.append(new_dm) + + cube_kwargs[kwarg] = class_kwarg + + # Cube creation. + if is_series: + data_series_list = [pandas_structure] + else: + data_series_list = [ + pandas_structure[column_name] + for column_name in pandas_structure.columns + if column_name not in non_data_names + ] + cubes = CubeList() + for data_series in data_series_list: + cube_data = data_series.to_numpy().reshape(cube_shape) + new_cube = Cube(cube_data, **cube_kwargs) + if data_series.name is not None: + # Use rename() to attempt standard_name but fall back on long_name. + new_cube.rename(str(data_series.name)) + cubes.append(new_cube) + + return cubes + + def _as_pandas_coord(coord): """Convert an Iris Coord into a Pandas index or columns array.""" index = coord.points diff --git a/lib/iris/tests/test_pandas.py b/lib/iris/tests/test_pandas.py index 208f7b944e..f47df75def 100644 --- a/lib/iris/tests/test_pandas.py +++ b/lib/iris/tests/test_pandas.py @@ -10,12 +10,16 @@ import copy import datetime -import unittest +from termios import IXOFF # noqa: F401 import cf_units import cftime import matplotlib.units import numpy as np +import pytest + +import iris +from iris._deprecation import IrisDeprecation # Importing pandas has the side-effect of messing with the formatters # used by matplotlib for handling dates. @@ -27,13 +31,14 @@ pandas = None matplotlib.units.registry = default_units_registry -skip_pandas = unittest.skipIf( - pandas is None, 'Test(s) require "pandas", ' "which is not available." +skip_pandas = pytest.mark.skipif( + pandas is None, + reason='Test(s) require "pandas", ' "which is not available.", ) if pandas is not None: - from iris.coords import DimCoord - from iris.cube import Cube + from iris.coords import AncillaryVariable, AuxCoord, CellMeasure, DimCoord + from iris.cube import Cube, CubeList import iris.pandas @@ -80,7 +85,7 @@ def test_time_standard(self): ] series = iris.pandas.as_series(cube) self.assertArrayEqual(series, cube.data) - self.assertListEqual(list(series.index), expected_index) + assert list(series.index) == expected_index def test_time_360(self): cube = Cube(np.array([0, 1, 2, 3, 4]), long_name="ts") @@ -107,37 +112,37 @@ def test_copy_true(self): cube = Cube(np.array([0, 1, 2, 3, 4]), long_name="foo") series = iris.pandas.as_series(cube) series[0] = 99 - self.assertEqual(cube.data[0], 0) + assert cube.data[0] == 0 def test_copy_int32_false(self): cube = Cube(np.array([0, 1, 2, 3, 4], dtype=np.int32), long_name="foo") series = iris.pandas.as_series(cube, copy=False) series[0] = 99 - self.assertEqual(cube.data[0], 99) + assert cube.data[0] == 99 def test_copy_int64_false(self): cube = Cube(np.array([0, 1, 2, 3, 4], dtype=np.int32), long_name="foo") series = iris.pandas.as_series(cube, copy=False) series[0] = 99 - self.assertEqual(cube.data[0], 99) + assert cube.data[0] == 99 def test_copy_float_false(self): cube = Cube(np.array([0, 1, 2, 3.3, 4]), long_name="foo") series = iris.pandas.as_series(cube, copy=False) series[0] = 99 - self.assertEqual(cube.data[0], 99) + assert cube.data[0] == 99 def test_copy_masked_true(self): data = np.ma.MaskedArray([0, 1, 2, 3, 4], mask=[0, 1, 0, 1, 0]) cube = Cube(data, long_name="foo") series = iris.pandas.as_series(cube) series[0] = 99 - self.assertEqual(cube.data[0], 0) + assert cube.data[0] == 0 def test_copy_masked_false(self): data = np.ma.MaskedArray([0, 1, 2, 3, 4], mask=[0, 1, 0, 1, 0]) cube = Cube(data, long_name="foo") - with self.assertRaises(ValueError): + with pytest.raises(ValueError): _ = iris.pandas.as_series(cube, copy=False) @@ -230,8 +235,8 @@ def test_time_standard(self): ) for day_offset in day_offsets ] - self.assertTrue(all(data_frame.columns == timestamps)) - self.assertTrue(all(data_frame.index == [0, 1])) + assert all(data_frame.columns == timestamps) + assert all(data_frame.index == [0, 1]) def test_time_360(self): cube = Cube( @@ -261,7 +266,7 @@ def test_copy_true(self): ) data_frame = iris.pandas.as_data_frame(cube) data_frame[0][0] = 99 - self.assertEqual(cube.data[0, 0], 0) + assert cube.data[0, 0] == 0 def test_copy_int32_false(self): cube = Cube( @@ -270,7 +275,7 @@ def test_copy_int32_false(self): ) data_frame = iris.pandas.as_data_frame(cube, copy=False) data_frame[0][0] = 99 - self.assertEqual(cube.data[0, 0], 99) + assert cube.data[0, 0] == 99 def test_copy_int64_false(self): cube = Cube( @@ -279,7 +284,7 @@ def test_copy_int64_false(self): ) data_frame = iris.pandas.as_data_frame(cube, copy=False) data_frame[0][0] = 99 - self.assertEqual(cube.data[0, 0], 99) + assert cube.data[0, 0] == 99 def test_copy_float_false(self): cube = Cube( @@ -287,7 +292,7 @@ def test_copy_float_false(self): ) data_frame = iris.pandas.as_data_frame(cube, copy=False) data_frame[0][0] = 99 - self.assertEqual(cube.data[0, 0], 99) + assert cube.data[0, 0] == 99 def test_copy_masked_true(self): data = np.ma.MaskedArray( @@ -297,7 +302,7 @@ def test_copy_masked_true(self): cube = Cube(data, long_name="foo") data_frame = iris.pandas.as_data_frame(cube) data_frame[0][0] = 99 - self.assertEqual(cube.data[0, 0], 0) + assert cube.data[0, 0] == 0 def test_copy_masked_false(self): data = np.ma.MaskedArray( @@ -305,7 +310,7 @@ def test_copy_masked_false(self): mask=[[0, 1, 0, 1, 0], [1, 0, 1, 0, 1]], ) cube = Cube(data, long_name="foo") - with self.assertRaises(ValueError): + with pytest.raises(ValueError): _ = iris.pandas.as_data_frame(cube, copy=False) def test_copy_false_with_cube_view(self): @@ -313,10 +318,13 @@ def test_copy_false_with_cube_view(self): cube = Cube(data[:], long_name="foo") data_frame = iris.pandas.as_data_frame(cube, copy=False) data_frame[0][0] = 99 - self.assertEqual(cube.data[0, 0], 99) + assert cube.data[0, 0] == 99 @skip_pandas +@pytest.mark.filterwarnings( + "ignore:.*as_cube has been deprecated.*:iris._deprecation.IrisDeprecation" +) class TestSeriesAsCube(tests.IrisTest): def test_series_simple(self): series = pandas.Series([0, 1, 2, 3, 4], index=[5, 6, 7, 8, 9]) @@ -390,16 +398,19 @@ def test_copy_true(self): series = pandas.Series([0, 1, 2, 3, 4], index=[5, 6, 7, 8, 9]) cube = iris.pandas.as_cube(series) cube.data[0] = 99 - self.assertEqual(series[5], 0) + assert series[5] == 0 def test_copy_false(self): series = pandas.Series([0, 1, 2, 3, 4], index=[5, 6, 7, 8, 9]) cube = iris.pandas.as_cube(series, copy=False) cube.data[0] = 99 - self.assertEqual(series[5], 99) + assert series[5] == 99 @skip_pandas +@pytest.mark.filterwarnings( + "ignore:.*as_cube has been deprecated.*:iris._deprecation.IrisDeprecation" +) class TestDataFrameAsCube(tests.IrisTest): def test_data_frame_simple(self): data_frame = pandas.DataFrame( @@ -491,13 +502,461 @@ def test_copy_true(self): data_frame = pandas.DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]) cube = iris.pandas.as_cube(data_frame) cube.data[0, 0] = 99 - self.assertEqual(data_frame[0][0], 0) + assert data_frame[0][0] == 0 def test_copy_false(self): data_frame = pandas.DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]) cube = iris.pandas.as_cube(data_frame, copy=False) cube.data[0, 0] = 99 - self.assertEqual(data_frame[0][0], 99) + assert data_frame[0][0] == 99 + + +@skip_pandas +class TestFutureAndDeprecation(tests.IrisTest): + def test_deprecation_warning(self): + data_frame = pandas.DataFrame([[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]) + with pytest.warns( + IrisDeprecation, match="as_cube has been deprecated" + ): + _ = iris.pandas.as_cube(data_frame) + + # Tests for FUTURE are expected when as_dataframe() is made n-dimensional. + + +@skip_pandas +class TestPandasAsCubes(tests.IrisTest): + @staticmethod + def _create_pandas(index_levels=0, is_series=False): + index_length = 3 + + index_names = [f"index_{i}" for i in range(index_levels)] + index_values = [ + np.arange(index_length) * 10 * (i + 1) for i in range(index_levels) + ] + + if index_levels == 1: + index = pandas.Index(index_values[0], name=index_names[0]) + data_length = index_length + elif index_levels > 1: + index = pandas.MultiIndex.from_product( + index_values, names=index_names + ) + data_length = index.nunique() + else: + index = None + data_length = index_length + + data = np.arange(data_length) * 10 + + if is_series: + class_ = pandas.Series + else: + class_ = pandas.DataFrame + + return class_(data, index=index) + + def test_1d_no_index(self): + df = self._create_pandas() + result = iris.pandas.as_cubes(df) + + expected_coord = DimCoord(df.index.values) + expected_cube = Cube( + data=df[0].values, + long_name=str(df[0].name), + dim_coords_and_dims=[(expected_coord, 0)], + ) + assert result == [expected_cube] + + def test_1d_with_index(self): + df = self._create_pandas(index_levels=1) + result = iris.pandas.as_cubes(df) + + expected_coord = DimCoord(df.index.values, long_name=df.index.name) + (result_cube,) = result + assert result_cube.dim_coords == (expected_coord,) + + def test_1d_series_no_index(self): + series = self._create_pandas(is_series=True) + result = iris.pandas.as_cubes(series) + + expected_coord = DimCoord(series.index.values) + expected_cube = Cube( + data=series.values, dim_coords_and_dims=[(expected_coord, 0)] + ) + assert result == [expected_cube] + + def test_1d_series_with_index(self): + series = self._create_pandas(index_levels=1, is_series=True) + result = iris.pandas.as_cubes(series) + + expected_coord = DimCoord( + series.index.values, long_name=series.index.name + ) + (result_cube,) = result + assert result_cube.dim_coords == (expected_coord,) + + def test_3d(self): + df = self._create_pandas(index_levels=3) + result = iris.pandas.as_cubes(df) + + expected_coords = [ + DimCoord(level.values, long_name=level.name) + for level in df.index.levels + ] + (result_cube,) = result + assert result_cube.dim_coords == tuple(expected_coords) + + def test_3d_series(self): + series = self._create_pandas(index_levels=3, is_series=True) + result = iris.pandas.as_cubes(series) + + expected_coords = [ + DimCoord(level.values, long_name=level.name) + for level in series.index.levels + ] + (result_cube,) = result + assert result_cube.dim_coords == tuple(expected_coords) + + def test_non_unique_index(self): + df = self._create_pandas(index_levels=1) + new_index = df.index.values + new_index[1] = new_index[0] + df.set_index(new_index) + + with pytest.raises(ValueError, match="not unique per row"): + _ = iris.pandas.as_cubes(df) + + def test_non_monotonic_index(self): + df = self._create_pandas(index_levels=1) + new_index = df.index.values + new_index[:2] = new_index[1::-1] + df.set_index(new_index) + + with pytest.raises(ValueError, match="not monotonic"): + _ = iris.pandas.as_cubes(df) + + def test_missing_rows(self): + df = self._create_pandas(index_levels=2) + df = df[:-1] + + with pytest.raises( + ValueError, match="Not all index values have a corresponding row" + ): + _ = iris.pandas.as_cubes(df) + + def test_aux_coord(self): + df = self._create_pandas() + coord_name = "foo" + df[coord_name] = df.index.values + result = iris.pandas.as_cubes(df, aux_coord_cols=[coord_name]) + + expected_aux_coord = AuxCoord( + df[coord_name].values, long_name=coord_name + ) + (result_cube,) = result + assert result_cube.aux_coords == (expected_aux_coord,) + + def test_cell_measure(self): + df = self._create_pandas() + coord_name = "foo" + df[coord_name] = df.index.values + result = iris.pandas.as_cubes(df, cell_measure_cols=[coord_name]) + + expected_cm = CellMeasure(df[coord_name].values, long_name=coord_name) + (result_cube,) = result + assert result_cube.cell_measures() == [expected_cm] + + def test_ancillary_variable(self): + df = self._create_pandas() + coord_name = "foo" + df[coord_name] = df.index.values + result = iris.pandas.as_cubes(df, ancillary_variable_cols=[coord_name]) + + expected_av = AncillaryVariable( + df[coord_name].values, long_name=coord_name + ) + (result_cube,) = result + assert result_cube.ancillary_variables() == [expected_av] + + def test_3d_with_2d_coord(self): + df = self._create_pandas(index_levels=3) + coord_shape = df.index.levshape[:2] + coord_values = np.arange(np.product(coord_shape)) + coord_name = "foo" + df[coord_name] = coord_values.repeat(df.index.levshape[-1]) + result = iris.pandas.as_cubes(df, aux_coord_cols=[coord_name]) + + expected_points = coord_values.reshape(coord_shape) + (result_cube,) = result + result_coord = result_cube.coord(coord_name) + self.assertArrayEqual(result_coord.points, expected_points) + assert result_coord.cube_dims(result_cube) == (0, 1) + + def test_coord_varies_all_indices(self): + df = self._create_pandas(index_levels=3) + coord_shape = df.index.levshape + coord_values = np.arange(np.product(coord_shape)) + coord_name = "foo" + df[coord_name] = coord_values + result = iris.pandas.as_cubes(df, aux_coord_cols=[coord_name]) + + expected_points = coord_values.reshape(coord_shape) + (result_cube,) = result + result_coord = result_cube.coord(coord_name) + self.assertArrayEqual(result_coord.points, expected_points) + assert result_coord.cube_dims(result_cube) == (0, 1, 2) + + def test_category_coord(self): + # Something that varies on a dimension, but doesn't change with every + # increment. + df = self._create_pandas(index_levels=2) + coord_shape = df.index.levshape + coord_values = np.arange(np.product(coord_shape)) + coord_name = "foo" + + # Create a repeating value along a dimension. + step = coord_shape[-1] + coord_values[1::step] = coord_values[::step] + + df[coord_name] = coord_values + result = iris.pandas.as_cubes(df, aux_coord_cols=[coord_name]) + + expected_points = coord_values.reshape(coord_shape) + (result_cube,) = result + result_coord = result_cube.coord(coord_name) + self.assertArrayEqual(result_coord.points, expected_points) + assert result_coord.cube_dims(result_cube) == (0, 1) + + def test_scalar_coord(self): + df = self._create_pandas() + coord_values = np.ones(len(df)) + coord_name = "foo" + df[coord_name] = coord_values + result = iris.pandas.as_cubes(df, aux_coord_cols=[coord_name]) + + expected_points = np.unique(coord_values) + (result_cube,) = result + result_coord = result_cube.coord(coord_name) + self.assertArrayEqual(result_coord.points, expected_points) + assert result_coord.cube_dims(result_cube) == tuple() + + def test_multi_phenom(self): + df = self._create_pandas() + new_name = "new_phenom" + df[new_name] = df[0] + result = iris.pandas.as_cubes(df) + + # Note the shared coord object between both Cubes. + expected_coord = DimCoord(df.index.values) + expected_cube_kwargs = dict(dim_coords_and_dims=[(expected_coord, 0)]) + + expected_cube_0 = Cube( + data=df[0].values, + long_name=str(df[0].name), + **expected_cube_kwargs, + ) + expected_cube_1 = Cube( + data=df[new_name].values, + long_name=new_name, + **expected_cube_kwargs, + ) + assert result == [expected_cube_0, expected_cube_1] + + def test_empty_series(self): + series = pandas.Series(dtype=object) + result = iris.pandas.as_cubes(series) + + assert result == CubeList() + + def test_empty_dataframe(self): + df = pandas.DataFrame() + result = iris.pandas.as_cubes(df) + + assert result == CubeList() + + def test_no_phenom(self): + df = self._create_pandas() + # Specify the only column as an AuxCoord. + result = iris.pandas.as_cubes(df, aux_coord_cols=[0]) + + assert result == CubeList() + + def test_standard_name_phenom(self): + # long_name behaviour is tested in test_1d_no_index. + df = self._create_pandas() + new_name = "air_temperature" + df = df.rename(columns={0: new_name}) + result = iris.pandas.as_cubes(df) + + (result_cube,) = result + assert result_cube.standard_name == new_name + + def test_standard_name_coord(self): + # long_name behaviour is tested in test_1d_with_index. + df = self._create_pandas() + new_name = "longitude" + df.index.names = [new_name] + result = iris.pandas.as_cubes(df) + + (result_cube,) = result + result_coord = result_cube.coord(dim_coords=True) + assert result_coord.standard_name == new_name + + def test_dtype_preserved_phenom(self): + df = self._create_pandas() + df = df.astype("int32") + result = iris.pandas.as_cubes(df) + + (result_cube,) = result + assert result_cube.dtype == np.int32 + + def test_preserve_dim_order(self): + new_order = ["index_1", "index_0", "index_2"] + + df = self._create_pandas(index_levels=3) + df = df.reset_index() + df = df.set_index(new_order) + df = df.sort_index() + result = iris.pandas.as_cubes(df) + + (result_cube,) = result + dim_order = [c.name() for c in result_cube.dim_coords] + assert dim_order == new_order + + def test_dtype_preserved_coord(self): + df = self._create_pandas() + new_index = df.index.astype("float64") + df.index = new_index + result = iris.pandas.as_cubes(df) + + (result_cube,) = result + result_coord = result_cube.coord(dim_coords=True) + assert result_coord.dtype == np.float64 + + def test_string_phenom(self): + # Strings can be uniquely troublesome. + df = self._create_pandas() + new_values = [str(v) for v in df[0]] + df[0] = new_values + result = iris.pandas.as_cubes(df) + + (result_cube,) = result + self.assertArrayEqual(result_cube.data, new_values) + + def test_string_coord(self): + # Strings can be uniquely troublesome. + # Must test using an AuxCoord since strings cannot be DimCoords. + df = self._create_pandas() + new_points = [str(v) for v in df.index.values] + coord_name = "foo" + df[coord_name] = new_points + result = iris.pandas.as_cubes(df, aux_coord_cols=[coord_name]) + + (result_cube,) = result + result_coord = result_cube.coord(coord_name) + self.assertArrayEqual(result_coord.points, new_points) + + def test_series_with_col_args(self): + series = self._create_pandas(is_series=True) + with pytest.warns(Warning, match="is a Series; ignoring"): + _ = iris.pandas.as_cubes(series, aux_coord_cols=["some_column"]) + + def test_phenom_view(self): + df = self._create_pandas() + result = iris.pandas.as_cubes(df, copy=False) + + # Modify AFTER creating the Cube(s). + df[0][0] += 1 + + (result_cube,) = result + assert result_cube.data[0] == df[0][0] + + def test_phenom_copy(self): + df = self._create_pandas() + result = iris.pandas.as_cubes(df) + + # Modify AFTER creating the Cube(s). + df[0][0] += 1 + + (result_cube,) = result + assert result_cube.data[0] != df[0][0] + + def test_coord_never_view(self): + # Using AuxCoord - DimCoords and Pandas indices are immutable. + df = self._create_pandas() + coord_name = "foo" + df[coord_name] = df.index.values + result = iris.pandas.as_cubes( + df, copy=False, aux_coord_cols=[coord_name] + ) + + # Modify AFTER creating the Cube(s). + df[coord_name][0] += 1 + + (result_cube,) = result + result_coord = result_cube.coord(coord_name) + assert result_coord.points[0] != df[coord_name][0] + + def _test_dates_common(self, mode=None, alt_calendar=False): + df = self._create_pandas() + kwargs = dict(pandas_structure=df) + coord_name = "dates" + + if alt_calendar: + calendar = cf_units.CALENDAR_360_DAY + # Only pass this when non-default. + kwargs["calendars"] = {coord_name: calendar} + expected_points = [8640, 8641, 8642] + else: + calendar = cf_units.CALENDAR_STANDARD + expected_points = [8760, 8761, 8762] + expected_units = cf_units.Unit( + "hours since 1970-01-01 00:00:00", calendar=calendar + ) + + datetime_args = [(1971, 1, 1, i, 0, 0) for i in df.index.values] + if mode == "index": + values = [datetime.datetime(*a) for a in datetime_args] + df.index = pandas.Index(values, name=coord_name) + elif mode == "numpy": + values = [datetime.datetime(*a) for a in datetime_args] + df[coord_name] = values + kwargs["aux_coord_cols"] = [coord_name] + elif mode == "cftime": + values = [ + cftime.datetime(*a, calendar=calendar) for a in datetime_args + ] + df[coord_name] = values + kwargs["aux_coord_cols"] = [coord_name] + else: + raise ValueError("mode needs to be set") + + result = iris.pandas.as_cubes(**kwargs) + + (result_cube,) = result + result_coord = result_cube.coord(coord_name) + assert result_coord.units == expected_units + self.assertArrayEqual(result_coord.points, expected_points) + + def test_datetime_index(self): + self._test_dates_common(mode="index") + + def test_datetime_index_calendar(self): + self._test_dates_common(mode="index", alt_calendar=True) + + def test_numpy_datetime_coord(self): + # NumPy format is what happens if a Python datetime is assigned to a + # Pandas column. + self._test_dates_common(mode="numpy") + + def test_numpy_datetime_coord_calendar(self): + self._test_dates_common(mode="numpy", alt_calendar=True) + + def test_cftime_coord(self): + self._test_dates_common(mode="cftime") + + def test_cftime_coord_calendar(self): + self._test_dates_common(mode="cftime", alt_calendar=True) if __name__ == "__main__":