diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1135613c911..df68378d8d3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,7 +33,9 @@ Breaking changes Enhancements ~~~~~~~~~~~~ - +- Added ability to open netcdf4/hdf5 file-like objects with ``open_dataset``. + Requires (h5netcdf>0.7 and h5py>2.9.0). (:issue:`2781`) + By `Scott Henderson `_ - Internal plotting now supports ``cftime.datetime`` objects as time series. (:issue:`2164`) By `Julius Busecke `_ and @@ -81,8 +83,8 @@ Enhancements :py:meth:`~xarray.open_mfdataset` (:issue:`1263`) and/or to silence serialization warnings raised if dates from a standard calendar are found to be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By - `Spencer Clark `_. - + `Spencer Clark `_. + - Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). By `Kevin Squire `_. diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 36baa9071c0..e280d715180 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -75,6 +75,34 @@ def _get_default_engine_netcdf(): return engine +def _get_engine_from_magic_number(filename_or_obj): + # check byte header to determine file type + if isinstance(filename_or_obj, bytes): + magic_number = filename_or_obj[:8] + else: + if filename_or_obj.tell() != 0: + raise ValueError("file-like object read/write pointer not at zero " + "please close and reopen, or use a context " + "manager") + magic_number = filename_or_obj.read(8) + filename_or_obj.seek(0) + + if magic_number.startswith(b'CDF'): + engine = 'scipy' + elif magic_number.startswith(b'\211HDF\r\n\032\n'): + engine = 'h5netcdf' + if isinstance(filename_or_obj, bytes): + raise ValueError("can't open netCDF4/HDF5 as bytes " + "try passing a path or file-like object") + else: + if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80: + filename_or_obj = filename_or_obj[:80] + b'...' + raise ValueError('{} is not a valid netCDF file ' + 'did you mean to pass a string for a path instead?' + .format(filename_or_obj)) + return engine + + def _get_default_engine(path, allow_remote=False): if allow_remote and is_remote_uri(path): engine = _get_default_engine_remote_uri() @@ -170,8 +198,8 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, Strings and Path objects are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename ends with .gz, in which case the file is gunzipped and opened with - scipy.io.netcdf (only netCDF3 supported). File-like objects are opened - with scipy.io.netcdf (only netCDF3 supported). + scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like + objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). group : str, optional Path to the netCDF4 group in the given file to open (only works for netCDF4 files). @@ -258,6 +286,13 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, -------- open_mfdataset """ + engines = [None, 'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', + 'cfgrib', 'pseudonetcdf'] + if engine not in engines: + raise ValueError('unrecognized engine for open_dataset: {}\n' + 'must be one of: {}' + .format(engine, engines)) + if autoclose is not None: warnings.warn( 'The autoclose argument is no longer used by ' @@ -316,18 +351,9 @@ def maybe_decode_store(store, lock=False): if isinstance(filename_or_obj, backends.AbstractDataStore): store = filename_or_obj - ds = maybe_decode_store(store) - elif isinstance(filename_or_obj, str): - if (isinstance(filename_or_obj, bytes) and - filename_or_obj.startswith(b'\x89HDF')): - raise ValueError('cannot read netCDF4/HDF5 file images') - elif (isinstance(filename_or_obj, bytes) and - filename_or_obj.startswith(b'CDF')): - # netCDF3 file images are handled by scipy - pass - elif isinstance(filename_or_obj, str): - filename_or_obj = _normalize_path(filename_or_obj) + elif isinstance(filename_or_obj, str): + filename_or_obj = _normalize_path(filename_or_obj) if engine is None: engine = _get_default_engine(filename_or_obj, @@ -352,18 +378,19 @@ def maybe_decode_store(store, lock=False): elif engine == 'cfgrib': store = backends.CfGribDataStore( filename_or_obj, lock=lock, **backend_kwargs) - else: - raise ValueError('unrecognized engine for open_dataset: %r' - % engine) - with close_on_error(store): - ds = maybe_decode_store(store) else: - if engine is not None and engine != 'scipy': - raise ValueError('can only read file-like objects with ' - "default engine or engine='scipy'") - # assume filename_or_obj is a file-like object - store = backends.ScipyDataStore(filename_or_obj) + if engine not in [None, 'scipy', 'h5netcdf']: + raise ValueError("can only read bytes or file-like objects " + "with engine='scipy' or 'h5netcdf'") + engine = _get_engine_from_magic_number(filename_or_obj) + if engine == 'scipy': + store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs) + elif engine == 'h5netcdf': + store = backends.H5NetCDFStore(filename_or_obj, group=group, + lock=lock, **backend_kwargs) + + with close_on_error(store): ds = maybe_decode_store(store) # Ensure source filename always stored in dataset object (GH issue #2550) @@ -390,8 +417,8 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, Strings and Paths are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename ends with .gz, in which case the file is gunzipped and opened with - scipy.io.netcdf (only netCDF3 supported). File-like objects are opened - with scipy.io.netcdf (only netCDF3 supported). + scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like + objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). group : str, optional Path to the netCDF4 group in the given file to open (only works for netCDF4 files). diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 281fc662197..4ebcc29a61e 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -77,6 +77,12 @@ def LooseVersion(vstring): has_cfgrib, requires_cfgrib = _importorskip('cfgrib') # some special cases +has_h5netcdf07, requires_h5netcdf07 = _importorskip('h5netcdf', + minversion='0.7') +has_h5py29, requires_h5py29 = _importorskip('h5py', minversion='2.9.0') +has_h5fileobj = has_h5netcdf07 and has_h5py29 +requires_h5fileobj = pytest.mark.skipif( + not has_h5fileobj, reason='requires h5py>2.9.0 & h5netcdf>0.7') has_scipy_or_netCDF4 = has_scipy or has_netCDF4 requires_scipy_or_netCDF4 = pytest.mark.skipif( not has_scipy_or_netCDF4, reason='requires scipy or netCDF4') diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c6ddb8fae58..a20ba2df229 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -35,7 +35,7 @@ requires_cftime, requires_dask, requires_h5netcdf, requires_netCDF4, requires_pathlib, requires_pseudonetcdf, requires_pydap, requires_pynio, requires_rasterio, requires_scipy, requires_scipy_or_netCDF4, - requires_zarr) + requires_zarr, requires_h5fileobj) from .test_coding_times import (_STANDARD_CALENDARS, _NON_STANDARD_CALENDARS, _ALL_CALENDARS) from .test_dataset import create_test_data @@ -1770,7 +1770,7 @@ def test_engine(self): open_dataset(tmp_file, engine='foobar') netcdf_bytes = data.to_netcdf() - with raises_regex(ValueError, 'can only read'): + with raises_regex(ValueError, 'unrecognized engine'): open_dataset(BytesIO(netcdf_bytes), engine='foobar') def test_cross_engine_read_write_netcdf3(self): @@ -1955,6 +1955,52 @@ def test_dump_encodings_h5py(self): assert actual.x.encoding['compression_opts'] is None +@requires_h5fileobj +class TestH5NetCDFFileObject(TestH5NetCDFData): + engine = 'h5netcdf' + + def test_open_badbytes(self): + with raises_regex(ValueError, "HDF5 as bytes"): + with open_dataset(b'\211HDF\r\n\032\n', engine='h5netcdf'): + pass + with raises_regex(ValueError, "not a valid netCDF"): + with open_dataset(b'garbage'): + pass + with raises_regex(ValueError, "can only read bytes"): + with open_dataset(b'garbage', engine='netcdf4'): + pass + with raises_regex(ValueError, "not a valid netCDF"): + with open_dataset(BytesIO(b'garbage'), engine='h5netcdf'): + pass + + def test_open_twice(self): + expected = create_test_data() + expected.attrs['foo'] = 'bar' + with raises_regex(ValueError, 'read/write pointer not at zero'): + with create_tmp_file() as tmp_file: + expected.to_netcdf(tmp_file, engine='h5netcdf') + with open(tmp_file, 'rb') as f: + with open_dataset(f, engine='h5netcdf'): + with open_dataset(f, engine='h5netcdf'): + pass + + def test_open_fileobj(self): + # open in-memory datasets instead of local file paths + expected = create_test_data().drop('dim3') + expected.attrs['foo'] = 'bar' + with create_tmp_file() as tmp_file: + expected.to_netcdf(tmp_file, engine='h5netcdf') + + with open(tmp_file, 'rb') as f: + with open_dataset(f, engine='h5netcdf') as actual: + assert_identical(expected, actual) + + f.seek(0) + with BytesIO(f.read()) as bio: + with open_dataset(bio, engine='h5netcdf') as actual: + assert_identical(expected, actual) + + @requires_h5netcdf @requires_dask @pytest.mark.filterwarnings('ignore:deallocating CachingFileManager')