mpiannucci · mpiannucci · Jul 7, 2023 · Jul 6, 2023 · Jul 6, 2023 · Jul 6, 2023
diff --git a/python/examples/kerchunk_gefs_wave.ipynb b/python/examples/kerchunk_gefs_wave.ipynb
diff --git a/python/examples/requirements.txt b/python/examples/requirements.txt
@@ -1,4 +1,10 @@
 numpy 
 matplotlib
 xarray
-cf_xarray
+cf_xarray
+kerchunk
+zarr
+numcodecs
+fsspec
+ujson
+dask
diff --git a/python/examples/xarray_usage.ipynb b/python/examples/xarray_usage.ipynb
diff --git a/python/gribberish/gribberish_backend.py b/python/gribberish/gribberish_backend.py
@@ -1,10 +1,11 @@
 import os
-import xarray as xr
+
 import numpy as np
+import xarray as xr
+from xarray.backends.common import BackendEntrypoint, BackendArray
+from xarray.core import indexing
 
 from .gribberishpy import parse_grid_dataset, build_grib_array
-from xarray.backends.common import BackendEntrypoint, BackendArray, AbstractDataStore
-from xarray.core import indexing
 
 
 def read_binary_data(filename: str):
@@ -58,8 +59,9 @@ def guess_can_open(self, filename_or_obj):
 
 class GribberishBackendArray(BackendArray):
     '''
-    Custom backend array for xarray to support lazy loading of gribberish datasets
+    Custom backend array to support lazy loading of gribberish datasets
     '''
+
     def __init__(
         self,
         filename_or_obj,

diff --git a/python/gribberish/kerchunk/__init__.py b/python/gribberish/kerchunk/__init__.py
@@ -0,0 +1,3 @@
+# module
+from .codec import *
+from .mapper import *
diff --git a/python/gribberish/kerchunk/codec.py b/python/gribberish/kerchunk/codec.py
@@ -0,0 +1,34 @@
+import numcodecs
+
+from ..gribberishpy import parse_grib_data
+
+
+class GribberishCodec(numcodecs.abc.Codec):
+    """
+    Read GRIB stream of bytes as a message using gribberish
+
+    Adapted from https://github.com/fsspec/kerchunk/blob/main/kerchunk/codecs.py
+    """
+
+    # eclock = threading.RLock()
+
+    codec_id = "gribberish"
+
+    def __init__(self, var, shape, dtype=None):
+        self.var = var
+        self.shape = shape
+        self.dtype = dtype
+
+    def encode(self, buf):
+        # on encode, pass through
+        return buf
+
+    def decode(self, buf, out=None):
+        data = parse_grib_data(buf, 0, self.shape)
+        if out is not None:
+            return numcodecs.compat.ndarray_copy(data, out)
+        else:
+            return data.astype(self.dtype)
+
+
+numcodecs.register_codec(GribberishCodec, "gribberish")
diff --git a/python/gribberish/kerchunk/mapper.py b/python/gribberish/kerchunk/mapper.py
@@ -0,0 +1,163 @@
+import base64
+import fsspec
+import zarr
+import numpy as np
+
+from kerchunk.utils import class_factory, _encode_for_JSON
+from .codec import GribberishCodec
+from ..gribberishpy import parse_grid_dataset
+
+
+def _split_file(f, skip=0):
+    if hasattr(f, "size"):
+        size = f.size
+    else:
+        f.seek(0, 2)
+        size = f.tell()
+        f.seek(0)
+    part = 0
+
+    while f.tell() < size:
+        start = f.tell()
+        head = f.read(16)
+        marker = head[:4]
+        if not marker:
+            break  # EOF
+        assert head[:4] == b"GRIB", "Bad grib message start marker"
+        part_size = int.from_bytes(head[12:], "big")
+        f.seek(start)
+        yield start, part_size, f.read(part_size)
+        part += 1
+        if skip and part >= skip:
+            break
+
+
+def _store_array_inline(store, z, data, var, attr):
+    shape = tuple(data.shape or ())
+    d = z.create_dataset(
+        name=var,
+        shape=shape,
+        chunks=shape,
+        dtype=data.dtype,
+        fill_value=None,
+        compressor=False,
+    )
+    if hasattr(data, "tobytes"):
+        b = data.tobytes()
+    else:
+        b = data.build_array().tobytes()
+    try:
+        # easiest way to test if data is ascii
+        b.decode("ascii")
+    except UnicodeDecodeError:
+        b = b"base64:" + base64.b64encode(data)
+    store[f"{var}/0"] = b.decode("ascii")
+    d.attrs.update(attr)
+
+
+def _store_array_ref(
+    store,
+    z,
+    data_shape,
+    var,
+    offset,
+    size,
+    attr
+):
+    shape = tuple(data_shape or ())
+    data_type = np.dtype('float64')
+    d = z.create_dataset(
+        name=var,
+        shape=shape,
+        chunks=shape,
+        dtype=data_type,
+        filters=[GribberishCodec(var=var, dtype=str(data_type), shape=list(shape))],
+        compressor=False,
+        fill_value=None,
+        overwrite=True,
+    )
+    store[f"{var}/" + ".".join(["0"] * len(shape))] = ["{{u}}", offset, size]
+    d.attrs.update(attr)
+
+
+def scan_gribberish(
+    url,
+    common=None,
+    storage_options=None,
+    skip=0,
+    only_vars=None,
+):
+    """
+    Generate references for a GRIB2 file using gribberish
+
+    Parameters
+    ----------
+
+    url: str
+        File location
+    common_vars: (depr, do not use)
+    storage_options: dict
+        For accessing the data, passed to filesystem
+    skip: int
+        If non-zero, stop processing the file after this many messages
+    only_vars: list(str)
+        If given, only store these variables
+
+    Returns
+    -------
+
+    list(dict): references dicts in Version 1 format, one per message
+    """
+    storage_options = storage_options or {}
+
+    out = []
+    with fsspec.open(url, "rb", **storage_options) as f:
+        for offset, size, data in _split_file(f, skip=skip):
+            dataset = parse_grid_dataset(data)
+
+            # Only reading one variable from each data chunk (1 message)
+            var_name, var_data = next(iter(dataset['data_vars'].items()))
+            if only_vars and var_name not in only_vars:
+                continue
+
+            store = {}
+            z = zarr.open_group(store)
+            z.attrs.update(dataset['attrs'])
+
+            _store_array_ref(
+                store,
+                z,
+                var_data['values']['shape'],
+                var_name,
+                offset,
+                size,
+                var_data['attrs']
+            )
+
+            # Coords
+            dims = var_data['dims']
+            z[var_name].attrs["_ARRAY_DIMENSIONS"] = dims
+
+            for coord_name, coord_data in dataset['coords'].items():
+                # TODO: Prob dont store inline for non regular grids
+                coord_array = np.array(coord_data['values'])
+                _store_array_inline(
+                    store,
+                    z,
+                    coord_array,
+                    coord_name,
+                    coord_data['attrs']
+                )
+                z[coord_name].attrs["_ARRAY_DIMENSIONS"] = coord_data['dims']
+
+            out.append(
+                {
+                    "version": 1,
+                    "refs": _encode_for_JSON(store),
+                    "templates": {"u": url},
+                }
+            )
+    return out
+
+
+GribberishToZarr = class_factory(scan_gribberish)
diff --git a/python/setup.py b/python/setup.py
@@ -9,6 +9,11 @@
     include_package_data=True,
     # rust extensions are not zip safe, just like C-extensions.
     zip_safe=False,
+    requires=["numpy"],
+    extras_require={
+        "xarray": ["xarray"],
+        "kerchunk": ["kerchunk", "zarr", "numcodecs", "fsspec"],
+    },
     entry_points={
       "xarray.backends": ["gribberish=gribberish.gribberish_backend:GribberishBackend"],
     },

diff --git a/python/src/dataset.rs b/python/src/dataset.rs
@@ -6,11 +6,11 @@ use gribberish::{
 };
 use numpy::{
     ndarray::{Dim, IxDynImpl},
-    PyArray,
+    PyArray, PyArray1, datetime::{Datetime, units::Seconds},
 };
 use pyo3::{
     prelude::*,
-    types::{PyDateTime, PyDict, PyList},
+    types::{PyDict, PyList},
 };
 
 #[pyfunction]
@@ -198,8 +198,9 @@ pub fn parse_grid_dataset<'py>(
 
         let times = times
             .iter()
-            .map(|d| PyDateTime::from_timestamp(py, d.timestamp() as f64, None).unwrap())
+            .map(|d| Datetime::<Seconds>::from(d.timestamp()))
             .collect::<Vec<_>>();
+        let times = PyArray1::from_slice(py, &times);
 
         time_dim_map[var].iter().for_each(|v: &String| {
             var_dims.get_mut(v).unwrap().push(name.clone());
@@ -397,12 +398,12 @@ pub fn parse_grid_dataset<'py>(
     if first.2.is_regular_grid {
         latitude.set_item("dims", vec!["latitude"]).unwrap();
         latitude
-            .set_item("values", PyArray::from_slice(py, &first.2.lat()))
+            .set_item("values", PyArray1::from_slice(py, &first.2.lat()))
             .unwrap();
 
         longitude.set_item("dims", vec!["longitude"]).unwrap();
         longitude
-            .set_item("values", PyArray::from_slice(py, &first.2.lng()))
+            .set_item("values", PyArray1::from_slice(py, &first.2.lng()))
             .unwrap();
 
         var_dims.iter_mut().for_each(|(_, v)| {
@@ -449,6 +450,9 @@ pub fn parse_grid_dataset<'py>(
         var_metadata
             .set_item("coordinates", "latitude longitude")
             .unwrap();
+        var_metadata
+            .set_item("reference_date", first.2.reference_date.to_rfc3339())
+            .unwrap();
         var_metadata
             .set_item("generating_process", first.2.generating_process.to_string())
             .unwrap();