exogenous data caching done on head node after checking for cache files. each node uses this data so it doesn't make sense to have each node try to cache this data.

bnb32 · bnb32 · commit 298e5513c108 · 2024-11-08T11:27:18.000-07:00
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
@@ -2,7 +2,8 @@ name: Codecov
 
 on:
   push:
-    branches: [main, master]
+    branches: [main]
+  workflow_dispatch:
 
 jobs:
   run:
diff --git a/.github/workflows/gh_pages.yml b/.github/workflows/gh_pages.yml
@@ -2,7 +2,8 @@ name: Documentation
 
 on:
   push:
-    branches: [main, master]
+    branches: [main]
+  workflow_dispatch:
 
 jobs:
   make-pages:
diff --git a/.github/workflows/release_drafter.yml b/.github/workflows/release_drafter.yml
@@ -2,7 +2,7 @@ name: Release Drafter
 
 on:
   push:
-    branches: [main, master]
+    branches: [main]
 
 jobs:
   update_release_draft:
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -62,6 +62,7 @@
     "sphinx.ext.napoleon",
     "sphinx_autosummary_accessors",
     "sphinx_copybutton",
+    "pygments_lexer"
 ]
 
 intersphinx_mapping = {
diff --git a/sup3r/bias/bias_transforms.py b/sup3r/bias/bias_transforms.py
@@ -350,6 +350,7 @@ def monthly_local_linear_bc(
     temporal_avg=True,
     out_range=None,
     smoothing=0,
+    range_kwargs=None
 ):
     """Bias correct data using a simple monthly *scalar +adder method on a
     site-by-site basis.
@@ -396,6 +397,9 @@ def monthly_local_linear_bc(
         effect of extreme values within aggregations over large number of
         pixels.  This value is the standard deviation for the gaussian_filter
         kernel.
+    range_kwargs : dict | None
+        Dictionary of ranges for scalar and adder values. e.g. {'scalar': (0,
+        3), 'adder': (-2, 2)}
 
     Returns
     -------
@@ -450,6 +454,14 @@ def monthly_local_linear_bc(
                 adder[..., idt], smoothing, mode='nearest'
             )
 
+    if range_kwargs is not None:
+        scalar_range = range_kwargs.get('scalar', (-np.inf, np.inf))
+        adder_range = range_kwargs.get('adder', (-np.inf, np.inf))
+        scalar = np.minimum(scalar, np.max(scalar_range))
+        scalar = np.maximum(scalar, np.min(scalar_range))
+        adder = np.minimum(adder, np.max(adder_range))
+        adder = np.maximum(adder, np.min(adder_range))
+
     out = data * scalar + adder
     if out_range is not None:
         out = np.maximum(out, np.min(out_range))
diff --git a/sup3r/pipeline/slicer.py b/sup3r/pipeline/slicer.py
@@ -25,8 +25,9 @@ class ForwardPassSlicer:
     time_steps : int
         Number of time steps for full temporal domain of low res data. This
         is used to construct a dummy_time_index from np.arange(time_steps)
-    time_slice : slice
-        Slice to use to extract range from time_index
+    time_slice : slice | list
+        Slice to use to extract range from time_index. Can be a ``slice(start,
+        stop, step)`` or list ``[start, stop, step]``
     chunk_shape : tuple
         Max shape (spatial_1, spatial_2, temporal) of an unpadded coarse
         chunk to use for a forward pass. The number of nodes that the
diff --git a/sup3r/pipeline/strategy.py b/sup3r/pipeline/strategy.py
@@ -10,6 +10,7 @@
 from dataclasses import dataclass
 from functools import cached_property
 from typing import Dict, Optional, Tuple, Union
+from warnings import warn
 
 import dask.array as da
 import numpy as np
@@ -228,6 +229,18 @@ def __post_init__(self):
         )
         self.n_chunks = self.fwp_slicer.n_chunks
 
+        msg = (
+            'The same exogenous data is used by all nodes, so it will be '
+            'cached on the head_node. This can take a long time and might be '
+            'worth doing as an independent preprocessing step instead.'
+        )
+        if self.head_node and not all(
+            os.path.exists(fp) for fp in self.get_exo_cache_files(model)
+        ):
+            logger.warning(msg)
+            warn(msg)
+            _ = self.timer(self.load_exo_data, log=True)(model)
+
         if not self.head_node:
             hr_shape = self.hr_lat_lon.shape[:-1]
             self.gids = np.arange(np.prod(hr_shape)).reshape(hr_shape)
@@ -532,19 +545,9 @@ def init_chunk(self, chunk_index=0):
             index=chunk_index,
         )
 
-    def load_exo_data(self, model):
-        """Extract exogenous data for each exo feature and store data in
-        dictionary with key for each exo feature
-
-        Returns
-        -------
-        exo_data : ExoData
-           :class:`ExoData` object composed of multiple
-           :class:`SingleExoDataStep` objects. This is the exo data for the
-           full spatiotemporal extent.
-        """
-        data = {}
-        exo_data = None
+    def get_exo_kwargs(self, model):
+        """Get list of exo kwargs for all exo features."""
+        exo_kwargs_list = []
         if self.exo_handler_kwargs:
             for feature in self.exo_features:
                 exo_kwargs = copy.deepcopy(self.exo_handler_kwargs[feature])
@@ -558,8 +561,32 @@ def load_exo_data(self, model):
                 _ = input_handler_kwargs.pop('time_slice', None)
                 exo_kwargs['input_handler_kwargs'] = input_handler_kwargs
                 exo_kwargs = get_class_kwargs(ExoDataHandler, exo_kwargs)
-                data.update(ExoDataHandler(**exo_kwargs).data)
-            exo_data = ExoData(data)
+                exo_kwargs_list.append(exo_kwargs)
+        return exo_kwargs_list
+
+    def get_exo_cache_files(self, model):
+        """Get list of exo cache files so we can check if they exist or not."""
+        cache_files = []
+        for exo_kwargs in self.get_exo_kwargs(model):
+            cache_files.extend(ExoDataHandler(**exo_kwargs).cache_files)
+        return cache_files
+
+    def load_exo_data(self, model):
+        """Extract exogenous data for each exo feature and store data in
+        dictionary with key for each exo feature
+
+        Returns
+        -------
+        exo_data : ExoData
+           :class:`ExoData` object composed of multiple
+           :class:`SingleExoDataStep` objects. This is the exo data for the
+           full spatiotemporal extent.
+        """
+        data = {}
+        exo_data = None
+        for exo_kwargs in self.get_exo_kwargs(model):
+            data.update(ExoDataHandler(**exo_kwargs).data)
+        exo_data = ExoData(data)
         return exo_data
 
     @cached_property
diff --git a/sup3r/preprocessing/batch_queues/abstract.py b/sup3r/preprocessing/batch_queues/abstract.py
@@ -12,9 +12,9 @@
 import time
 from abc import ABC, abstractmethod
 from collections import namedtuple
+from concurrent.futures import ThreadPoolExecutor
 from typing import TYPE_CHECKING, List, Optional, Union
 
-import dask
 import numpy as np
 import tensorflow as tf
 
@@ -244,15 +244,13 @@ def enqueue_batches(self) -> None:
             if needed == 1 or self.max_workers == 1:
                 self.enqueue_batch()
             elif needed > 0:
-                tasks = [
-                    dask.delayed(self.enqueue_batch)() for _ in range(needed)
-                ]
+                with ThreadPoolExecutor(self.max_workers) as exe:
+                    _ = [exe.submit(self.enqueue_batch) for _ in range(needed)]
                 logger.debug(
                     'Added %s enqueue futures to %s queue.',
                     needed,
                     self._thread_name,
                 )
-                dask.compute(*tasks)
             if time.time() > log_time + 10:
                 logger.debug(self.log_queue_info())
                 log_time = time.time()
diff --git a/sup3r/preprocessing/cachers/base.py b/sup3r/preprocessing/cachers/base.py
@@ -59,10 +59,15 @@ def __init__(
             of dictionaries for each feature (or a single dictionary to use
             for all features). e.g.
             .. code-block:: JSON
-            {'cache_pattern': ...,
-               'chunks': {
-                 'u_10m': {'time': 20, 'south_north': 100, 'west_east': 100}}
-            }
+                {'cache_pattern': ...,
+                    'chunks': {
+                        'u_10m': {
+                            'time': 20,
+                            'south_north': 100,
+                            'west_east': 100
+                        }
+                    }
+                }
 
         Note
         ----
@@ -414,8 +419,10 @@ def write_netcdf(
         features : str | list
             Names of feature(s) to write to file.
         chunks : dict | None
-            Chunk sizes for coordinate dimensions. e.g. ``{'windspeed':
-            {'south_north': 100, 'west_east': 100, 'time': 10}}``
+            Chunk sizes for coordinate dimensions. e.g. ``{'south_north': 100,
+            'west_east': 100, 'time': 10}`` Can also include dataset specific
+            values. e.g. ``{'windspeed': {'south_north': 100, 'west_east': 100,
+            'time': 10}}``
         max_workers : int | None
             Number of workers to use for parallel writing of chunks
         mode : str
diff --git a/sup3r/preprocessing/data_handlers/exo.py b/sup3r/preprocessing/data_handlers/exo.py
@@ -250,7 +250,9 @@ def get_chunk(self, lr_slices):
                     if k == 'data':
                         # last dimension is feature channel, so we use only the
                         # spatial slices if data is 2d and all slices otherwise
-                        chunk_step[k] = v[tuple(exo_slices)[:len(v.shape) - 1]]
+                        chunk_step[k] = v[
+                            tuple(exo_slices)[: len(v.shape) - 1]
+                        ]
                     else:
                         chunk_step[k] = v
                 exo_chunk[feature]['steps'].append(chunk_step)
@@ -380,9 +382,8 @@ def get_exo_steps(cls, feature, models):
                 steps.append({'model': i, 'combine_type': 'output'})
         return steps
 
-    def get_single_step_data(self, s_enhance, t_enhance):
-        """Get exo data for a single model step, with specific enhancement
-        factors."""
+    def get_exo_rasterizer(self, s_enhance, t_enhance):
+        """Get exo rasterizer instance for given enhancement factors"""
         return ExoRasterizer(
             file_paths=self.file_paths,
             source_file=self.source_file,
@@ -394,7 +395,20 @@ def get_single_step_data(self, s_enhance, t_enhance):
             cache_dir=self.cache_dir,
             chunks=self.chunks,
             distance_upper_bound=self.distance_upper_bound,
-        ).data
+        )
+
+    def get_single_step_data(self, s_enhance, t_enhance):
+        """Get exo data for a single model step, with specific enhancement
+        factors."""
+        return self.get_exo_rasterizer(s_enhance, t_enhance).data
+
+    @property
+    def cache_files(self):
+        """Get exo data cache file for all enhancement factors"""
+        return [
+            self.get_exo_rasterizer(s_en, t_en).cache_file
+            for s_en, t_en in zip(self.s_enhancements, self.t_enhancements)
+        ]
 
     def get_all_step_data(self):
         """Get exo data for each model step."""
diff --git a/sup3r/preprocessing/loaders/base.py b/sup3r/preprocessing/loaders/base.py
@@ -8,7 +8,6 @@
 from typing import Callable
 
 import numpy as np
-import xarray as xr
 
 from sup3r.preprocessing.base import Container
 from sup3r.preprocessing.names import FEATURE_NAMES
@@ -17,6 +16,7 @@
     log_args,
     ordered_dims,
 )
+from sup3r.utilities.utilities import xr_open_mfdataset
 
 from .utilities import (
     lower_names,
@@ -35,7 +35,7 @@ class BaseLoader(Container, ABC):
     by :class:`~sup3r.preprocessing.rasterizers.Rasterizer` objects to derive /
     extract specific features / regions / time_periods."""
 
-    BASE_LOADER: Callable = xr.open_mfdataset
+    BASE_LOADER: Callable = xr_open_mfdataset
 
     @log_args
     def __init__(
diff --git a/sup3r/preprocessing/rasterizers/exo.py b/sup3r/preprocessing/rasterizers/exo.py
@@ -143,22 +143,19 @@ def source_handler(self):
             )
         return self._source_handler
 
-    def get_cache_file(self, feature):
+    @property
+    def cache_file(self):
         """Get cache file name
 
-        Parameters
-        ----------
-        feature : str
-            Name of feature to get cache file for
-
         Returns
         -------
         cache_fp : str
             Name of cache file. This is a netcdf file which will be saved with
             :class:`~sup3r.preprocessing.cachers.Cacher` and loaded with
             :class:`~sup3r.preprocessing.loaders.Loader`
         """
-        fn = f'exo_{feature}_{"_".join(map(str, self.input_handler.target))}_'
+        fn = f'exo_{self.feature}_'
+        fn += f'{"_".join(map(str, self.input_handler.target))}_'
         fn += f'{"x".join(map(str, self.input_handler.grid_shape))}_'
 
         if len(self.source_data.shape) == 3:
@@ -278,8 +275,8 @@ def data(self):
         """Get a raster of source values corresponding to the
         high-resolution grid (the file_paths input grid * s_enhance *
         t_enhance). The shape is (lats, lons, temporal, 1)"""
-        cache_fp = self.get_cache_file(feature=self.feature)
 
+        cache_fp = self.cache_file
         if os.path.exists(cache_fp):
             data = Loader(cache_fp)
         else:
diff --git a/sup3r/preprocessing/samplers/base.py b/sup3r/preprocessing/samplers/base.py
@@ -15,7 +15,7 @@
     uniform_box_sampler,
     uniform_time_sampler,
 )
-from sup3r.preprocessing.utilities import log_args, lowered
+from sup3r.preprocessing.utilities import compute_if_dask, log_args, lowered
 
 logger = logging.getLogger(__name__)
 
@@ -195,9 +195,9 @@ def _reshape_samples(self, samples):
             new_shape[-1],
         ]
         # (lats, lons, batch_size, times, feats)
-        out = samples.reshape(new_shape)
+        out = np.reshape(samples, new_shape)
         # (batch_size, lats, lons, times, feats)
-        return np.asarray(out.transpose((2, 0, 1, 3, 4)))
+        return compute_if_dask(np.transpose(out, axes=(2, 0, 1, 3, 4)))
 
     def _stack_samples(self, samples):
         """Used to build batch arrays in the case of independent time samples
diff --git a/sup3r/utilities/utilities.py b/sup3r/utilities/utilities.py
@@ -58,6 +58,8 @@ def xr_open_mfdataset(files, **kwargs):
     """Wrapper for xr.open_mfdataset with default opening options."""
     default_kwargs = {'engine': 'netcdf4'}
     default_kwargs.update(kwargs)
+    if isinstance(files, str):
+        files = [files]
     try:
         return xr.open_mfdataset(files, **default_kwargs)
     except Exception as e:

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,7 @@`
`62`	`62`	`"sphinx.ext.napoleon",`
`63`	`63`	`"sphinx_autosummary_accessors",`
`64`	`64`	`"sphinx_copybutton",`
	`65`	`+ "pygments_lexer"`
`65`	`66`	`]`
`66`	`67`
`67`	`68`	`intersphinx_mapping = {`