From af18c67e076545d09103b196f15b078771859036 Mon Sep 17 00:00:00 2001 From: Florian Deconinck Date: Wed, 2 Jul 2025 15:46:42 -0400 Subject: [PATCH 1/5] Best guess netcdfs compare --- ndsl/stencils/testing/best_guess_diff.py | 194 +++++++++++++++++++++++ setup.py | 1 + 2 files changed, 195 insertions(+) create mode 100644 ndsl/stencils/testing/best_guess_diff.py diff --git a/ndsl/stencils/testing/best_guess_diff.py b/ndsl/stencils/testing/best_guess_diff.py new file mode 100644 index 00000000..edbf25f6 --- /dev/null +++ b/ndsl/stencils/testing/best_guess_diff.py @@ -0,0 +1,194 @@ +import argparse + +import xarray as xr +import yaml +import numpy as np +import pathlib + + +def get_parser(): + parser = argparse.ArgumentParser( + "Attempt to diff two NetCDFs with similar data." + "Differences that can be reconcialed are strict domain vs halo, variable name mapping." + "They program will report on assumptions taken." + ) + parser.add_argument( + "netcdf_A", + type=str, + help="path of NetCDFs, named A in the logs.", + ) + parser.add_argument( + "netcdf_B", + type=str, + help="path of NetCDFs, named B in the logs.", + ) + parser.add_argument( + "-nm", + "--name_mapping", + type=str, + help="[Optional] Yaml file describing the mapping of the names.", + ) + parser.add_argument( + "-ha", + "--halo", + type=int, + default=3, + help="[Optional] Halo size if any, default to 3.", + ) + return parser + + +def main( + netcdf_A: str, + netcdf_B: str, + name_mapping: str | None = None, + halo: int = 3, +): + A = xr.open_dataset(netcdf_A) + B = xr.open_dataset(netcdf_B) + name_map = {} + if name_mapping is not None: + with open(name_mapping) as f: + name_map = yaml.safe_load(f) + + dataset = {} + for name_A, data_A in A.items(): + print(f"Best guess for {name_A} from A:") + # Resolve name + resolved_name_B = None + if name_A not in B.keys(): + if name_A not in name_map.keys(): + print(" [Failed] name can't be found in B nor in name mapping") + else: + resolved_name_B = name_map[name_A] + else: + resolved_name_B = name_A + + if resolved_name_B is None: + continue + print(f" [Hyp] use {resolved_name_B} for B") + + # Resolve domain size + data_B = B[resolved_name_B] + if len(data_A.shape) >= 5: + print( + " [Hyp] A data dims are >= 5, assuming savepoints/rank are the firt two and going A[0, 0, ::]" + ) + data_A = data_A[0, 0, ::] + if len(data_B.shape) >= 5: + print( + " [Hyp] B data dims are >= 5, assuming savepoints/rank are the firt two and going B[0, 0, ::]" + ) + data_B = data_B[0, 0, ::] + + if len(data_A.shape) != len(data_B.shape): + print( + f" [Failed] A is shape {len(data_A.shape)}, B is shape {len(data_B.shape)}: can't reconcile." + ) + continue + + # - Assume we have 0 == I, 1 == J now + resolved_I = None + A_uses_halo = False + B_uses_halo = False + if data_A.shape[0] != data_B.shape[0]: + if data_A.shape[0] < data_B.shape[0]: + if data_B.shape[0] - 2 * halo != data_A.shape[0]: + print( + f" [Failed] B in dim I is too big, even with halo substracted {data_B.shape[0]} (halo: {halo})" + ) + else: + B_uses_halo = True + resolved_I = data_A.shape[0] + else: + if data_A.shape[0] - 2 * halo != data_B.shape[0]: + print( + f" [Failed] A in dim I is too big, even with halo substracted {data_A.shape[0]} (halo: {halo})" + ) + else: + A_uses_halo = True + resolved_I = data_B.shape[0] + else: + resolved_I = data_A.shape[0] + + if resolved_I is None: + continue + + print(f" [Hyp] Using {resolved_I} as I dim size") + + resolved_J = None + if data_A.shape[1] != data_B.shape[1]: + if data_A.shape[1] < data_B.shape[1]: + if data_B.shape[1] - 2 * halo != data_A.shape[1]: + print( + f" [Failed] B in dim J is too big, even with halo substracted {data_B.shape[1]} (halo: {halo})" + ) + else: + resolved_J = data_A.shape[1] + else: + if data_A.shape[1] - 2 * halo != data_B.shape[1]: + print( + f" [Failed] A in dim J is too big, even with halo substracted {data_A.shape[1]} (halo: {halo})" + ) + else: + resolved_J = data_B.shape[1] + else: + resolved_J = data_A.shape[1] + + if resolved_J is None: + continue + + print(f" [Hyp] Using {resolved_J} as J dim size") + + # - Assume 2 == K + if data_A.shape[2] != data_B.shape[2]: + print( + f" [Failed] Can't reconcile K dim: A ({data_A.shape[2]}) != B ({data_B.shape[2]})" + ) + continue + resolved_K = data_A.shape[2] + + print(f" [Hyp] Using {resolved_K} as K dim size") + + # We should now be ready to diff + if A_uses_halo: + data_A = data_A[halo:-halo, halo:-halo, ::] + if B_uses_halo: + data_B = data_B[halo:-halo, halo:-halo, ::] + + dims = [f"D{i}_{s}" for i, s in enumerate(data_A.shape)] + absolute_diff = data_A.data - data_B.data + dataset[name_A] = xr.DataArray( + absolute_diff, + dims=dims, + ) + + # ULP diffs + max_values = np.maximum( + np.absolute(data_A.data.flatten()), np.absolute(data_B.data.flatten()) + ) + ulp_diff = np.divide(np.abs(absolute_diff.flatten()), np.spacing(max_values)) + ulp_diff = np.sort(ulp_diff) + dataset[f"ulp_{name_A}"] = xr.DataArray( + ulp_diff, + dims=[f"GP_{ulp_diff.shape[0]}"], + ) + + print(" [Success]") + + xr.Dataset(dataset).to_netcdf(f"best_guest_diff_{pathlib.Path(netcdf_A).stem}.nc4") + + +def entry_point(): + parser = get_parser() + args = parser.parse_args() + main( + netcdf_A=args.netcdf_A, + netcdf_B=args.netcdf_B, + name_mapping=args.name_mapping, + halo=args.halo, + ) + + +if __name__ == "__main__": + entry_point() diff --git a/setup.py b/setup.py index 453c4e07..43ecd501 100644 --- a/setup.py +++ b/setup.py @@ -65,6 +65,7 @@ def local_pkg(name: str, relative_path: str) -> str: entry_points={ "console_scripts": [ "ndsl-serialbox_to_netcdf = ndsl.stencils.testing.serialbox_to_netcdf:entry_point", + "best_guess_diff = ndsl.stencils.testing.best_guess_diff:entry_point", ] }, ) From ab25952117cab5f6b2e252c89ef240ddae82a98c Mon Sep 17 00:00:00 2001 From: Florian Deconinck Date: Wed, 2 Jul 2025 15:47:05 -0400 Subject: [PATCH 2/5] Add FieldBundle to debugger --- ndsl/__init__.py | 2 +- ndsl/debug/debugger.py | 3 +++ ndsl/quantity/quantity.py | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ndsl/__init__.py b/ndsl/__init__.py index f6bb6117..95400558 100644 --- a/ndsl/__init__.py +++ b/ndsl/__init__.py @@ -1,4 +1,5 @@ from . import dsl # isort:skip +from .logging import ndsl_log # isort:skip from .comm.communicator import CubedSphereCommunicator, TileCommunicator from .comm.local_comm import LocalComm from .comm.mpi import MPIComm @@ -22,7 +23,6 @@ from .halo.updater import HaloUpdater, HaloUpdateRequest, VectorInterfaceHaloUpdater from .initialization.allocator import QuantityFactory from .initialization.sizer import GridSizer, SubtileGridSizer -from .logging import ndsl_log from .monitor.netcdf_monitor import NetCDFMonitor from .namelist import Namelist from .performance.collector import NullPerformanceCollector, PerformanceCollector diff --git a/ndsl/debug/debugger.py b/ndsl/debug/debugger.py index 7e1f60fe..40199bf5 100644 --- a/ndsl/debug/debugger.py +++ b/ndsl/debug/debugger.py @@ -8,6 +8,7 @@ from ndsl.logging import ndsl_log from ndsl.quantity import Quantity +from ndsl.quantity.field_bundle import FieldBundle @dataclasses.dataclass @@ -86,6 +87,8 @@ def save_as_dataset(self, data_as_dict, savename, is_in) -> None: data_arrays[f"{name}.{field.name}"] = self._to_xarray( getattr(data, field.name), field.name ) + elif isinstance(data, FieldBundle): + data_arrays[name] = data.quantity.field_as_xarray else: data_arrays[name] = self._to_xarray(data, name) diff --git a/ndsl/quantity/quantity.py b/ndsl/quantity/quantity.py index a4f7bd52..d1dffa03 100644 --- a/ndsl/quantity/quantity.py +++ b/ndsl/quantity/quantity.py @@ -162,6 +162,8 @@ def from_data_array( def to_netcdf(self, path: str, name="var", rank: int = -1, all_data=False) -> None: if rank < 0 or MPI.COMM_WORLD.Get_rank() == rank: + if rank < 0: + rank = MPI.COMM_WORLD.Get_rank() if all_data: self.data_as_xarray.to_dataset(name=name).to_netcdf( f"{path}__r{rank}.nc4" From b3978cf49f6c17db1a90fec4403e67c4ec4d342f Mon Sep 17 00:00:00 2001 From: Florian Deconinck Date: Wed, 2 Jul 2025 15:59:18 -0400 Subject: [PATCH 3/5] lint --- ndsl/stencils/testing/best_guess_diff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ndsl/stencils/testing/best_guess_diff.py b/ndsl/stencils/testing/best_guess_diff.py index edbf25f6..155ac4e4 100644 --- a/ndsl/stencils/testing/best_guess_diff.py +++ b/ndsl/stencils/testing/best_guess_diff.py @@ -1,9 +1,9 @@ import argparse +import pathlib +import numpy as np import xarray as xr import yaml -import numpy as np -import pathlib def get_parser(): From bc5053f28852672a762a73c092764f8fbe4a640e Mon Sep 17 00:00:00 2001 From: Florian Deconinck Date: Tue, 25 Nov 2025 08:38:50 -0500 Subject: [PATCH 4/5] Move executable to `pyproject` --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index a562bfb5..44173844 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ zarr = ["zarr<3"] [project.scripts] ndsl-serialbox_to_netcdf = "ndsl.stencils.testing.serialbox_to_netcdf:entry_point" +best_guess_diff = "ndsl.stencils.testing.best_guess_diff:entry_point" [project.urls] Repository = "https://github.com/NOAA-GFDL/NDSL" From c50f51d60151d2d6aa24e00a709faaf8dd315ef2 Mon Sep 17 00:00:00 2001 From: Florian Deconinck Date: Tue, 25 Nov 2025 08:40:51 -0500 Subject: [PATCH 5/5] Lint --- ndsl/stencils/testing/best_guess_diff.py | 14 +++++++------- pyproject.toml | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ndsl/stencils/testing/best_guess_diff.py b/ndsl/stencils/testing/best_guess_diff.py index 155ac4e4..7bb0ca82 100644 --- a/ndsl/stencils/testing/best_guess_diff.py +++ b/ndsl/stencils/testing/best_guess_diff.py @@ -6,21 +6,21 @@ import yaml -def get_parser(): +def get_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( "Attempt to diff two NetCDFs with similar data." - "Differences that can be reconcialed are strict domain vs halo, variable name mapping." - "They program will report on assumptions taken." + "Differences that can be reconciled are strict domain vs halo, variable name mapping." + "The program will report on assumptions taken." ) parser.add_argument( "netcdf_A", type=str, - help="path of NetCDFs, named A in the logs.", + help="path of first NetCDFs, named A in the logs.", ) parser.add_argument( "netcdf_B", type=str, - help="path of NetCDFs, named B in the logs.", + help="path of second NetCDFs, named B in the logs.", ) parser.add_argument( "-nm", @@ -43,7 +43,7 @@ def main( netcdf_B: str, name_mapping: str | None = None, halo: int = 3, -): +) -> None: A = xr.open_dataset(netcdf_A) B = xr.open_dataset(netcdf_B) name_map = {} @@ -179,7 +179,7 @@ def main( xr.Dataset(dataset).to_netcdf(f"best_guest_diff_{pathlib.Path(netcdf_A).stem}.nc4") -def entry_point(): +def entry_point() -> None: parser = get_parser() args = parser.parse_args() main( diff --git a/pyproject.toml b/pyproject.toml index 44173844..8bf3ab7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,8 +29,8 @@ test = ["pytest", "coverage"] zarr = ["zarr<3"] [project.scripts] -ndsl-serialbox_to_netcdf = "ndsl.stencils.testing.serialbox_to_netcdf:entry_point" best_guess_diff = "ndsl.stencils.testing.best_guess_diff:entry_point" +ndsl-serialbox_to_netcdf = "ndsl.stencils.testing.serialbox_to_netcdf:entry_point" [project.urls] Repository = "https://github.com/NOAA-GFDL/NDSL"