diff --git a/docs/specification.md b/docs/specification.md index 3c92436e..5aa48bd6 100644 --- a/docs/specification.md +++ b/docs/specification.md @@ -42,7 +42,7 @@ The optional `extra` object is a free-form dictionary that can hold any addition ## The `nodes` group The nodes group will contain an `ids` array and optionally a `props` group. ### The `ids` array -The `nodes\ids` array is a 1D array of node IDs of length `N` >= 0, where `N` is the number of nodes in the graph. Node ids must be unique. Node IDs can have any type supported by zarr (except floats), but we recommend integer dtypes. For large graphs, `uint64` might be necessary to provide enough range for every node to have a unique ID. In the minimal case of an empty graph, the `ids` array will be present but empty. +The `nodes\ids` array is a 1D array of node IDs of length `N` >= 0, where `N` is the number of nodes in the graph. Node ids must be unique. Node IDs must have an unsigned integer dtype. For large graphs, `uint64` might be necessary to provide enough range for every node to have a unique ID. In the minimal case of an empty graph, the `ids` array will be present but empty. ### The `props` group and `node property` groups diff --git a/justfile b/justfile index 6e95c97a..20fe6afa 100644 --- a/justfile +++ b/justfile @@ -8,7 +8,7 @@ test-cov: # run benchmarks benchmark: - uv run --group bench pytest tests/bench.py + uv run --group bench pytest tests/test_bench.py # build wheel and sdist build: diff --git a/src/geff/_graph_libs/_api_wrapper.py b/src/geff/_graph_libs/_api_wrapper.py index dcfdec52..27410d16 100644 --- a/src/geff/_graph_libs/_api_wrapper.py +++ b/src/geff/_graph_libs/_api_wrapper.py @@ -16,6 +16,7 @@ from geff._typing import PropDictNpArray from geff.metadata._schema import GeffMetadata + from geff.validate.data import ValidationConfig SupportedBackend = Literal["networkx", "rustworkx", "spatial-graph"] @@ -107,30 +108,33 @@ def get_construct_func(backend: SupportedBackend) -> ConstructFunc[Any]: @overload def read( store: StoreLike, - validate: bool = True, + structure_validation: bool = True, node_props: list[str] | None = None, edge_props: list[str] | None = None, backend: Literal["networkx"] = "networkx", + data_validation: ValidationConfig | None = None, ) -> tuple[nx.Graph | nx.DiGraph, GeffMetadata]: ... @overload def read( store: StoreLike, - validate: bool, + structure_validation: bool, node_props: list[str] | None, edge_props: list[str] | None, backend: Literal["rustworkx"], + data_validation: ValidationConfig | None = None, ) -> tuple[rx.PyGraph | rx.PyDiGraph, GeffMetadata]: ... @overload def read( store: StoreLike, - validate: bool, + structure_validation: bool, node_props: list[str] | None, edge_props: list[str] | None, backend: Literal["spatial-graph"], + data_validation: ValidationConfig | None = None, *, position_attr: str = "position", ) -> tuple[sg.SpatialGraph | sg.SpatialDiGraph, GeffMetadata]: ... @@ -138,10 +142,11 @@ def read( def read( store: StoreLike, - validate: bool = True, + structure_validation: bool = True, node_props: list[str] | None = None, edge_props: list[str] | None = None, backend: SupportedBackend = "networkx", + data_validation: ValidationConfig | None = None, **backend_kwargs: Any, ) -> tuple[Any, GeffMetadata]: """ @@ -150,7 +155,7 @@ def read( Args: store (StoreLike): The path or zarr store to the root of the geff zarr, where the .attrs contains the geff metadata. - validate (bool, optional): Flag indicating whether to perform validation on the + structure_validation (bool, optional): Flag indicating whether to perform validation on the geff file before loading into memory. If set to False and there are format issues, will likely fail with a cryptic error. Defaults to True. node_props (list of str, optional): The names of the node properties to load, @@ -159,6 +164,8 @@ def read( if None all properties will be loaded, defaults to None. backend ({"networkx", "rustworkx", "spatial-graph"}): Flag for the chosen backend, default is "networkx". + data_validation (ValidationConfig, optional): Optional configuration for which + optional types of data to validate. Each option defaults to False. backend_kwargs (Any): Additional kwargs that may be accepted by the backend when reading the data. @@ -166,7 +173,9 @@ def read( tuple[Any, GeffMetadata]: Graph object of the chosen backend, and the GEFF metadata. """ construct_func = get_construct_func(backend) - in_memory_geff = read_to_memory(store, validate, node_props, edge_props) + in_memory_geff = read_to_memory( + store, structure_validation, node_props, edge_props, data_validation + ) return ( construct_func(**in_memory_geff, **backend_kwargs), in_memory_geff["metadata"], diff --git a/src/geff/_graph_libs/_networkx.py b/src/geff/_graph_libs/_networkx.py index c4f610df..023232c4 100644 --- a/src/geff/_graph_libs/_networkx.py +++ b/src/geff/_graph_libs/_networkx.py @@ -18,6 +18,7 @@ from zarr.storage import StoreLike from geff._typing import PropDictNpArray + from geff.validate.data import ValidationConfig import logging @@ -190,9 +191,10 @@ def construct_nx( def read_nx( store: StoreLike, - validate: bool = True, + structure_validation: bool = True, node_props: list[str] | None = None, edge_props: list[str] | None = None, + data_validation: ValidationConfig | None = None, ) -> tuple[nx.Graph, GeffMetadata]: """Read a geff file into a networkx graph. Metadata properties will be stored in the graph properties, accessed via `G.graph[key]` where G is a networkx graph. @@ -200,18 +202,22 @@ def read_nx( Args: store (str | Path | zarr store): The path/str to the geff zarr, or the store itself. Opens in append mode, so will only overwrite geff-controlled groups. - validate (bool, optional): Flag indicating whether to perform validation on the + structure_validation (bool, optional): Flag indicating whether to perform validation on the geff file before loading into memory. If set to False and there are format issues, will likely fail with a cryptic error. Defaults to True. node_props (list of str, optional): The names of the node properties to load, if None all properties will be loaded, defaults to None. edge_props (list of str, optional): The names of the edge properties to load, if None all properties will be loaded, defaults to None. + data_validation (ValidationConfig, optional): Optional configuration for which + optional types of data to validate. Each option defaults to false. Returns: A networkx graph containing the graph that was stored in the geff file format """ - in_memory_geff = read_to_memory(store, validate, node_props, edge_props) + in_memory_geff = read_to_memory( + store, structure_validation, node_props, edge_props, data_validation + ) graph = construct_nx(**in_memory_geff) return graph, in_memory_geff["metadata"] diff --git a/src/geff/_graph_libs/_rustworkx.py b/src/geff/_graph_libs/_rustworkx.py index d99fd11e..9690b962 100644 --- a/src/geff/_graph_libs/_rustworkx.py +++ b/src/geff/_graph_libs/_rustworkx.py @@ -28,6 +28,7 @@ from zarr.storage import StoreLike from geff._typing import PropDictNpArray + from geff.validate.data import ValidationConfig def get_roi_rx( @@ -250,9 +251,10 @@ def construct_rx( def read_rx( store: StoreLike, - validate: bool = True, + structure_validation: bool = True, node_props: list[str] | None = None, edge_props: list[str] | None = None, + data_validation: ValidationConfig | None = None, ) -> tuple[rx.PyGraph | rx.PyDiGraph, GeffMetadata]: """Read a geff file into a rustworkx graph. Metadata properties will be stored in the graph.attrs dict @@ -264,16 +266,20 @@ def read_rx( Args: store: The path/str to the geff zarr, or the store itself. - validate: Whether to validate the geff file. + structure_validation: Whether to validate the geff file. node_props: The names of the node properties to load, if None all properties will be loaded, defaults to None. edge_props: The names of the edge properties to load, if None all properties will be loaded, defaults to None. + data_validation (ValidationConfig, optional): Optional configuration for which + optional types of data to validate. Each option defaults to False. Returns: A tuple containing the rustworkx graph and the metadata. """ - graph_dict = read_to_memory(store, validate, node_props, edge_props) + graph_dict = read_to_memory( + store, structure_validation, node_props, edge_props, data_validation + ) graph = construct_rx(**graph_dict) return graph, graph_dict["metadata"] diff --git a/src/geff/_graph_libs/_spatial_graph.py b/src/geff/_graph_libs/_spatial_graph.py index 3b11d605..61e87874 100644 --- a/src/geff/_graph_libs/_spatial_graph.py +++ b/src/geff/_graph_libs/_spatial_graph.py @@ -21,6 +21,7 @@ from zarr.storage import StoreLike from geff._typing import PropDictNpArray + from geff.validate.data import ValidationConfig import geff from geff.core_io import write_arrays @@ -116,10 +117,11 @@ def write_sg( def read_sg( store: StoreLike, - validate: bool = True, + structure_validation: bool = True, position_attr: str = "position", node_props: list[str] | None = None, edge_props: list[str] | None = None, + data_validation: ValidationConfig | None = None, ) -> tuple[sg.SpatialGraph | sg.SpatialDiGraph, GeffMetadata]: """Read a geff file into a SpatialGraph. @@ -129,37 +131,31 @@ def read_sg( Args: store (Path | str | zarr store): - The path to the root of the geff zarr, where the .attrs contains the geff metadata. - - validate (bool, optional): - + structure_validation (bool, optional): Flag indicating whether to perform validation on the geff file before loading into memory. If set to False and there are format issues, will likely fail with a cryptic error. Defaults to True. - position_attr (str, optional): - How to call the position attribute in the returned SpatialGraph. Defaults to "position". - node_props (list of str, optional): - The names of the node properties to load, if None all properties will be loaded, defaults to None. - edge_props (list of str, optional): - The names of the edge properties to load, if None all properties will be loaded, defaults to None. + data_validation (ValidationConfig, optional): Optional configuration for which + optional types of data to validate. Each option defaults to False. Returns: - A tuple containing the spatial_graph graph and the metadata. """ - in_memory_geff = read_to_memory(store, validate, node_props, edge_props) + in_memory_geff = read_to_memory( + store, structure_validation, node_props, edge_props, data_validation + ) graph = construct_sg(**in_memory_geff, position_attr=position_attr) return graph, in_memory_geff["metadata"] diff --git a/src/geff/core_io/_base_read.py b/src/geff/core_io/_base_read.py index 644074f8..88fcabba 100644 --- a/src/geff/core_io/_base_read.py +++ b/src/geff/core_io/_base_read.py @@ -8,6 +8,7 @@ from geff import _path from geff.core_io import _utils from geff.metadata._schema import GeffMetadata +from geff.validate.data import ValidationConfig, validate_data from geff.validate.structure import validate_structure if TYPE_CHECKING: @@ -214,9 +215,10 @@ def build( # added to this function to select between them. def read_to_memory( source: StoreLike, - validate: bool = True, + structure_validation: bool = True, node_props: Iterable[str] | None = None, edge_props: Iterable[str] | None = None, + data_validation: ValidationConfig | None = None, ) -> InMemoryGeff: """ Read a GEFF zarr file to into memory as a series of numpy arrays in a dictionary. @@ -227,9 +229,11 @@ def read_to_memory( Args: source (str | Path | zarr store): Either a path to the root of the geff zarr (where the .attrs contains the geff metadata), or a zarr store object - validate (bool, optional): Flag indicating whether to perform validation on the - geff file before loading into memory. If set to False and there are - format issues, will likely fail with a cryptic error. Defaults to True. + structure_validation (bool, optional): Flag indicating whether to perform metadata/structure + validation on the geff file before loading into memory. If set to False and + there are format issues, will likely fail with a cryptic error. Defaults to True. + data_validation (ValidationConfig, optional): Optional configuration for which + optional types of data to validate. Each option defaults to False. node_props (iterable of str, optional): The names of the node properties to load, if None all properties will be loaded, defaults to None. edge_props (iterable of str, optional): The names of the edge properties to load, @@ -240,10 +244,14 @@ def read_to_memory( (metadata, node_ids, edge_ids, node_props, edge_props) """ - file_reader = GeffReader(source, validate) + file_reader = GeffReader(source, structure_validation) file_reader.read_node_props(node_props) file_reader.read_edge_props(edge_props) in_memory_geff = file_reader.build() + + if data_validation is not None: + validate_data(config=data_validation, memory_geff=in_memory_geff) + return in_memory_geff diff --git a/src/geff/testing/data.py b/src/geff/testing/data.py index c319a5ae..a2b54086 100644 --- a/src/geff/testing/data.py +++ b/src/geff/testing/data.py @@ -70,8 +70,9 @@ if TYPE_CHECKING: from numpy.typing import NDArray + DTypeStr = Literal["double", "int", "int8", "uint8", "int16", "uint16", "float32", "float64", "str"] -NodeIdDTypeStr = Literal["int", "int8", "uint8", "int16", "uint16"] +NodeIdDTypeStr = Literal["uint", "uint8", "uint16", "uint32", "uint64"] Axes = Literal["t", "z", "y", "x"] @@ -509,7 +510,7 @@ def create_simple_2d_geff( >>> # graph is a networkx Graph with 2D spatial data (x, y, t) """ return create_memory_mock_geff( - node_id_dtype="int", + node_id_dtype="uint", node_axis_dtypes={"position": "float64", "time": "float64"}, directed=directed, num_nodes=num_nodes, @@ -568,7 +569,7 @@ def create_simple_3d_geff( >>> x, y, z, t = node_data['x'], node_data['y'], node_data['z'], node_data['t'] """ return create_memory_mock_geff( - node_id_dtype="int", + node_id_dtype="uint", node_axis_dtypes={"position": "float64", "time": "float64"}, directed=directed, num_nodes=num_nodes, @@ -616,7 +617,7 @@ def create_simple_temporal_geff( >>> # Each node has only 't' coordinate, no x, y, z """ return create_memory_mock_geff( - node_id_dtype="int", + node_id_dtype="uint", node_axis_dtypes={"position": "float64", "time": "float64"}, directed=directed, num_nodes=num_nodes, diff --git a/src/geff/validate/data.py b/src/geff/validate/data.py new file mode 100644 index 00000000..f2558b75 --- /dev/null +++ b/src/geff/validate/data.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +from pydantic import BaseModel + +from geff.validate.graph import ( + validate_no_repeated_edges, + validate_no_self_edges, + validate_nodes_for_edges, + validate_unique_node_ids, +) +from geff.validate.shapes import validate_ellipsoid, validate_sphere +from geff.validate.tracks import ( + validate_lineages, + validate_tracklets, +) + +if TYPE_CHECKING: + from geff._typing import InMemoryGeff + + +class ValidationConfig(BaseModel): + graph: bool = False + sphere: bool = False + ellipsoid: bool = False + lineage: bool = False + tracklet: bool = False + + +def validate_data(memory_geff: InMemoryGeff, config: ValidationConfig) -> None: + """Validate the data of a geff based on the options selected in ValidationConfig + + Args: + memory_geff (InMemoryGeff): An InMemoryGeff which contains metadata and + dictionaries of node/edge property arrays + config (ValidationConfig): Configuration for which validation to run + """ + meta = memory_geff["metadata"] + + if config.graph: + node_ids = memory_geff["node_ids"] + edge_ids = memory_geff["edge_ids"] + + valid, nonunique_nodes = validate_unique_node_ids(node_ids) + if not valid: + raise ValueError(f"Some node ids are not unique:\n{nonunique_nodes}") + + valid, invalid_edges = validate_nodes_for_edges(node_ids, edge_ids) + if not valid: + raise ValueError(f"Some edges are missing nodes:\n{invalid_edges}") + + valid, invalid_edges = validate_no_self_edges(edge_ids) + if not valid: + raise ValueError(f"Self edges found in data:\n{invalid_edges}") + + valid, invalid_edges = validate_no_repeated_edges(edge_ids) + if not valid: + raise ValueError(f"Repeated edges found in data:\n{invalid_edges}") + + if config.sphere and meta.sphere is not None: + radius = memory_geff["node_props"][meta.sphere]["values"] + validate_sphere(radius) + + if config.ellipsoid and meta.ellipsoid is not None: + covariance = memory_geff["node_props"][meta.ellipsoid]["values"] + validate_ellipsoid(covariance, memory_geff["metadata"].axes) + + if meta.track_node_props is not None: + if config.tracklet and "tracklet" in meta.track_node_props: + node_ids = memory_geff["node_ids"] + edge_ids = memory_geff["edge_ids"] + tracklet_key = meta.track_node_props["tracklet"] + tracklet_ids = memory_geff["node_props"][tracklet_key]["values"] + valid, errors = validate_tracklets(node_ids, edge_ids, tracklet_ids) + if not valid: + raise ValueError("Found invalid tracklets:\n", "\n".join(errors)) + + if config.lineage and "lineage" in meta.track_node_props: + node_ids = memory_geff["node_ids"] + edge_ids = memory_geff["edge_ids"] + lineage_key = meta.track_node_props["lineage"] + lineage_ids = memory_geff["node_props"][lineage_key]["values"] + valid, errors = validate_lineages(node_ids, edge_ids, lineage_ids) + if not valid: + raise ValueError("Found invalid lineages:\n", "\n".join(errors)) diff --git a/src/geff/validate/graph.py b/src/geff/validate/graph.py index 2aacf724..da6ecb05 100644 --- a/src/geff/validate/graph.py +++ b/src/geff/validate/graph.py @@ -8,9 +8,26 @@ from numpy.typing import ArrayLike -def validate_nodes_for_edges( - node_ids: ArrayLike, edge_ids: ArrayLike -) -> tuple[bool, list[tuple[int, int]]]: +def validate_unique_node_ids(node_ids: ArrayLike) -> tuple[bool, np.ndarray]: + """Validates that all node ids are unique + + Args: + node_ids (ArrayLike): 1D arraylike of node ids + + Returns: + tuple[bool, np.ndarray]: + - valid (bool): True if all node ids are unique + - errors (list[ints]): List of any non-unique node ids + """ + node_ids = np.asarray(node_ids) + + unique_ids, counts = np.unique(node_ids, return_counts=True) + if any(counts > 1): + return False, unique_ids[counts > 1] + return True, np.array([]) + + +def validate_nodes_for_edges(node_ids: ArrayLike, edge_ids: ArrayLike) -> tuple[bool, np.ndarray]: """ Validates that all edges in `edge_ids` reference node IDs present in `node_ids`. @@ -24,9 +41,9 @@ def validate_nodes_for_edges( (source, target). Returns: - tuple[bool, list[tuple[int, int]]]: + tuple[bool, np.ndarray]: - all_edges_valid (bool): True if all edges reference valid node IDs. - - invalid_edges (list of tuple[int, int]): List of (source, target) pairs for + - invalid_edges (np.ndarray): Array of (source, target) pairs for invalid edges. """ @@ -39,8 +56,8 @@ def validate_nodes_for_edges( mask = valid_src & valid_tgt # Find invalid edges - invalid_edges = [tuple(edge) for edge in edge_ids[~mask]] - all_edges_valid = not invalid_edges + invalid_edges = edge_ids[~mask] + all_edges_valid = len(invalid_edges) == 0 return all_edges_valid, invalid_edges diff --git a/src/geff/validate/shapes.py b/src/geff/validate/shapes.py new file mode 100644 index 00000000..a4177395 --- /dev/null +++ b/src/geff/validate/shapes.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np + +if TYPE_CHECKING: + from geff.metadata._schema import Axis + + +def validate_ellipsoid(covariance: np.ndarray, axes: list[Axis] | None) -> None: + """Validate that ellipsoid data has a valid covariance matrix + + The first axis of the covariance array corresponds to the number of nodes. The + remaining axes correspond to the number of spatial axes. + + Args: + covariance (np.ndarray): Covariance array stored as values for an ellipsoid property + axes (list[Axis]): List of Axis metadata + + Raises: + ValueError: Must define space axes in order to have ellipsoid data + ValueError: Ellipsoid covariance matrix must have 1 + number of spatial dimensions + ValueError: Spatial dimensions of covariance matrix must be equal + ValueError: Ellipsoid covariance matrices must be symmetric + ValueError: Ellipsoid covariance matrices must be positive-definite + """ + bad_axes = True + spatial_dim = 0 + # Axes need to exist and contain spatial data + if axes is not None: + for ax in axes: + if ax.type == "space": + spatial_dim += 1 + + if spatial_dim > 0: + bad_axes = False + + if bad_axes: + raise ValueError("Must define space axes in order to have ellipsoid data") + + if covariance.ndim != (exp_dim := spatial_dim + 1): + raise ValueError( + f"Ellipsoid covariance matrix must have {exp_dim} dimensions, got {covariance.ndim}" + ) + + if covariance.shape[1] != covariance.shape[2]: + raise ValueError( + f"Spatial dimensions of covariance matrix must be equal, got {covariance.shape[1:]}" + ) + + transpose = [0, *list(range(covariance.ndim - 1, 0, -1))] + if not np.allclose(covariance, np.transpose(covariance, axes=transpose)): + raise ValueError("Ellipsoid covariance matrices must be symmetric") + + if not np.all(np.linalg.eigvals(covariance) > 0): + raise ValueError("Ellipsoid covariance matrices must be positive-definite") + + +def validate_sphere(radius: np.ndarray) -> None: + """Validate that sphere data has nonzero radii and is 1d + + Args: + radius (np.ndarray): Values array of a sphere property + + Raises: + ValueError: Sphere radius values must be non-negative + ValueError: Sphere radius values must be 1D + """ + if radius.ndim != 1: + raise ValueError(f"Sphere radius values must be 1D, got {radius.ndim} dimensions") + + if np.any(radius < 0): + raise ValueError("Sphere radius values must be non-negative.") diff --git a/src/geff/validate/structure.py b/src/geff/validate/structure.py index ae69cb63..e75e0a65 100644 --- a/src/geff/validate/structure.py +++ b/src/geff/validate/structure.py @@ -9,12 +9,10 @@ from geff.core_io._utils import expect_array, expect_group, open_storelike if TYPE_CHECKING: - from collections.abc import Mapping - from zarr.storage import StoreLike -from geff.metadata import GeffMetadata, PropMetadata +from geff.metadata import GeffMetadata def validate_structure(store: StoreLike) -> None: @@ -37,47 +35,36 @@ def validate_structure(store: StoreLike) -> None: nodes_group = expect_group(graph_group, _path.NODES) _validate_nodes_group(nodes_group, metadata) - # TODO: Do we want to prevent missing values on spatialtemporal properties - if _path.EDGES in graph_group.keys(): - edges_group = expect_group(graph_group, _path.EDGES) - _validate_edges_group(edges_group, metadata) + edges_group = expect_group(graph_group, _path.EDGES) + _validate_edges_group(edges_group, metadata) + # Metadata based validation + if metadata.axes is not None: + _validate_axes_structure(graph_group, metadata) -def _validate_props_metadata( - props_metadata_dict: Mapping[str, PropMetadata], - component_props: zarr.Group, - component_type: str, -) -> None: - """Validate that properties described in metadata are compatible with the data in zarr arrays. - Args: - props_metadata_dict (dict): Dictionary of property metadata with identifier keys - and PropMetadata values - component_props (zarr.Group): Zarr group containing the component properties (nodes - or edges) - component_type (str): Component type for error messages ("Node" or "Edge") +def _validate_axes_structure(graph: zarr.Group, meta: GeffMetadata) -> None: + """Verify that any metadata regarding axes is actually present in the data - Raises: - AssertionError: If properties in metadata don't match zarr arrays - """ - for prop in props_metadata_dict.values(): - prop_id = prop.identifier - # Properties described in metadata should be present in zarr arrays - if not isinstance(props_group := component_props.get(prop_id), zarr.Group): - raise ValueError( - f"{component_type} property {prop_id} described in metadata is not present " - f"in props arrays" - ) + - Property exists with name matching Axis name + - Data is 1D + - Missing values not allowed - # dtype in metadata should match dtype in zarr arrays - values_array = expect_array(props_group, _path.VALUES, component_type) - array_dtype = values_array.dtype - prop_dtype = np.dtype(prop.dtype).type - if array_dtype != prop_dtype: - raise ValueError( - f"{component_type} property {prop_id} with dtype {array_dtype} does not match " - f"metadata dtype {prop_dtype}" + Args: + graph (zarr.Group): The zarr group containing the geff metadata + meta (GeffMetadata): Metadata from geff + """ + if meta.axes is not None: + node_prop_group = expect_group(graph, "nodes/props") + for ax in meta.axes: + # Array must be present without missing values + assert f"{ax.name}/values" in node_prop_group, f"Axis {ax.name} data is missing" + assert f"{ax.name}/missing" not in node_prop_group, ( + f"Axis {ax.name} has missing values which are not allowed" ) + # Only 1d data allowed, already checked length of first axis + ndim = len(expect_array(node_prop_group, f"{ax.name}/values").shape) + assert ndim == 1, f"Axis property {ax.name} has {ndim} dimensions, must be 1D" def _validate_props_group( @@ -108,25 +95,33 @@ def _validate_props_group( ) if _path.MISSING in arrays: - miss_len = cast("zarr.Array", prop_group[_path.MISSING]).shape[0] + missing_arr = cast("zarr.Array", prop_group[_path.MISSING]) + miss_len = missing_arr.shape[0] if miss_len != expected_len: raise ValueError( f"{parent_key} property {prop_name!r} {_path.MISSING} mask has length " f"{miss_len}, which does not match id length {expected_len}" ) + if not np.issubdtype(missing_arr.dtype, np.bool_): + raise ValueError( + f"{parent_key} property {prop_name!r} {_path.MISSING} must be boolean" + ) + def _validate_nodes_group(nodes_group: zarr.Group, metadata: GeffMetadata) -> None: """Validate the structure of a nodes group in a GEFF zarr store.""" node_ids = expect_array(nodes_group, _path.IDS, _path.NODES) + + # Node ids must be int dtype + # TODO: enforce uint + if not np.issubdtype(np.dtype(node_ids.dtype), np.integer): + raise ValueError("Node ids must have an integer dtype") + id_len = node_ids.shape[0] node_props = expect_group(nodes_group, _path.PROPS, _path.NODES) _validate_props_group(node_props, id_len, "Node") - # Node properties metadata validation - if metadata.node_props_metadata is not None: - _validate_props_metadata(metadata.node_props_metadata, node_props, "Node") - def _validate_edges_group(edges_group: zarr.Group, metadata: GeffMetadata) -> None: """Validate the structure of an edges group in a GEFF zarr store.""" @@ -147,6 +142,3 @@ def _validate_edges_group(edges_group: zarr.Group, metadata: GeffMetadata) -> No f"{_path.EDGES!r} group must contain a {_path.PROPS!r} group. Got {type(edge_props)}" ) _validate_props_group(edge_props, edge_id_len, "Edge") - # Edge properties metadata validation - if metadata.edge_props_metadata is not None: - _validate_props_metadata(metadata.edge_props_metadata, edge_props, "Edge") diff --git a/tests/test_bench.py b/tests/test_bench.py index c1f377a4..a5eaf37d 100644 --- a/tests/test_bench.py +++ b/tests/test_bench.py @@ -80,9 +80,9 @@ def graph_file_path(num_nodes: int) -> Path: # ########################### TESTS ################################## READ_PATH: Mapping[Callable, Callable[[Path], tuple[Any, Any]]] = { - geff.read_nx: lambda path: geff.read_nx(path, validate=False), - geff.read_rx: lambda path: geff.read_rx(path, validate=False), - geff.read_sg: lambda path: geff.read_sg(path, validate=False), + geff.read_nx: lambda path: geff.read_nx(path, structure_validation=False), + geff.read_rx: lambda path: geff.read_rx(path, structure_validation=False), + geff.read_sg: lambda path: geff.read_sg(path, structure_validation=False), } @@ -111,4 +111,4 @@ def test_bench_validate(benchmark: BenchmarkFixture, nodes: int) -> None: @pytest.mark.parametrize("read_func", [geff.read_nx, geff.read_rx, geff.read_sg]) def test_bench_read(read_func: Callable, benchmark: BenchmarkFixture, nodes: int) -> None: graph_path = graph_file_path(nodes) - benchmark(read_func, graph_path, validate=False) + benchmark(read_func, graph_path, structure_validation=False) diff --git a/tests/test_core_io/test_base_read.py b/tests/test_core_io/test_base_read.py index a39d931d..2197dfd1 100644 --- a/tests/test_core_io/test_base_read.py +++ b/tests/test_core_io/test_base_read.py @@ -3,7 +3,9 @@ from geff import GeffReader from geff._graph_libs._networkx import construct_nx -from geff.testing.data import create_memory_mock_geff +from geff.core_io._base_read import read_to_memory +from geff.testing.data import create_memory_mock_geff, create_simple_2d_geff +from geff.validate.data import ValidationConfig node_id_dtypes = ["int8", "uint8", "int16", "uint16"] node_axis_dtypes = [ @@ -180,3 +182,10 @@ def test_read_edge_props() -> None: ) _ = construct_nx(**in_memory_geff) + + +def test_read_to_memory(): + # Mostly testing that conditionals run correctly since functionality is tested elsewhere + store, attrs = create_simple_2d_geff() + + read_to_memory(store, structure_validation=True, data_validation=ValidationConfig()) diff --git a/tests/test_testing/test_data.py b/tests/test_testing/test_testing_data.py similarity index 100% rename from tests/test_testing/test_data.py rename to tests/test_testing/test_testing_data.py diff --git a/tests/test_validate/test_graph.py b/tests/test_validate/test_graph.py index bc5a22aa..da082e24 100644 --- a/tests/test_validate/test_graph.py +++ b/tests/test_validate/test_graph.py @@ -4,6 +4,7 @@ validate_no_repeated_edges, validate_no_self_edges, validate_nodes_for_edges, + validate_unique_node_ids, ) @@ -72,3 +73,10 @@ def test_detects_repeated_edges() -> None: assert not is_valid, "Validator should detect repeated edges." assert [0, 1] in repeated_edges.tolist(), "Edge [0, 1] should be reported as repeated." assert len(repeated_edges) == 1, "There should be exactly one unique repeated edge." + + +def test_validate_unique_node_ids() -> None: + node_ids = np.array([0, 0, 1, 2, 3]) + valid, nonunique = validate_unique_node_ids(node_ids) + assert not valid + assert nonunique == np.array([0]) diff --git a/tests/test_validate/test_segmentation.py b/tests/test_validate/test_segmentation.py index 1f344c0b..8a45a715 100644 --- a/tests/test_validate/test_segmentation.py +++ b/tests/test_validate/test_segmentation.py @@ -14,7 +14,7 @@ @pytest.fixture def valid_store_and_attrs(): store, graphattrs = create_memory_mock_geff( - node_id_dtype="int", + node_id_dtype="uint", node_axis_dtypes={"position": "float64", "time": "float64"}, directed=True, num_nodes=5, @@ -31,7 +31,7 @@ def valid_store_and_attrs(): @pytest.fixture def invalid_store_and_attrs(): store, graphattrs = create_memory_mock_geff( - node_id_dtype="int", + node_id_dtype="uint8", node_axis_dtypes={"position": "float64", "time": "float64"}, directed=True, num_nodes=5, diff --git a/tests/test_validate/test_shapes.py b/tests/test_validate/test_shapes.py new file mode 100644 index 00000000..ce4adaea --- /dev/null +++ b/tests/test_validate/test_shapes.py @@ -0,0 +1,66 @@ +from typing import ClassVar + +import numpy as np +import pytest + +from geff.metadata._schema import Axis +from geff.validate.shapes import validate_ellipsoid, validate_sphere + + +class Test_validate_ellipsoid: + axes_2d: ClassVar[list[Axis]] = [Axis(name="x", type="space"), Axis(name="y", type="space")] + axes_3d: ClassVar[list[Axis]] = [ + Axis(name="x", type="space"), + Axis(name="y", type="space"), + Axis(name="z", type="space"), + ] + + def test_axes(self): + arr = np.ones((10, 2, 2)) + # Must provided axes + with pytest.raises( + ValueError, match="Must define space axes in order to have ellipsoid data" + ): + validate_ellipsoid(arr, None) + + # Axes must be spatial + axes = [Axis(name="t", type="time"), Axis(name="c", type="channel")] + with pytest.raises( + ValueError, match="Must define space axes in order to have ellipsoid data" + ): + validate_ellipsoid(arr, axes) + + def test_square_matrix(self): + arr = np.ones((10, 2, 5)) + with pytest.raises( + ValueError, match="Spatial dimensions of covariance matrix must be equal" + ): + validate_ellipsoid(arr, self.axes_2d) + + def test_ndim(self): + arr = np.ones((10, 2, 2)) + with pytest.raises(ValueError, match="Ellipsoid covariance matrix must have .* dimensions"): + validate_ellipsoid(arr, self.axes_3d) + + def test_symmetric(self): + arr = np.ones((10, 2, 2)) + arr[:, 0, 1] = 0 + with pytest.raises(ValueError, match="Ellipsoid covariance matrices must be symmetric"): + validate_ellipsoid(arr, self.axes_2d) + + def test_pos_def(self): + arr = np.ones((10, 2, 2)) + with pytest.raises( + ValueError, match="Ellipsoid covariance matrices must be positive-definite" + ): + validate_ellipsoid(arr, self.axes_2d) + + +def test_validate_sphere(): + # Not 1d + with pytest.raises(ValueError, match="Sphere radius values must be 1D"): + validate_sphere(np.ones((2, 2, 2))) + + # Not positive + with pytest.raises(ValueError, match="Sphere radius values must be non-negative."): + validate_sphere(np.full((2), fill_value=-1)) diff --git a/tests/test_validate/test_structure.py b/tests/test_validate/test_structure.py index 19c29986..559ac3f2 100644 --- a/tests/test_validate/test_structure.py +++ b/tests/test_validate/test_structure.py @@ -1,218 +1,185 @@ from __future__ import annotations import copy -import re -from typing import TYPE_CHECKING import numpy as np import pytest import zarr import zarr.storage -from geff import validate_structure +from geff import _path, validate_structure from geff.core_io._base_read import read_to_memory from geff.core_io._base_write import write_arrays -from geff.core_io._utils import open_storelike +from geff.core_io._utils import expect_group, open_storelike +from geff.metadata._schema import GeffMetadata from geff.testing._utils import check_equiv_geff -from geff.testing.data import create_simple_2d_geff - -if TYPE_CHECKING: - from pathlib import Path - - -def test_validate_structure(tmp_path: Path) -> None: - # Does not exist - with pytest.raises(FileNotFoundError, match=r"Path does not exist: does-not-exist"): - validate_structure("does-not-exist") - - # remote zarr path does not raise existence error - remote_path = "https://blah.com/test.zarr" - with pytest.raises(ValueError, match=r"store must be a zarr StoreLike"): - validate_structure(remote_path) - - # Path exists but is not a zarr store - non_zarr_path = tmp_path / "not-a-zarr" - non_zarr_path.mkdir() - with pytest.raises(ValueError, match=r"store must be a zarr StoreLike"): - validate_structure(non_zarr_path) - - zpath = tmp_path / "test.zarr" - z = zarr.open_group(zpath) - - # Missing metadata - with pytest.raises(ValueError, match="No geff key found in"): - validate_structure(zpath) - z.attrs["geff"] = { - "geff_version": "0.0.1", - "directed": True, - "roi_min": [0, 0], - "roi_max": [100, 100], - } - - # No nodes - with pytest.raises(ValueError, match="'graph' group must contain a group named 'nodes'"): - validate_structure(zpath) - nodes = z.create_group("nodes") - - # Nodes missing ids - with pytest.raises(ValueError, match="'nodes' group must contain an 'ids' array"): - validate_structure(zpath) - n_node = 10 - z["nodes/ids"] = np.zeros(n_node) - - # Nodes must have a props group - with pytest.raises(ValueError, match="'nodes' group must contain a group named 'props'"): - validate_structure(zpath) - nodes.create_group("props") - - # Subgroups in props must have values - nodes.create_group("props/score") - with pytest.raises(ValueError, match="Node property group 'score' must have a 'values' array"): - validate_structure(zpath) - z["nodes/props/score/values"] = np.zeros(n_node) - validate_structure(zpath) - - # Property shape mismatch - z["nodes/props/badshape/values"] = np.zeros(n_node * 2) - with pytest.raises( - ValueError, - match=( - f"Node property 'badshape' values has length {n_node * 2}, " - f"which does not match id length {n_node}" - ), - ): - validate_structure(zpath) - - del z["nodes/props"]["badshape"] - # Property missing shape mismatch - z["nodes/props/badshape/values"] = np.zeros(shape=(n_node)) - z["nodes/props/badshape/missing"] = np.zeros(shape=(n_node * 2)) - with pytest.raises( - ValueError, - match=( - f"Node property 'badshape' missing mask has length {n_node * 2}, " - f"which does not match id length {n_node}" - ), - ): - validate_structure(zpath) - del z["nodes/props"]["badshape"] - - # No edge group is okay, if the graph has no edges - z.create_group("edges") - - # Missing edge ids - with pytest.raises(ValueError, match="'edges' group must contain an 'ids' array"): - validate_structure(zpath) - - # ids array must have last dim size 2 - n_edges = 5 - badshape = (n_edges, 3) - z["edges/ids"] = np.zeros(badshape) - with pytest.raises( - ValueError, - match=re.escape( - f"edges ids must have a last dimension of size 2, received shape {badshape}" - ), - ): - validate_structure(zpath) - del z["edges"]["ids"] - z["edges/ids"] = np.zeros((n_edges, 2)) - - # Property values shape mismatch - z["edges/props/badshape/values"] = np.zeros((n_edges * 2, 2)) - with pytest.raises( - ValueError, - match=( - f"Edge property 'badshape' values has length {n_edges * 2}, " - f"which does not match id length {n_edges}" - ), - ): - validate_structure(zpath) - del z["edges/props/badshape"]["values"] - - # Property missing shape mismatch - z["edges/props/badshape/values"] = np.zeros((n_edges, 2)) - z["edges/props/badshape/missing"] = np.zeros((n_edges * 2, 2)) - with pytest.raises( - ValueError, - match=( - f"Edge property 'badshape' missing mask has length {n_edges * 2}, " - f"which does not match id length {n_edges}" - ), - ): - validate_structure(zpath) - del z["edges/props/badshape"]["missing"] - - # Nodes: property metadata has no matching data - geff_attrs = z.attrs["geff"] - geff_attrs["node_props_metadata"] = { - "prop1": {"identifier": "prop1", "dtype": "float32"}, - "prop2": {"identifier": "prop2", "dtype": "int"}, - } - z.attrs["geff"] = geff_attrs - with pytest.raises( - ValueError, - match="Node property prop1 described in metadata is not present in props arrays", - ): - validate_structure(zpath) - - # Nodes: inconsistent property metadata dtype - z["nodes/props/prop1/values"] = np.zeros(n_node, dtype=np.float32) - z["nodes/props/prop2/values"] = np.zeros(n_node, dtype=np.float32) - with pytest.raises( - ValueError, - match=( - "Node property prop2 with dtype float32 does not match " - "metadata dtype " - ), - ): - validate_structure(zpath) - # Another type of dtype mismatch - z["nodes/props/prop2/values"] = np.zeros(n_node, dtype="int16") - with pytest.raises( - ValueError, - match=( - "Node property prop2 with dtype int16 does not match " - "metadata dtype " - ), - ): - validate_structure(zpath) - z["nodes/props/prop2/values"] = np.zeros(n_node, dtype="int") # clean up - - # Edges: property metadata has no matching data - geff_attrs["edge_props_metadata"] = { - "prop3": {"identifier": "prop3", "dtype": "bool"}, - } - z.attrs["geff"] = geff_attrs - with pytest.raises( - ValueError, - match="Edge property prop3 described in metadata is not present in props arrays", - ): - validate_structure(zpath) - - # Edges: inconsistent property metadata dtype - z["edges/props/prop3/values"] = np.zeros(n_edges, dtype=np.float32) - with pytest.raises( - ValueError, - match=( - r"Edge property prop3 with dtype float32 does not match " - r"metadata dtype " - ), - ): - validate_structure(zpath) - z["edges/props/prop3/values"] = np.zeros(n_edges, dtype="bool") # clean up - - # No error raised when property with no matching prop metadata - z["nodes/props/prop4/values"] = np.zeros(n_node, dtype="bool") - z["edges/props/prop4/values"] = np.zeros(n_edges, dtype="uint8") - - # No error when identical property identifiers across node and edge props - geff_attrs["node_props_metadata"] = {"prop4": {"identifier": "prop4", "dtype": "bool"}} - geff_attrs["edge_props_metadata"] = {"prop4": {"identifier": "prop4", "dtype": "uint8"}} - z.attrs["geff"] = geff_attrs - - # Everything passes - validate_structure(zpath) +from geff.testing.data import ( + create_memory_mock_geff, + create_simple_2d_geff, +) +from geff.validate.structure import ( + _validate_axes_structure, + _validate_edges_group, + _validate_nodes_group, + _validate_props_group, +) + + +@pytest.fixture +def z() -> zarr.Group: + store, attrs = create_memory_mock_geff( + node_id_dtype="uint", + node_axis_dtypes={"position": "float64", "time": "float64"}, + directed=False, + num_nodes=10, + num_edges=15, + extra_node_props={"score": "float64"}, + extra_edge_props={"score": "float64", "color": "int"}, + include_t=True, + include_z=True, # 3D includes z + include_y=True, + include_x=True, + ) + return zarr.open_group(store) + + +@pytest.fixture +def meta(z) -> GeffMetadata: + return GeffMetadata.read(z.store) + + +@pytest.fixture +def node_group(z) -> zarr.Group: + return expect_group(z, _path.NODES) + + +@pytest.fixture +def edge_group(z) -> zarr.Group: + return expect_group(z, _path.EDGES) + + +class TestValidateStructure: + def test_valid_geff(self, z): + validate_structure(z.store) + + def test_missing_metadata(self, z): + del z.attrs["geff"] + + # Missing metadata + with pytest.raises(ValueError, match="No geff key found in"): + validate_structure(z.store) + + def test_no_nodes_group(self, z): + del z[_path.NODES] + with pytest.raises( + ValueError, match=f"'graph' group must contain a group named '{_path.NODES}'" + ): + validate_structure(z.store) + + def test_no_edges(self, z): + del z["edges"] + with pytest.raises( + ValueError, match=f"'graph' group must contain a group named '{_path.EDGES}'" + ): + validate_structure(z.store) + + +class Test_validate_nodes_group: + def test_no_node_ids(self, node_group, meta): + del node_group[_path.IDS] + with pytest.raises( + ValueError, match=f"'{_path.NODES}' group must contain an '{_path.IDS}' array" + ): + _validate_nodes_group(node_group, meta) + + def test_no_node_props_group(self, node_group, meta): + del node_group[_path.PROPS] + # Nodes must have a props group + with pytest.raises( + ValueError, match=f"'{_path.NODES}' group must contain a group named '{_path.PROPS}'" + ): + _validate_nodes_group(node_group, meta) + + def test_ids_not_int(self, node_group, meta): + node_group[_path.IDS] = node_group[_path.IDS][:].astype("float") + with pytest.raises(ValueError, match="Node ids must have an integer dtype"): + _validate_nodes_group(node_group, meta) + + # TODO: Must be positive integers + # node_group[_path.IDS] = node_group[_path.IDS][:] * -1 + # with pytest.raises(ValueError, match="Node ids must have an integer dtype"): + # _validate_nodes_group(node_group, meta) + + # Other cases are caught in tests for _validate_props_group + + +class Test_validate_edges_group: + def test_no_edge_ids(self, edge_group, meta): + del edge_group[_path.IDS] + with pytest.raises( + ValueError, match=f"'{_path.EDGES}' group must contain an '{_path.IDS}' array" + ): + _validate_edges_group(edge_group, meta) + + def test_edge_ids_bad_shape(self, edge_group, meta): + edge_group[_path.IDS] = np.zeros((3, 3)) + with pytest.raises( + ValueError, + match="edges ids must have a last dimension of size 2, received shape .*", + ): + _validate_edges_group(edge_group, meta) + + # Other cases are caught in tests for _validate_props_group + + +class Test_validate_props_group: + def test_node_prop_no_values(self, node_group): + # Subgroups in props must have values + key = "t" + del node_group[_path.PROPS][key][_path.VALUES] + id_len = node_group[_path.IDS].shape[0] + with pytest.raises( + ValueError, match=f"Node property group '{key}' must have a '{_path.VALUES}' array" + ): + _validate_props_group(node_group[_path.PROPS], id_len, "Node") + + def test_node_prop_shape_mismatch(self, node_group): + # Property shape mismatch + key = "badshape" + node_group[f"{_path.PROPS}/{key}/{_path.VALUES}"] = np.zeros(1) + id_len = node_group[_path.IDS].shape[0] + with pytest.raises( + ValueError, + match=( + f"Node property '{key}' values has length {1}, which does not match id length .*" + ), + ): + _validate_props_group(node_group[_path.PROPS], id_len, "Node") + + def test_node_prop_missing_mismatch(self, node_group): + # Property missing shape mismatch + key = "t" + node_group[f"{_path.PROPS}/{key}/{_path.MISSING}"] = np.zeros(shape=(1)) + id_len = node_group[_path.IDS].shape[0] + with pytest.raises( + ValueError, + match=( + f"Node property '{key}' missing mask has length 1, " + "which does not match id length .*" + ), + ): + _validate_props_group(node_group[_path.PROPS], id_len, "Node") + + def test_missing_dtype(self, node_group): + # missing arrays must be boolean + key = "score" + node_group[f"{_path.PROPS}/{key}/{_path.MISSING}"] = np.zeros( + node_group[f"{_path.PROPS}/{key}/{_path.VALUES}"].shape, dtype="float" + ) + id_len = node_group[_path.IDS].shape[0] + + with pytest.raises(ValueError, match=f"Node property '{key}' missing must be boolean"): + _validate_props_group(node_group[_path.PROPS], id_len, "Node") def test_open_storelike(tmp_path): @@ -237,57 +204,77 @@ def test_open_storelike(tmp_path): open_storelike(group) -def test_check_equiv_geff(): - def _write_new_store(in_mem): +class Test_check_equiv_geff: + store, attrs = create_simple_2d_geff(num_nodes=10, num_edges=15) + in_mem = read_to_memory(store) + + def _write_new_store(self, in_mem): store = zarr.storage.MemoryStore() write_arrays(store, **in_mem) return store - store, attrs = create_simple_2d_geff(num_nodes=10, num_edges=15) - - # Check that two exactly same geffs pass - check_equiv_geff(store, store) - - # Create in memory version to mess with - in_mem = read_to_memory(store) - - # Id shape mismatch - bad_store, attrs = create_simple_2d_geff(num_nodes=5) - with pytest.raises(ValueError, match=r".* ids shape: .* does not match .*"): - check_equiv_geff(store, bad_store) - - # Missing props - bad_mem = copy.deepcopy(in_mem) - bad_mem["node_props"] = {} - bad_store = _write_new_store(bad_mem) - with pytest.raises(ValueError, match=".* properties: a .* does not match b .*"): - check_equiv_geff(store, bad_store) - - # Warn if one has missing but other doesn't - bad_mem = copy.deepcopy(in_mem) - bad_mem["edge_props"]["score"]["missing"] = np.zeros( - bad_mem["edge_props"]["score"]["values"].shape, dtype=np.bool_ - ) - bad_store = _write_new_store(bad_mem) - with pytest.raises(UserWarning, match=".* contains missing but the other does not"): - check_equiv_geff(bad_store, store) - - # Values shape mismatch - bad_mem = copy.deepcopy(in_mem) - # Add extra dimension to an edge prop - bad_mem["edge_props"]["score"]["values"] = bad_mem["edge_props"]["score"]["values"][ - ..., np.newaxis - ] - bad_store = _write_new_store(bad_mem) - with pytest.raises(ValueError, match=r".* shape: .* does not match b .*"): - check_equiv_geff(store, bad_store) - - # Values dtype mismatch - bad_mem = copy.deepcopy(in_mem) - # Change dtype - bad_mem["edge_props"]["score"]["values"] = ( - bad_mem["edge_props"]["score"]["values"].astype("int").squeeze() - ) - bad_store = _write_new_store(bad_mem) - with pytest.raises(ValueError, match=r".* dtype: .* does not match b .*"): - check_equiv_geff(store, bad_store) + def test_same_geff(self): + # Check that two exactly same geffs pass + check_equiv_geff(self.store, self.store) + + def test_id_shape_mismatch(self): + # Id shape mismatch + bad_store, attrs = create_simple_2d_geff(num_nodes=5) + with pytest.raises(ValueError, match=r".* ids shape: .* does not match .*"): + check_equiv_geff(self.store, bad_store) + + def test_props_mismatch(self): + bad_mem = copy.deepcopy(self.in_mem) + bad_mem["node_props"]["new prop"] = bad_mem["node_props"]["t"] + bad_store = self._write_new_store(bad_mem) + with pytest.raises(ValueError, match=".* properties: a .* does not match b .*"): + check_equiv_geff(self.store, bad_store) + + def test_only_one_with_missing(self): + bad_mem = copy.deepcopy(self.in_mem) + bad_mem["edge_props"]["score"]["missing"] = np.zeros( + bad_mem["edge_props"]["score"]["values"].shape, dtype=np.bool_ + ) + bad_store = self._write_new_store(bad_mem) + with pytest.raises(UserWarning, match=".* contains missing but the other does not"): + check_equiv_geff(bad_store, self.store) + + def test_value_shape_mismatch(self): + bad_mem = copy.deepcopy(self.in_mem) + # Add extra dimension to an edge prop + bad_mem["edge_props"]["score"]["values"] = bad_mem["edge_props"]["score"]["values"][ + ..., np.newaxis + ] + bad_store = self._write_new_store(bad_mem) + with pytest.raises(ValueError, match=r".* shape: .* does not match b .*"): + check_equiv_geff(self.store, bad_store) + + def test_value_dtype_mismatch(self): + # Values dtype mismatch + bad_mem = copy.deepcopy(self.in_mem) + # Change dtype + bad_mem["edge_props"]["score"]["values"] = ( + bad_mem["edge_props"]["score"]["values"].astype("int").squeeze() + ) + bad_store = self._write_new_store(bad_mem) + with pytest.raises(ValueError, match=r".* dtype: .* does not match b .*"): + check_equiv_geff(self.store, bad_store) + + +class Test_validate_axes_structure: + def test_missing_axes_prop(self, z, meta): + key = "x" + del z[_path.NODE_PROPS][key] + with pytest.raises(AssertionError, match=f"Axis {key} data is missing"): + _validate_axes_structure(z, meta) + + def test_must_be_1d(self, z, meta): + z[f"{_path.NODE_PROPS}/x/{_path.VALUES}"] = np.zeros((10, 2)) + with pytest.raises(AssertionError, match="Axis property x has 2 dimensions, must be 1D"): + _validate_axes_structure(z, meta) + + def test_no_missing_values(self, z, meta): + z[f"{_path.NODE_PROPS}/x/{_path.VALUES}"] = np.zeros((10,)) + z[f"{_path.NODE_PROPS}/x/{_path.MISSING}"] = np.zeros((10,)) + with pytest.raises(AssertionError, match="Axis x has missing values which are not allowed"): + _validate_axes_structure(z, meta) diff --git a/tests/test_validate/test_validate_data.py b/tests/test_validate/test_validate_data.py new file mode 100644 index 00000000..6e46f755 --- /dev/null +++ b/tests/test_validate/test_validate_data.py @@ -0,0 +1,142 @@ +import numpy as np +import pytest + +import geff.validate.data +from geff.core_io._base_read import read_to_memory +from geff.testing.data import create_memory_mock_geff, create_simple_2d_geff +from geff.validate.data import ValidationConfig, validate_data + + +class Test_validate_data: + store, attrs = create_simple_2d_geff() + memory_geff = read_to_memory(store) + + def test_valid_graph(self): + # test valid + validate_data(self.memory_geff, ValidationConfig(graph=True)) + + def test_nodes_for_edges(self, monkeypatch): + # error on validate_nodes_for_edges + monkeypatch.setattr( + geff.validate.data, "validate_nodes_for_edges", lambda node_ids, edge_ids: (False, []) + ) + with pytest.raises(ValueError, match="Some edges are missing nodes"): + validate_data(self.memory_geff, ValidationConfig(graph=True)) + + def test_no_self_edges(self, monkeypatch): + monkeypatch.setattr( + geff.validate.data, "validate_no_self_edges", lambda edge_ids: (False, []) + ) + with pytest.raises(ValueError, match="Self edges found in data"): + validate_data(self.memory_geff, ValidationConfig(graph=True)) + + def test_no_repeated_edges(self, monkeypatch): + monkeypatch.setattr( + geff.validate.data, "validate_no_repeated_edges", lambda edge_ids: (False, []) + ) + with pytest.raises(ValueError, match="Repeated edges found in data"): + validate_data(self.memory_geff, ValidationConfig(graph=True)) + + def test_unique_node_ids(self, monkeypatch): + monkeypatch.setattr( + geff.validate.data, "validate_unique_node_ids", lambda node_ids: (False, []) + ) + with pytest.raises(ValueError, match="Some node ids are not unique"): + validate_data(self.memory_geff, ValidationConfig(graph=True)) + + def test_sphere(self): + # Invalid spheres are tested in test_shapes + # Only need to test a valid case + store, _ = create_memory_mock_geff( + node_id_dtype="int", + node_axis_dtypes={"position": "float64", "time": "float64"}, + directed=True, + num_nodes=10, + num_edges=10, + extra_node_props={"radius": "int"}, + include_t=True, + include_z=False, # 2D only + include_y=True, + include_x=True, + ) + memory_geff = read_to_memory(store) + # Add sphere metadata + memory_geff["metadata"].sphere = "radius" + + validate_data(config=ValidationConfig(sphere=True), memory_geff=memory_geff) + + def test_ellipsoid(self): + # Invalid ellipsoids are tested in test_shapes + # Only need to test a valid case + store, _ = create_memory_mock_geff( + node_id_dtype="int", + node_axis_dtypes={"position": "float64", "time": "float64"}, + directed=True, + num_nodes=10, + num_edges=10, + extra_node_props={"covariance2d": "float64"}, + include_t=True, + include_z=False, # 2D only + include_y=True, + include_x=True, + ) + memory_geff = read_to_memory(store) + # Add ellipsoid metadata + memory_geff["metadata"].ellipsoid = "covariance2d" + # Overwrite ellipsoid values + covar = np.ones((10, 2, 2)) + covar[:, 0, 0] = 2 + covar[:, 1, 1] = 2 + memory_geff["node_props"]["covariance2d"]["values"] = covar + + validate_data(config=ValidationConfig(ellipsoid=True), memory_geff=memory_geff) + + def test_tracklet(self, monkeypatch): + # validate_tracklets is tested in test_graph + # Just need to trigger the value error + store, _ = create_memory_mock_geff( + node_id_dtype="int", + node_axis_dtypes={"position": "float64", "time": "float64"}, + directed=True, + num_nodes=10, + num_edges=10, + extra_node_props={"tracklet": "int"}, + include_t=True, + include_z=False, # 2D only + include_y=True, + include_x=True, + ) + memory_geff = read_to_memory(store) + # Add tracklet metadata + memory_geff["metadata"].track_node_props = {"tracklet": "tracklet"} + + # Monkeypatch validate tracklets to return false + monkeypatch.setattr(geff.validate.data, "validate_tracklets", lambda x, y, z: (False, [])) + + with pytest.raises(ValueError, match="Found invalid tracklets"): + validate_data(config=ValidationConfig(tracklet=True), memory_geff=memory_geff) + + def test_lineages(self, monkeypatch): + # validate_lineages is tested in test_graph + # Just need to trigger the value error + store, _ = create_memory_mock_geff( + node_id_dtype="int", + node_axis_dtypes={"position": "float64", "time": "float64"}, + directed=True, + num_nodes=10, + num_edges=10, + extra_node_props={"lineage": "int"}, + include_t=True, + include_z=False, # 2D only + include_y=True, + include_x=True, + ) + memory_geff = read_to_memory(store) + # Add tracklet metadata + memory_geff["metadata"].track_node_props = {"lineage": "lineage"} + + # Monkeypatch validate tracklets to return false + monkeypatch.setattr(geff.validate.data, "validate_lineages", lambda x, y, z: (False, [])) + + with pytest.raises(ValueError, match="Found invalid lineages"): + validate_data(config=ValidationConfig(lineage=True), memory_geff=memory_geff)