Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved input syntax for frequency_differencing operation #1106

Merged
merged 4 commits into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 21 additions & 143 deletions echopype/mask/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from ..utils.io import validate_source_ds_da
from ..utils.prov import add_processing_level, echopype_prov_attrs, insert_input_processing_level
from .freq_diff import _check_freq_diff_source_Sv, _parse_freq_diff_eq

# lookup table with key string operator and value as corresponding Python operator
str2ops = {
Expand Down Expand Up @@ -366,128 +367,11 @@ def get_ch_shape(da):
return output_ds


def _check_freq_diff_non_data_inputs(
freqAB: Optional[List[float]] = None,
chanAB: Optional[List[str]] = None,
operator: str = ">",
diff: Union[float, int] = None,
) -> None:
"""
Checks that the non-data related inputs of ``frequency_differencing`` (i.e. ``freqAB``,
``chanAB``, ``operator``, ``diff``) were correctly provided.

Parameters
----------
freqAB: list of float, optional
The pair of nominal frequencies to be used for frequency-differencing, where
the first element corresponds to ``freqA`` and the second element corresponds
to ``freqB``
chanAB: list of float, optional
The pair of channels that will be used to select the nominal frequencies to be
used for frequency-differencing, where the first element corresponds to ``freqA``
and the second element corresponds to ``freqB``
operator: {">", "<", "<=", ">=", "=="}
The operator for the frequency-differencing
diff: float or int
The threshold of Sv difference between frequencies
"""

# check that either freqAB or chanAB are provided and they are a list of length 2
if (freqAB is None) and (chanAB is None):
raise ValueError("Either freqAB or chanAB must be given!")
elif (freqAB is not None) and (chanAB is not None):
raise ValueError("Only freqAB or chanAB must be given, but not both!")
elif freqAB is not None:
if not isinstance(freqAB, list):
raise TypeError("freqAB must be a list!")
elif len(set(freqAB)) != 2:
raise ValueError("freqAB must be a list of length 2 with unique elements!")
else:
if not isinstance(chanAB, list):
raise TypeError("chanAB must be a list!")
elif len(set(chanAB)) != 2:
raise ValueError("chanAB must be a list of length 2 with unique elements!")

# check that operator is a string and a valid operator
if not isinstance(operator, str):
raise TypeError("operator must be a string!")
else:
if operator not in [">", "<", "<=", ">=", "=="]:
raise ValueError("Invalid operator!")

# ensure that diff is a float or an int
if not isinstance(diff, (float, int)):
raise TypeError("diff must be a float or int!")


def _check_source_Sv_freq_diff(
source_Sv: xr.Dataset,
freqAB: Optional[List[float]] = None,
chanAB: Optional[List[str]] = None,
) -> None:
"""
Ensures that ``source_Sv`` contains ``channel`` as a coordinate and
``frequency_nominal`` as a variable, the provided list input
(``freqAB`` or ``chanAB``) are contained in the coordinate ``channel``
or variable ``frequency_nominal``, and ``source_Sv`` does not have
repeated values for ``channel`` and ``frequency_nominal``.

Parameters
----------
source_Sv: xr.Dataset
A Dataset that contains the Sv data to create a mask for
freqAB: list of float, optional
The pair of nominal frequencies to be used for frequency-differencing, where
the first element corresponds to ``freqA`` and the second element corresponds
to ``freqB``
chanAB: list of float, optional
The pair of channels that will be used to select the nominal frequencies to be
used for frequency-differencing, where the first element corresponds to ``freqA``
and the second element corresponds to ``freqB``
"""

# check that channel and frequency nominal are in source_Sv
if "channel" not in source_Sv.coords:
raise ValueError("The Dataset defined by source_Sv must have channel as a coordinate!")
elif "frequency_nominal" not in source_Sv.variables:
raise ValueError(
"The Dataset defined by source_Sv must have frequency_nominal as a variable!"
)

# make sure that the channel and frequency_nominal values are not repeated in source_Sv
if len(set(source_Sv.channel.values)) < source_Sv.channel.size:
raise ValueError(
"The provided source_Sv contains repeated channel values, this is not allowed!"
)

if len(set(source_Sv.frequency_nominal.values)) < source_Sv.frequency_nominal.size:
raise ValueError(
"The provided source_Sv contains repeated frequency_nominal "
"values, this is not allowed!"
)

# check that the elements of freqAB are in frequency_nominal
if (freqAB is not None) and (not all([freq in source_Sv.frequency_nominal for freq in freqAB])):
raise ValueError(
"The provided list input freqAB contains values that "
"are not in the frequency_nominal variable!"
)

# check that the elements of chanAB are in channel
if (chanAB is not None) and (not all([chan in source_Sv.channel for chan in chanAB])):
raise ValueError(
"The provided list input chanAB contains values that are "
"not in the channel coordinate!"
)


def frequency_differencing(
source_Sv: Union[xr.Dataset, str, pathlib.Path],
storage_options: Optional[dict] = {},
freqAB: Optional[List[float]] = None,
chanAB: Optional[List[str]] = None,
operator: str = ">",
diff: Union[float, int] = None,
freqABEq: Optional[str] = None,
chanABEq: Optional[str] = None,
) -> xr.DataArray:
"""
Create a mask based on the differences of Sv values using a pair of
Expand All @@ -504,19 +388,13 @@ def frequency_differencing(
storage_options: dict, optional
Any additional parameters for the storage backend, corresponding to the
path provided for ``source_Sv``
freqAB: list of float, optional
The pair of nominal frequencies to be used for frequency-differencing, where
the first element corresponds to ``freqA`` and the second element corresponds
to ``freqB``. Only one of ``freqAB`` and ``chanAB`` should be provided, and not both.
chanAB: list of strings, optional
The pair of channels that will be used to select the nominal frequencies to be
used for frequency-differencing, where the first element corresponds to ``freqA``
and the second element corresponds to ``freqB``. Only one of ``freqAB`` and ``chanAB``
freqABEq: string, optional
The frequency differencing criteria.
Only one of ``freqAB`` and ``chanAB`` should be provided, and not both.
chanAB: string, optional
The frequency differencing criteria in terms of channel names where channel names
in the criteria are enclosed in double quotes. Only one of ``freqAB`` and ``chanAB``
should be provided, and not both.
operator: {">", "<", "<=", ">=", "=="}
The operator for the frequency-differencing
diff: float or int
The threshold of Sv difference between frequencies

Returns
-------
Expand All @@ -527,24 +405,24 @@ def frequency_differencing(
Raises
------
ValueError
If neither ``freqAB`` or ``chanAB`` are given
If neither ``freqABEq`` or ``chanABEq`` are given
ValueError
If both ``freqAB`` and ``chanAB`` are given
If both ``freqABEq`` and ``chanABEq`` are given
TypeError
If any input is not of the correct type
ValueError
If either ``freqAB`` or ``chanAB`` are provided and the list
does not contain 2 distinct elements
If either ``freqABEq`` or ``chanABEq`` are provided and the extracted
``freqAB`` or ``chanAB`` does not contain 2 distinct elements
ValueError
If ``freqAB`` contains values that are not contained in ``frequency_nominal``
If ``freqABEq`` contains values that are not contained in ``frequency_nominal``
ValueError
If ``chanAB`` contains values that not contained in ``channel``
If ``chanABEq`` contains values that not contained in ``channel``
ValueError
If ``operator`` is not one of the following: ``">", "<", "<=", ">=", "=="``
ValueError
If the path provided for ``source_Sv`` is not a valid path
ValueError
If ``freqAB`` or ``chanAB`` is provided and the Dataset produced by ``source_Sv``
If ``freqABEq`` or ``chanABEq`` is provided and the Dataset produced by ``source_Sv``
does not contain the coordinate ``channel`` and variable ``frequency_nominal``

Notes
Expand Down Expand Up @@ -573,9 +451,8 @@ def frequency_differencing(
>>> Sv_ds = xr.Dataset(data_vars={"Sv": Sv_da, "frequency_nominal": freq_nom})
...
>>> # compute frequency-differencing mask using channel names
>>> echopype.mask.frequency_differencing(source_Sv=mock_Sv_ds, storage_options={}, freqAB=None,
... chanAB = ['chan1', 'chan2'],
... operator = ">=", diff=10.0)
>>> echopype.mask.frequency_differencing(source_Sv=mock_Sv_ds, storage_options={},
... freqABEq=None, chanABEq = '"chan1" - "chan2">=10.0')
<xarray.DataArray 'mask' (ping_time: 5, range_sample: 5)>
array([[False, False, False, False, False],
[False, False, False, False, False],
Expand All @@ -588,7 +465,8 @@ def frequency_differencing(
"""

# check that non-data related inputs were correctly provided
_check_freq_diff_non_data_inputs(freqAB, chanAB, operator, diff)
# _check_freq_diff_non_data_inputs(freqAB, chanAB, operator, diff)
freqAB, chanAB, operator, diff = _parse_freq_diff_eq(freqABEq, chanABEq)

# validate the source_Sv type or path (if it is provided)
source_Sv, file_type = validate_source_ds_da(source_Sv, storage_options)
Expand All @@ -598,7 +476,7 @@ def frequency_differencing(
source_Sv = xr.open_dataset(source_Sv, engine=file_type, chunks={}, **storage_options)

# check the source_Sv with respect to channel and frequency_nominal
_check_source_Sv_freq_diff(source_Sv, freqAB, chanAB)
_check_freq_diff_source_Sv(source_Sv, freqAB, chanAB)

# determine chanA and chanB
if freqAB is not None:
Expand Down
149 changes: 149 additions & 0 deletions echopype/mask/freq_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import re
from typing import List, Optional, Union

import xarray as xr


def _parse_freq_diff_eq(
freqABEq: Optional[str] = None,
chanABEq: Optional[str] = None,
) -> List[Union[List[float], List[str], str, Union[float, int]]]:
"""
Checks if either `freqABEq` or `chanABEq` is provided and parse the arguments accordingly
from the frequency diffrencing criteria.

Parameters
----------
freqABEq : str, optional
The equation for frequency-differencing using frequency values.
chanABEq : str, optional
The equation for frequency-differencing using channel names.

Returns
-------
List[Union[List[float], List[str], str, Union[float, int]]]
A list containing the parsed arguments for frequency-differencing, where the first element
corresponds to `freqAB`, the second element corresponds to `chanAB`, the third element
corresponds to `operator`, the fourth element corresponds to `diff`.

Raises
------
ValueError
If `operator` is not a valid operator.
If both `freqABEq` and `chanABEq` are provided.
If neither `freqABEq` nor `chanABEq` is provided.
If `freqAB` or `chanAB` is not a list of length 2 with unique elements.
TypeError
If `diff` is not a float or an int.
If `freqABEq` or `chanABEq` is not a valid equation.
"""

if (freqABEq is None) and (chanABEq is None):
raise ValueError("Either freqAB or chanAB must be given!")
elif (freqABEq is not None) and (chanABEq is not None):
raise ValueError("Only one of freqAB or chanAB should be given, but not both!")
elif freqABEq is not None:
freqAPattern = r"(?P<freqA>\d*\.\d+)\s*(?P<unitA>\w?)Hz"
freqBPattern = r"(?P<freqB>\d*\.\d+)\s*(?P<unitB>\w?)Hz"
operatorPattern = r"\s*(?P<cmp>\S*?)\s*"
rhsPattern = r"(?P<db>\d*\.?\d+)\s*dB"
diffMatcher = re.compile(
freqAPattern + r"\s*-\s*" + freqBPattern + operatorPattern + rhsPattern
)
eqMatched = diffMatcher.match(freqABEq)
if eqMatched is None:
raise TypeError("Invalid freqAB Equation!")
operator = eqMatched["cmp"]
if operator not in [">", "<", "<=", ">=", "=="]:
raise ValueError("Invalid operator!")
freqMultiplier = {"": 1, "k": 1e3, "M": 1e6, "G": 1e9}
freqA = float(eqMatched["freqA"]) * freqMultiplier[eqMatched["unitA"]]
freqB = float(eqMatched["freqB"]) * freqMultiplier[eqMatched["unitB"]]
freqAB = [freqA, freqB]
if len(set(freqAB)) != 2:
raise ValueError("freqAB must be a list of length 2 with unique elements!")
diff = float(eqMatched["db"])
return [freqAB, None, operator, diff]
elif chanABEq is not None:
chanAPattern = r"(?P<chanA>\".+\")\s*"
chanBPattern = r"(?P<chanB>\".+\")\s*"
operatorPattern = r"\s*(?P<cmp>\S*?)\s*"
rhsPattern = r"(?P<db>\d*\.?\d+)\s*dB"
diffMatcher = re.compile(
chanAPattern + r"\s*-\s*" + chanBPattern + operatorPattern + rhsPattern
)
eqMatched = diffMatcher.match(chanABEq)
if eqMatched is None:
raise TypeError("Invalid chanAB Equation!")
operator = eqMatched["cmp"]
if operator not in [">", "<", "<=", ">=", "=="]:
raise ValueError("Invalid operator!")
chanAB = [eqMatched["chanA"][1:-1], eqMatched["chanB"][1:-1]]
if len(set(chanAB)) != 2:
raise ValueError("chanAB must be a list of length 2 with unique elements!")
diff = float(eqMatched["db"])
return [None, chanAB, operator, diff]


def _check_freq_diff_source_Sv(
source_Sv: xr.Dataset,
freqAB: Optional[List[float]] = None,
chanAB: Optional[List[str]] = None,
) -> None:
"""
Ensures that ``source_Sv`` contains ``channel`` as a coordinate and
``frequency_nominal`` as a variable, the provided list input
(``freqAB`` or ``chanAB``) are contained in the coordinate ``channel``
or variable ``frequency_nominal``, and ``source_Sv`` does not have
repeated values for ``channel`` and ``frequency_nominal``.

Parameters
----------
source_Sv: xr.Dataset
A Dataset that contains the Sv data to create a mask for
freqAB: list of float, optional
The pair of nominal frequencies to be used for frequency-differencing, where
the first element corresponds to ``freqA`` and the second element corresponds
to ``freqB``
chanAB: list of float, optional
The pair of channels that will be used to select the nominal frequencies to be
used for frequency-differencing, where the first element corresponds to ``freqA``
and the second element corresponds to ``freqB``
"""

# check that channel and frequency nominal are in source_Sv
if "channel" not in source_Sv.coords:
raise ValueError("The Dataset defined by source_Sv must have channel as a coordinate!")
elif "frequency_nominal" not in source_Sv.variables:
raise ValueError(
"The Dataset defined by source_Sv must have frequency_nominal as a variable!"
)

# make sure that the channel values are not repeated in source_Sv and
# elements of chanAB are in channel
if chanAB is not None:
if len(set(source_Sv.channel.values)) < source_Sv.channel.size:
raise ValueError(
"The provided source_Sv contains repeated channel values, this is not allowed!"
)
if not all([chan in source_Sv.channel for chan in chanAB]):
raise ValueError(
"The provided list input chanAB contains values that are "
"not in the channel coordinate!"
)

# make sure that the frequency_nominal values are not repeated in source_Sv and
# elements of freqAB are in frequency_nominal
if freqAB is not None:
print(source_Sv.frequency_nominal.values)
if len(set(source_Sv.frequency_nominal.values)) < source_Sv.frequency_nominal.size:
raise ValueError(
"The provided source_Sv contains repeated "
"frequency_nominal values, this is not allowed!"
)

if not all([freq in source_Sv.frequency_nominal for freq in freqAB]):
raise ValueError(
"The provided list input freqAB contains values that "
"are not in the frequency_nominal variable!"
)
Loading