diff --git a/mokapot/_nnls.py b/mokapot/_nnls.py deleted file mode 100644 index 8321549..0000000 --- a/mokapot/_nnls.py +++ /dev/null @@ -1,183 +0,0 @@ -# Note: this is a patched copy of -# scipy/optimize/_nnls.py -# Commit: https://github.com/scipy/scipy/commit/3cae6dc85d2288519cce3d3112e1c30e1f470cad -# Branch: main (#18570) -# Tags: v1.12.0 v1.12.0rc2 v1.12.0rc1 -# Reason was a bug in the original version line 139 (here 151) marked by the -# comment "# C.1". In the paper the algorithm was based on, there is a -# comparison to 0 in step C.1, but here was a comparison with `tol`, which -# sometimes led to non-convergent iterations and a subsequent RuntimeError. -# This was fixed in this file by changing -# `<= tol` to `< 0`, which is the correct form to check for a constraint -# violation. If this is patched upstream in scipy, we can remove this file -# and the corresponding import in peps.py. - -import numpy as np -from scipy.linalg import solve - - -__all__ = ["nnls"] - - -def nnls(A, b, maxiter=None, *, atol=None): - """ - Solve ``argmin_x || Ax - b ||_2`` for ``x>=0``. - - This problem, often called as NonNegative Least Squares, is a convex - optimization problem with convex constraints. It typically arises when - the ``x`` models quantities for which only nonnegative values are - attainable; weight of ingredients, component costs and so on. - - Parameters - ---------- - A : (m, n) ndarray - Coefficient array - b : (m,) ndarray, float - Right-hand side vector. - maxiter: int, optional - Maximum number of iterations, optional. Default value is ``3 * n``. - atol: float - Tolerance value used in the algorithm to assess closeness to zero in - the projected residual ``(A.T @ (A x - b)`` entries. Increasing this - value relaxes the solution constraints. A typical relaxation value can - be selected as ``max(m, n) * np.linalg.norm(a, 1) * np.spacing(1.)``. - This value is not set as default since the norm operation becomes - expensive for large problems hence can be used only when necessary. - - Returns - ------- - x : ndarray - Solution vector. - rnorm : float - The 2-norm of the residual, ``|| Ax-b ||_2``. - - See Also - -------- - lsq_linear : Linear least squares with bounds on the variables - - Notes - ----- - The code is based on [2]_ which is an improved version of the classical - algorithm of [1]_. It utilizes an active set method and solves the KKT - (Karush-Kuhn-Tucker) conditions for the non-negative least squares problem. - - References - ---------- - .. [1] : Lawson C., Hanson R.J., "Solving Least Squares Problems", SIAM, - 1995, :doi:`10.1137/1.9781611971217` - .. [2] : Bro, Rasmus and de Jong, Sijmen, "A Fast Non-Negativity- - Constrained Least Squares Algorithm", Journal Of Chemometrics, 1997, - :doi:`10.1002/(SICI)1099-128X(199709/10)11:5<393::AID-CEM483>3.0.CO;2-L` - - Examples - -------- - >>> import numpy as np - >>> from scipy.optimize import nnls - >>> A = np.array([[1, 0], [1, 0], [0, 1]]) - >>> b = np.array([2, 1, 1]) - >>> nnls(A, b) - (array([1.5, 1. ]), 0.7071067811865475) - - >>> b = np.array([-1, -1, -1]) - >>> nnls(A, b) - (array([0., 0.]), 1.7320508075688772) - - """ - - A = np.asarray_chkfinite(A) - b = np.asarray_chkfinite(b) - - if len(A.shape) != 2: - raise ValueError( - "Expected a two-dimensional array (matrix)" - + f", but the shape of A is {A.shape}" - ) - if len(b.shape) != 1: - raise ValueError( - "Expected a one-dimensional array (vector)" - + f", but the shape of b is {b.shape}" - ) - - m, n = A.shape - - if m != b.shape[0]: - raise ValueError( - "Incompatible dimensions. The first dimension of " - + f"A is {m}, while the shape of b is {(b.shape[0],)}" - ) - - x, rnorm, mode = _nnls(A, b, maxiter, tol=atol) - if mode != 1: - raise RuntimeError("Maximum number of iterations reached.") - - return x, rnorm - - -def _nnls(A, b, maxiter=None, tol=None): - """ - This is a single RHS algorithm from ref [2] above. For multiple RHS - support, the algorithm is given in :doi:`10.1002/cem.889` - """ - m, n = A.shape - - AtA = A.T @ A - Atb = b @ A # Result is 1D - let NumPy figure it out - - if not maxiter: - maxiter = 3 * n - if tol is None: - tol = 10 * max(m, n) * np.spacing(1.0) - - # Initialize vars - x = np.zeros(n, dtype=np.float64) - - # Inactive constraint switches - P = np.zeros(n, dtype=bool) - - # Projected residual - resid = Atb.copy().astype(np.float64) # x=0. Skip (-AtA @ x) term - - # Overall iteration counter - # Outer loop is not counted, inner iter is counted across outer spins - iter = 0 - - while (not P.all()) and (resid[~P] > tol).any(): # B - # Get the "most" active coeff index and move to inactive set - resid[P] = -np.inf - k = np.argmax(resid) # B.2 - P[k] = True # B.3 - - # Iteration solution - s = np.zeros(n, dtype=np.float64) - P_ind = P.nonzero()[0] - s[P] = solve( - AtA[P_ind[:, None], P_ind[None, :]], - Atb[P], - assume_a="sym", - check_finite=False, - ) # B.4 - - # Inner loop - while (iter < maxiter) and (s[P].min() < 0): # C.1 - alpha_ind = ((s < 0) & P).nonzero() - alpha = (x[alpha_ind] / (x[alpha_ind] - s[alpha_ind])).min() # C.2 - x *= 1 - alpha - x += alpha * s - P[x <= 0] = False - s[P] = solve( - AtA[np.ix_(P, P)], Atb[P], assume_a="sym", check_finite=False - ) - s[~P] = 0 # C.6 - iter += 1 - - x[:] = s[:] - resid = Atb - AtA @ x - - if iter == maxiter: - # Typically following line should return - # return x, np.linalg.norm(A@x - b), -1 - # however at the top level, -1 raises an exception wasting norm - # Instead return dummy number 0. - return x, 0.0, -1 - - return x, np.linalg.norm(A @ x - b), 1 diff --git a/mokapot/confidence_writer.py b/mokapot/confidence_writer.py index 1a5e85a..8faf8af 100644 --- a/mokapot/confidence_writer.py +++ b/mokapot/confidence_writer.py @@ -80,13 +80,18 @@ def write_confidences( data_iterator : Iterator[pd.DataFrame] An iterator that yields chunks of data as pandas DataFrames. q_value_iterator : Iterable[np.array] - A iterator that yields numpy arrays containing the q-values for each data chunk. + A iterator that yields numpy arrays containing the q-values for each + data chunk. pep_iterator : Iterable[np.array] - A iterator that yields numpy arrays containing the posterior error probabilities for each data chunk. + A iterator that yields numpy arrays containing the posterior error + probabilities for each data chunk. target_iterator : Iterable[np.array] - A iterator that yields numpy arrays indicating whether each data point is a target or decoy for each data chunk. + A iterator that yields numpy arrays indicating whether each data point + is a target or decoy for each data chunk. out_paths : list[Path] - A list of output file paths where the confidence data will be written. The first element contains the path for the targets and the second those for the decoys. + A list of output file paths where the confidence data will be written. + The first element contains the path for the targets and the second + those for the decoys. decoys : bool A boolean flag indicating whether to include decoy data in the output. level : str @@ -96,7 +101,8 @@ def write_confidences( qvalue_column : str, optional The name of the column to store the q-values. Default is 'q_value'. pep_column : str, optional - The name of the column to store the posterior error probabilities. Default is 'posterior_error_prob'. + The name of the column to store the posterior error probabilities. + Default is 'posterior_error_prob'. Returns ------- diff --git a/mokapot/config.py b/mokapot/config.py index 60df3ef..ca44052 100644 --- a/mokapot/config.py +++ b/mokapot/config.py @@ -390,9 +390,9 @@ def _parser(): "--sqlite_db_path", default=None, type=Path, - help="Optionally, sets a path to an MSAID sqlite result database for writing " - "outputs to. If not set (None), results are written in the standard TSV " - "format.", + help="Optionally, sets a path to an MSAID sqlite result database " + "for writing outputs to. If not set (None), results are " + "written in the standard TSV format.", ) return parser diff --git a/mokapot/peps.py b/mokapot/peps.py index 72f10db..9aace66 100644 --- a/mokapot/peps.py +++ b/mokapot/peps.py @@ -3,11 +3,7 @@ import matplotlib.pyplot as plt from triqler import qvality -# TODO: Remove the next and uncomment the 2nd next line when -# scipy.optimize.nnls is fixed (see _nnls.py for explanation) -from ._nnls import nnls - -# from scipy.optimize import nnls +from scipy.optimize import nnls PEP_ALGORITHM = { "qvality": lambda scores, targets: peps_from_scores_qvality( diff --git a/mokapot/streaming.py b/mokapot/streaming.py index 3341e58..b2fdcae 100644 --- a/mokapot/streaming.py +++ b/mokapot/streaming.py @@ -151,10 +151,10 @@ def get_chunked_data_iterator( @typechecked class MergedTabularDataReader(TabularDataReader): """ - Merges data from multiple tabular data sources vertically into a single data - source, ordering the rows (one by one) by the value of a priority column. - I.e. for each output row, the row of the input readers with the highest - value of the priority column is picked. + Merges data from multiple tabular data sources vertically into a single + data source, ordering the rows (one by one) by the value of a priority + column. I.e. for each output row, the row of the input readers with the + highest value of the priority column is picked. Attributes: ----------- diff --git a/mokapot/tabular_data.py b/mokapot/tabular_data.py index ced01df..5ced745 100644 --- a/mokapot/tabular_data.py +++ b/mokapot/tabular_data.py @@ -53,8 +53,8 @@ def get_score_column_type(suffix): @typechecked class TabularDataReader(ABC): """ - An abstract class that represents a source for tabular data that can be read - in either completely or chunk-wise. + An abstract class that represents a source for tabular data that can be + read in either completely or chunk-wise. Implementations can be classes that either read from files, from memory (e.g. data frames), combine or modify other readers or represent computed @@ -118,7 +118,8 @@ class ColumnMappedReader(TabularDataReader): reader : TabularDataReader The underlying reader for the original data. column_map : dict[str, str] - A dictionary that maps the original column names to the new column names. + A dictionary that maps the original column names to the new + column names. """ def __init__(self, reader: TabularDataReader, column_map: dict[str, str]): self.reader = reader @@ -437,8 +438,9 @@ def auto_finalize(writers: list[TabularDataWriter]): @typechecked class BufferedWriter(TabularDataWriter): """ - This class represents a buffered writer for tabular data. It allows writing data to a tabular data writer in - batches, reducing the number of write operations. + This class represents a buffered writer for tabular data. It allows + writing data to a tabular data writer in batches, reducing the + number of write operations. Attributes: ----------- @@ -447,9 +449,11 @@ class BufferedWriter(TabularDataWriter): buffer_size : int The number of records to buffer before writing to the writer. buffer_type : TableType - The type of buffer being used. Can be one of TableType.DataFrame, TableType.Dicts, or TableType.Records. + The type of buffer being used. Can be one of TableType.DataFrame, + TableType.Dicts, or TableType.Records. buffer : pd.DataFrame or list of dictionaries or np.recarray or None - The buffer containing the tabular data to be written. The buffer type depends on the buffer_type attribute. + The buffer containing the tabular data to be written. + The buffer type depends on the buffer_type attribute. """ writer: TabularDataWriter buffer_size: int @@ -554,7 +558,8 @@ class CSVFileWriter(TabularDataWriter): The file path where the CSV file will be written. sep : str, optional - The separator string used to separate fields in the CSV file. Default is tab character ("\t"). + The separator string used to separate fields in the CSV file. + Default is tab character ("\t"). """ file_name: Path diff --git a/pyproject.toml b/pyproject.toml index e21843d..d972113 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "importlib-metadata>=5.1.0", "typeguard>=4.1.5", "pyarrow>=15.0.0", + "scipy>=1.13.0", ] dynamic = ["version"]