Prepare release v0.8.0 (#45)

wfondrie · Mar 11, 2022 · 8ef79af · 8ef79af
1 parent 1c76a7e
commit 8ef79af
Show file tree

Hide file tree

Showing 9 changed files with 72 additions and 45 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -20,10 +20,10 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
-    - name: Set up Python 3.8
+    - name: Set up Python
       uses: actions/setup-python@v2
       with:
-        python-version: "3.8"
+        python-version: "3.x"
 
     - name: Install dependencies
       run: |
@@ -38,7 +38,7 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Run unit and system tests
       run: |
-        pytest --cov=mokapot tests/
+        pytest -v --cov=mokapot tests/
     - name: Upload coverage to codecov
       uses: codecov/codecov-action@v1
       with:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 19.10b0 # Replace by any tag/version: https://github.com/psf/black/tags
+    rev: 22.1.0 # Replace by any tag/version: https://github.com/psf/black/tags
     hooks:
       - id: black
         language_version: python3 # Should be a command that runs python3.6+
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,21 @@
 # Changelog for mokapot  
 
+## [0.8.0] - 2022-03-11
+
+Thanks to @sambenfredj, @gessulat, @tkschmidt, and @MatthewThe for 
+PR #44, which made these things happen!
+
+### Added
+- A new command line argument, `--max_workers`. This allows the
+  cross-validation folds to be computed in parallel.
+- The `PercolatorModel` class now has an `n_jobs` parameter, which 
+  controls parallelization of the grid search.
+
+### Changes
+- Improved speed by using multiple jobs for grid search by default.
+- Parallelization within `mokapot.brew()` now uses `joblib` 
+  instead of `concurrent.futures`.
+
 ## [0.7.4] - 2021-09-03
 ### Changed
 - Improved documentation and added warnings for `--subset_max_train`. Thanks

diff --git a/mokapot/brew.py b/mokapot/brew.py
@@ -3,10 +3,10 @@
 """
 import logging
 import copy
-from concurrent.futures import ProcessPoolExecutor
 
 import pandas as pd
 import numpy as np
+from joblib import Parallel, delayed
 
 from .model import PercolatorModel
 
@@ -47,7 +47,8 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1):
         The number of processes to use for model training. More workers
         will require more memory, but will typically decrease the total
         run time. An integer exceeding the number of folds will have
-        no additional effect.
+        no additional effect. Note that logging messages will be garbled
+        if more than one worker is enabled.
 
     Returns
     -------
@@ -81,25 +82,14 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1):
     LOGGER.info("Splitting PSMs into %i folds...", folds)
     test_idx = [p._split(folds) for p in psms]
     train_sets = _make_train_sets(psms, test_idx)
-
-    # Create args for map:
-    map_args = [
-        _fit_model,
-        train_sets,
-        [copy.deepcopy(model) for _ in range(folds)],
-        range(folds),
-    ]
-
-    # Train models optionally in parallel
-    with ProcessPoolExecutor(max_workers=max_workers) as prc:
-        if max_workers == 1:
-            map_fun = map
-        else:
-            map_args[1] = list(map_args[1])
-            map_args[3] = list(map_args[3])
-            map_fun = prc.map
-
-        models = list(map_fun(*map_args))
+    if max_workers != 1:
+        # train_sets can't be a generator for joblib :(
+        train_sets = list(train_sets)
+
+    models = Parallel(n_jobs=max_workers, require="sharedmem")(
+        delayed(_fit_model)(d, copy.deepcopy(model), f)
+        for f, d in enumerate(train_sets)
+    )
 
     # Determine if the models need to be reset:
     reset = any([m[1] for m in models])

diff --git a/mokapot/config.py b/mokapot/config.py
@@ -70,7 +70,11 @@ def _parser():
         "--max_workers",
         default=1,
         type=int,
-        help="The number of processes to use for model training.",
+        help=(
+            "The number of processes to use for model training. Note that "
+            "using more than one worker will result in garbled logging "
+            "messages."
+        ),
     )
 
     parser.add_argument(

diff --git a/mokapot/model.py b/mokapot/model.py
@@ -127,17 +127,6 @@ def __init__(
         shuffle=True,
     ):
         """Initialize a Model object"""
-        if estimator is None:
-            warnings.warn(
-                "The estimator will need to be specified in future "
-                "versions. Use the PercolatorModel class instead.",
-                DeprecationWarning,
-            )
-            svm_model = LinearSVC(dual=False)
-            estimator = GridSearchCV(
-                svm_model, param_grid=PERC_GRID, refit=False, cv=3
-            )
-
         self.estimator = clone(estimator)
         self.features = None
         self.is_trained = False
@@ -391,6 +380,8 @@ class PercolatorModel(Model):
     shuffle : bool, optional
         Should the order of PSMs be randomized for training? For deterministic
         algorithms, this will have no effect.
+    n_jobs : int, optional
+        The number of jobs used to parallelize the hyperparameter grid search.
 
     Attributes
     ----------
@@ -416,8 +407,9 @@ class PercolatorModel(Model):
         the model still be used?
     subset_max_train : int or None
         The number of PSMs for training.
-    shuffle : bool
-        Is the order of PSMs shuffled for training?
+    n_jobs : int
+        The number of jobs to use for parallizing the hyperparameter
+        grid search.
     """
 
     def __init__(
@@ -428,11 +420,17 @@ def __init__(
         direction=None,
         override=False,
         subset_max_train=None,
+        n_jobs=-1,
     ):
         """Initialize a PercolatorModel"""
+        self.n_jobs = n_jobs
         svm_model = LinearSVC(dual=False)
         estimator = GridSearchCV(
-            svm_model, param_grid=PERC_GRID, refit=False, cv=3, n_jobs=-1
+            svm_model,
+            param_grid=PERC_GRID,
+            refit=False,
+            cv=3,
+            n_jobs=n_jobs,
         )
 
         super().__init__(

diff --git a/setup.cfg b/setup.cfg
@@ -28,6 +28,7 @@ install_requires =
     matplotlib>=3.1.3
     lxml>=4.6.2
     triqler>=0.6.2
+    joblib>=1.1.0
 
 [options.extras_require]
 docs =

diff --git a/tests/unit_tests/test_brew.py b/tests/unit_tests/test_brew.py
@@ -2,7 +2,7 @@
 import pytest
 import numpy as np
 import mokapot
-from mokapot import LinearPsmDataset, PercolatorModel
+from mokapot import PercolatorModel
 
 np.random.seed(42)
 
@@ -47,4 +47,4 @@ def test_brew_test_fdr_error(psms, svm):
 # @pytest.mark.skip(reason="Not currently working, at least on MacOS.")
 def test_brew_multiprocess(psms, svm):
     """Test that multiprocessing doesn't yield an error"""
-    mokapot.brew(psms, svm, test_fdr=0.05, max_workers=3)
+    mokapot.brew(psms, svm, test_fdr=0.05, max_workers=2)
diff --git a/tests/unit_tests/test_model.py b/tests/unit_tests/test_model.py
@@ -1,6 +1,8 @@
 """Test that models work as expected"""
+import pytest
 import mokapot
 import numpy as np
+import pandas as pd
 from sklearn.svm import LinearSVC
 from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
 from sklearn.preprocessing import MinMaxScaler, StandardScaler
@@ -37,6 +39,8 @@ def test_model_init():
     model = mokapot.Model(LogisticRegression())
     assert isinstance(model.scaler, StandardScaler)
 
+    print(model)
+
 
 def test_perc_init():
     """Test the initialization of a PercolatorModel"""
@@ -71,6 +75,14 @@ def test_model_fit(psms):
     assert isinstance(model.estimator, LogisticRegression)
     assert model.is_trained
 
+    no_targets = pd.DataFrame({"targets": [False] * 100})
+    with pytest.raises(ValueError):
+        model.fit(no_targets)
+
+    no_decoys = pd.DataFrame({"targets": [True] * 100})
+    with pytest.raises(ValueError):
+        model.fit(no_decoys)
+
 
 def test_model_fit_large_subset(psms):
     model = mokapot.Model(
@@ -101,10 +113,8 @@ def test_model_predict(psms):
     # The case where a model is trained on a dataset with different features:
     psms._data["blah"] = np.random.randn(len(psms))
     psms._feature_columns = ("score", "blah")
-    try:
+    with pytest.raises(ValueError):
         model.predict(psms)
-    except ValueError:
-        pass
 
 
 def test_model_persistance(tmp_path):
@@ -116,3 +126,11 @@ def test_model_persistance(tmp_path):
     loaded = mokapot.load_model(model_file)
 
     assert isinstance(loaded, mokapot.Model)
+
+
+def test_dummy_scaler():
+    """Test the DummyScaler class"""
+    data = np.random.default_rng(42).normal(0, 1, (20, 10))
+    scaler = mokapot.model.DummyScaler()
+    assert (data == scaler.fit_transform(data)).all()
+    assert (data == scaler.transform(data)).all()