From 8581cbc7e9d5bb4fabd90d0f6c010bb273598bb7 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Tue, 2 Jul 2024 13:16:08 +0200 Subject: [PATCH 1/2] #170: apply ruff formatting --- .pre-commit-config.yaml | 8 +- docs/conf.py | 42 +- peptdeep/_modidx.py | 3 +- peptdeep/cli.py | 130 ++- peptdeep/cli_argparse.py | 72 +- peptdeep/gui.py | 20 +- peptdeep/legacy/thermo_raw/pyrawfilereader.py | 373 ++++--- peptdeep/mass_spec/__init__.py | 4 +- peptdeep/mass_spec/mass_calibration.py | 41 +- peptdeep/mass_spec/match.py | 305 +++--- peptdeep/mass_spec/ms_reader.py | 180 ++-- peptdeep/model/__init__.py | 13 +- peptdeep/model/building_block.py | 689 ++++++------ peptdeep/model/ccs.py | 135 +-- peptdeep/model/charge.py | 151 +-- peptdeep/model/featurize.py | 98 +- peptdeep/model/generic_property_prediction.py | 288 +++-- peptdeep/model/model_interface.py | 603 +++++------ peptdeep/model/ms2.py | 710 +++++++------ peptdeep/model/rt.py | 126 +-- peptdeep/pretrained_models.py | 811 +++++++------- peptdeep/protein/fasta.py | 84 +- peptdeep/psm_frag_reader/__init__.py | 3 +- peptdeep/rescore/__init__.py | 4 +- peptdeep/rescore/fdr.py | 54 +- peptdeep/rescore/feature_extractor.py | 999 +++++++++--------- peptdeep/rescore/percolator.py | 310 +++--- peptdeep/settings.py | 82 +- peptdeep/spec_lib/library_factory.py | 270 ++--- peptdeep/spec_lib/predict_lib.py | 130 ++- peptdeep/utils/__init__.py | 55 +- peptdeep/utils/_pyinstaller_hooks.py | 4 +- peptdeep/utils/device_utils.py | 59 +- peptdeep/utils/logger.py | 41 +- peptdeep/utils/regression.py | 75 +- peptdeep/webui/library_ui.py | 513 +++++---- peptdeep/webui/main_ui.py | 30 +- peptdeep/webui/model_ui.py | 94 +- peptdeep/webui/rescore_ui.py | 33 +- peptdeep/webui/server.py | 50 +- peptdeep/webui/server_ui.py | 23 +- peptdeep/webui/settings_ui.py | 170 +-- peptdeep/webui/startpage.py | 7 +- peptdeep/webui/transfer_ui.py | 363 ++++--- peptdeep/webui/ui_utils.py | 76 +- .../pyinstaller/peptdeep_cli_pyinstaller.py | 2 + release/pyinstaller/peptdeep_pyinstaller.py | 2 + setup.py | 4 +- 48 files changed, 4360 insertions(+), 3979 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e2a1b5cc..c61d8b2a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,10 +7,10 @@ repos: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace -# - repo: https://github.com/astral-sh/ruff-pre-commit -# rev: v0.4.0 -# hooks: -# - id: ruff-format +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.5.0 + hooks: + - id: ruff-format # - id: ruff exclude: .bumpversion.cfg diff --git a/docs/conf.py b/docs/conf.py index 4f7aaf18..6631a6bc 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,14 +14,15 @@ import sys import importlib import inspect -sys.path.insert(0, os.path.abspath('..')) + +sys.path.insert(0, os.path.abspath("..")) # -- Project information ----------------------------------------------------- -project = 'peptdeep' -copyright = '2022, Mann Labs, MPIB' -author = 'Mann Labs, MPIB' +project = "peptdeep" +copyright = "2022, Mann Labs, MPIB" +author = "Mann Labs, MPIB" release = "1.2.1" @@ -31,34 +32,33 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.napoleon', + "sphinx.ext.napoleon", "sphinx.ext.intersphinx", "sphinx.ext.linkcode", - 'sphinx.ext.viewcode', - 'autodocsumm', - 'nbsphinx', - 'myst_parser', + "sphinx.ext.viewcode", + "autodocsumm", + "nbsphinx", + "myst_parser", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [ - '_build', 'Thumbs.db', '.DS_Store', - '_modidx,py' -] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "_modidx,py"] code_url = f"https://github.com/mannlabs/alphapeptdeep/blob/main" + def linkcode_resolve(domain, info): # Non-linkable objects from the starter kit in the tutorial. if domain == "js" or info["module"] == "connect4": return - if domain != "py": return + if domain != "py": + return mod = importlib.import_module(info["module"]) if "." in info["fullname"]: @@ -93,16 +93,16 @@ def linkcode_resolve(domain, info): # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'furo' -html_logo = '_static/peptdeep.png' -html_favicon = '_static/peptdeep.png' +html_theme = "furo" +html_logo = "_static/peptdeep.png" +html_favicon = "_static/peptdeep.png" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] autodoc_default_options = { - 'autosummary': True, - 'special-members': '__init__', # Include __init__ methods. + "autosummary": True, + "special-members": "__init__", # Include __init__ methods. } diff --git a/peptdeep/_modidx.py b/peptdeep/_modidx.py index b12b4534..53ce3258 100644 --- a/peptdeep/_modidx.py +++ b/peptdeep/_modidx.py @@ -1,2 +1 @@ -d = { 'settings': { }, - 'syms': { }} +d = {"settings": {}, "syms": {}} diff --git a/peptdeep/cli.py b/peptdeep/cli.py index 51b5dab0..3cec28bf 100644 --- a/peptdeep/cli.py +++ b/peptdeep/cli.py @@ -13,17 +13,18 @@ from peptdeep.settings import global_settings, load_global_settings from .cli_argparse import get_parser, parse_args_to_global_settings + @click.group( context_settings=dict( - help_option_names=['-h', '--help'], + help_option_names=["-h", "--help"], ), - invoke_without_command=True + invoke_without_command=True, ) @click.pass_context @click.version_option(peptdeep.__version__, "-v", "--version") def run(ctx, **kwargs): click.echo( -r''' + r""" ____ __ ____ / __ \___ ____ / /_/ __ \___ ___ ____ / /_/ / _ \/ __ \/ __/ / / / _ \/ _ \/ __ \ @@ -35,48 +36,63 @@ def run(ctx, **kwargs): .{url}. .{license}. .................................................... -'''.format( - version=peptdeep.__version__.center(50), - url=peptdeep.__github__.center(50), - license=peptdeep.__license__.center(50), +""".format( + version=peptdeep.__version__.center(50), + url=peptdeep.__github__.center(50), + license=peptdeep.__license__.center(50), + ) ) -) if ctx.invoked_subcommand is None: click.echo(run.get_help(ctx)) + @run.command("gui", help="Start graphical user interface.") -@click.option("--port", default=10077, type=int, - show_default=True, help="The web server port." +@click.option( + "--port", default=10077, type=int, show_default=True, help="The web server port." ) -@click.option("--settings_yaml", default='', type=str, - show_default=True, help="Load default settings yaml file." +@click.option( + "--settings_yaml", + default="", + type=str, + show_default=True, + help="Load default settings yaml file.", ) def _gui(port, settings_yaml): import peptdeep.gui from peptdeep.webui.server import _server + if os.path.isfile(settings_yaml): load_global_settings(settings_yaml) # start the server to run tasks _server.start() peptdeep.gui.run(port) + @run.command("install-models", help="Install or update peptdeep pre-trained models.") -@click.option("--model-file", default=None, type=str, - show_default=True, help="The model .zip file to install. " - "If not set, peptdeep will download the model file from GitHub." +@click.option( + "--model-file", + default=None, + type=str, + show_default=True, + help="The model .zip file to install. " + "If not set, peptdeep will download the model file from GitHub.", ) -@click.option("--overwrite", default=True, type=bool, - show_default=True, help="If overwrite existing model file." +@click.option( + "--overwrite", + default=True, + type=bool, + show_default=True, + help="If overwrite existing model file.", ) def _install_model(model_file, overwrite): - from peptdeep.pretrained_models import ( - download_models, model_url - ) + from peptdeep.pretrained_models import download_models, model_url + if not model_file: download_models(model_url, overwrite=overwrite) else: download_models(model_file, overwrite=overwrite) + _help_str = ( "\n\nTo get the settings_yaml file," " you can either export from the GUI," @@ -94,41 +110,49 @@ def _install_model(model_file, overwrite): # load_global_settings(settings_yaml) # rescore() -@run.command("library", help= - "Predict library for DIA search."+_help_str -) + +@run.command("library", help="Predict library for DIA search." + _help_str) @click.argument("settings_yaml", type=str) -def _library(settings_yaml:str): +def _library(settings_yaml: str): from peptdeep.pipeline_api import generate_library + load_global_settings(settings_yaml) generate_library() -@run.command("transfer", help= - "Transfer learning for different data types."+_help_str -) + +@run.command("transfer", help="Transfer learning for different data types." + _help_str) @click.argument("settings_yaml", type=str) -def _transfer(settings_yaml:str): +def _transfer(settings_yaml: str): from peptdeep.pipeline_api import transfer_learn + load_global_settings(settings_yaml) transfer_learn() -@run.command("export-settings", help="Export the default settings to a yaml file. It can be used as the template setting.") + +@run.command( + "export-settings", + help="Export the default settings to a yaml file. It can be used as the template setting.", +) @click.argument("yaml_file", type=str) -def _export_settings(yaml_file:str): +def _export_settings(yaml_file: str): save_yaml(yaml_file, global_settings) + class ParserHelper(click.Command): def format_help(self, ctx: Context, formatter: HelpFormatter) -> None: parser = get_parser() formatter.write(parser.format_help()) -@run.command("cmd-flow", + +@run.command( + "cmd-flow", help="Using command line arguments to control the settings", cls=ParserHelper, context_settings=dict( - ignore_unknown_options=True, - allow_extra_args=True, -)) + ignore_unknown_options=True, + allow_extra_args=True, + ), +) @click.pass_context def _cmd_flow(ctx): parser = get_parser() @@ -138,25 +162,39 @@ def _cmd_flow(ctx): parse_args_to_global_settings(parser, ctx.args) if "train" in global_settings["task_workflow"]: from peptdeep.pipeline_api import transfer_learn + transfer_learn() - if os.path.isfile(os.path.join( - global_settings["model_mgr"]["transfer"]["model_output_folder"], "ms2.pth" - )): + if os.path.isfile( + os.path.join( + global_settings["model_mgr"]["transfer"]["model_output_folder"], + "ms2.pth", + ) + ): global_settings["model_mgr"]["external_ms2_model"] = os.path.join( - global_settings["model_mgr"]["transfer"]["model_output_folder"], "ms2.pth" + global_settings["model_mgr"]["transfer"]["model_output_folder"], + "ms2.pth", ) - if os.path.isfile(os.path.join( - global_settings["model_mgr"]["transfer"]["model_output_folder"], "rt.pth" - )): + if os.path.isfile( + os.path.join( + global_settings["model_mgr"]["transfer"]["model_output_folder"], + "rt.pth", + ) + ): global_settings["model_mgr"]["external_rt_model"] = os.path.join( - global_settings["model_mgr"]["transfer"]["model_output_folder"], "rt.pth" + global_settings["model_mgr"]["transfer"]["model_output_folder"], + "rt.pth", + ) + if os.path.isfile( + os.path.join( + global_settings["model_mgr"]["transfer"]["model_output_folder"], + "ccs.pth", ) - if os.path.isfile(os.path.join( - global_settings["model_mgr"]["transfer"]["model_output_folder"], "ccs.pth" - )): + ): global_settings["model_mgr"]["external_ccs_model"] = os.path.join( - global_settings["model_mgr"]["transfer"]["model_output_folder"], "ccs.pth" + global_settings["model_mgr"]["transfer"]["model_output_folder"], + "ccs.pth", ) if "library" in global_settings["task_workflow"]: from peptdeep.pipeline_api import generate_library + generate_library() diff --git a/peptdeep/cli_argparse.py b/peptdeep/cli_argparse.py index bff8ba07..05056a1c 100644 --- a/peptdeep/cli_argparse.py +++ b/peptdeep/cli_argparse.py @@ -4,14 +4,16 @@ import argparse from peptdeep.settings import ( - global_settings, load_global_settings, - _refine_global_settings + global_settings, + load_global_settings, + _refine_global_settings, ) -__argparse_dict_level_sep="--" # do not change +__argparse_dict_level_sep = "--" # do not change + def convert_dict_to_argparse( - settings:dict, + settings: dict, prefix_key="", dict_level_sep=__argparse_dict_level_sep, ): @@ -26,56 +28,73 @@ def convert_dict_to_argparse( "user_defined_modifications", "instrument_group", ]: - ret += [(prefix_key+dict_level_sep+key, val)] + ret += [(prefix_key + dict_level_sep + key, val)] else: ret += convert_dict_to_argparse( - val, prefix_key=(prefix_key+dict_level_sep+key) if prefix_key else key + val, + prefix_key=(prefix_key + dict_level_sep + key) + if prefix_key + else key, ) return ret else: return [(prefix_key, settings)] + def _set_dict_val(_dict, keys, val): - if len(keys) < 1: return + if len(keys) < 1: + return elif keys[0] == "labeling_channels": - def _get(x:str): + + def _get(x: str): i = x.find(":") - k,v = x[:i], x[i+1:] + k, v = x[:i], x[i + 1 :] k = int(k) if k.isdigit() else k v = v.split(";") - return k,v + return k, v + _dict[keys[0]].update(dict([_get(s) for s in val])) elif keys[0] == "psm_modification_mapping": + def _get(x): i = x.find(":", x.find("@")) - k,v = x[:i], x[i+1:] + k, v = x[:i], x[i + 1 :] return k, v.split(";") + _dict[keys[0]].update(dict([_get(s) for s in val])) elif keys[0] == "user_defined_modifications": + def _get(x): i = x.find(":", x.find("@")) - k,v = x[:i], x[i+1:] + k, v = x[:i], x[i + 1 :] items = v.split(";") if len(items) == 1: - return k, {"composition":items[0]} + return k, {"composition": items[0]} else: return k, {"composition": items[0], "modloss_composition": items[1]} + _dict[keys[0]].update(dict([_get(s) for s in val])) elif len(keys) == 1: _dict[keys[0]] = val - else: _set_dict_val(_dict[keys[0]], keys[1:], val) + else: + _set_dict_val(_dict[keys[0]], keys[1:], val) + def get_parser(): parser = argparse.ArgumentParser() parser.add_argument( - "--settings_yaml", type=str, default="", - help="The yaml file for saved settings (default: %(default)s)" + "--settings_yaml", + type=str, + default="", + help="The yaml file for saved settings (default: %(default)s)", ) arg_settings = convert_dict_to_argparse(global_settings) for arg, val in arg_settings: - arg = "--"+arg - if isinstance(val, (list,dict,set)): - parser.add_argument(arg, nargs="*", default=val, help="(default: %(default)s)") + arg = "--" + arg + if isinstance(val, (list, dict, set)): + parser.add_argument( + arg, nargs="*", default=val, help="(default: %(default)s)" + ) else: if isinstance(val, bool): _type = bool @@ -89,25 +108,24 @@ def get_parser(): else: _type = str _dt = "s" - parser.add_argument(arg, type=_type, default=val, help=f"(default: %(default){_dt})") + parser.add_argument( + arg, type=_type, default=val, help=f"(default: %(default){_dt})" + ) return parser + def parse_args_to_global_settings(parser, args): args_dict = vars(parser.parse_known_args(args)[0]) if "settings_yaml" in args_dict: - if os.path.isfile( - args_dict["settings_yaml"] - ): - load_global_settings( - args_dict["settings_yaml"] - ) + if os.path.isfile(args_dict["settings_yaml"]): + load_global_settings(args_dict["settings_yaml"]) else: print(f"Settings.yaml `{args_dict['settings_yaml']}` does not exist.") args_dict.pop("settings_yaml") used_args = {} for arg in args: if arg.startswith("--"): - arg = arg[2:].replace("--","__") + arg = arg[2:].replace("--", "__") if arg in args_dict: used_args[arg] = args_dict[arg] diff --git a/peptdeep/gui.py b/peptdeep/gui.py index 600fb233..9da8140a 100644 --- a/peptdeep/gui.py +++ b/peptdeep/gui.py @@ -1,13 +1,14 @@ #!python import os + def run(port=10077): - print(f'Starting PeptDeep Web Server on port {port} ...') + print(f"Starting PeptDeep Web Server on port {port} ...") _this_file = __file__ _this_directory = os.path.dirname(_this_file) - file_path = os.path.join(_this_directory, 'webui', 'main_ui.py') + file_path = os.path.join(_this_directory, "webui", "main_ui.py") HOME = os.path.expanduser("~") @@ -16,14 +17,13 @@ def run(port=10077): if not os.path.isdir(ST_PATH): os.mkdir(ST_PATH) - #Check if streamlit credentials exists - ST_CREDENTIALS = os.path.join(ST_PATH, 'credentials.toml') + # Check if streamlit credentials exists + ST_CREDENTIALS = os.path.join(ST_PATH, "credentials.toml") if not os.path.isfile(ST_CREDENTIALS): - with open(ST_CREDENTIALS, 'w') as file: + with open(ST_CREDENTIALS, "w") as file: file.write("[general]\n") file.write('\nemail = ""') - import sys from streamlit.web import cli as stcli @@ -36,11 +36,13 @@ def run(port=10077): theme.append("--theme.primaryColor=#18212b") args = [ - "streamlit", "run", - file_path, "--global.developmentMode=false", + "streamlit", + "run", + file_path, + "--global.developmentMode=false", f"--server.port={port}", "--browser.gatherUsageStats=False", - "--logger.level=error" + "--logger.level=error", ] args.extend(theme) diff --git a/peptdeep/legacy/thermo_raw/pyrawfilereader.py b/peptdeep/legacy/thermo_raw/pyrawfilereader.py index 80bbabfe..7f71ba3c 100644 --- a/peptdeep/legacy/thermo_raw/pyrawfilereader.py +++ b/peptdeep/legacy/thermo_raw/pyrawfilereader.py @@ -3,13 +3,14 @@ # require pythonnet, pip install pythonnet on Windows import clr -clr.AddReference('System') + +clr.AddReference("System") import System from System.Threading import Thread from System.Globalization import CultureInfo -de_fr = CultureInfo('fr-FR') -other = CultureInfo('en-US') +de_fr = CultureInfo("fr-FR") +other = CultureInfo("en-US") Thread.CurrentThread.CurrentCulture = other Thread.CurrentThread.CurrentUICulture = other @@ -20,15 +21,15 @@ import ThermoFisher from ThermoFisher.CommonCore.Data.Interfaces import IScanEventBase, IScanEvent -'''C# code to read Raw data +"""C# code to read Raw data rawFile = ThermoFisher.CommonCore.RawFileReader.RawFileReaderAdapter.FileFactory(raw_filename) var scanStatistics = rawFile.GetScanStatsForScanNumber(1); var seg = rawFile.GetSegmentedScanFromScanNumber(1, scanStatistics); var scanEvent = rawFile.GetScanEventForScanNumber(1); var trailerData = rawFile.GetTrailerExtraInformation(1); -''' +""" -''' +""" APIs to access Thermo's Raw Files > This implementation is based on [pythonnet](http://pythonnet.github.io) and ThermoFisher's `RawFileReader` project. @@ -43,7 +44,7 @@ > 3. "export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:/usr/lib/pkgconfig:/Library/Frameworks/Mono.framework/Versions/Current/lib/pkgconfig:$PKG_CONFIG_PATH"; > `or` add these PKG_CONFIG_PATH into ~./bash_profile, and run "source ~/bash_profile". 6.12.0 is my mono version > 4. pip install pythonnet -''' +""" # see https://github.com/mobiusklein/ms_deisotope/blob/90b817d4b5ae7823cfe4ad61c57119d62a6e3d9d/ms_deisotope/data_source/thermo_raw_net.py#L217 # from System.Runtime.InteropServices import Marshal @@ -71,136 +72,157 @@ from System.Runtime.InteropServices import GCHandle, GCHandleType import ctypes + + def DotNetArrayToNPArray(src, dtype=None): - ''' + """ See https://mail.python.org/pipermail/pythondotnet/2014-May/001527.html - ''' + """ if src is None: return np.array([], dtype=np.float64) src_hndl = GCHandle.Alloc(src, GCHandleType.Pinned) try: src_ptr = src_hndl.AddrOfPinnedObject().ToInt64() - bufType = ctypes.c_double*len(src) + bufType = ctypes.c_double * len(src) cbuf = bufType.from_address(src_ptr) dest = np.frombuffer(cbuf, dtype=cbuf._type_).copy() finally: - if src_hndl.IsAllocated: src_hndl.Free() + if src_hndl.IsAllocated: + src_hndl.Free() return dest -''' + +""" APIs are similar to [pymsfilereader](https://github.com/frallain/pymsfilereader), but some APIs have not been implemented yet." -''' +""" + + class RawFileReader(object): # static class members - sampleType = {0: 'Unknown', - 1: 'Blank', - 2: 'QC', - 3: 'Standard Clear (None)', - 4: 'Standard Update (None)', - 5: 'Standard Bracket (Open)', - 6: 'Standard Bracket Start (multiple brackets)', - 7: 'Standard Bracket End (multiple brackets)'} - - controllerType = {-1: 'No device', - 0: 'MS', - 1: 'Analog', - 2: 'A/D card', - 3: 'PDA', - 4: 'UV', - 'No device': -1, - 'MS': 0, - 'Analog': 1, - 'A/D card': 2, - 'PDA': 3, - 'UV': 4} - - massAnalyzerType = {'ITMS': 0, - 'TQMS': 1, - 'SQMS': 2, - 'TOFMS': 3, - 'FTMS': 4, - 'Sector': 5, - 0: 'ITMS', - 1: 'TQMS', - 2: 'SQMS', - 3: 'TOFMS', - 4: 'FTMS', - 5: 'Sector'} - activationType = {'CID': 0, - 'MPD': 1, - 'ECD': 2, - 'PQD': 3, - 'ETD': 4, - 'HCD': 5, - 'Any activation type': 6, - 'SA': 7, - 'PTR': 8, - 'NETD': 9, - 'NPTR': 10, - 'UVPD': 11, - 'ETHCD': 12, # not Thermo's build-in activation types - 'ETCID': 13, # not Thermo's build-in activation types - 0: 'CID', - 1: 'MPD', - 2: 'ECD', - 3: 'PQD', - 4: 'ETD', - 5: 'HCD', - 6: 'Any activation type', - 7: 'SA', - 8: 'PTR', - 9: 'NETD', - 10: 'NPTR', - 11: 'UVPD', - 12: 'ETHCD', # not Thermo's build-in activation types - 13: 'ETCID', # not Thermo's build-in activation types - } - - detectorType = {'Valid': 0, - 'Any': 1, - 'NotValid': 2, - 0: 'Valid', - 1: 'Any', - 2: 'NotValid', - } - - scanDataType = {'Centroid': 0, - 'Profile': 1, - 'Any': 2, - 0: 'Centroid', - 1: 'Profile', - 2: 'Any', - } - - scanType = {'Full': 0, - 'Zoom': 1, - 'SIM': 2, - 'SRM': 3, - 'CRM': 4, - 'Any': 5, - 'Q1MS': 6, - 'Q3MS': 7, - 0: 'Full', - 1: 'SIM', - 2: 'Zoom', - 3: 'SRM', - 4: 'CRM', - 5: 'Any', - 6: 'Q1MS', - 7: 'Q3MS', - } + sampleType = { + 0: "Unknown", + 1: "Blank", + 2: "QC", + 3: "Standard Clear (None)", + 4: "Standard Update (None)", + 5: "Standard Bracket (Open)", + 6: "Standard Bracket Start (multiple brackets)", + 7: "Standard Bracket End (multiple brackets)", + } + + controllerType = { + -1: "No device", + 0: "MS", + 1: "Analog", + 2: "A/D card", + 3: "PDA", + 4: "UV", + "No device": -1, + "MS": 0, + "Analog": 1, + "A/D card": 2, + "PDA": 3, + "UV": 4, + } + + massAnalyzerType = { + "ITMS": 0, + "TQMS": 1, + "SQMS": 2, + "TOFMS": 3, + "FTMS": 4, + "Sector": 5, + 0: "ITMS", + 1: "TQMS", + 2: "SQMS", + 3: "TOFMS", + 4: "FTMS", + 5: "Sector", + } + activationType = { + "CID": 0, + "MPD": 1, + "ECD": 2, + "PQD": 3, + "ETD": 4, + "HCD": 5, + "Any activation type": 6, + "SA": 7, + "PTR": 8, + "NETD": 9, + "NPTR": 10, + "UVPD": 11, + "ETHCD": 12, # not Thermo's build-in activation types + "ETCID": 13, # not Thermo's build-in activation types + 0: "CID", + 1: "MPD", + 2: "ECD", + 3: "PQD", + 4: "ETD", + 5: "HCD", + 6: "Any activation type", + 7: "SA", + 8: "PTR", + 9: "NETD", + 10: "NPTR", + 11: "UVPD", + 12: "ETHCD", # not Thermo's build-in activation types + 13: "ETCID", # not Thermo's build-in activation types + } + + detectorType = { + "Valid": 0, + "Any": 1, + "NotValid": 2, + 0: "Valid", + 1: "Any", + 2: "NotValid", + } + + scanDataType = { + "Centroid": 0, + "Profile": 1, + "Any": 2, + 0: "Centroid", + 1: "Profile", + 2: "Any", + } + + scanType = { + "Full": 0, + "Zoom": 1, + "SIM": 2, + "SRM": 3, + "CRM": 4, + "Any": 5, + "Q1MS": 6, + "Q3MS": 7, + 0: "Full", + 1: "SIM", + 2: "Zoom", + 3: "SRM", + 4: "CRM", + 5: "Any", + 6: "Q1MS", + 7: "Q3MS", + } def __init__(self, filename, **kwargs): - self.filename = os.path.abspath(filename) self.filename = os.path.normpath(self.filename) - self.source = ThermoFisher.CommonCore.RawFileReader.RawFileReaderAdapter.FileFactory(self.filename) + self.source = ( + ThermoFisher.CommonCore.RawFileReader.RawFileReaderAdapter.FileFactory( + self.filename + ) + ) if not self.source.IsOpen: raise IOError( "RAWfile '{0}' could not be opened, is the file accessible ?".format( - self.filename)) + self.filename + ) + ) self.source.SelectInstrument(ThermoFisher.CommonCore.Data.Business.Device.MS, 1) try: @@ -213,7 +235,7 @@ def __init__(self, filename, **kwargs): self.MassResolution = self.GetMassResolution() self.NumSpectra = self.GetNumSpectra() except Exception as e: - raise IOError(f'{e}') + raise IOError(f"{e}") def Close(self): """Closes a raw file and frees the associated memory.""" @@ -232,7 +254,7 @@ def GetCreationDate(self): """Returns the file creation date in DATE format.""" # https://msdn.microsoft.com/en-us/library/82ab7w69.aspx # The DATE type is implemented using an 8-byte floating-point number - return self.source.CreationDate.ToString('o') + return self.source.CreationDate.ToString("o") def GetStatusLogForRetentionTime(self, rt): logEntry = self.source.GetStatusLogForRetentionTime(rt) @@ -305,7 +327,10 @@ def GetFilters(self): # INSTRUMENT BEGIN def GetInstName(self): """Returns the instrument name, if available, for the current controller.""" - return System.String.Join(" -> ", self.source.GetAllInstrumentNamesFromInstrumentMethod()) + return System.String.Join( + " -> ", self.source.GetAllInstrumentNamesFromInstrumentMethod() + ) + # INSTRUMENT END def GetScanEventStringForScanNum(self, scanNumber): @@ -321,36 +346,48 @@ def GetStatusLogForScanNum(self, scan): def GetNumberOfMassRangesFromScanNum(self, scanNumber): """This function gets the number of MassRange data items in the scan.""" - return IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)).MassRangeCount + return IScanEventBase( + self.source.GetScanEventForScanNumber(scanNumber) + ).MassRangeCount def GetMassRangeFromScanNum(self, scanNumber, massRangeIndex): """This function retrieves information about the mass range data of a scan (high and low masses). You can find the count of mass ranges for the scan by calling GetNumberOfMassRangesFromScanNum().""" - range = IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)).GetMassRange(massRangeIndex) + range = IScanEventBase( + self.source.GetScanEventForScanNumber(scanNumber) + ).GetMassRange(massRangeIndex) return range.Low, range.High def GetNumberOfSourceFragmentsFromScanNum(self, scanNumber): """This function gets the number of source fragments (or compensation voltages) in the scan.""" - return IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)).SourceFragmentationInfoCount + return IScanEventBase( + self.source.GetScanEventForScanNumber(scanNumber) + ).SourceFragmentationInfoCount def GetSourceFragmentValueFromScanNum(self, scanNumber, sourceFragmentIndex): """This function retrieves information about one of the source fragment values of a scan. It is also the same value as the compensation voltage. You can find the count of source fragments for the scan by calling GetNumberOfSourceFragmentsFromScanNum ().""" - return IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)).GetSourceFragmentationInfo(sourceFragmentIndex) + return IScanEventBase( + self.source.GetScanEventForScanNumber(scanNumber) + ).GetSourceFragmentationInfo(sourceFragmentIndex) - def GetIsolationWidthForScanNum(self, scanNumber, MSOrder = 0): + def GetIsolationWidthForScanNum(self, scanNumber, MSOrder=0): """This function returns the isolation width for the scan specified by scanNumber and the transition specified by MSOrder (0 for MS1?) from the scan event structure in the raw file.""" - return IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)).GetIsolationWidth(MSOrder) + return IScanEventBase( + self.source.GetScanEventForScanNumber(scanNumber) + ).GetIsolationWidth(MSOrder) - def GetCollisionEnergyForScanNum(self, scanNumber, MSOrder = 0): + def GetCollisionEnergyForScanNum(self, scanNumber, MSOrder=0): """This function returns the collision energy for the scan specified by scanNumber and the - transition specified by MSOrder (0 for MS1?) from the scan event structure in the raw file. """ - return IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)).GetEnergy(MSOrder) + transition specified by MSOrder (0 for MS1?) from the scan event structure in the raw file.""" + return IScanEventBase( + self.source.GetScanEventForScanNumber(scanNumber) + ).GetEnergy(MSOrder) - def GetActivationTypeForScanNum(self, scanNumber, MSOrder = 0): + def GetActivationTypeForScanNum(self, scanNumber, MSOrder=0): """This function returns the activation type for the scan specified by scanNumber and the transition specified by MSOrder from the scan event structure in the RAW file. The value returned in the pnActivationType variable is one of the following: @@ -366,7 +403,11 @@ def GetActivationTypeForScanNum(self, scanNumber, MSOrder = 0): NETD 9 NPTR 10 UVPD 11""" - return RawFileReader.activationType[IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)).GetActivation(MSOrder)] + return RawFileReader.activationType[ + IScanEventBase( + self.source.GetScanEventForScanNumber(scanNumber) + ).GetActivation(MSOrder) + ] def GetMassAnalyzerTypeForScanNum(self, scanNumber): """This function returns the mass analyzer type for the scan specified by scanNumber from the @@ -379,17 +420,23 @@ def GetMassAnalyzerTypeForScanNum(self, scanNumber): def GetDetectorTypeForScanNum(self, scanNumber): """This function returns the detector type for the scan specified by scanNumber from the scan event structure in the RAW file.""" - return RawFileReader.detectorType[IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)).Detector] + return RawFileReader.detectorType[ + IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)).Detector + ] def GetNumberOfMassCalibratorsFromScanNum(self, scanNumber): """This function gets the number of mass calibrators (each of which is a double) in the scan.""" - return IScanEvent(self.source.GetScanEventForScanNumber(scanNumber)).MassCalibratorCount + return IScanEvent( + self.source.GetScanEventForScanNumber(scanNumber) + ).MassCalibratorCount def GetMassCalibrationValueFromScanNum(self, scanNumber, massCalibrationIndex): """This function retrieves information about one of the mass calibration data values of a scan. You can find the count of mass calibrations for the scan by calling GetNumberOfMassCalibratorsFromScanNum().""" - return IScanEvent(self.source.GetScanEventForScanNumber(scanNumber)).GetMassCalibrator(massCalibrationIndex) + return IScanEvent( + self.source.GetScanEventForScanNumber(scanNumber) + ).GetMassCalibrator(massCalibrationIndex) def GetMassResolution(self): """Gets the mass resolution value recorded for the current controller. The value is returned as one @@ -442,7 +489,7 @@ def ScanNumFromRTInSeconds(self, RTInSeconds): For non-scanning devices, such as UV, the closest reading number is returned. The value of RT must be within the acquisition run time for the current controller. The acquisition run time for the current controller may be obtained by calling GetStartTime and GetEndTime.""" - return self.ScanNumFromRT(RTInSeconds/60) + return self.ScanNumFromRT(RTInSeconds / 60) def RTFromScanNum(self, scanNumber): """Returns the closest matching run time or retention time that corresponds to scanNumber for @@ -454,7 +501,7 @@ def RTInSecondsFromScanNum(self, scanNumber): """Returns the closest matching run time or retention time that corresponds to scanNumber for the current controller. For non-scanning devices, such as UV, the scanNumber is the reading number.""" - return self.RTFromScanNum(scanNumber)*60 + return self.RTFromScanNum(scanNumber) * 60 def IsProfileScanForScanNum(self, scanNumber): """Returns TRUE if the scan specified by scanNumber is a profile scan, FALSE if the scan is a @@ -490,14 +537,20 @@ def GetNumberOfMSOrdersFromScanNum(self, scanNumber): """This function gets the number of MS reaction data items in the scan event for the scan specified by scanNumber and the transition specified by MSOrder from the scan event structure in the raw file.""" - return IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)).MassCount + return IScanEventBase( + self.source.GetScanEventForScanNumber(scanNumber) + ).MassCount - def GetPrecursorMassForScanNum(self, scanNumber, MSOrder = 0): + def GetPrecursorMassForScanNum(self, scanNumber, MSOrder=0): """This function returns the precursor mass for the scan specified by scanNumber and the transition specified by MSOrder (0 for precursor in MS1?) from the scan event structure in the RAW file.""" - return IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)).GetReaction(MSOrder).PrecursorMass + return ( + IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)) + .GetReaction(MSOrder) + .PrecursorMass + ) - def GetPrecursorRangeForScanNum(self, scanNumber, MSOrder = 0): + def GetPrecursorRangeForScanNum(self, scanNumber, MSOrder=0): """This function returns the first and last precursor mass values of the range and whether they are valid for the scan specified by scanNumber and the transition specified by MSOrder (0 for precursor in MS1?) from the scan event structure in the raw file.""" scanEvent = IScanEventBase(self.source.GetScanEventForScanNumber(scanNumber)) @@ -521,8 +574,8 @@ def GetTrailerExtraForScanNum(self, scanNumber): def GetMS2MonoMzAndChargeFromScanNum(self, scanNumber): trailerData = self.GetTrailerExtraForScanNum(scanNumber) - mono = float(trailerData['Monoisotopic M/Z:'].strip()) - charge = int(trailerData['Charge State:'].strip()) + mono = float(trailerData["Monoisotopic M/Z:"].strip()) + charge = int(trailerData["Charge State:"].strip()) if mono < 1: mono = self.GetPrecursorMassForScanNum(scanNumber) if mono < 1: @@ -531,20 +584,48 @@ def GetMS2MonoMzAndChargeFromScanNum(self, scanNumber): def GetProfileMassListFromScanNum(self, scanNumber): scanStatistics = self.source.GetScanStatsForScanNumber(scanNumber) - segmentedScan = self.source.GetSegmentedScanFromScanNumber(scanNumber, scanStatistics) - return np.array([DotNetArrayToNPArray(segmentedScan.Positions, float), DotNetArrayToNPArray(segmentedScan.Intensities, float)]) + segmentedScan = self.source.GetSegmentedScanFromScanNumber( + scanNumber, scanStatistics + ) + return np.array( + [ + DotNetArrayToNPArray(segmentedScan.Positions, float), + DotNetArrayToNPArray(segmentedScan.Intensities, float), + ] + ) def GetCentroidMassListFromScanNum(self, scanNumber): scanStatistics = self.source.GetScanStatsForScanNumber(scanNumber) if scanStatistics.IsCentroidScan: - segmentedScan = self.source.GetSegmentedScanFromScanNumber(scanNumber, scanStatistics) - return np.array([DotNetArrayToNPArray(segmentedScan.Positions, float), DotNetArrayToNPArray(segmentedScan.Intensities, float)]) + segmentedScan = self.source.GetSegmentedScanFromScanNumber( + scanNumber, scanStatistics + ) + return np.array( + [ + DotNetArrayToNPArray(segmentedScan.Positions, float), + DotNetArrayToNPArray(segmentedScan.Intensities, float), + ] + ) else: - scan = ThermoFisher.CommonCore.Data.Business.Scan.FromFile(self.source, scanNumber) + scan = ThermoFisher.CommonCore.Data.Business.Scan.FromFile( + self.source, scanNumber + ) if scan.HasCentroidStream: stream = self.source.GetCentroidStream(scanNumber, False) - return np.array([DotNetArrayToNPArray(stream.Masses, float), DotNetArrayToNPArray(stream.Intensities, float)]) + return np.array( + [ + DotNetArrayToNPArray(stream.Masses, float), + DotNetArrayToNPArray(stream.Intensities, float), + ] + ) else: print("Profile scan {0} cannot be centroided!".format(scanNumber)) - segmentedScan = self.source.GetSegmentedScanFromScanNumber(scanNumber, scanStatistics) - return np.array([DotNetArrayToNPArray(segmentedScan.Positions, float), DotNetArrayToNPArray(segmentedScan.Intensities, float)]) + segmentedScan = self.source.GetSegmentedScanFromScanNumber( + scanNumber, scanStatistics + ) + return np.array( + [ + DotNetArrayToNPArray(segmentedScan.Positions, float), + DotNetArrayToNPArray(segmentedScan.Intensities, float), + ] + ) diff --git a/peptdeep/mass_spec/__init__.py b/peptdeep/mass_spec/__init__.py index eeecbe8d..702487cd 100644 --- a/peptdeep/mass_spec/__init__.py +++ b/peptdeep/mass_spec/__init__.py @@ -1,3 +1 @@ -from peptdeep.mass_spec import ( - match, ms_reader -) +from peptdeep.mass_spec import match, ms_reader diff --git a/peptdeep/mass_spec/mass_calibration.py b/peptdeep/mass_spec/mass_calibration.py index 488fd195..361e16b5 100644 --- a/peptdeep/mass_spec/mass_calibration.py +++ b/peptdeep/mass_spec/mass_calibration.py @@ -2,37 +2,44 @@ import pandas as pd import numpy as np -def get_fragment_median(start_end_idxes:tuple, frag_df:pd.DataFrame): + +def get_fragment_median(start_end_idxes: tuple, frag_df: pd.DataFrame): start_idx, end_idx = start_end_idxes ret = np.nanmedian(frag_df.values[start_idx:end_idx]) - if np.isnan(ret): return 0.0 - else: return ret + if np.isnan(ret): + return 0.0 + else: + return ret + def calibrate_one(start_end_shift, frag_df): start_idx, end_idx, mass_shift = start_end_shift - frag_df.values[int(start_idx):int(end_idx)] -= mass_shift + frag_df.values[int(start_idx) : int(end_idx)] -= mass_shift + class MassCalibratorForRT_KNN: - """Using KNN to calibrate measured m/z across RT. - """ + """Using KNN to calibrate measured m/z across RT.""" + def __init__(self, n_neighbors=5): self._n_neighbors = n_neighbors self.model = KNeighborsRegressor(n_neighbors) - def fit(self, psm_df:pd.DataFrame, mass_error_df:pd.DataFrame): + def fit(self, psm_df: pd.DataFrame, mass_error_df: pd.DataFrame): mass_error_df = mass_error_df.replace(np.inf, np.nan) - mean_merrs = psm_df[['frag_start_idx','frag_stop_idx']].apply( - get_fragment_median, axis=1, frag_df=mass_error_df - ).values - self.model.fit(psm_df.rt.values.reshape((-1,1)), mean_merrs.reshape(-1,1)) + mean_merrs = ( + psm_df[["frag_start_idx", "frag_stop_idx"]] + .apply(get_fragment_median, axis=1, frag_df=mass_error_df) + .values + ) + self.model.fit(psm_df.rt.values.reshape((-1, 1)), mean_merrs.reshape(-1, 1)) - def calibrate(self, - psm_df:pd.DataFrame, mass_error_df:pd.DataFrame - )->pd.DataFrame: - psm_df['frag_mass_shift'] = self.model.predict( - psm_df.rt.values.reshape((-1,1)) + def calibrate( + self, psm_df: pd.DataFrame, mass_error_df: pd.DataFrame + ) -> pd.DataFrame: + psm_df["frag_mass_shift"] = self.model.predict( + psm_df.rt.values.reshape((-1, 1)) ).reshape(-1) - psm_df[['frag_start_idx','frag_stop_idx','frag_mass_shift']].apply( + psm_df[["frag_start_idx", "frag_stop_idx", "frag_mass_shift"]].apply( calibrate_one, axis=1, frag_df=mass_error_df ).values return mass_error_df diff --git a/peptdeep/mass_spec/match.py b/peptdeep/mass_spec/match.py index d7137866..7e9f8041 100644 --- a/peptdeep/mass_spec/match.py +++ b/peptdeep/mass_spec/match.py @@ -5,18 +5,17 @@ from alphabase.peptide.fragment import ( create_fragment_mz_dataframe, - get_charged_frag_types -) -from peptdeep.mass_spec.ms_reader import ( - ms2_reader_provider, MSReaderBase + get_charged_frag_types, ) +from peptdeep.mass_spec.ms_reader import ms2_reader_provider, MSReaderBase + @numba.njit def match_centroid_mz( - spec_mzs:np.ndarray, - query_mzs:np.ndarray, - spec_mz_tols:np.ndarray, -)->np.ndarray: + spec_mzs: np.ndarray, + query_mzs: np.ndarray, + spec_mz_tols: np.ndarray, +) -> np.ndarray: """ Matching query masses against sorted MS2/spec centroid masses, only closest (minimal abs mass error) peaks are returned. @@ -42,13 +41,13 @@ def match_centroid_mz( idxes = np.searchsorted(spec_mzs, query_mzs) ret_indices = np.empty_like(query_mzs, dtype=np.int32) # ret_indices[:] = -1 - for i,idx in np.ndenumerate(idxes): - min_merr = abs(spec_mzs[idx-1]-query_mzs[i]) + for i, idx in np.ndenumerate(idxes): + min_merr = abs(spec_mzs[idx - 1] - query_mzs[i]) min_idx = -1 - if min_merr <= spec_mz_tols[idx-1]: - min_idx = idx-1 + if min_merr <= spec_mz_tols[idx - 1]: + min_idx = idx - 1 if idx < len(spec_mzs): - merr = abs(spec_mzs[idx]-query_mzs[i]) + merr = abs(spec_mzs[idx] - query_mzs[i]) if merr <= spec_mz_tols[idx] and merr < min_merr: min_idx = idx ret_indices[i] = min_idx @@ -57,11 +56,11 @@ def match_centroid_mz( @numba.njit def match_profile_mz( - spec_mzs:np.ndarray, - query_mzs:np.ndarray, - spec_mz_tols:np.ndarray, - spec_intens:np.ndarray, -)->np.ndarray: + spec_mzs: np.ndarray, + query_mzs: np.ndarray, + spec_mz_tols: np.ndarray, + spec_intens: np.ndarray, +) -> np.ndarray: """ Matching query masses against sorted MS2/spec profile masses, only highest peaks are returned. @@ -86,21 +85,21 @@ def match_profile_mz( """ idxes = np.searchsorted(spec_mzs, query_mzs) ret_indices = np.empty_like(query_mzs, dtype=np.int32) - for i,idx in np.ndenumerate(idxes): + for i, idx in np.ndenumerate(idxes): if idx == len(spec_mzs): - idx = idx-1 + idx = idx - 1 highest = 0 highest_idx = -1 for _idx in range(idx, -1, -1): - if abs(spec_mzs[_idx]-query_mzs[i])>spec_mz_tols[_idx]: + if abs(spec_mzs[_idx] - query_mzs[i]) > spec_mz_tols[_idx]: break if highest < spec_intens[_idx]: highest = spec_intens[_idx] highest_idx = _idx - for _idx in range(idx+1, len(spec_mzs)): - if abs(spec_mzs[_idx]-query_mzs[i])>spec_mz_tols[_idx]: + for _idx in range(idx + 1, len(spec_mzs)): + if abs(spec_mzs[_idx] - query_mzs[i]) > spec_mz_tols[_idx]: break if highest < spec_intens[_idx]: highest = spec_intens[_idx] @@ -108,12 +107,13 @@ def match_profile_mz( ret_indices[i] = highest_idx return ret_indices + @numba.njit def match_first_last_profile_mz( - spec_mzs:np.ndarray, - query_mzs:np.ndarray, - spec_mz_tols:np.ndarray, -)->np.ndarray: + spec_mzs: np.ndarray, + query_mzs: np.ndarray, + spec_mz_tols: np.ndarray, +) -> np.ndarray: """ Matching query masses against sorted MS2/spec profile masses, both first and last m/z values are returned. @@ -137,24 +137,20 @@ def match_first_last_profile_mz( -1 means no peaks are matched for the query mz """ idxes = np.searchsorted(spec_mzs, query_mzs) - first_indices = np.empty_like( - query_mzs, dtype=np.int32 - ) - last_indices = np.empty_like( - query_mzs, dtype=np.int32 - ) + first_indices = np.empty_like(query_mzs, dtype=np.int32) + last_indices = np.empty_like(query_mzs, dtype=np.int32) first_indices[:] = -1 last_indices[:] = -1 - for i,idx in np.ndenumerate(idxes): + for i, idx in np.ndenumerate(idxes): if idx == len(spec_mzs): - idx = idx-1 + idx = idx - 1 for _idx in range(idx, -1, -1): - if spec_mzs[_idx]spec_mz_tols[_idx]: + if abs(spec_mzs[_idx] - query_mzs[i]) > spec_mz_tols[_idx]: break else: last_indices[i] = _idx @@ -163,12 +159,18 @@ def match_first_last_profile_mz( @numba.njit def match_one_raw_with_numba( - spec_idxes, frag_start_idxes, frag_stop_idxes, + spec_idxes, + frag_start_idxes, + frag_stop_idxes, all_frag_mzs, - all_spec_mzs, all_spec_intensities, - peak_start_idxes, peak_end_idxes, - matched_intensities, matched_mz_errs, - ppm, tol, + all_spec_mzs, + all_spec_intensities, + peak_start_idxes, + peak_end_idxes, + matched_intensities, + matched_mz_errs, + ppm, + tol, ): """ Internel function to match fragment mz values to spectrum mz values. @@ -179,47 +181,45 @@ def match_one_raw_with_numba( ): peak_start = peak_start_idxes[spec_idx] peak_end = peak_end_idxes[spec_idx] - if peak_end == peak_start: continue + if peak_end == peak_start: + continue spec_mzs = all_spec_mzs[peak_start:peak_end] spec_intens = all_spec_intensities[peak_start:peak_end] if ppm: - spec_mz_tols = spec_mzs*tol*1e-6 + spec_mz_tols = spec_mzs * tol * 1e-6 else: spec_mz_tols = np.full_like(spec_mzs, tol) - frag_mzs = all_frag_mzs[frag_start:frag_end,:].copy() + frag_mzs = all_frag_mzs[frag_start:frag_end, :].copy() - matched_idxes = match_centroid_mz( - spec_mzs, frag_mzs, spec_mz_tols - ).reshape(-1) + matched_idxes = match_centroid_mz(spec_mzs, frag_mzs, spec_mz_tols).reshape(-1) matched_intens = spec_intens[matched_idxes] - matched_intens[matched_idxes==-1] = 0 + matched_intens[matched_idxes == -1] = 0 matched_mass_errs = np.abs( - spec_mzs[ - matched_idxes.reshape(-1) - ]-frag_mzs.reshape(-1) + spec_mzs[matched_idxes.reshape(-1)] - frag_mzs.reshape(-1) ) - matched_mass_errs[matched_idxes==-1] = np.inf + matched_mass_errs[matched_idxes == -1] = np.inf - matched_intensities[ - frag_start:frag_end,: - ] = matched_intens.reshape(frag_mzs.shape) + matched_intensities[frag_start:frag_end, :] = matched_intens.reshape( + frag_mzs.shape + ) - matched_mz_errs[ - frag_start:frag_end,: - ] = matched_mass_errs.reshape(frag_mzs.shape) + matched_mz_errs[frag_start:frag_end, :] = matched_mass_errs.reshape( + frag_mzs.shape + ) class PepSpecMatch(object): """Main entry for peptide-spectrum matching""" - def __init__(self, - charged_frag_types = get_charged_frag_types( - ['b','y','b_modloss','y_modloss'], - 2 - ) + + def __init__( + self, + charged_frag_types=get_charged_frag_types( + ["b", "y", "b_modloss", "y_modloss"], 2 + ), ): self.charged_frag_types = charged_frag_types @@ -227,16 +227,16 @@ def _preprocess_psms(self, psm_df): pass def get_fragment_mz_df(self, psm_df): - return create_fragment_mz_dataframe( - psm_df, self.charged_frag_types - ) + return create_fragment_mz_dataframe(psm_df, self.charged_frag_types) - def match_ms2_one_raw(self, + def match_ms2_one_raw( + self, psm_df_one_raw: pd.DataFrame, - ms2_file:str, - ms2_file_type:str='alphapept', - ppm:bool=True, tol:float=20.0, - )->tuple: + ms2_file: str, + ms2_file_type: str = "alphapept", + ppm: bool = True, + tol: float = 20.0, + ) -> tuple: """Matching psm_df_one_raw against ms2_file Parameters @@ -276,119 +276,99 @@ def match_ms2_one_raw(self, if isinstance(ms2_file, MSReaderBase): ms2_reader = ms2_file else: - ms2_reader = ms2_reader_provider.get_reader( - ms2_file_type - ) + ms2_reader = ms2_reader_provider.get_reader(ms2_file_type) ms2_reader.load(ms2_file) add_spec_info_list = [] - if 'rt_norm' not in psm_df.columns: - add_spec_info_list.append('rt') + if "rt_norm" not in psm_df.columns: + add_spec_info_list.append("rt") if ( - 'mobility' not in psm_df.columns and - 'mobility' in ms2_reader.spectrum_df.columns + "mobility" not in psm_df.columns + and "mobility" in ms2_reader.spectrum_df.columns ): - add_spec_info_list.append('mobility') + add_spec_info_list.append("mobility") - if ( - 'nce' not in psm_df.columns and - 'nce' in ms2_reader.spectrum_df.columns - ): - add_spec_info_list.append('nce') + if "nce" not in psm_df.columns and "nce" in ms2_reader.spectrum_df.columns: + add_spec_info_list.append("nce") if len(add_spec_info_list) > 0: # pfind does not report RT in the result file - psm_df = psm_df.reset_index().merge( - ms2_reader.spectrum_df[['spec_idx']+add_spec_info_list], - how='left', - on='spec_idx', - ).set_index('index') + psm_df = ( + psm_df.reset_index() + .merge( + ms2_reader.spectrum_df[["spec_idx"] + add_spec_info_list], + how="left", + on="spec_idx", + ) + .set_index("index") + ) - if 'rt' in add_spec_info_list: - psm_df['rt_norm'] = psm_df.rt/ms2_reader.spectrum_df.rt.max() + if "rt" in add_spec_info_list: + psm_df["rt_norm"] = psm_df.rt / ms2_reader.spectrum_df.rt.max() fragment_mz_df = self.get_fragment_mz_df(psm_df) matched_intensity_df = pd.DataFrame( - np.zeros_like( - fragment_mz_df.values, dtype=np.float64 - ), - columns=fragment_mz_df.columns + np.zeros_like(fragment_mz_df.values, dtype=np.float64), + columns=fragment_mz_df.columns, ) matched_mz_err_df = pd.DataFrame( - np.full_like( - fragment_mz_df.values, np.inf, dtype=np.float64 - ), - columns=fragment_mz_df.columns + np.full_like(fragment_mz_df.values, np.inf, dtype=np.float64), + columns=fragment_mz_df.columns, ) - for ( - spec_idx, frag_start_idx, frag_stop_idx - ) in psm_df[[ - 'spec_idx', 'frag_start_idx', - 'frag_stop_idx' - ]].values: - ( - spec_mzs, spec_intens - ) = ms2_reader.get_peaks(spec_idx) - if len(spec_mzs)==0: continue + for spec_idx, frag_start_idx, frag_stop_idx in psm_df[ + ["spec_idx", "frag_start_idx", "frag_stop_idx"] + ].values: + (spec_mzs, spec_intens) = ms2_reader.get_peaks(spec_idx) + if len(spec_mzs) == 0: + continue if ppm: - mz_tols = spec_mzs*tol*1e-6 + mz_tols = spec_mzs * tol * 1e-6 else: mz_tols = np.full_like(spec_mzs, tol) - frag_mzs = fragment_mz_df.values[ - frag_start_idx:frag_stop_idx,: - ] + frag_mzs = fragment_mz_df.values[frag_start_idx:frag_stop_idx, :] - matched_idxes = match_centroid_mz( - spec_mzs, frag_mzs, mz_tols - ) + matched_idxes = match_centroid_mz(spec_mzs, frag_mzs, mz_tols) matched_intens = spec_intens[matched_idxes] - matched_intens[matched_idxes==-1] = 0 + matched_intens[matched_idxes == -1] = 0 - matched_mz_errs = np.abs( - spec_mzs[matched_idxes]-frag_mzs - ) - matched_mz_errs[matched_idxes==-1] = np.inf + matched_mz_errs = np.abs(spec_mzs[matched_idxes] - frag_mzs) + matched_mz_errs[matched_idxes == -1] = np.inf - matched_intensity_df.values[ - frag_start_idx:frag_stop_idx,: - ] = matched_intens + matched_intensity_df.values[frag_start_idx:frag_stop_idx, :] = ( + matched_intens + ) - matched_mz_err_df.values[ - frag_start_idx:frag_stop_idx,: - ] = matched_mz_errs + matched_mz_err_df.values[frag_start_idx:frag_stop_idx, :] = matched_mz_errs - return ( - psm_df, fragment_mz_df, - matched_intensity_df, matched_mz_err_df - ) + return (psm_df, fragment_mz_df, matched_intensity_df, matched_mz_err_df) def _match_ms2_centroid_one_raw(self, raw_name, df_group): if raw_name in self._ms2_file_dict: if isinstance(self._ms2_file_dict[raw_name], MSReaderBase): ms2_reader = self._ms2_file_dict[raw_name] else: - ms2_reader = ms2_reader_provider.get_reader( - self._ms2_file_type - ) + ms2_reader = ms2_reader_provider.get_reader(self._ms2_file_type) ms2_reader.load(self._ms2_file_dict[raw_name]) if self.rt_not_in_df: # pfind does not report RT in the result file - _df = df_group.reset_index().merge( - ms2_reader.spectrum_df[['spec_idx','rt']], - how='left', - on='spec_idx', - ).set_index('index') + _df = ( + df_group.reset_index() + .merge( + ms2_reader.spectrum_df[["spec_idx", "rt"]], + how="left", + on="spec_idx", + ) + .set_index("index") + ) - _df['rt_norm'] = _df.rt/ms2_reader.spectrum_df.rt.max() - self.psm_df.loc[ - _df.index, ['rt','rt_norm'] - ] = _df[['rt','rt_norm']] + _df["rt_norm"] = _df.rt / ms2_reader.spectrum_df.rt.max() + self.psm_df.loc[_df.index, ["rt", "rt_norm"]] = _df[["rt", "rt_norm"]] match_one_raw_with_numba( df_group.spec_idx.values, @@ -401,14 +381,17 @@ def _match_ms2_centroid_one_raw(self, raw_name, df_group): ms2_reader.spectrum_df.peak_end_idx.values, self.matched_intensity_df.values, self.matched_mz_err_df.values, - self.ppm, self.tol + self.ppm, + self.tol, ) - def match_ms2_centroid(self, + def match_ms2_centroid( + self, psm_df: pd.DataFrame, - ms2_file_dict: dict, #raw_name: ms2_file_path or ms_reader object - ms2_file_type:str = 'alphapept', # or 'mgf', or 'thermo' - ppm=True, tol=20.0, + ms2_file_dict: dict, # raw_name: ms2_file_path or ms_reader object + ms2_file_type: str = "alphapept", # or 'mgf', or 'thermo' + ppm=True, + tol=20.0, ): """Matching PSM dataframe against the ms2 files in ms2_file_dict This method will store matched values as attributes: @@ -439,24 +422,20 @@ def match_ms2_centroid(self, self._preprocess_psms(psm_df) self.psm_df = psm_df - if 'frag_start_idx' in self.psm_df.columns: - del self.psm_df['frag_start_idx'] - del self.psm_df['frag_stop_idx'] + if "frag_start_idx" in self.psm_df.columns: + del self.psm_df["frag_start_idx"] + del self.psm_df["frag_stop_idx"] self.fragment_mz_df = self.get_fragment_mz_df(self.psm_df) self.matched_intensity_df = pd.DataFrame( - np.zeros_like( - self.fragment_mz_df.values, dtype=np.float64 - ), - columns=self.fragment_mz_df.columns + np.zeros_like(self.fragment_mz_df.values, dtype=np.float64), + columns=self.fragment_mz_df.columns, ) self.matched_mz_err_df = pd.DataFrame( - np.full_like( - self.fragment_mz_df.values, np.inf, dtype=np.float64 - ), - columns=self.fragment_mz_df.columns + np.full_like(self.fragment_mz_df.values, np.inf, dtype=np.float64), + columns=self.fragment_mz_df.columns, ) self._ms2_file_dict = ms2_file_dict @@ -464,11 +443,9 @@ def match_ms2_centroid(self, self.ppm = ppm self.tol = tol - if 'rt_norm' not in self.psm_df.columns: + if "rt_norm" not in self.psm_df.columns: self.rt_not_in_df = True else: self.rt_not_in_df = False - for raw_name, df_group in tqdm.tqdm( - self.psm_df.groupby('raw_name') - ): + for raw_name, df_group in tqdm.tqdm(self.psm_df.groupby("raw_name")): self._match_ms2_centroid_one_raw(raw_name, df_group) diff --git a/peptdeep/mass_spec/ms_reader.py b/peptdeep/mass_spec/ms_reader.py index c42941a0..2010da80 100644 --- a/peptdeep/mass_spec/ms_reader.py +++ b/peptdeep/mass_spec/ms_reader.py @@ -9,27 +9,31 @@ try: # should be replaced by AlphaRaw in the near future from peptdeep.legacy.thermo_raw.pyrawfilereader import RawFileReader -except (ImportError,AttributeError,RuntimeError) as e: +except (ImportError, AttributeError, RuntimeError) as e: frameinfo = getframeinfo(currentframe()) - logging.warn(f"{frameinfo.filename}#L{frameinfo.lineno}: Cannot import `RawFileReader`, check if PythonNet is installed. See https://github.com/MannLabs/alphapeptdeep#pip") + logging.warn( + f"{frameinfo.filename}#L{frameinfo.lineno}: Cannot import `RawFileReader`, check if PythonNet is installed. See https://github.com/MannLabs/alphapeptdeep#pip" + ) RawFileReader = None + class MSReaderBase: def __init__(self): - self.spectrum_df:pd.DataFrame = pd.DataFrame() - self.peak_df:pd.DataFrame = pd.DataFrame() + self.spectrum_df: pd.DataFrame = pd.DataFrame() + self.peak_df: pd.DataFrame = pd.DataFrame() # self.mzs: np.ndarray = np.array([]) # self.intensities: np.ndarray = np.array([]) def load(self, file_path): - raise NotImplementedError('load()') - - def build_spectrum_df(self, - scan_list:list, - scan_indices:np.ndarray, - rt_list:list, - mobility_list:list = None, - nce_list:list = None + raise NotImplementedError("load()") + + def build_spectrum_df( + self, + scan_list: list, + scan_indices: np.ndarray, + rt_list: list, + mobility_list: list = None, + nce_list: list = None, ): """Build spectrum_df by the given information @@ -49,6 +53,7 @@ def build_spectrum_df(self, mobility for each scan. Defaults to None. """ + def set_col(col, indexes, values, dtype, na_value): self.spectrum_df.loc[indexes, col] = values self.spectrum_df[col].fillna(na_value, inplace=True) @@ -58,18 +63,18 @@ def set_col(col, indexes, values, dtype, na_value): if scan_list.min() > 0: # thermo scan >= 1 scan_list -= 1 - idx_len = np.max(scan_list)+1 + idx_len = np.max(scan_list) + 1 self.spectrum_df = pd.DataFrame(index=np.arange(idx_len, dtype=np.int64)) - self.spectrum_df['spec_idx'] = self.spectrum_df.index.values - set_col('peak_start_idx', scan_list, scan_indices[:-1], np.int64, -1) - set_col('peak_end_idx', scan_list, scan_indices[1:], np.int64, -1) - set_col('rt', scan_list, rt_list, np.float64, np.nan) + self.spectrum_df["spec_idx"] = self.spectrum_df.index.values + set_col("peak_start_idx", scan_list, scan_indices[:-1], np.int64, -1) + set_col("peak_end_idx", scan_list, scan_indices[1:], np.int64, -1) + set_col("rt", scan_list, rt_list, np.float64, np.nan) if mobility_list is not None: - set_col('mobility', scan_list, mobility_list, np.float64, np.nan) + set_col("mobility", scan_list, mobility_list, np.float64, np.nan) if nce_list is not None: - set_col('nce', scan_list, nce_list, np.float64, np.nan) + set_col("nce", scan_list, nce_list, np.float64, np.nan) - def get_peaks(self, spec_idx:int): + def get_peaks(self, spec_idx: int): """Get peak (mz and intensity) values by `spec_idx` Parameters @@ -90,14 +95,14 @@ def get_peaks(self, spec_idx:int): if spec_idx not in self.spectrum_df.index: return None, None start_idx, end_idx = self.spectrum_df.loc[ - spec_idx, ['peak_start_idx','peak_end_idx'] + spec_idx, ["peak_start_idx", "peak_end_idx"] ].values.astype(np.int64) return ( self.peak_df.mz.values[start_idx:end_idx], - self.peak_df.intensity.values[start_idx:end_idx] + self.peak_df.intensity.values[start_idx:end_idx], ) - def get_peaks_by_scan_num(self, scan_num:int): + def get_peaks_by_scan_num(self, scan_num: int): """Get peak (mz and intensity) values by `spec_idx` Parameters @@ -114,29 +119,34 @@ def get_peaks_by_scan_num(self, scan_num:int): intensity values for the given spec_idx """ - return self.get_peaks(scan_num-1) + return self.get_peaks(scan_num - 1) + class AlphaPept_HDF_MS1_Reader(MSReaderBase): """MS1 from AlphaPept HDF""" + def load(self, file_path): hdf = HDF_File(file_path) - self.peak_df['mz'] = hdf.Raw.MS1_scans.mass_list_ms1.values - self.peak_df['intensity'] = hdf.Raw.MS1_scans.int_list_ms1.values + self.peak_df["mz"] = hdf.Raw.MS1_scans.mass_list_ms1.values + self.peak_df["intensity"] = hdf.Raw.MS1_scans.int_list_ms1.values self.build_spectrum_df( scan_list=hdf.Raw.MS1_scans.scan_list_ms1.values, scan_indices=hdf.Raw.MS1_scans.indices_ms1.values, rt_list=hdf.Raw.MS1_scans.rt_list_ms1.values, mobility_list=hdf.Raw.MS1_scans.mobility.values - if hasattr(hdf.Raw.MS1_scans, 'mobility') else None, + if hasattr(hdf.Raw.MS1_scans, "mobility") + else None, ) + class AlphaPept_HDF_MS2_Reader(MSReaderBase): """MS2 from AlphaPept HDF""" + def load(self, file_path): hdf = HDF_File(file_path) - self.peak_df['mz'] = hdf.Raw.MS2_scans.mass_list_ms2.values - self.peak_df['intensity'] = hdf.Raw.MS2_scans.int_list_ms2.values - if hasattr(hdf.Raw.MS2_scans, 'mobility2'): + self.peak_df["mz"] = hdf.Raw.MS2_scans.mass_list_ms2.values + self.peak_df["intensity"] = hdf.Raw.MS2_scans.int_list_ms2.values + if hasattr(hdf.Raw.MS2_scans, "mobility2"): scan_list = np.arange(len(hdf.Raw.MS2_scans.rt_list_ms2)) else: scan_list = hdf.Raw.MS2_scans.scan_list_ms2.values @@ -145,9 +155,11 @@ def load(self, file_path): scan_indices=hdf.Raw.MS2_scans.indices_ms2.values, rt_list=hdf.Raw.MS2_scans.rt_list_ms2.values, mobility_list=hdf.Raw.MS2_scans.mobility2.values - if hasattr(hdf.Raw.MS2_scans, 'mobility2') else None, + if hasattr(hdf.Raw.MS2_scans, "mobility2") + else None, ) + class MZMLReader(MSReaderBase): def load(self, mzmlF): if isinstance(mzmlF, str): @@ -161,12 +173,12 @@ def load(self, mzmlF): rt_list = [] nce_list = [] for entry in f: - if entry["ms level"] != 2: #only care about MS2 scans + if entry["ms level"] != 2: # only care about MS2 scans continue scan = int(entry["id"].split("scan=")[1]) if scan in scanset: continue - #accept only hcd and cid + # accept only hcd and cid filter_string = entry["scanList"]["scan"][0]["filter string"] if "@hcd" in filter_string: nce_list.append(filter_string.split("@hcd")[1].split(" ")[0]) @@ -185,38 +197,41 @@ def load(self, mzmlF): f.close() self.build_spectrum_df( - scan_list, - index_ragged_list(masses_list), - rt_list, - nce_list=nce_list + scan_list, index_ragged_list(masses_list), rt_list, nce_list=nce_list ) - self.peak_df['mz'] = np.concatenate(masses_list) - self.peak_df['intensity'] = np.concatenate(intens_list) + self.peak_df["mz"] = np.concatenate(masses_list) + self.peak_df["intensity"] = np.concatenate(intens_list) + def read_until(file, until): lines = [] while True: line = file.readline().strip() - if line == "": break + if line == "": + break elif line.startswith(until): break else: lines.append(line) return lines + def find_line(lines, start): for line in lines: if line.startswith(start): return line return None + def parse_pfind_scan_from_TITLE(pfind_title): - return int(pfind_title.split('.')[-4]) + return int(pfind_title.split(".")[-4]) + def is_pfind_mgf(mgf): - return mgf.upper().endswith('_HCDFT.MGF') + return mgf.upper().endswith("_HCDFT.MGF") + -def index_ragged_list(ragged_list: list) -> np.ndarray: +def index_ragged_list(ragged_list: list) -> np.ndarray: """Create lookup indices for a list of arrays for concatenation. Parameters @@ -236,8 +251,10 @@ def index_ragged_list(ragged_list: list) -> np.ndarray: return indices + class MGFReader(MSReaderBase): """MGF Reader (MS2)""" + def load(self, mgf): if isinstance(mgf, str): f = open(mgf) @@ -250,26 +267,28 @@ def load(self, mgf): rt_list = [] while True: line = f.readline() - if not line: break - if line.startswith('BEGIN IONS'): - lines = read_until(f, 'END IONS') + if not line: + break + if line.startswith("BEGIN IONS"): + lines = read_until(f, "END IONS") masses = [] intens = [] scan = None RT = 0 for line in lines: if line[0].isdigit(): - mass,inten = [float(i) for i in line.strip().split()] + mass, inten = [float(i) for i in line.strip().split()] masses.append(mass) intens.append(inten) - elif line.startswith('SCAN='): - scan = int(line.split('=')[1]) - elif line.startswith('RTINSECOND'): - RT = float(line.split('=')[1])/60 + elif line.startswith("SCAN="): + scan = int(line.split("=")[1]) + elif line.startswith("RTINSECOND"): + RT = float(line.split("=")[1]) / 60 if not scan: - title = find_line(lines, 'TITLE=') + title = find_line(lines, "TITLE=") scan = parse_pfind_scan_from_TITLE(title) - if scan in scanset: continue + if scan in scanset: + continue scanset.add(scan) scan_list.append(scan) rt_list.append(RT) @@ -277,39 +296,43 @@ def load(self, mgf): intens_list.append(np.array(intens)) if isinstance(mgf, str): f.close() - self.build_spectrum_df( - scan_list, - index_ragged_list(masses_list), - rt_list - ) - self.peak_df['mz'] = np.concatenate(masses_list) - self.peak_df['intensity'] = np.concatenate(intens_list) + self.build_spectrum_df(scan_list, index_ragged_list(masses_list), rt_list) + self.peak_df["mz"] = np.concatenate(masses_list) + self.peak_df["intensity"] = np.concatenate(intens_list) + class MSReaderProvider: """Factory class to register and get MS Readers""" + def __init__(self): self.reader_dict = {} + def register_reader(self, ms2_type, reader_class): self.reader_dict[ms2_type.lower()] = reader_class - def get_reader(self, file_type)->MSReaderBase: + def get_reader(self, file_type) -> MSReaderBase: if file_type not in self.reader_dict: frameinfo = getframeinfo(currentframe()) - logging.warn(f'{frameinfo.filename}#L{frameinfo.lineno}: "{file_type}" is not registered in `MSReaderProvider` yet.') + logging.warn( + f'{frameinfo.filename}#L{frameinfo.lineno}: "{file_type}" is not registered in `MSReaderProvider` yet.' + ) return None - else: return self.reader_dict[file_type.lower()]() + else: + return self.reader_dict[file_type.lower()]() + ms2_reader_provider = MSReaderProvider() -ms2_reader_provider.register_reader('mgf', MGFReader) -ms2_reader_provider.register_reader('alphapept', AlphaPept_HDF_MS2_Reader) -ms2_reader_provider.register_reader('alphapept_hdf', AlphaPept_HDF_MS2_Reader) -ms2_reader_provider.register_reader('mzml', MZMLReader) +ms2_reader_provider.register_reader("mgf", MGFReader) +ms2_reader_provider.register_reader("alphapept", AlphaPept_HDF_MS2_Reader) +ms2_reader_provider.register_reader("alphapept_hdf", AlphaPept_HDF_MS2_Reader) +ms2_reader_provider.register_reader("mzml", MZMLReader) ms1_reader_provider = MSReaderProvider() -ms1_reader_provider.register_reader('alphapept', AlphaPept_HDF_MS1_Reader) -ms1_reader_provider.register_reader('alphapept_hdf', AlphaPept_HDF_MS1_Reader) +ms1_reader_provider.register_reader("alphapept", AlphaPept_HDF_MS1_Reader) +ms1_reader_provider.register_reader("alphapept_hdf", AlphaPept_HDF_MS1_Reader) if RawFileReader is None: + class ThermoRawMS1Reader: def __init__(self): raise NotImplementedError("RawFileReader is not available") @@ -318,8 +341,10 @@ class ThermoRawMS2Reader: def __init__(self): raise NotImplementedError("RawFileReader is not available") else: + class ThermoRawMS1Reader(MSReaderBase): """Thermo Raw MS1 Reader""" + def __init__(self): super().__init__() self.profile_mode = False @@ -360,12 +385,13 @@ def load(self, raw_path): index_ragged_list(masses_list), rt_list, ) - self.peak_df['mz'] = np.concatenate(masses_list) - self.peak_df['intensity'] = np.concatenate(intens_list) + self.peak_df["mz"] = np.concatenate(masses_list) + self.peak_df["intensity"] = np.concatenate(intens_list) rawfile.Close() class ThermoRawMS2Reader(MSReaderBase): """Thermo RAW MS2 Reader""" + def __init__(self): super().__init__() self.profile_mode = False @@ -406,11 +432,11 @@ def load(self, raw_path): index_ragged_list(masses_list), rt_list, ) - self.peak_df['mz'] = np.concatenate(masses_list) - self.peak_df['intensity'] = np.concatenate(intens_list) + self.peak_df["mz"] = np.concatenate(masses_list) + self.peak_df["intensity"] = np.concatenate(intens_list) rawfile.Close() - ms2_reader_provider.register_reader('thermo', ThermoRawMS2Reader) - ms2_reader_provider.register_reader('thermo_raw', ThermoRawMS2Reader) - ms1_reader_provider.register_reader('thermo', ThermoRawMS1Reader) - ms1_reader_provider.register_reader('thermo_raw', ThermoRawMS1Reader) + ms2_reader_provider.register_reader("thermo", ThermoRawMS2Reader) + ms2_reader_provider.register_reader("thermo_raw", ThermoRawMS2Reader) + ms1_reader_provider.register_reader("thermo", ThermoRawMS1Reader) + ms1_reader_provider.register_reader("thermo_raw", ThermoRawMS1Reader) diff --git a/peptdeep/model/__init__.py b/peptdeep/model/__init__.py index af980f39..9a3c08be 100644 --- a/peptdeep/model/__init__.py +++ b/peptdeep/model/__init__.py @@ -1,6 +1,11 @@ from peptdeep.model import ( - model_interface, base, building_block, - ccs, rt, ms2, - generic_property_prediction, model_shop, - featurize + model_interface, + base, + building_block, + ccs, + rt, + ms2, + generic_property_prediction, + model_shop, + featurize, ) diff --git a/peptdeep/model/building_block.py b/peptdeep/model/building_block.py index e651bc83..58685a7b 100644 --- a/peptdeep/model/building_block.py +++ b/peptdeep/model/building_block.py @@ -1,6 +1,7 @@ import torch import numpy as np -#BERT from huggingface + +# BERT from huggingface from transformers.models.bert.modeling_bert import BertEncoder from peptdeep.settings import model_const @@ -8,40 +9,44 @@ torch.set_num_threads(2) -mod_feature_size = len(model_const['mod_elements']) -max_instrument_num = model_const['max_instrument_num'] -frag_types = settings['model']['frag_types'] -max_frag_charge = settings['model']['max_frag_charge'] -num_ion_types = len(frag_types)*max_frag_charge -aa_embedding_size = model_const['aa_embedding_size'] +mod_feature_size = len(model_const["mod_elements"]) +max_instrument_num = model_const["max_instrument_num"] +frag_types = settings["model"]["frag_types"] +max_frag_charge = settings["model"]["max_frag_charge"] +num_ion_types = len(frag_types) * max_frag_charge +aa_embedding_size = model_const["aa_embedding_size"] + def aa_embedding(hidden_size): return torch.nn.Embedding(aa_embedding_size, hidden_size, padding_idx=0) + def ascii_embedding(hidden_size): return torch.nn.Embedding(128, hidden_size, padding_idx=0) + def aa_one_hot(aa_indices, *cat_others): - aa_x = torch.nn.functional.one_hot( - aa_indices, aa_embedding_size - ) + aa_x = torch.nn.functional.one_hot(aa_indices, aa_embedding_size) return torch.cat((aa_x, *cat_others), 2) + def instrument_embedding(hidden_size): return torch.nn.Embedding(max_instrument_num, hidden_size) + def zero_param(*shape): return torch.nn.Parameter(torch.zeros(shape), requires_grad=False) + def xavier_param(*shape): x = torch.nn.Parameter(torch.empty(shape), requires_grad=False) torch.nn.init.xavier_uniform_(x) return x + def invert_attention_mask( - encoder_attention_mask:torch.Tensor, - dtype=torch.float32 -)->torch.FloatTensor: + encoder_attention_mask: torch.Tensor, dtype=torch.float32 +) -> torch.FloatTensor: """ See `invert_attention_mask()` in https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L737. Invert an attention mask (e.g., switches 0. and 1.). @@ -54,21 +59,27 @@ def invert_attention_mask( encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] if encoder_attention_mask.dim() == 2: encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] - encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=dtype) # fp16 compatibility - encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(dtype).min + encoder_extended_attention_mask = encoder_extended_attention_mask.to( + dtype=dtype + ) # fp16 compatibility + encoder_extended_attention_mask = ( + 1.0 - encoder_extended_attention_mask + ) * torch.finfo(dtype).min return encoder_extended_attention_mask init_state = xavier_param + class SeqCNN_MultiKernel(torch.nn.Module): """ Extract sequence features using `torch.nn.Conv1D` with different kernel sizes (1(residue connection),3,5,7), and then concatenate the outputs of these Conv1Ds. """ - def __init__(self, out_features:int): + + def __init__(self, out_features: int): """ Parameters ---------- @@ -83,51 +94,41 @@ def __init__(self, out_features:int): """ super().__init__() - hidden = out_features//4 - if hidden*4 != out_features: - raise ValueError('out_features must be divided by 4') + hidden = out_features // 4 + if hidden * 4 != out_features: + raise ValueError("out_features must be divided by 4") - self.cnn_short = torch.nn.Conv1d( - hidden, hidden, - kernel_size=3, padding=1 - ) - self.cnn_medium = torch.nn.Conv1d( - hidden, hidden, - kernel_size=5, padding=2 - ) - self.cnn_long = torch.nn.Conv1d( - hidden, hidden, - kernel_size=7, padding=3 - ) + self.cnn_short = torch.nn.Conv1d(hidden, hidden, kernel_size=3, padding=1) + self.cnn_medium = torch.nn.Conv1d(hidden, hidden, kernel_size=5, padding=2) + self.cnn_long = torch.nn.Conv1d(hidden, hidden, kernel_size=7, padding=3) def forward(self, x): x = x.transpose(1, 2) x1 = self.cnn_short(x) x2 = self.cnn_medium(x) x3 = self.cnn_long(x) - return torch.cat((x, x1, x2, x3), dim=1).transpose(1,2) + return torch.cat((x, x1, x2, x3), dim=1).transpose(1, 2) -#legacy + +# legacy class SeqCNN(torch.nn.Module): """ Extract sequence features using `torch.nn.Conv1D` with different kernel sizes (1(residue connection),3,5,7), and then concatenate the outputs of these Conv1Ds. The Output dim is 4*embedding_hidden. """ + def __init__(self, embedding_hidden): super().__init__() self.cnn_short = torch.nn.Conv1d( - embedding_hidden, embedding_hidden, - kernel_size=3, padding=1 + embedding_hidden, embedding_hidden, kernel_size=3, padding=1 ) self.cnn_medium = torch.nn.Conv1d( - embedding_hidden, embedding_hidden, - kernel_size=5, padding=2 + embedding_hidden, embedding_hidden, kernel_size=5, padding=2 ) self.cnn_long = torch.nn.Conv1d( - embedding_hidden, embedding_hidden, - kernel_size=7, padding=3 + embedding_hidden, embedding_hidden, kernel_size=7, padding=3 ) def forward(self, x): @@ -135,50 +136,47 @@ def forward(self, x): x1 = self.cnn_short(x) x2 = self.cnn_medium(x) x3 = self.cnn_long(x) - return torch.cat((x, x1, x2, x3), dim=1).transpose(1,2) + return torch.cat((x, x1, x2, x3), dim=1).transpose(1, 2) class Seq_Transformer(torch.nn.Module): """ Using PyTorch built-in Transformer layers """ - def __init__(self, - in_features, - hidden_features, - nheads=8, - nlayers=2, - dropout=0.1 - ): + + def __init__(self, in_features, hidden_features, nheads=8, nlayers=2, dropout=0.1): super().__init__() encoder_layers = torch.nn.TransformerEncoderLayer( in_features, nheads, hidden_features, dropout ) - self.transformer_encoder = torch.nn.TransformerEncoder( - encoder_layers, nlayers - ) + self.transformer_encoder = torch.nn.TransformerEncoder(encoder_layers, nlayers) def forward(self, x): - return self.transformer_encoder(x.permute(1,0,2)).permute(1,0,2) + return self.transformer_encoder(x.permute(1, 0, 2)).permute(1, 0, 2) class Hidden_Transformer(torch.nn.Module): """ Transformer NN based on pytorch's built-in TransformerLayer class """ - def __init__(self, - hidden, hidden_expand=4, - nheads=8, nlayers=4, dropout=0.1 - ): + + def __init__(self, hidden, hidden_expand=4, nheads=8, nlayers=4, dropout=0.1): super().__init__() self.transormer = Seq_Transformer( - hidden, hidden*hidden_expand, nheads=nheads, - nlayers=nlayers, dropout=dropout + hidden, + hidden * hidden_expand, + nheads=nheads, + nlayers=nlayers, + dropout=dropout, ) + def forward(self, x): return self.transormer(x) + class _Pseudo_Bert_Config: - def __init__(self, + def __init__( + self, hidden_dim=256, intermediate_size=1024, num_attention_heads=8, @@ -201,31 +199,40 @@ def __init__(self, self.num_attention_heads = num_attention_heads self.num_hidden_layers = num_bert_layers self.output_attentions = output_attentions - self._attn_implementation = "eager" # Add this for transformers-4.41.0 + self._attn_implementation = "eager" # Add this for transformers-4.41.0 + class Hidden_HFace_Transformer(torch.nn.Module): """ Transformer NN based on HuggingFace's BertEncoder class """ - def __init__(self, - hidden_dim, hidden_expand=4, - nheads=8, nlayers=4, dropout=0.1, + + def __init__( + self, + hidden_dim, + hidden_expand=4, + nheads=8, + nlayers=4, + dropout=0.1, output_attentions=False, ): super().__init__() self.config = _Pseudo_Bert_Config( hidden_dim=hidden_dim, - intermediate_size=hidden_dim*hidden_expand, + intermediate_size=hidden_dim * hidden_expand, num_attention_heads=nheads, num_bert_layers=nlayers, dropout=dropout, - output_attentions=False + output_attentions=False, ) self.output_attentions = output_attentions self.bert = BertEncoder(self.config) - def forward(self, x:torch.Tensor, - attention_mask:torch.Tensor=None, - )->tuple: + + def forward( + self, + x: torch.Tensor, + attention_mask: torch.Tensor = None, + ) -> tuple: """ Parameters ---------- @@ -242,18 +249,19 @@ def forward(self, x:torch.Tensor, if self.output_attentions==True """ if attention_mask is not None: - attention_mask = invert_attention_mask( - attention_mask, dtype=x.dtype - ) + attention_mask = invert_attention_mask(attention_mask, dtype=x.dtype) return self.bert( x, attention_mask=attention_mask, output_attentions=self.output_attentions, - return_dict=False + return_dict=False, ) -#legacy + + +# legacy HiddenBert = Hidden_HFace_Transformer + class HFace_Transformer_with_PositionalEncoder(torch.nn.Module): """ HuggingFace transformer with a positional encoder in front. @@ -281,23 +289,33 @@ class HFace_Transformer_with_PositionalEncoder(torch.nn.Module): max_len : int, optional Max input sequence length. Defaults to 200. """ - def __init__(self, - hidden_dim:int, hidden_expand=4, - nheads=8, nlayers=4, dropout=0.1, + + def __init__( + self, + hidden_dim: int, + hidden_expand=4, + nheads=8, + nlayers=4, + dropout=0.1, output_attentions=False, max_len=200, ): super().__init__() self.pos_encoder = PositionalEncoding(hidden_dim, max_len=max_len) self.bert = Hidden_HFace_Transformer( - hidden_dim=hidden_dim, hidden_expand=hidden_expand, - nheads=nheads, nlayers=nlayers, dropout=dropout, - output_attentions=output_attentions + hidden_dim=hidden_dim, + hidden_expand=hidden_expand, + nheads=nheads, + nlayers=nlayers, + dropout=dropout, + output_attentions=output_attentions, ) - def forward(self, - x:torch.Tensor, - attention_mask:torch.Tensor=None, - )->tuple: + + def forward( + self, + x: torch.Tensor, + attention_mask: torch.Tensor = None, + ) -> tuple: """ Parameters ---------- @@ -313,83 +331,76 @@ def forward(self, x = self.pos_encoder(x) return self.bert(x, attention_mask) + class SeqLSTM(torch.nn.Module): """ returns LSTM applied on sequence input """ - def __init__(self, in_features, out_features, - rnn_layer=2, bidirectional=True - ): + + def __init__(self, in_features, out_features, rnn_layer=2, bidirectional=True): super().__init__() if bidirectional: - if out_features%2 != 0: + if out_features % 2 != 0: raise ValueError("'out_features' must be able to be divided by 2") - hidden = out_features//2 + hidden = out_features // 2 else: hidden = out_features - self.rnn_h0 = init_state( - rnn_layer+rnn_layer*bidirectional, - 1, hidden - ) - self.rnn_c0 = init_state( - rnn_layer+rnn_layer*bidirectional, - 1, hidden - ) + self.rnn_h0 = init_state(rnn_layer + rnn_layer * bidirectional, 1, hidden) + self.rnn_c0 = init_state(rnn_layer + rnn_layer * bidirectional, 1, hidden) self.rnn = torch.nn.LSTM( - input_size = in_features, - hidden_size = hidden, - num_layers = rnn_layer, - batch_first = True, - bidirectional = bidirectional, + input_size=in_features, + hidden_size=hidden, + num_layers=rnn_layer, + batch_first=True, + bidirectional=bidirectional, ) - def forward(self, x:torch.Tensor): + def forward(self, x: torch.Tensor): h0 = self.rnn_h0.repeat(1, x.size(0), 1) c0 = self.rnn_c0.repeat(1, x.size(0), 1) - x, _ = self.rnn(x, (h0,c0)) + x, _ = self.rnn(x, (h0, c0)) return x + class SeqGRU(torch.nn.Module): """ returns GRU applied on sequence input """ - def __init__(self, in_features, out_features, - rnn_layer=2, bidirectional=True - ): + + def __init__(self, in_features, out_features, rnn_layer=2, bidirectional=True): super().__init__() if bidirectional: - if out_features%2 != 0: + if out_features % 2 != 0: raise ValueError("'out_features' must be able to be divided by 2") # to make sure that output dim is out_features # as `bidirectional` will cat forward and reverse RNNs - hidden = out_features//2 + hidden = out_features // 2 else: hidden = out_features - self.rnn_h0 = init_state( - rnn_layer+rnn_layer*bidirectional, - 1, hidden - ) + self.rnn_h0 = init_state(rnn_layer + rnn_layer * bidirectional, 1, hidden) self.rnn = torch.nn.GRU( - input_size = in_features, - hidden_size = hidden, - num_layers = rnn_layer, - batch_first = True, - bidirectional = bidirectional, + input_size=in_features, + hidden_size=hidden, + num_layers=rnn_layer, + batch_first=True, + bidirectional=bidirectional, ) - def forward(self, x:torch.Tensor): + def forward(self, x: torch.Tensor): h0 = self.rnn_h0.repeat(1, x.size(0), 1) x, _ = self.rnn(x, h0) return x + class SeqAttentionSum(torch.nn.Module): """ apply linear transformation and tensor rescaling with softmax """ + def __init__(self, in_features): super().__init__() self.attn = torch.nn.Sequential( @@ -401,210 +412,212 @@ def forward(self, x): attn = self.attn(x) return torch.sum(torch.mul(x, attn), dim=1) + class PositionalEncoding(torch.nn.Module): """ transform sequence input into a positional representation """ - def __init__(self, out_features=128, max_len = 200): + + def __init__(self, out_features=128, max_len=200): super().__init__() position = torch.arange(max_len).unsqueeze(1) div_term = torch.exp( - torch.arange( - 0, out_features, 2 - ) * (-np.log(max_len) / out_features) + torch.arange(0, out_features, 2) * (-np.log(max_len) / out_features) ) pe = torch.zeros(1, max_len, out_features) pe[0, :, 0::2] = torch.sin(position * div_term) pe[0, :, 1::2] = torch.cos(position * div_term) - self.register_buffer('pe', pe) + self.register_buffer("pe", pe) def forward(self, x): - return x + self.pe[:,:x.size(1),:] + return x + self.pe[:, : x.size(1), :] + class PositionalEmbedding(torch.nn.Module): """ transform sequence with the standard embedding function """ + def __init__(self, out_features=128, max_len=200): super().__init__() - self.pos_emb = torch.nn.Embedding( - max_len, out_features + self.pos_emb = torch.nn.Embedding(max_len, out_features) + + def forward(self, x: torch.Tensor): + return x + self.pos_emb( + torch.arange(x.size(1), dtype=torch.long, device=x.device).unsqueeze(0) ) - def forward(self, x:torch.Tensor): - return x + self.pos_emb(torch.arange( - x.size(1), dtype=torch.long, device=x.device - ).unsqueeze(0)) class Meta_Embedding(torch.nn.Module): """Encodes Charge state, Normalized Collision Energy (NCE) and Instrument for a given spectrum into a 'meta' single layer network """ - def __init__(self, + + def __init__( + self, out_features, ): super().__init__() - self.nn = torch.nn.Linear( - max_instrument_num+1, out_features-1 - ) + self.nn = torch.nn.Linear(max_instrument_num + 1, out_features - 1) - def forward(self, - charges, NCEs, instrument_indices, + def forward( + self, + charges, + NCEs, + instrument_indices, ): - inst_x = torch.nn.functional.one_hot( - instrument_indices, max_instrument_num - ) + inst_x = torch.nn.functional.one_hot(instrument_indices, max_instrument_num) meta_x = self.nn(torch.cat((inst_x, NCEs), 1)) meta_x = torch.cat((meta_x, charges), 1) return meta_x -#legacy + + +# legacy InputMetaNet = Meta_Embedding + class Mod_Embedding_FixFirstK(torch.nn.Module): """ Encodes the modification vector in a single layer feed forward network, but not transforming the first k features """ - def __init__(self, + + def __init__( + self, out_features, ): super().__init__() self.k = 6 self.nn = torch.nn.Linear( - mod_feature_size-self.k, out_features-self.k, - bias=False + mod_feature_size - self.k, out_features - self.k, bias=False ) - def forward(self, + def forward( + self, mod_x, ): - return torch.cat(( - mod_x[:,:,:self.k], - self.nn(mod_x[:,:,self.k:]) - ), 2) -#legacy + return torch.cat((mod_x[:, :, : self.k], self.nn(mod_x[:, :, self.k :])), 2) + + +# legacy InputModNetFixFirstK = Mod_Embedding_FixFirstK + class AA_Mod_Embedding(torch.nn.Module): """ Concatenates the AA (128 ASCII codes) embedding with the modifcation vector """ - def __init__(self, + + def __init__( + self, out_features, - mod_feature_size = 8, + mod_feature_size=8, ): super().__init__() - self.mod_embedding = Mod_Embedding_FixFirstK( - mod_feature_size - ) - self.aa_embedding = ascii_embedding( - out_features-mod_feature_size - ) + self.mod_embedding = Mod_Embedding_FixFirstK(mod_feature_size) + self.aa_embedding = ascii_embedding(out_features - mod_feature_size) + def forward(self, aa_indices, mod_x): mod_x = self.mod_embedding(mod_x) aa_x = self.aa_embedding(aa_indices) return torch.cat((aa_x, mod_x), 2) -#legacy + + +# legacy InputAAEmbedding = AA_Mod_Embedding + class Mod_Embedding(torch.nn.Module): """ Encodes the modification vector in a single layer feed forward network """ - def __init__(self, + + def __init__( + self, out_features, ): super().__init__() - self.nn = torch.nn.Linear( - mod_feature_size, out_features, - bias=False - ) + self.nn = torch.nn.Linear(mod_feature_size, out_features, bias=False) - def forward(self, + def forward( + self, mod_x, ): return self.nn(mod_x) -#legacy + + +# legacy InputModNet = Mod_Embedding + class Input_26AA_Mod_PositionalEncoding(torch.nn.Module): """ Encodes AA (26 AA letters) and modification vector """ + def __init__(self, out_features, max_len=200): super().__init__() mod_hidden = 8 self.mod_nn = Mod_Embedding_FixFirstK(mod_hidden) - self.aa_emb = aa_embedding( - out_features-mod_hidden - ) - self.pos_encoder = PositionalEncoding( - out_features, max_len - ) + self.aa_emb = aa_embedding(out_features - mod_hidden) + self.pos_encoder = PositionalEncoding(out_features, max_len) - def forward(self, - aa_indices, mod_x - ): + def forward(self, aa_indices, mod_x): mod_x = self.mod_nn(mod_x) x = self.aa_emb(aa_indices) return self.pos_encoder(torch.cat((x, mod_x), 2)) -#legacy + + +# legacy AATransformerEncoding = Input_26AA_Mod_PositionalEncoding + class Input_AA_Mod_PositionalEncoding(torch.nn.Module): """ Encodes AA (ASCII codes) and modification vector """ + def __init__(self, out_features, max_len=200): super().__init__() mod_hidden = 8 self.mod_nn = Mod_Embedding_FixFirstK(mod_hidden) - self.aa_emb = ascii_embedding( - out_features-mod_hidden - ) - self.pos_encoder = PositionalEncoding( - out_features, max_len - ) + self.aa_emb = ascii_embedding(out_features - mod_hidden) + self.pos_encoder = PositionalEncoding(out_features, max_len) - def forward(self, - aa_indices, mod_x - ): + def forward(self, aa_indices, mod_x): mod_x = self.mod_nn(mod_x) x = self.aa_emb(aa_indices) return self.pos_encoder(torch.cat((x, mod_x), 2)) + class Input_AA_Mod_Charge_PositionalEncoding(torch.nn.Module): """ Embed AA (128 ASCII codes), modification, and charge state """ + def __init__(self, out_features, max_len=200): super().__init__() mod_hidden = 8 self.charge_dim = 2 self.mod_nn = Mod_Embedding_FixFirstK(mod_hidden) - self.aa_emb = ascii_embedding( - out_features-mod_hidden-self.charge_dim - ) - self.pos_encoder = PositionalEncoding( - out_features, max_len - ) + self.aa_emb = ascii_embedding(out_features - mod_hidden - self.charge_dim) + self.pos_encoder = PositionalEncoding(out_features, max_len) - def forward(self, - aa_indices, mod_x, charges - ): + def forward(self, aa_indices, mod_x, charges): mod_x = self.mod_nn(mod_x) x = self.aa_emb(aa_indices) - charge_x = charges[:, None, None].repeat( - 1, mod_x.size(1), self.charge_dim - ) + charge_x = charges[:, None, None].repeat(1, mod_x.size(1), self.charge_dim) return self.pos_encoder(torch.cat((x, mod_x, charge_x), 2)) + class Input_26AA_Mod_LSTM(torch.nn.Module): """ Applies an LSTM network to a AA (26 AA letters) sequence & modifications """ - def __init__(self, + + def __init__( + self, out_features, n_lstm_layers=1, ): @@ -612,16 +625,19 @@ def __init__(self, mod_hidden = 8 self.mod_nn = Mod_Embedding_FixFirstK(mod_hidden) self.lstm = SeqLSTM( - aa_embedding_size+mod_hidden, + aa_embedding_size + mod_hidden, out_features, n_lstm_layers=n_lstm_layers, - bidirectional=True + bidirectional=True, ) + def forward(self, aa_indices, mod_x): mod_x = self.mod_nn(mod_x) x = aa_one_hot(aa_indices, mod_x) return self.lstm(x) -#legacy + + +# legacy InputAALSTM = Input_26AA_Mod_LSTM @@ -630,7 +646,9 @@ class Input_26AA_Mod_Meta_LSTM(torch.nn.Module): Applies a LSTM network to a AA (26 AA letters) sequence and modifications, and concatenates with 'meta' information (charge, nce, instrument_indices) """ - def __init__(self, + + def __init__( + self, out_features, ): super().__init__() @@ -639,22 +657,25 @@ def __init__(self, self.mod_nn = Mod_Embedding_FixFirstK(mod_hidden) self.meta_nn = Meta_Embedding(meta_dim) self.nn = SeqLSTM( - aa_embedding_size+mod_hidden, - out_features-meta_dim, - rnn_layer=1, bidirectional=True + aa_embedding_size + mod_hidden, + out_features - meta_dim, + rnn_layer=1, + bidirectional=True, ) - def forward(self, - aa_indices, mod_x, charges, NCEs, instrument_indices - ): + def forward(self, aa_indices, mod_x, charges, NCEs, instrument_indices): mod_x = self.mod_nn(mod_x) x = aa_one_hot(aa_indices, mod_x) x = self.nn(x) - meta_x = self.meta_nn( - charges, NCEs, instrument_indices - ).unsqueeze(1).repeat(1, mod_x.size(1), 1) + meta_x = ( + self.meta_nn(charges, NCEs, instrument_indices) + .unsqueeze(1) + .repeat(1, mod_x.size(1), 1) + ) return torch.cat((x, meta_x), 2) -#legacy + + +# legacy InputAALSTM_cat_Meta = Input_26AA_Mod_Meta_LSTM @@ -663,7 +684,9 @@ class Input_26AA_Mod_Charge_LSTM(torch.nn.Module): Applies a LSTM network to a AA (26 AA letters) sequence and modifications, and concatenates with charge state information """ - def __init__(self, + + def __init__( + self, out_features, ): super().__init__() @@ -671,20 +694,21 @@ def __init__(self, mod_hidden = 8 self.mod_nn = Mod_Embedding_FixFirstK(mod_hidden) self.nn = SeqLSTM( - aa_embedding_size+mod_hidden, - out_features-self.charge_dim, - rnn_layer=1, bidirectional=True + aa_embedding_size + mod_hidden, + out_features - self.charge_dim, + rnn_layer=1, + bidirectional=True, ) def forward(self, aa_indices, mod_x, charges): mod_x = self.mod_nn(mod_x) x = aa_one_hot(aa_indices, mod_x) x = self.nn(x) - charge_x = charges.unsqueeze(1).repeat( - 1, mod_x.size(1), self.charge_dim - ) + charge_x = charges.unsqueeze(1).repeat(1, mod_x.size(1), self.charge_dim) return torch.cat((x, charge_x), 2) -#legacy + + +# legacy InputAALSTM_cat_Charge = Input_26AA_Mod_Charge_LSTM @@ -693,7 +717,9 @@ class Seq_Meta_LSTM(torch.nn.Module): Takes a hidden layer which processes the hidden tensor as well as the 'meta' information of NCE, Instrument, Charge """ - def __init__(self, + + def __init__( + self, in_features, out_features, ): @@ -701,63 +727,68 @@ def __init__(self, meta_dim = 4 self.meta_nn = Meta_Embedding(meta_dim) self.nn = SeqLSTM( - in_features+meta_dim, - out_features, - rnn_layer=1, bidirectional=False + in_features + meta_dim, out_features, rnn_layer=1, bidirectional=False ) def forward(self, x, charges, NCEs, instrument_indices): - meta_x = self.meta_nn( - charges, NCEs, instrument_indices - ).unsqueeze(1).repeat(1, x.size(1), 1) + meta_x = ( + self.meta_nn(charges, NCEs, instrument_indices) + .unsqueeze(1) + .repeat(1, x.size(1), 1) + ) return self.nn(torch.cat((x, meta_x), 2)) -#legacy + + +# legacy OutputLSTM_cat_Meta = Seq_Meta_LSTM + class Seq_Meta_Linear(torch.nn.Module): """ takes a hidden linear which processed the 'meta' information of NCE, Instrument, Charge """ - def __init__(self, + + def __init__( + self, in_features, out_features, ): super().__init__() meta_dim = 4 self.meta_nn = Meta_Embedding(meta_dim) - self.nn = torch.nn.Linear( - in_features+meta_dim, - out_features, - bias=False - ) + self.nn = torch.nn.Linear(in_features + meta_dim, out_features, bias=False) def forward(self, x, charges, NCEs, instrument_indices): - meta_x = self.meta_nn( - charges, NCEs, instrument_indices - ).unsqueeze(1).repeat(1, x.size(1), 1) + meta_x = ( + self.meta_nn(charges, NCEs, instrument_indices) + .unsqueeze(1) + .repeat(1, x.size(1), 1) + ) return self.nn(torch.cat((x, meta_x), 2)) -#legacy + + +# legacy OutputLinear_cat_Meta = Seq_Meta_Linear + class Encoder_26AA_Mod_LSTM(torch.nn.Module): """ Two LSTM layers on AA (26 AA letters) and modifications. """ + def __init__(self, out_features, n_lstm_layers=1): super().__init__() self.input_nn = Input_26AA_Mod_LSTM(out_features) - self.nn = SeqLSTM( - out_features, out_features, - rnn_layer=n_lstm_layers - ) + self.nn = SeqLSTM(out_features, out_features, rnn_layer=n_lstm_layers) def forward(self, aa_indices, mod_x): x = self.input_nn(aa_indices, mod_x) x = self.nn(x) return x -#legacy + +# legacy Input_AA_LSTM_Encoder = Encoder_26AA_Mod_LSTM @@ -765,17 +796,17 @@ class Encoder_26AA_Mod_CNN_LSTM(torch.nn.Module): """ Encode AAs (26 AA letters) and modifications by CNN and LSTM layers """ + def __init__(self, out_features, n_lstm_layers=1): super().__init__() mod_hidden = 8 self.mod_nn = Mod_Embedding_FixFirstK(mod_hidden) - input_dim = aa_embedding_size+mod_hidden + input_dim = aa_embedding_size + mod_hidden self.input_cnn = SeqCNN(input_dim) self.hidden_nn = SeqLSTM( - input_dim*4, out_features, - rnn_layer=n_lstm_layers - ) #SeqCNN outputs 4*input_dim + input_dim * 4, out_features, rnn_layer=n_lstm_layers + ) # SeqCNN outputs 4*input_dim def forward(self, aa_indices, mod_x): mod_x = self.mod_nn(mod_x) @@ -784,26 +815,28 @@ def forward(self, aa_indices, mod_x): x = self.hidden_nn(x) return x -#legacy + +# legacy Input_AA_CNN_Encoder = Encoder_26AA_Mod_CNN_LSTM + class Encoder_26AA_Mod_CNN_LSTM_AttnSum(torch.nn.Module): """ Encode AAs (26 AA letters) and modifications by CNN and LSTM layers, then by 'SeqAttentionSum'. """ + def __init__(self, out_features, n_lstm_layers=2): super().__init__() mod_hidden = 8 self.mod_nn = Mod_Embedding_FixFirstK(mod_hidden) - input_dim = aa_embedding_size+mod_hidden + input_dim = aa_embedding_size + mod_hidden self.input_cnn = SeqCNN(input_dim) self.hidden_nn = SeqLSTM( - input_dim*4, out_features, - rnn_layer=n_lstm_layers - ) #SeqCNN outputs 4*input_dim + input_dim * 4, out_features, rnn_layer=n_lstm_layers + ) # SeqCNN outputs 4*input_dim self.attn_sum = SeqAttentionSum(out_features) def forward(self, aa_indices, mod_x): @@ -813,27 +846,28 @@ def forward(self, aa_indices, mod_x): x = self.hidden_nn(x) x = self.attn_sum(x) return x -#legacy + + +# legacy Input_AA_CNN_LSTM_Encoder = Encoder_26AA_Mod_CNN_LSTM_AttnSum + class Encoder_AA_Mod_CNN_LSTM_AttnSum(torch.nn.Module): """ Encode AAs (128 ASCII codes) and modifications by CNN and LSTM layers, and then by 'SeqAttentionSum'. """ + def __init__(self, out_features, n_lstm_layers=2): super().__init__() mod_hidden = 8 - input_dim = out_features//4 - self.aa_mod_embedding = AA_Mod_Embedding( - input_dim, mod_feature_size=mod_hidden - ) + input_dim = out_features // 4 + self.aa_mod_embedding = AA_Mod_Embedding(input_dim, mod_feature_size=mod_hidden) self.input_cnn = SeqCNN(input_dim) self.hidden_nn = SeqLSTM( - input_dim*4, out_features, - rnn_layer=n_lstm_layers - ) #SeqCNN outputs 4*input_dim + input_dim * 4, out_features, rnn_layer=n_lstm_layers + ) # SeqCNN outputs 4*input_dim self.attn_sum = SeqAttentionSum(out_features) def forward(self, aa_indices, mod_x): @@ -849,11 +883,8 @@ class Encoder_AA_Mod_Transformer(torch.nn.Module): AAs (128 ASCII codes) and modifications embedded by Bert, then encoded by 'SeqAttentionSum'. """ - def __init__(self,out_features, - dropout=0.1, - nlayers=4, - output_attentions=False - ): + + def __init__(self, out_features, dropout=0.1, nlayers=4, output_attentions=False): super().__init__() self.dropout = torch.nn.Dropout(dropout) @@ -862,15 +893,14 @@ def __init__(self,out_features, self.output_attentions = output_attentions self.encoder = Hidden_HFace_Transformer( - out_features, nlayers=nlayers, dropout=dropout, - output_attentions=output_attentions - ) - def forward(self, aa_indices, mod_x, - attention_mask=None - ): - x = self.input_nn( - aa_indices, mod_x + out_features, + nlayers=nlayers, + dropout=dropout, + output_attentions=output_attentions, ) + + def forward(self, aa_indices, mod_x, attention_mask=None): + x = self.input_nn(aa_indices, mod_x) x = self.dropout(x) x = self.encoder(x, attention_mask) @@ -880,22 +910,22 @@ def forward(self, aa_indices, mod_x, self.attentions = None return x[0] + class Encoder_AA_Mod_Transformer_AttnSum(torch.nn.Module): """ Encode AAs (128 ASCII codes) and modifications by transformers. """ - def __init__(self,out_features, - dropout=0.1, - nlayers=4, - output_attentions=False - ): + + def __init__(self, out_features, dropout=0.1, nlayers=4, output_attentions=False): super().__init__() self.dropout = torch.nn.Dropout(dropout) self.encoder_nn = Encoder_AA_Mod_Transformer( - out_features, dropout=dropout, nlayers=nlayers, - output_attentions=output_attentions + out_features, + dropout=dropout, + nlayers=nlayers, + output_attentions=output_attentions, ) self.attn_sum = SeqAttentionSum(out_features) @@ -903,15 +933,13 @@ def forward(self, aa_indices, mod_x): x = self.encoder_nn(aa_indices, mod_x) return self.dropout(self.attn_sum(x)) + class Encoder_AA_Mod_Charge_Transformer(torch.nn.Module): """ Encode AAs (128 ASCII codes), modifications and charge by transformers. """ - def __init__(self,out_features, - dropout=0.1, - nlayers=4, - output_attentions=False - ): + + def __init__(self, out_features, dropout=0.1, nlayers=4, output_attentions=False): super().__init__() self.dropout = torch.nn.Dropout(dropout) @@ -919,15 +947,20 @@ def __init__(self,out_features, self.output_attentions = output_attentions self.encoder = Hidden_HFace_Transformer( - out_features, nlayers=nlayers, dropout=dropout, - output_attentions=output_attentions + out_features, + nlayers=nlayers, + dropout=dropout, + output_attentions=output_attentions, ) - def forward(self, aa_indices, mod_x, charges, + + def forward( + self, + aa_indices, + mod_x, + charges, attention_mask=None, ): - x = self.input_nn( - aa_indices, mod_x, charges - ) + x = self.input_nn(aa_indices, mod_x, charges) x = self.dropout(x) x = self.encoder(x, attention_mask) @@ -937,59 +970,62 @@ def forward(self, aa_indices, mod_x, charges, self.attentions = None return x[0] + class Encoder_AA_Mod_Charge_Transformer_AttnSum(torch.nn.Module): """ Encode AAs (128 ASCII codes), modifications and charge by transformers, and then by 'SeqAttentionSum' """ - def __init__(self,out_features, - dropout=0.1, - nlayers=4, - output_attentions=False - ): + + def __init__(self, out_features, dropout=0.1, nlayers=4, output_attentions=False): super().__init__() self.dropout = torch.nn.Dropout(dropout) self.encoder_nn = Encoder_AA_Mod_Charge_Transformer( - out_features, dropout=dropout, nlayers=nlayers, - output_attentions=output_attentions + out_features, + dropout=dropout, + nlayers=nlayers, + output_attentions=output_attentions, ) self.attn_sum = SeqAttentionSum(out_features) + def forward(self, aa_indices, mod_x, charges): x = self.encoder_nn(aa_indices, mod_x, charges) return self.dropout(self.attn_sum(x)) + class Encoder_26AA_Mod_Charge_CNN_LSTM_AttnSum(torch.nn.Module): """ Encode AAs (26 AA letters), modifications and charge by transformers, and then by 'SeqAttentionSum' """ + def __init__(self, out_features): super().__init__() mod_hidden = 8 self.mod_nn = Mod_Embedding_FixFirstK(mod_hidden) - input_dim = aa_embedding_size+mod_hidden+1 + input_dim = aa_embedding_size + mod_hidden + 1 self.input_cnn = SeqCNN(input_dim) self.hidden_nn = SeqLSTM( - input_dim*4, out_features, rnn_layer=2 - ) #SeqCNN outputs 4*input_dim + input_dim * 4, out_features, rnn_layer=2 + ) # SeqCNN outputs 4*input_dim self.attn_sum = SeqAttentionSum(out_features) def forward(self, aa_indices, mod_x, charges): mod_x = self.mod_nn(mod_x) x = aa_one_hot( - aa_indices, mod_x, - charges.unsqueeze(1).repeat(1,mod_x.size(1),1) + aa_indices, mod_x, charges.unsqueeze(1).repeat(1, mod_x.size(1), 1) ) x = self.input_cnn(x) x = self.hidden_nn(x) x = self.attn_sum(x) return x -#legacy + +# legacy Input_AA_CNN_LSTM_cat_Charge_Encoder = Encoder_26AA_Mod_Charge_CNN_LSTM_AttnSum @@ -997,58 +1033,63 @@ class Decoder_LSTM(torch.nn.Module): """ Decode with LSTM """ + def __init__(self, in_features, out_features): super().__init__() hidden = 128 self.rnn = SeqLSTM( - in_features, out_features, - rnn_layer=1, bidirectional=False, + in_features, + out_features, + rnn_layer=1, + bidirectional=False, ) - self.output_nn = torch.nn.Linear( - hidden, out_features, bias=False - ) + self.output_nn = torch.nn.Linear(hidden, out_features, bias=False) - def forward(self, x:torch.tensor, output_len): - x = self.rnn( - x.unsqueeze(1).repeat(1,output_len,1) - ) + def forward(self, x: torch.tensor, output_len): + x = self.rnn(x.unsqueeze(1).repeat(1, output_len, 1)) x = self.output_nn(x) return x -#legacy + + +# legacy SeqLSTMDecoder = Decoder_LSTM + class Decoder_GRU(torch.nn.Module): """ Decode with GRU """ + def __init__(self, in_features, out_features): super().__init__() hidden = 128 self.rnn = SeqGRU( - in_features, out_features, - rnn_layer=1, bidirectional=False, + in_features, + out_features, + rnn_layer=1, + bidirectional=False, ) - self.output_nn = torch.nn.Linear( - hidden, out_features, bias=False - ) + self.output_nn = torch.nn.Linear(hidden, out_features, bias=False) - def forward(self, x:torch.tensor, output_len): - x = self.rnn( - x.unsqueeze(1).repeat(1,output_len,1) - ) + def forward(self, x: torch.tensor, output_len): + x = self.rnn(x.unsqueeze(1).repeat(1, output_len, 1)) x = self.output_nn(x) return x -#legacy + + +# legacy SeqGRUDecoder = Decoder_GRU + class Decoder_Linear(torch.nn.Module): """ Decode w linear NN """ + def __init__(self, in_features, out_features): super().__init__() @@ -1060,5 +1101,7 @@ def __init__(self, in_features, out_features): def forward(self, x): return self.nn(x) -#legacy + + +# legacy LinearDecoder = Decoder_Linear diff --git a/peptdeep/model/ccs.py b/peptdeep/model/ccs.py index 0e3ac282..16940823 100644 --- a/peptdeep/model/ccs.py +++ b/peptdeep/model/ccs.py @@ -3,10 +3,7 @@ import typing -from alphabase.peptide.mobility import ( - ccs_to_mobility_for_df, - mobility_to_ccs_for_df -) +from alphabase.peptide.mobility import ccs_to_mobility_for_df, mobility_to_ccs_for_df import peptdeep.model.base as model_base from peptdeep.utils import evaluate_linear_regression @@ -16,10 +13,12 @@ class Model_CCS_Bert(torch.nn.Module): """ Transformer model for CCS prediction """ - def __init__(self, - dropout = 0.1, - nlayers = 4, - hidden = 128, + + def __init__( + self, + dropout=0.1, + nlayers=4, + hidden=128, output_attentions=False, **kwargs, ): @@ -27,13 +26,15 @@ def __init__(self, self.dropout = torch.nn.Dropout(dropout) - self.input_nn = model_base.AATransformerEncoding(hidden-2) + self.input_nn = model_base.AATransformerEncoding(hidden - 2) self._output_attentions = output_attentions self.hidden_nn = model_base.Hidden_HFace_Transformer( - hidden, nlayers=nlayers, dropout=dropout, - output_attentions=output_attentions + hidden, + nlayers=nlayers, + dropout=dropout, + output_attentions=output_attentions, ) self.output_nn = torch.nn.Sequential( @@ -48,133 +49,113 @@ def output_attentions(self): return self._output_attentions @output_attentions.setter - def output_attentions(self, val:bool): + def output_attentions(self, val: bool): self._output_attentions = val self.hidden_nn.output_attentions = val - def forward(self, + def forward( + self, aa_indices, mod_x, - charges:torch.Tensor, + charges: torch.Tensor, ): - x = self.dropout(self.input_nn( - aa_indices, mod_x - )) - charges = charges.unsqueeze(1).repeat(1,x.size(1),2) - x = torch.cat((x, charges),2) + x = self.dropout(self.input_nn(aa_indices, mod_x)) + charges = charges.unsqueeze(1).repeat(1, x.size(1), 2) + x = torch.cat((x, charges), 2) hidden_x = self.hidden_nn(x) if self.output_attentions: self.attentions = hidden_x[1] else: self.attentions = None - x = self.dropout(hidden_x[0]+x*0.2) + x = self.dropout(hidden_x[0] + x * 0.2) return self.output_nn(x).squeeze(1) + class Model_CCS_LSTM(torch.nn.Module): """LSTM model for CCS prediction""" - def __init__(self, - dropout=0.1 - ): + + def __init__(self, dropout=0.1): super().__init__() self.dropout = torch.nn.Dropout(dropout) hidden = 256 - self.ccs_encoder = ( - model_base.Encoder_26AA_Mod_Charge_CNN_LSTM_AttnSum( - hidden - ) - ) + self.ccs_encoder = model_base.Encoder_26AA_Mod_Charge_CNN_LSTM_AttnSum(hidden) - self.ccs_decoder = model_base.Decoder_Linear( - hidden+1, 1 - ) + self.ccs_decoder = model_base.Decoder_Linear(hidden + 1, 1) - def forward(self, + def forward( + self, aa_indices, mod_x, charges, ): x = self.ccs_encoder(aa_indices, mod_x, charges) x = self.dropout(x) - x = torch.cat((x, charges),1) + x = torch.cat((x, charges), 1) return self.ccs_decoder(x).squeeze(1) -def ccs_to_mobility_pred_df( - precursor_df:pd.DataFrame -)->pd.DataFrame: - """ Add 'mobility_pred' into precursor_df inplace """ - precursor_df[ - 'mobility_pred' - ] = ccs_to_mobility_for_df( - precursor_df, 'ccs_pred' - ) + +def ccs_to_mobility_pred_df(precursor_df: pd.DataFrame) -> pd.DataFrame: + """Add 'mobility_pred' into precursor_df inplace""" + precursor_df["mobility_pred"] = ccs_to_mobility_for_df(precursor_df, "ccs_pred") return precursor_df -def mobility_to_ccs_df_( - precursor_df:pd.DataFrame -)->pd.DataFrame: - """ Add 'ccs' into precursor_df inplace """ - precursor_df[ - 'ccs' - ] = mobility_to_ccs_for_df( - precursor_df, 'mobility' - ) + +def mobility_to_ccs_df_(precursor_df: pd.DataFrame) -> pd.DataFrame: + """Add 'ccs' into precursor_df inplace""" + precursor_df["ccs"] = mobility_to_ccs_for_df(precursor_df, "mobility") return precursor_df + class AlphaCCSModel(model_base.ModelInterface): """ `ModelInterface` for `Model_CCS_LSTM` or `Model_CCS_Bert` """ - def __init__(self, + + def __init__( + self, dropout=0.1, - model_class:torch.nn.Module=Model_CCS_LSTM, - device:str='gpu', + model_class: torch.nn.Module = Model_CCS_LSTM, + device: str = "gpu", **kwargs, ): super().__init__(device=device) - self.model:Model_CCS_LSTM = None - self.build( - model_class, - dropout=dropout, - **kwargs - ) + self.model: Model_CCS_LSTM = None + self.build(model_class, dropout=dropout, **kwargs) self.charge_factor = 0.1 - self.target_column_to_predict = 'ccs_pred' - self.target_column_to_train = 'ccs' + self.target_column_to_predict = "ccs_pred" + self.target_column_to_train = "ccs" - def test(self, + def test( + self, precursor_df: pd.DataFrame, *, - x:typing.Literal["ccs_pred","mobility_pred"]="ccs_pred", - y:typing.Literal["ccs","mobility"]="ccs", - batch_size:int = 1024, + x: typing.Literal["ccs_pred", "mobility_pred"] = "ccs_pred", + y: typing.Literal["ccs", "mobility"] = "ccs", + batch_size: int = 1024, ): return evaluate_linear_regression( - self.predict( - precursor_df, batch_size=batch_size - ), - x=x, y=y + self.predict(precursor_df, batch_size=batch_size), x=x, y=y ) - def _get_features_from_batch_df(self, + def _get_features_from_batch_df( + self, batch_df: pd.DataFrame, ): aa_indices = self._get_26aa_indice_features(batch_df) mod_x = self._get_mod_features(batch_df) - charges = self._as_tensor( - batch_df['charge'].values - ).unsqueeze(1)*self.charge_factor + charges = ( + self._as_tensor(batch_df["charge"].values).unsqueeze(1) * self.charge_factor + ) return aa_indices, mod_x, charges - def ccs_to_mobility_pred(self, - precursor_df:pd.DataFrame - )->pd.DataFrame: + def ccs_to_mobility_pred(self, precursor_df: pd.DataFrame) -> pd.DataFrame: return ccs_to_mobility_pred_df(precursor_df) diff --git a/peptdeep/model/charge.py b/peptdeep/model/charge.py index 183918dc..430f3a98 100644 --- a/peptdeep/model/charge.py +++ b/peptdeep/model/charge.py @@ -8,38 +8,46 @@ Model_for_Generic_ModAASeq_BinaryClassification_Transformer, ) + class _ChargeModelInterface: def __init__(self, *args, **kwargs): raise TypeError("Interface class cannot be instantiated.") - def predict_charges_as_prob(self, - pep_df:pd.DataFrame, - min_precursor_charge:int, - max_precursor_charge:int, + + def predict_charges_as_prob( + self, + pep_df: pd.DataFrame, + min_precursor_charge: int, + max_precursor_charge: int, ): df = self.predict_mp( pep_df.copy(), batch_size=self.predict_batch_size, ) - df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) - df["charge"] = [self.charge_range[ - min_precursor_charge-self.min_predict_charge: - max_precursor_charge-self.min_predict_charge+1 - ]]*len(df) + df.rename(columns={"charge_probs": "charge_prob"}, inplace=True) + df["charge"] = [ + self.charge_range[ + min_precursor_charge - self.min_predict_charge : max_precursor_charge + - self.min_predict_charge + + 1 + ] + ] * len(df) df["charge_prob"] = df.charge_prob.apply( lambda x: x[ - min_precursor_charge-self.min_predict_charge: - max_precursor_charge-self.min_predict_charge+1 + min_precursor_charge - self.min_predict_charge : max_precursor_charge + - self.min_predict_charge + + 1 ] ) - df = df.explode( - ["charge","charge_prob"], ignore_index=True - ).dropna(subset=["charge"]) + df = df.explode(["charge", "charge_prob"], ignore_index=True).dropna( + subset=["charge"] + ) df["charge"] = df.charge.astype(np.int8) df["charge_prob"] = df.charge_prob.astype(np.float32) return df - def predict_prob_for_charge(self, - precursor_df:pd.DataFrame, + def predict_prob_for_charge( + self, + precursor_df: pd.DataFrame, ): if "charge" not in precursor_df.columns: raise KeyError("precursor_df must contain `charge` column") @@ -47,32 +55,31 @@ def predict_prob_for_charge(self, precursor_df, batch_size=self.predict_batch_size, ) - precursor_df["charge_prob"] = precursor_df[["charge_probs","charge"]].apply( - lambda x: x.iloc[0][x.iloc[1]-self.min_predict_charge], axis=1 - ).astype(np.float32) + precursor_df["charge_prob"] = ( + precursor_df[["charge_probs", "charge"]] + .apply(lambda x: x.iloc[0][x.iloc[1] - self.min_predict_charge], axis=1) + .astype(np.float32) + ) precursor_df.drop(columns="charge_probs", inplace=True) return precursor_df - def predict_and_clip_charges(self, - pep_df:pd.DataFrame, - min_precursor_charge:int, - max_precursor_charge:int, - charge_prob_cutoff:float, + def predict_and_clip_charges( + self, + pep_df: pd.DataFrame, + min_precursor_charge: int, + max_precursor_charge: int, + charge_prob_cutoff: float, ): df = self.predict_mp( pep_df.copy(), batch_size=self.predict_batch_size, ) - df.rename(columns={"charge_probs":"charge_prob"}, inplace=True) + df.rename(columns={"charge_probs": "charge_prob"}, inplace=True) df["charge"] = df.charge_prob.apply( - lambda x: self.charge_range[x>charge_prob_cutoff] + lambda x: self.charge_range[x > charge_prob_cutoff] ) - df["charge_prob"] = df.charge_prob.apply( - lambda x: x[x>charge_prob_cutoff] - ) - df = df.explode( - ["charge","charge_prob"] - ).dropna(subset=["charge"]) + df["charge_prob"] = df.charge_prob.apply(lambda x: x[x > charge_prob_cutoff]) + df = df.explode(["charge", "charge_prob"]).dropna(subset=["charge"]) df["charge"] = df.charge.astype(np.int8) df = df.query( f"charge>={min_precursor_charge} and charge<={max_precursor_charge}" @@ -80,9 +87,9 @@ def predict_and_clip_charges(self, df["charge_prob"] = df.charge_prob.astype(np.float32) return df + class ChargeModelForModAASeq( - ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, - _ChargeModelInterface + ModelInterface_for_Generic_ModAASeq_MultiLabelClassification, _ChargeModelInterface ): """ ModelInterface for charge prediction for modified peptides @@ -96,25 +103,27 @@ class ChargeModelForModAASeq( device : str, optional Device to use for training and prediction, by default "gpu" """ - def __init__(self, min_charge:int=1, max_charge:int=6, device:str="gpu"): + + def __init__(self, min_charge: int = 1, max_charge: int = 6, device: str = "gpu"): super().__init__( - num_target_values=max_charge-min_charge+1, + num_target_values=max_charge - min_charge + 1, model_class=Model_for_Generic_ModAASeq_BinaryClassification_Transformer, - nlayers=4, hidden_dim=128, dropout=0.1, device=device + nlayers=4, + hidden_dim=128, + dropout=0.1, + device=device, ) self.target_column_to_predict = "charge_probs" self.target_column_to_train = "charge_indicators" self.min_predict_charge = min_charge self.max_predict_charge = max_charge - self.charge_range = np.arange( - min_charge, max_charge+1, dtype=np.int8 - ) + self.charge_range = np.arange(min_charge, max_charge + 1, dtype=np.int8) self.predict_batch_size = 1024 + class ChargeModelForAASeq( - ModelInterface_for_Generic_AASeq_MultiLabelClassification, - _ChargeModelInterface + ModelInterface_for_Generic_AASeq_MultiLabelClassification, _ChargeModelInterface ): """ ModelInterface for charge prediction for amino acid sequence @@ -128,52 +137,66 @@ class ChargeModelForAASeq( device : str, optional Device to use for training and prediction, by default "gpu" """ - def __init__(self, min_charge:int=1, max_charge:int=6,device:str="gpu"): + + def __init__(self, min_charge: int = 1, max_charge: int = 6, device: str = "gpu"): super().__init__( - num_target_values=max_charge-min_charge+1, + num_target_values=max_charge - min_charge + 1, model_class=Model_for_Generic_AASeq_BinaryClassification_Transformer, - nlayers=4, hidden_dim=128, dropout=0.1, device=device + nlayers=4, + hidden_dim=128, + dropout=0.1, + device=device, ) self.target_column_to_predict = "charge_probs" self.target_column_to_train = "charge_indicators" self.min_predict_charge = min_charge self.max_predict_charge = max_charge - self.charge_range = np.arange( - min_charge, max_charge+1, dtype=np.int8 - ) + self.charge_range = np.arange(min_charge, max_charge + 1, dtype=np.int8) self.predict_batch_size = 1024 + def group_psm_df_by_sequence( psm_df: pd.DataFrame, - min_charge:int, - max_charge:int, + min_charge: int, + max_charge: int, ): - return psm_df.groupby("sequence")["charge"].apply( - lambda x: get_charge_indicators(set(x), - min_charge=min_charge, max_charge=max_charge + return ( + psm_df.groupby("sequence")["charge"] + .apply( + lambda x: get_charge_indicators( + set(x), min_charge=min_charge, max_charge=max_charge + ) ) - ).reset_index(drop=False).rename(columns={"charge":"charge_indicators"}) + .reset_index(drop=False) + .rename(columns={"charge": "charge_indicators"}) + ) def group_psm_df_by_modseq( psm_df: pd.DataFrame, - min_charge:int, - max_charge:int, + min_charge: int, + max_charge: int, ): - return psm_df.groupby(["sequence","mods","mod_sites"])["charge"].apply( - lambda x: get_charge_indicators(set(x), - min_charge=min_charge, max_charge=max_charge + return ( + psm_df.groupby(["sequence", "mods", "mod_sites"])["charge"] + .apply( + lambda x: get_charge_indicators( + set(x), min_charge=min_charge, max_charge=max_charge + ) ) - ).reset_index(drop=False).rename(columns={"charge":"charge_indicators"}) + .reset_index(drop=False) + .rename(columns={"charge": "charge_indicators"}) + ) + def get_charge_indicators( charge_list, - min_charge:int, - max_charge:int, + min_charge: int, + max_charge: int, ): - charge_indicators = np.zeros(max_charge-min_charge+1) + charge_indicators = np.zeros(max_charge - min_charge + 1) for charge in charge_list: if charge <= max_charge and charge >= min_charge: - charge_indicators[charge-min_charge] = 1.0 + charge_indicators[charge - min_charge] = 1.0 return charge_indicators diff --git a/peptdeep/model/featurize.py b/peptdeep/model/featurize.py index a8b7f1ce..9e33945b 100644 --- a/peptdeep/model/featurize.py +++ b/peptdeep/model/featurize.py @@ -3,17 +3,20 @@ from typing import List, Union from peptdeep.settings import ( - model_const, mod_feature_size, MOD_TO_FEATURE, - mod_elements, mod_elem_to_idx, - _parse_mod_formula, update_all_mod_features, + model_const, + mod_feature_size, + MOD_TO_FEATURE, + mod_elements, + mod_elem_to_idx, + _parse_mod_formula, + update_all_mod_features, ) + def parse_mod_feature( - nAA:int, - mod_names:List[str], - mod_sites:List[int] -)->np.ndarray: - ''' + nAA: int, mod_names: List[str], mod_sites: List[int] +) -> np.ndarray: + """ Get modification feature of a given peptide (len=nAA). Note that `site=0` is for peptide N-term modification, `site=1` is for peptide C-term modification, and @@ -36,18 +39,17 @@ def parse_mod_feature( np.ndarray 2-D feature array with shape `(nAA+2,mod_feature_size)` - ''' - mod_x = np.zeros((nAA+2,mod_feature_size)) + """ + mod_x = np.zeros((nAA + 2, mod_feature_size)) if len(mod_names) > 0: for site, mod in zip(mod_sites, mod_names): mod_x[site] += MOD_TO_FEATURE[mod] # mod_x[mod_sites] = [MOD_TO_FEATURE[mod] for mod in mod_names] return mod_x -def get_batch_mod_feature( - batch_df: pd.DataFrame -)->np.ndarray: - ''' + +def get_batch_mod_feature(batch_df: pd.DataFrame) -> np.ndarray: + """ Parameters ---------- batch_df : pd.DataFrame @@ -58,37 +60,28 @@ def get_batch_mod_feature( ------- np.ndarray 3-D tensor with shape (batch_size, nAA+2, mod_feature_size) - ''' + """ - mod_features_list = batch_df.mods.str.split(';').apply( - lambda mod_names: [ - MOD_TO_FEATURE[mod] for mod in mod_names - if len(mod)>0 - ] + mod_features_list = batch_df.mods.str.split(";").apply( + lambda mod_names: [MOD_TO_FEATURE[mod] for mod in mod_names if len(mod) > 0] ) - mod_sites_list = batch_df.mod_sites.str.split(';').apply( - lambda mod_sites:[ - int(site) for site in mod_sites - if len(site)>0 - ] + mod_sites_list = batch_df.mod_sites.str.split(";").apply( + lambda mod_sites: [int(site) for site in mod_sites if len(site) > 0] ) mod_x_batch = np.zeros( - (len(batch_df), batch_df.nAA.values[0]+2, mod_feature_size) + (len(batch_df), batch_df.nAA.values[0] + 2, mod_feature_size) ) - for i, (mod_feats, mod_sites) in enumerate( - zip(mod_features_list, mod_sites_list) - ): - if len(mod_sites)>0: + for i, (mod_feats, mod_sites) in enumerate(zip(mod_features_list, mod_sites_list)): + if len(mod_sites) > 0: for site, feat in zip(mod_sites, mod_feats): # Process multiple mods on one site - mod_x_batch[i,site,:] += feat + mod_x_batch[i, site, :] += feat # mod_x_batch[i,mod_sites,:] = mod_feats return mod_x_batch -def get_batch_aa_indices( - seq_array: Union[List, np.ndarray] -)->np.ndarray: - ''' + +def get_batch_aa_indices(seq_array: Union[List, np.ndarray]) -> np.ndarray: + """ Convert peptide sequences into AA ID array. ID=0 is reserved for masking, so ID of 'A' is 1, ID of 'B' is 2, ..., ID of 'Z' is 26 (maximum). Zeros are padded into the N- and C-term for each sequence. @@ -105,17 +98,14 @@ def get_batch_aa_indices( `(len(seq_array), len(seq_array[0])+2)`. Zeros is padded into the N- and C-term of each sequence, so the 1st-D is `len(seq_array[0])+2`. - ''' - x = np.array(seq_array).view(np.int32).reshape( - len(seq_array), -1 - )-ord('A')+1 + """ + x = np.array(seq_array).view(np.int32).reshape(len(seq_array), -1) - ord("A") + 1 # padding zeros at the N- and C-term - return np.pad(x, [(0,0)]*(len(x.shape)-1)+[(1,1)]) + return np.pad(x, [(0, 0)] * (len(x.shape) - 1) + [(1, 1)]) + -def get_ascii_indices( - seq_array: Union[List, np.ndarray] -)->np.ndarray: - ''' +def get_ascii_indices(seq_array: Union[List, np.ndarray]) -> np.ndarray: + """ Convert peptide sequences into ASCII code array. The values are from 0 to 127. Zeros are padded into the N- and C-term for each sequence. @@ -133,25 +123,25 @@ def get_ascii_indices( For the the sequence whose length is shorter than max seq length, zeros are padded to the missing values. - ''' + """ + + x = np.array(seq_array).view(np.int32).reshape(len(seq_array), -1) + return np.pad(x, [(0, 0)] * (len(x.shape) - 1) + [(1, 1)]) - x = np.array(seq_array).view(np.int32).reshape( - len(seq_array), -1 - ) - return np.pad(x, [(0,0)]*(len(x.shape)-1)+[(1,1)]) instrument_dict = dict( zip( - [inst.upper() for inst in model_const['instruments']], - range(len(model_const['instruments'])) + [inst.upper() for inst in model_const["instruments"]], + range(len(model_const["instruments"])), ) ) -unknown_inst_index = model_const['max_instrument_num']-1 +unknown_inst_index = model_const["max_instrument_num"] - 1 + def parse_instrument_indices(instrument_list): instrument_list = [inst.upper() for inst in instrument_list] instrument_list = [inst for inst in instrument_list] return [ - instrument_dict[inst] if inst in instrument_dict - else unknown_inst_index for inst in instrument_list + instrument_dict[inst] if inst in instrument_dict else unknown_inst_index + for inst in instrument_list ] diff --git a/peptdeep/model/generic_property_prediction.py b/peptdeep/model/generic_property_prediction.py index 7a80f925..859b3a99 100644 --- a/peptdeep/model/generic_property_prediction.py +++ b/peptdeep/model/generic_property_prediction.py @@ -6,11 +6,14 @@ from peptdeep.model.model_interface import ModelInterface from alphabase.peptide.precursor import is_precursor_refined -ASCII_NUM=128 +ASCII_NUM = 128 + class Model_for_Generic_AASeq_Regression_LSTM(torch.nn.Module): """Generic LSTM regression model for AA sequence""" - def __init__(self, + + def __init__( + self, *, hidden_dim=256, output_dim=1, @@ -22,31 +25,32 @@ def __init__(self, self.dropout = torch.nn.Dropout(dropout) self.nn = torch.nn.Sequential( - building_block.ascii_embedding(hidden_dim//4), - building_block.SeqCNN(hidden_dim//4), + building_block.ascii_embedding(hidden_dim // 4), + building_block.SeqCNN(hidden_dim // 4), self.dropout, - building_block.SeqLSTM( - hidden_dim, hidden_dim, - rnn_layer=nlayers - ), + building_block.SeqLSTM(hidden_dim, hidden_dim, rnn_layer=nlayers), building_block.SeqAttentionSum(hidden_dim), self.dropout, - torch.nn.Linear(hidden_dim,64), + torch.nn.Linear(hidden_dim, 64), torch.nn.GELU(), torch.nn.Linear(64, output_dim), ) + def forward(self, aa_x): return self.nn(aa_x).squeeze(-1) + class Model_for_Generic_AASeq_Regression_Transformer(torch.nn.Module): - """Generic transformer regression model for AA sequence """ - def __init__(self, + """Generic transformer regression model for AA sequence""" + + def __init__( + self, *, - hidden_dim = 256, - output_dim = 1, - nlayers = 4, + hidden_dim=256, + output_dim=1, + nlayers=4, output_attentions=False, - dropout = 0.1, + dropout=0.1, **kwargs, ): super().__init__() @@ -58,8 +62,10 @@ def __init__(self, self.output_attentions = output_attentions self.hidden_nn = building_block.HFace_Transformer_with_PositionalEncoder( - hidden_dim, nlayers=nlayers, dropout=dropout, - output_attentions=output_attentions + hidden_dim, + nlayers=nlayers, + dropout=dropout, + output_attentions=output_attentions, ) self.output_nn = torch.nn.Sequential( @@ -70,11 +76,11 @@ def __init__(self, ) @property - def output_attentions(self)->bool: + def output_attentions(self) -> bool: return self._output_attentions @output_attentions.setter - def output_attentions(self, val:bool): + def output_attentions(self, val: bool): self._output_attentions = val def forward(self, aa_x): @@ -94,10 +100,12 @@ class ModelInterface_for_Generic_AASeq_Regression(ModelInterface): """ `ModelInterface` for Generic_AASeq_Regression models """ - def __init__(self, - model_class:torch.nn.Module=Model_for_Generic_AASeq_Regression_LSTM, + + def __init__( + self, + model_class: torch.nn.Module = Model_for_Generic_AASeq_Regression_LSTM, dropout=0.1, - device:str='gpu', + device: str = "gpu", hidden_dim=256, output_dim=1, nlayers=4, @@ -110,16 +118,19 @@ def __init__(self, hidden_dim=hidden_dim, output_dim=output_dim, nlayers=nlayers, - **kwargs + **kwargs, ) - self.loss_func = torch.nn.L1Loss() # for regression + self.loss_func = torch.nn.L1Loss() # for regression + + self.target_column_to_predict = "predicted_property" + self.target_column_to_train = "detected_property" - self.target_column_to_predict = 'predicted_property' - self.target_column_to_train = 'detected_property' class Model_for_Generic_ModAASeq_Regression_LSTM(torch.nn.Module): """Generic LSTM regression model for modified sequence""" - def __init__(self, + + def __init__( + self, *, hidden_dim=256, output_dim=1, @@ -136,23 +147,27 @@ def __init__(self, ) self.output_nn = torch.nn.Sequential( self.dropout, - torch.nn.Linear(hidden_dim,64), + torch.nn.Linear(hidden_dim, 64), torch.nn.GELU(), torch.nn.Linear(64, output_dim), ) + def forward(self, aa_x, mod_x): x = self.encoder_nn(aa_x, mod_x) return self.output_nn(x).squeeze(-1) + class Model_for_Generic_ModAASeq_Regression_Transformer(torch.nn.Module): """Generic transformer regression model for modified sequence""" - def __init__(self, + + def __init__( + self, *, - hidden_dim = 256, - output_dim = 1, - nlayers = 4, + hidden_dim=256, + output_dim=1, + nlayers=4, output_attentions=False, - dropout = 0.1, + dropout=0.1, **kwargs, ): super().__init__() @@ -164,8 +179,10 @@ def __init__(self, self._output_attentions = output_attentions self.hidden_nn = building_block.HFace_Transformer_with_PositionalEncoder( - hidden_dim, nlayers=nlayers, dropout=dropout, - output_attentions=output_attentions + hidden_dim, + nlayers=nlayers, + dropout=dropout, + output_attentions=output_attentions, ) self.output_nn = torch.nn.Sequential( @@ -176,38 +193,40 @@ def __init__(self, ) @property - def output_attentions(self)->bool: + def output_attentions(self) -> bool: return self._output_attentions @output_attentions.setter - def output_attentions(self, val:bool): + def output_attentions(self, val: bool): self._output_attentions = val - def forward(self, + def forward( + self, aa_indices, mod_x, ): - x = self.dropout(self.input_nn( - aa_indices, mod_x - )) + x = self.dropout(self.input_nn(aa_indices, mod_x)) hidden_x = self.hidden_nn(x) if self.output_attentions: self.attentions = hidden_x[1] else: self.attentions = None - x = self.dropout(hidden_x[0]+x*0.2) + x = self.dropout(hidden_x[0] + x * 0.2) return self.output_nn(x).squeeze(1) + class ModelInterface_for_Generic_ModAASeq_Regression(ModelInterface): """ `ModelInterface` for all Generic_ModAASeq_Regression models """ - def __init__(self, - model_class:torch.nn.Module=Model_for_Generic_ModAASeq_Regression_LSTM, + + def __init__( + self, + model_class: torch.nn.Module = Model_for_Generic_ModAASeq_Regression_LSTM, dropout=0.1, - device:str='gpu', + device: str = "gpu", hidden_dim=256, output_dim=1, nlayers=4, @@ -220,24 +239,28 @@ def __init__(self, hidden_dim=hidden_dim, output_dim=output_dim, nlayers=nlayers, - **kwargs + **kwargs, ) - self.loss_func = torch.nn.L1Loss() # for regression + self.loss_func = torch.nn.L1Loss() # for regression - self.target_column_to_predict = 'predicted_property' - self.target_column_to_train = 'detected_property' + self.target_column_to_predict = "predicted_property" + self.target_column_to_train = "detected_property" - def _get_features_from_batch_df(self, + def _get_features_from_batch_df( + self, batch_df: pd.DataFrame, **kwargs, ): return self._get_aa_mod_features(batch_df) + class Model_for_Generic_AASeq_BinaryClassification_LSTM( Model_for_Generic_AASeq_Regression_LSTM ): """Generic LSTM classification model for AA sequence""" - def __init__(self, + + def __init__( + self, *, hidden_dim=256, output_dim=1, @@ -256,17 +279,20 @@ def forward(self, aa_x): x = super().forward(aa_x) return torch.sigmoid(x) + class Model_for_Generic_AASeq_BinaryClassification_Transformer( Model_for_Generic_AASeq_Regression_Transformer ): """Generic transformer classification model for AA sequence""" - def __init__(self, + + def __init__( + self, *, - hidden_dim = 256, - output_dim = 1, - nlayers = 4, + hidden_dim=256, + output_dim=1, + nlayers=4, output_attentions=False, - dropout = 0.1, + dropout=0.1, **kwargs, ): """ @@ -286,14 +312,17 @@ def forward(self, aa_x): x = super().forward(aa_x) return torch.sigmoid(x) + class ModelInterface_for_Generic_AASeq_BinaryClassification(ModelInterface): """ `ModelInterface` for all Generic_AASeq_BinaryClassification models """ - def __init__(self, - model_class:torch.nn.Module=Model_for_Generic_AASeq_BinaryClassification_LSTM, + + def __init__( + self, + model_class: torch.nn.Module = Model_for_Generic_AASeq_BinaryClassification_LSTM, dropout=0.1, - device:str='gpu', + device: str = "gpu", hidden_dim=256, output_dim=1, nlayers=4, @@ -309,17 +338,20 @@ def __init__(self, hidden_dim=hidden_dim, output_dim=output_dim, nlayers=nlayers, - **kwargs + **kwargs, ) - self.loss_func = torch.nn.BCELoss() # for binary classification - self.target_column_to_predict = 'predicted_prob' - self.target_column_to_train = 'detected_prob' + self.loss_func = torch.nn.BCELoss() # for binary classification + self.target_column_to_predict = "predicted_prob" + self.target_column_to_train = "detected_prob" + class Model_for_Generic_ModAASeq_BinaryClassification_LSTM( Model_for_Generic_ModAASeq_Regression_LSTM ): """Generic LSTM classification model for modified sequence""" - def __init__(self, + + def __init__( + self, *, hidden_dim=256, output_dim=1, @@ -344,13 +376,15 @@ class Model_for_Generic_ModAASeq_BinaryClassification_Transformer( Model_for_Generic_ModAASeq_Regression_Transformer ): """Generic transformer classification model for modified sequence""" - def __init__(self, + + def __init__( + self, *, - hidden_dim = 256, - output_dim = 1, - nlayers = 4, + hidden_dim=256, + output_dim=1, + nlayers=4, output_attentions=False, - dropout = 0.1, + dropout=0.1, **kwargs, ): super().__init__( @@ -359,18 +393,19 @@ def __init__(self, output_dim=output_dim, output_attentions=output_attentions, dropout=dropout, - **kwargs + **kwargs, ) @property - def output_attentions(self)->bool: + def output_attentions(self) -> bool: return self._output_attentions @output_attentions.setter - def output_attentions(self, val:bool): + def output_attentions(self, val: bool): self._output_attentions = val - def forward(self, + def forward( + self, aa_indices, mod_x, ): @@ -382,10 +417,12 @@ class ModelInterface_for_Generic_ModAASeq_BinaryClassification(ModelInterface): """ `ModelInterface` for Generic_ModAASeq_BinaryClassification """ - def __init__(self, - model_class:torch.nn.Module=Model_for_Generic_ModAASeq_BinaryClassification_LSTM, + + def __init__( + self, + model_class: torch.nn.Module = Model_for_Generic_ModAASeq_BinaryClassification_LSTM, dropout=0.1, - device:str='gpu', + device: str = "gpu", hidden_dim=256, output_dim=1, nlayers=4, @@ -398,14 +435,15 @@ def __init__(self, output_dim=output_dim, nlayers=nlayers, dropout=dropout, - **kwargs + **kwargs, ) - self.loss_func = torch.nn.BCELoss() # for classification + self.loss_func = torch.nn.BCELoss() # for classification - self.target_column_to_predict = 'predicted_prob' - self.target_column_to_train = 'detected_prob' + self.target_column_to_predict = "predicted_prob" + self.target_column_to_train = "detected_prob" - def _get_features_from_batch_df(self, + def _get_features_from_batch_df( + self, batch_df: pd.DataFrame, ): return self._get_aa_mod_features(batch_df) @@ -414,30 +452,35 @@ def _get_features_from_batch_df(self, class ModelInterface_for_Generic_AASeq_MultiLabelClassification( ModelInterface_for_Generic_AASeq_BinaryClassification ): - def __init__(self, - num_target_values:int=6, - model_class:torch.nn.Module=Model_for_Generic_AASeq_BinaryClassification_Transformer, - nlayers=4, hidden_dim=256, device='gpu', - dropout=0.1, **kwargs, + def __init__( + self, + num_target_values: int = 6, + model_class: torch.nn.Module = Model_for_Generic_AASeq_BinaryClassification_Transformer, + nlayers=4, + hidden_dim=256, + device="gpu", + dropout=0.1, + **kwargs, ): self.num_target_values = num_target_values super().__init__( model_class=model_class, output_dim=self.num_target_values, - nlayers=nlayers, hidden_dim=hidden_dim, - device=device, dropout=dropout, - **kwargs + nlayers=nlayers, + hidden_dim=hidden_dim, + device=device, + dropout=dropout, + **kwargs, ) - self.target_column_to_train = 'target_probs' - self.target_column_to_predict = 'target_probs_pred' + self.target_column_to_train = "target_probs" + self.target_column_to_predict = "target_probs_pred" def _get_targets_from_batch_df(self, batch_df, **kwargs): return self._as_tensor( - np.stack(batch_df[self.target_column_to_train].values), - dtype=torch.float32 + np.stack(batch_df[self.target_column_to_train].values), dtype=torch.float32 ) - def _check_predict_in_order(self, precursor_df:pd.DataFrame): + def _check_predict_in_order(self, precursor_df: pd.DataFrame): if not is_precursor_refined(precursor_df): # multilabel prediction can only predict in order precursor_df.sort_values("nAA", inplace=True) @@ -445,42 +488,48 @@ def _check_predict_in_order(self, precursor_df:pd.DataFrame): def _prepare_predict_data_df(self, precursor_df, **kwargs): precursor_df[self.target_column_to_predict] = [ - [0]*self.num_target_values - ]*len(precursor_df) + [0] * self.num_target_values + ] * len(precursor_df) self.predict_df = precursor_df def _set_batch_predict_data(self, batch_df, predict_values, **kwargs): - self.predict_df.loc[:,self.target_column_to_predict].values[ - batch_df.index.values[0]:batch_df.index.values[-1]+1 + self.predict_df.loc[:, self.target_column_to_predict].values[ + batch_df.index.values[0] : batch_df.index.values[-1] + 1 ] = list(predict_values) + class ModelInterface_for_Generic_ModAASeq_MultiLabelClassification( ModelInterface_for_Generic_ModAASeq_BinaryClassification ): - def __init__(self, - num_target_values:int=6, - model_class:torch.nn.Module=Model_for_Generic_ModAASeq_BinaryClassification_Transformer, - nlayers=4, hidden_dim=256, device='gpu', - dropout=0.1, **kwargs, + def __init__( + self, + num_target_values: int = 6, + model_class: torch.nn.Module = Model_for_Generic_ModAASeq_BinaryClassification_Transformer, + nlayers=4, + hidden_dim=256, + device="gpu", + dropout=0.1, + **kwargs, ): self.num_target_values = num_target_values super().__init__( model_class=model_class, output_dim=self.num_target_values, - nlayers=nlayers, hidden_dim=hidden_dim, - device=device, dropout=dropout, - **kwargs + nlayers=nlayers, + hidden_dim=hidden_dim, + device=device, + dropout=dropout, + **kwargs, ) - self.target_column_to_train = 'target_probs' - self.target_column_to_predict = 'target_probs_pred' + self.target_column_to_train = "target_probs" + self.target_column_to_predict = "target_probs_pred" def _get_targets_from_batch_df(self, batch_df, **kwargs): return self._as_tensor( - np.stack(batch_df[self.target_column_to_train].values), - dtype=torch.float32 + np.stack(batch_df[self.target_column_to_train].values), dtype=torch.float32 ) - def _check_predict_in_order(self, precursor_df:pd.DataFrame): + def _check_predict_in_order(self, precursor_df: pd.DataFrame): if not is_precursor_refined(precursor_df): # multilabel prediction can only predict in order precursor_df.sort_values("nAA", inplace=True) @@ -488,15 +537,20 @@ def _check_predict_in_order(self, precursor_df:pd.DataFrame): def _prepare_predict_data_df(self, precursor_df, **kwargs): precursor_df[self.target_column_to_predict] = [ - [0]*self.num_target_values - ]*len(precursor_df) + [0] * self.num_target_values + ] * len(precursor_df) self.predict_df = precursor_df def _set_batch_predict_data(self, batch_df, predict_values, **kwargs): - self.predict_df.loc[:,self.target_column_to_predict].values[ - batch_df.index.values[0]:batch_df.index.values[-1]+1 + self.predict_df.loc[:, self.target_column_to_predict].values[ + batch_df.index.values[0] : batch_df.index.values[-1] + 1 ] = list(predict_values) + # alias -ModelInterface_for_Generic_AASeq_MultiTargetClassification = ModelInterface_for_Generic_AASeq_MultiLabelClassification -ModelInterface_for_Generic_ModAASeq_MultiTargetClassification = ModelInterface_for_Generic_ModAASeq_MultiLabelClassification +ModelInterface_for_Generic_AASeq_MultiTargetClassification = ( + ModelInterface_for_Generic_AASeq_MultiLabelClassification +) +ModelInterface_for_Generic_ModAASeq_MultiTargetClassification = ( + ModelInterface_for_Generic_ModAASeq_MultiLabelClassification +) diff --git a/peptdeep/model/model_interface.py b/peptdeep/model/model_interface.py index 4ff167f2..993373ec 100644 --- a/peptdeep/model/model_interface.py +++ b/peptdeep/model/model_interface.py @@ -21,22 +21,21 @@ from alphabase.peptide.precursor import is_precursor_refined from peptdeep.settings import model_const -from peptdeep.utils import ( - logging, process_bar, get_device, - get_available_device -) +from peptdeep.utils import logging, process_bar, get_device, get_available_device from peptdeep.settings import global_settings from peptdeep.model.featurize import ( - get_ascii_indices, get_batch_aa_indices, - get_batch_mod_feature + get_ascii_indices, + get_batch_aa_indices, + get_batch_mod_feature, ) + class LR_SchedulerInterface(object): - def __init__(self, optimizer:torch.optim.Optimizer, **kwargs): + def __init__(self, optimizer: torch.optim.Optimizer, **kwargs): raise NotImplementedError - def step(self, epoch:int, loss:float): + def step(self, epoch: int, loss: float): """ This method must be implemented in the sub-class. It will be called to get the learning rate for the next epoch. While the one we are using here does not need the loss value, this is left in case of using something like the ReduceLROnPlateau scheduler. @@ -50,7 +49,7 @@ def step(self, epoch:int, loss:float): """ raise NotImplementedError - def get_last_lr(self)->float: + def get_last_lr(self) -> float: """ Get the last learning rate. @@ -61,24 +60,26 @@ def get_last_lr(self)->float: """ raise NotImplementedError + class WarmupLR_Scheduler(LR_SchedulerInterface): """ A learning rate scheduler that includes a warmup phase and then a cosine annealing phase. """ - def __init__(self, - optimizer:torch.optim.Optimizer, - num_warmup_steps:int, - num_training_steps:int, - num_cycles:float=0.5, - last_epoch:int=-1 + def __init__( + self, + optimizer: torch.optim.Optimizer, + num_warmup_steps: int, + num_training_steps: int, + num_cycles: float = 0.5, + last_epoch: int = -1, ): self.optimizer = optimizer self.lambda_lr = self.get_cosine_schedule_with_warmup( optimizer, num_warmup_steps, num_training_steps, num_cycles, last_epoch ) - def step(self, epoch:int, loss:float): + def step(self, epoch: int, loss: float): """ Get the learning rate for the next epoch. @@ -92,7 +93,7 @@ def step(self, epoch:int, loss:float): """ return self.lambda_lr.step(epoch) - def get_last_lr(self)->float: + def get_last_lr(self) -> float: """ Get the last learning rate. @@ -103,23 +104,34 @@ def get_last_lr(self)->float: """ return self.lambda_lr.get_last_lr() - # `transformers.optimization.get_cosine_schedule_with_warmup` will import tensorflow, # resulting in some package version issues. # Here we copy the code from transformers.optimization - def _get_cosine_schedule_with_warmup_lr_lambda(self, - current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float + def _get_cosine_schedule_with_warmup_lr_lambda( + self, + current_step: int, + *, + num_warmup_steps: int, + num_training_steps: int, + num_cycles: float, ): if current_step < num_warmup_steps: - return float(current_step+1) / float(max(1, num_warmup_steps)) - - progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) - return max(1e-10, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) - + return float(current_step + 1) / float(max(1, num_warmup_steps)) + progress = float(current_step - num_warmup_steps) / float( + max(1, num_training_steps - num_warmup_steps) + ) + return max( + 1e-10, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)) + ) - def get_cosine_schedule_with_warmup( self, - optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1 + def get_cosine_schedule_with_warmup( + self, + optimizer, + num_warmup_steps: int, + num_training_steps: int, + num_cycles: float = 0.5, + last_epoch: int = -1, ): """ Create a schedule with a learning rate that decreases following the values of the cosine function between the @@ -151,13 +163,15 @@ def get_cosine_schedule_with_warmup( self, ) return LambdaLR(optimizer, lr_lambda, last_epoch) + class CallbackHandler: """ A CallbackHandler class that can be used to add callbacks to the training process for both epoch-level and batch-level events. To have more control over the training process, you can create a subclass of this class and override the methods you need. """ - def epoch_callback(self, epoch:int, epoch_loss:float) -> bool: + + def epoch_callback(self, epoch: int, epoch_loss: float) -> bool: """ This method will be called at the end of each epoch. The callback can also be used to stop the training by returning False. If the return value is None, or True, the training @@ -177,7 +191,8 @@ def epoch_callback(self, epoch:int, epoch_loss:float) -> bool: """ continue_training = True return continue_training - def batch_callback(self, batch:int, batch_loss:float): + + def batch_callback(self, batch: int, batch_loss: float): """ This method will be called at the end of each batch. @@ -191,16 +206,18 @@ def batch_callback(self, batch:int, batch_loss:float): """ pass + def append_nAA_column_if_missing(precursor_df): """ Append a column containing the number of Amino Acids """ - if 'nAA' not in precursor_df.columns: - precursor_df['nAA'] = precursor_df.sequence.str.len() - precursor_df.sort_values('nAA', inplace=True) - precursor_df.reset_index(drop=True,inplace=True) + if "nAA" not in precursor_df.columns: + precursor_df["nAA"] = precursor_df.sequence.str.len() + precursor_df.sort_values("nAA", inplace=True) + precursor_df.reset_index(drop=True, inplace=True) return precursor_df + class ModelInterface(object): """ Provides standardized methods to interact @@ -208,11 +225,12 @@ class ModelInterface(object): the abstract (i.e. not implemented) methods. """ - def __init__(self, - device:str='gpu', - fixed_sequence_len:int = 0, - min_pred_value:float = 0.0, - **kwargs + def __init__( + self, + device: str = "gpu", + fixed_sequence_len: int = 0, + min_pred_value: float = 0.0, + **kwargs, ): """ Parameters @@ -227,9 +245,9 @@ def __init__(self, min_pred_value : float, optional See :attr:`min_pred_value`, defaults to 0.0. """ - self.model:torch.nn.Module = None + self.model: torch.nn.Module = None self.optimizer = None - self.model_params:dict = {} + self.model_params: dict = {} self.set_device(device) self.fixed_sequence_len = fixed_sequence_len self.min_pred_value = min_pred_value @@ -237,7 +255,7 @@ def __init__(self, self.callback_handler = CallbackHandler() @property - def fixed_sequence_len(self)->int: + def fixed_sequence_len(self) -> int: """ This attribute controls how to train and infer for variable-length sequences: @@ -248,54 +266,54 @@ def fixed_sequence_len(self)->int: return self._fixed_sequence_len @fixed_sequence_len.setter - def fixed_sequence_len(self, seq_len:int): + def fixed_sequence_len(self, seq_len: int): self._fixed_sequence_len = seq_len - self.model_params['fixed_sequence_len'] = seq_len + self.model_params["fixed_sequence_len"] = seq_len @property - def min_pred_value(self)->float: + def min_pred_value(self) -> float: """ The predicted values cannot be smaller than this value. """ return self._min_pred_value @min_pred_value.setter - def min_pred_value(self, val:float): + def min_pred_value(self, val: float): self._min_pred_value = val - self.model_params['min_pred_value'] = val + self.model_params["min_pred_value"] = val @property - def device_type(self)->str: + def device_type(self) -> str: """Read-only""" return self._device_type @property - def device(self)->torch.device: + def device(self) -> torch.device: """Read-only""" return self._device @property - def device_ids(self)->list: + def device_ids(self) -> list: """Read-only""" return self._device_ids @property - def target_column_to_predict(self)->str: + def target_column_to_predict(self) -> str: return self._target_column_to_predict @target_column_to_predict.setter - def target_column_to_predict(self, column:str): + def target_column_to_predict(self, column: str): self._target_column_to_predict = column @property - def target_column_to_train(self)->str: + def target_column_to_train(self) -> str: return self._target_column_to_train @target_column_to_train.setter - def target_column_to_train(self, column:str): + def target_column_to_train(self, column: str): self._target_column_to_train = column - def set_lr_scheduler_class(self, lr_scheduler_class:LR_SchedulerInterface) -> None: + def set_lr_scheduler_class(self, lr_scheduler_class: LR_SchedulerInterface) -> None: """ Set the learning rate scheduler class. We require the user pass a class that is a subclass of LR_SchedulerInterface because the current implementation will create an instance of it within this class. @@ -313,7 +331,8 @@ def set_lr_scheduler_class(self, lr_scheduler_class:LR_SchedulerInterface) -> No ) else: self.lr_scheduler_class = lr_scheduler_class - def set_callback_handler(self, callback_handler:CallbackHandler) -> None: + + def set_callback_handler(self, callback_handler: CallbackHandler) -> None: """ Set the callback handler. It has to be a subclass of CallbackHandler. """ @@ -323,10 +342,8 @@ def set_callback_handler(self, callback_handler:CallbackHandler) -> None: raise ValueError( "The callback handler passed must be a subclass of model_interface.CallbackHandler" ) - def set_device(self, - device_type:str = 'gpu', - device_ids:list = [] - ): + + def set_device(self, device_type: str = "gpu", device_ids: list = []): """ Set the device (e.g. gpu (cuda), mps, cpu, ...) to be used for the model. @@ -345,12 +362,10 @@ def set_device(self, """ self._device_ids = device_ids - if device_type == 'get_available': + if device_type == "get_available": self._device, self._device_type = get_available_device() else: - self._device, self._device_type = get_device( - device_type, device_ids - ) + self._device, self._device_type = get_device(device_type, device_ids) self._model_to_device() @@ -361,24 +376,18 @@ def _model_to_device(self): TODO It is better to use torch.nn.parallel.DistributedDataParallel, but this may need more setups for models and optimizers. """ - if self.model is None: return - if self.device_type != 'cuda': + if self.model is None: + return + if self.device_type != "cuda": self.model.to(self.device) else: - if ( - self.device_ids and len(self.device_ids) > 1 - ): + if self.device_ids and len(self.device_ids) > 1: self.model = torch.nn.DataParallel(self.model, self.device_ids) - elif ( - not self.device_ids and torch.cuda.device_count()>1 - ): + elif not self.device_ids and torch.cuda.device_count() > 1: self.model = torch.nn.DataParallel(self.model) self.model.to(self.device) - def build(self, - model_class: torch.nn.Module, - **kwargs - ): + def build(self, model_class: torch.nn.Module, **kwargs): """ Builds the model by specifying the PyTorch module, the parameters, the device, the loss function ... @@ -388,27 +397,29 @@ def build(self, self._model_to_device() self._init_for_training() - def set_bert_trainable(self, + def set_bert_trainable( + self, bert_layer_name="hidden_nn", - bert_layer_idxes=[1,2], # [0,1,2,3] in ms2 model + bert_layer_idxes=[1, 2], # [0,1,2,3] in ms2 model trainable=True, ): self.set_layer_trainable( layer_names=[ - f"{bert_layer_name}.bert.layer.{layer}" - for layer in bert_layer_idxes + f"{bert_layer_name}.bert.layer.{layer}" for layer in bert_layer_idxes ], trainable=trainable, ) - def set_layer_trainable(self, + def set_layer_trainable( + self, layer_names=[], trainable=True, ): for layer in layer_names: self.model.get_submodule(layer).requires_grad_(trainable) - def train_with_warmup(self, + def train_with_warmup( + self, precursor_df: pd.DataFrame, *, batch_size=1024, @@ -417,7 +428,7 @@ def train_with_warmup(self, lr=1e-4, verbose=False, verbose_each_epoch=False, - **kwargs + **kwargs, ): """ Train the model according to specifications. Includes a warumup @@ -425,28 +436,23 @@ def train_with_warmup(self, """ self._prepare_training(precursor_df, lr, **kwargs) - lr_scheduler = self._get_lr_schedule_with_warmup( - warmup_epoch, epoch - ) + lr_scheduler = self._get_lr_schedule_with_warmup(warmup_epoch, epoch) for epoch in range(epoch): if self.fixed_sequence_len == 0: batch_cost = self._train_one_epoch( - precursor_df, epoch, - batch_size, verbose_each_epoch, - **kwargs + precursor_df, epoch, batch_size, verbose_each_epoch, **kwargs ) else: batch_cost = self._train_one_epoch_by_padding_zeros( - precursor_df, epoch, - batch_size, verbose_each_epoch, - **kwargs + precursor_df, epoch, batch_size, verbose_each_epoch, **kwargs ) lr_scheduler.step(epoch=epoch, loss=np.mean(batch_cost)) - if verbose: print( - f'[Training] Epoch={epoch+1}, lr={lr_scheduler.get_last_lr()[0]}, loss={np.mean(batch_cost)}' - ) + if verbose: + print( + f"[Training] Epoch={epoch+1}, lr={lr_scheduler.get_last_lr()[0]}, loss={np.mean(batch_cost)}" + ) continue_training = self.callback_handler.epoch_callback( epoch=epoch, epoch_loss=np.mean(batch_cost) ) @@ -455,24 +461,26 @@ def train_with_warmup(self, break torch.cuda.empty_cache() - def train(self, + def train( + self, precursor_df: pd.DataFrame, *, batch_size=1024, epoch=10, - warmup_epoch:int=0, + warmup_epoch: int = 0, lr=1e-4, verbose=False, verbose_each_epoch=False, - **kwargs + **kwargs, ): """ Train the model according to specifications. """ - if verbose: logging.info( - f"Training with fixed sequence length: {self.fixed_sequence_len}" - ) + if verbose: + logging.info( + f"Training with fixed sequence length: {self.fixed_sequence_len}" + ) if warmup_epoch > 0: self.train_with_warmup( @@ -483,7 +491,7 @@ def train(self, lr=lr, verbose=verbose, verbose_each_epoch=verbose_each_epoch, - **kwargs + **kwargs, ) else: self._prepare_training(precursor_df, lr, **kwargs) @@ -491,33 +499,33 @@ def train(self, for epoch in range(epoch): if self.fixed_sequence_len == 0: batch_cost = self._train_one_epoch( - precursor_df, epoch, - batch_size, verbose_each_epoch, - **kwargs + precursor_df, epoch, batch_size, verbose_each_epoch, **kwargs ) else: batch_cost = self._train_one_epoch_by_padding_zeros( - precursor_df, epoch, - batch_size, verbose_each_epoch, - **kwargs + precursor_df, epoch, batch_size, verbose_each_epoch, **kwargs + ) + if verbose: + print( + f"[Training] Epoch={epoch+1}, Mean Loss={np.mean(batch_cost)}" ) - if verbose: print(f'[Training] Epoch={epoch+1}, Mean Loss={np.mean(batch_cost)}') continue_training = self.callback_handler.epoch_callback( epoch=epoch, epoch_loss=np.mean(batch_cost) - ) + ) if not continue_training: print(f"Training stopped at epoch {epoch}") break torch.cuda.empty_cache() - def predict(self, - precursor_df:pd.DataFrame, + def predict( + self, + precursor_df: pd.DataFrame, *, - batch_size:int=1024, - verbose:bool=False, - **kwargs - )->pd.DataFrame: + batch_size: int = 1024, + verbose: bool = False, + **kwargs, + ) -> pd.DataFrame: """ The model predicts the properties based on the inputs it has been trained for. Returns the ouput as a pandas dataframe. @@ -525,10 +533,10 @@ def predict(self, precursor_df = append_nAA_column_if_missing(precursor_df) self._pad_zeros_if_fixed_len(precursor_df) self._check_predict_in_order(precursor_df) - self._prepare_predict_data_df(precursor_df,**kwargs) + self._prepare_predict_data_df(precursor_df, **kwargs) self.model.eval() - _grouped = precursor_df.groupby('nAA') + _grouped = precursor_df.groupby("nAA") if verbose: batch_tqdm = tqdm(_grouped) else: @@ -536,35 +544,31 @@ def predict(self, with _inference_mode(): for nAA, df_group in batch_tqdm: for i in range(0, len(df_group), batch_size): - batch_end = i+batch_size + batch_end = i + batch_size - batch_df = df_group.iloc[i:batch_end,:] + batch_df = df_group.iloc[i:batch_end, :] - features = self._get_features_from_batch_df( - batch_df, **kwargs - ) + features = self._get_features_from_batch_df(batch_df, **kwargs) if isinstance(features, tuple): predicts = self._predict_one_batch(*features) else: predicts = self._predict_one_batch(features) - self._set_batch_predict_data( - batch_df, predicts, - **kwargs - ) + self._set_batch_predict_data(batch_df, predicts, **kwargs) torch.cuda.empty_cache() return self.predict_df - def predict_mp(self, - precursor_df:pd.DataFrame, + def predict_mp( + self, + precursor_df: pd.DataFrame, *, - batch_size:int=1024, - mp_batch_size:int=100000, - process_num:int=global_settings['thread_num'], - **kwargs - )->pd.DataFrame: + batch_size: int = 1024, + mp_batch_size: int = 100000, + process_num: int = global_settings["thread_num"], + **kwargs, + ) -> pd.DataFrame: """ Predicting with multiprocessing is no GPUs are availible. Note this multiprocessing method only works for models those predict @@ -572,33 +576,32 @@ def predict_mp(self, """ precursor_df = append_nAA_column_if_missing(precursor_df) - if self.device_type != 'cpu': + if self.device_type != "cpu": return self.predict( - precursor_df, - batch_size=batch_size, - verbose=True, - **kwargs + precursor_df, batch_size=batch_size, verbose=True, **kwargs ) - _predict_func = functools.partial(self.predict, - batch_size=batch_size, verbose=False, **kwargs + _predict_func = functools.partial( + self.predict, batch_size=batch_size, verbose=False, **kwargs ) def batch_df_gen(precursor_df, mp_batch_size): for i in range(0, len(precursor_df), mp_batch_size): - yield precursor_df.iloc[i:i+mp_batch_size] + yield precursor_df.iloc[i : i + mp_batch_size] self._check_predict_in_order(precursor_df) - self._prepare_predict_data_df(precursor_df,**kwargs) + self._prepare_predict_data_df(precursor_df, **kwargs) print("Predicting with multiprocessing ...") self.model.share_memory() df_list = [] - with mp.get_context('spawn').Pool(process_num) as p: - for ret_df in process_bar(p.imap( + with mp.get_context("spawn").Pool(process_num) as p: + for ret_df in process_bar( + p.imap( _predict_func, batch_df_gen(precursor_df, mp_batch_size), - ), len(precursor_df)//mp_batch_size+1 + ), + len(precursor_df) // mp_batch_size + 1, ): df_list.append(ret_df) @@ -607,33 +610,31 @@ def batch_df_gen(precursor_df, mp_batch_size): return self.predict_df - def save(self, filename:str): + def save(self, filename: str): """ Save the model state, the constants used, the code defining the model and the model parameters. """ # TODO save tf.keras.Model dir = os.path.dirname(filename) - if not dir: dir = './' - if not os.path.exists(dir): os.makedirs(dir) + if not dir: + dir = "./" + if not os.path.exists(dir): + os.makedirs(dir) torch.save(self.model.state_dict(), filename) - with open(filename+'.txt','w') as f: f.write(str(self.model)) - save_yaml(filename+'.model_const.yaml', model_const) - self._save_codes(filename+'.model.py') - save_yaml(filename+'.param.yaml', self.model_params) + with open(filename + ".txt", "w") as f: + f.write(str(self.model)) + save_yaml(filename + ".model_const.yaml", model_const) + self._save_codes(filename + ".model.py") + save_yaml(filename + ".param.yaml", self.model_params) - def load( - self, - model_file: Tuple[str, IO], - model_path_in_zip: str = None, - **kwargs - ): + def load(self, model_file: Tuple[str, IO], model_path_in_zip: str = None, **kwargs): """ Load a model specified in a zip file, a text file or a file stream. """ # TODO load tf.keras.Model if isinstance(model_file, str): # We may release all models (msms, rt, ccs, ...) in a single zip file - if model_file.lower().endswith('.zip'): + if model_file.lower().endswith(".zip"): self._load_model_from_zipfile(model_file, model_path_in_zip) else: self._load_model_from_pytorchfile(model_file) @@ -646,41 +647,37 @@ def get_parameter_num(self): """ return np.sum([p.numel() for p in self.model.parameters()]) - def build_from_py_codes(self, - model_code_file_or_zip:str, - code_file_in_zip:str=None, - include_model_params_yaml:bool=True, - **kwargs + def build_from_py_codes( + self, + model_code_file_or_zip: str, + code_file_in_zip: str = None, + include_model_params_yaml: bool = True, + **kwargs, ): """ Build the model based on a python file. Must contain a PyTorch model implemented as 'class Model(...' """ - if model_code_file_or_zip.lower().endswith('.zip'): - with ZipFile(model_code_file_or_zip, 'r') as model_zip: - with model_zip.open(code_file_in_zip,'r') as f: + if model_code_file_or_zip.lower().endswith(".zip"): + with ZipFile(model_code_file_or_zip, "r") as model_zip: + with model_zip.open(code_file_in_zip, "r") as f: codes = f.read() if include_model_params_yaml: with model_zip.open( - code_file_in_zip[:-len('model.py')]+'param.yaml', - 'r' + code_file_in_zip[: -len("model.py")] + "param.yaml", "r" ) as f: params = yaml.load(f, yaml.FullLoader) else: - with open(model_code_file_or_zip, 'r') as f: + with open(model_code_file_or_zip, "r") as f: codes = f.read() if include_model_params_yaml: params = load_yaml( - model_code_file_or_zip[:-len('model.py')]+'param.yaml' + model_code_file_or_zip[: -len("model.py")] + "param.yaml" ) - compiled_codes = compile( - codes, - filename='model_file_py', - mode='exec' - ) - _module = ModuleType('_apd_nn_codes') - #codes must contains torch model codes 'class Model(...' + compiled_codes = compile(codes, filename="model_file_py", mode="exec") + _module = ModuleType("_apd_nn_codes") + # codes must contains torch model codes 'class Model(...' exec(compiled_codes, _module.__dict__) if include_model_params_yaml: @@ -700,10 +697,9 @@ def _init_for_training(self): """ self.loss_func = torch.nn.L1Loss() - def _as_tensor(self, - data:np.ndarray, - dtype:torch.dtype=torch.float32 - )->torch.Tensor: + def _as_tensor( + self, data: np.ndarray, dtype: torch.dtype = torch.float32 + ) -> torch.Tensor: """Convert numerical np.array to pytorch tensor. The tensor will be stored in self.device @@ -725,40 +721,40 @@ def _as_tensor(self, def _load_model_from_zipfile(self, model_file, model_path_in_zip): with ZipFile(model_file) as model_zip: - with model_zip.open(model_path_in_zip,'r') as pt_file: + with model_zip.open(model_path_in_zip, "r") as pt_file: self._load_model_from_stream(pt_file) def _load_model_from_pytorchfile(self, model_file): - with open(model_file,'rb') as pt_file: + with open(model_file, "rb") as pt_file: self._load_model_from_stream(pt_file) def _load_model_from_stream(self, stream): - ( - missing_keys, unexpect_keys - ) = self.model.load_state_dict(torch.load( - stream, map_location=self.device), - strict=False + (missing_keys, unexpect_keys) = self.model.load_state_dict( + torch.load(stream, map_location=self.device), strict=False ) if len(missing_keys) > 0: - logging.warn(f"nn parameters {missing_keys} are MISSING while loading models in {self.__class__}") + logging.warn( + f"nn parameters {missing_keys} are MISSING while loading models in {self.__class__}" + ) if len(unexpect_keys) > 0: - logging.warn(f"nn parameters {unexpect_keys} are UNEXPECTED while loading models in {self.__class__}") + logging.warn( + f"nn parameters {unexpect_keys} are UNEXPECTED while loading models in {self.__class__}" + ) def _save_codes(self, save_as): try: - code = '''import torch\n''' - code += '''import peptdeep.model.building_block as building_block\n''' - code += '''from peptdeep.model.model_shop import *\n''' + code = """import torch\n""" + code += """import peptdeep.model.building_block as building_block\n""" + code += """from peptdeep.model.model_shop import *\n""" class_code = inspect.getsource(self.model.__class__) - code += 'class Model' + class_code[class_code.find('('):] - with open(save_as, 'w') as f: + code += "class Model" + class_code[class_code.find("(") :] + with open(save_as, "w") as f: f.write(code) except (TypeError, ValueError, KeyError) as e: - logging.info(f'Cannot save model source codes: {str(e)}') + logging.info(f"Cannot save model source codes: {str(e)}") - def _train_one_epoch_by_padding_zeros(self, - precursor_df, epoch, batch_size, verbose_each_epoch, - **kwargs + def _train_one_epoch_by_padding_zeros( + self, precursor_df, epoch, batch_size, verbose_each_epoch, **kwargs ): """Training for an epoch by padding zeros""" batch_cost = [] @@ -768,37 +764,28 @@ def _train_one_epoch_by_padding_zeros(self, else: batch_tqdm = range(0, len(rnd_df), batch_size) for i in batch_tqdm: - batch_end = i+batch_size + batch_end = i + batch_size - batch_df = rnd_df.iloc[i:batch_end,:] - targets = self._get_targets_from_batch_df( - batch_df, **kwargs - ) - features = self._get_features_from_batch_df( - batch_df, **kwargs - ) + batch_df = rnd_df.iloc[i:batch_end, :] + targets = self._get_targets_from_batch_df(batch_df, **kwargs) + features = self._get_features_from_batch_df(batch_df, **kwargs) if isinstance(features, tuple): - batch_cost.append( - self._train_one_batch(targets, *features) - ) + batch_cost.append(self._train_one_batch(targets, *features)) else: - batch_cost.append( - self._train_one_batch(targets, features) - ) - self.callback_handler.batch_callback(i//batch_size, batch_cost[-1]) + batch_cost.append(self._train_one_batch(targets, features)) + self.callback_handler.batch_callback(i // batch_size, batch_cost[-1]) if verbose_each_epoch: batch_tqdm.set_description( - f'Epoch={epoch+1}, batch={len(batch_cost)}, loss={batch_cost[-1]:.4f}' + f"Epoch={epoch+1}, batch={len(batch_cost)}, loss={batch_cost[-1]:.4f}" ) return batch_cost - def _train_one_epoch(self, - precursor_df, epoch, batch_size, verbose_each_epoch, - **kwargs + def _train_one_epoch( + self, precursor_df, epoch, batch_size, verbose_each_epoch, **kwargs ): """Training for an epoch""" batch_cost = [] - _grouped = list(precursor_df.sample(frac=1).groupby('nAA')) + _grouped = list(precursor_df.sample(frac=1).groupby("nAA")) rnd_nAA = np.random.permutation(len(_grouped)) if verbose_each_epoch: batch_tqdm = tqdm(rnd_nAA) @@ -808,33 +795,25 @@ def _train_one_epoch(self, nAA, df_group = _grouped[i_group] # df_group = df_group.reset_index(drop=True) for i in range(0, len(df_group), batch_size): - batch_end = i+batch_size + batch_end = i + batch_size - batch_df = df_group.iloc[i:batch_end,:] - targets = self._get_targets_from_batch_df( - batch_df, **kwargs - ) - features = self._get_features_from_batch_df( - batch_df, **kwargs - ) + batch_df = df_group.iloc[i:batch_end, :] + targets = self._get_targets_from_batch_df(batch_df, **kwargs) + features = self._get_features_from_batch_df(batch_df, **kwargs) if isinstance(features, tuple): - batch_cost.append( - self._train_one_batch(targets, *features) - ) + batch_cost.append(self._train_one_batch(targets, *features)) else: - batch_cost.append( - self._train_one_batch(targets, features) - ) - self.callback_handler.batch_callback(i//batch_size, batch_cost[-1]) + batch_cost.append(self._train_one_batch(targets, features)) + self.callback_handler.batch_callback(i // batch_size, batch_cost[-1]) if verbose_each_epoch: batch_tqdm.set_description( - f'Epoch={epoch+1}, nAA={nAA}, batch={len(batch_cost)}, loss={batch_cost[-1]:.4f}' + f"Epoch={epoch+1}, nAA={nAA}, batch={len(batch_cost)}, loss={batch_cost[-1]:.4f}" ) return batch_cost def _train_one_batch( self, - targets:torch.Tensor, + targets: torch.Tensor, *features, ): """Training for a mini batch""" @@ -846,17 +825,15 @@ def _train_one_batch( self.optimizer.step() return cost.item() - def _predict_one_batch(self, - *features - ): + def _predict_one_batch(self, *features): """Predicting for a mini batch""" - return self.model( - *features - ).cpu().detach().numpy() + return self.model(*features).cpu().detach().numpy() - def _get_targets_from_batch_df(self, - batch_df:pd.DataFrame, **kwargs, - )->torch.Tensor: + def _get_targets_from_batch_df( + self, + batch_df: pd.DataFrame, + **kwargs, + ) -> torch.Tensor: """Tell the `train()` method how to get target values from the `batch_df`. All sub-classes must re-implement this method. Use torch.tensor(np.array, dtype=..., device=self.device) to convert tensor. @@ -872,13 +849,12 @@ def _get_targets_from_batch_df(self, Target value tensor """ return self._as_tensor( - batch_df[self.target_column_to_train].values, - dtype=torch.float32 + batch_df[self.target_column_to_train].values, dtype=torch.float32 ) def _get_aa_indice_features_padding_zeros( - self, batch_df:pd.DataFrame - )->torch.LongTensor: + self, batch_df: pd.DataFrame + ) -> torch.LongTensor: """ Get indices values of variable length sequences using 128 ascii codes @@ -889,44 +865,37 @@ def _get_aa_indice_features_padding_zeros( max_len = self.fixed_sequence_len return self._as_tensor( get_ascii_indices( - batch_df['sequence'].apply( - lambda seq: seq + chr(0)*(max_len-len(seq)) - ).values.astype('U') + batch_df["sequence"] + .apply(lambda seq: seq + chr(0) * (max_len - len(seq))) + .values.astype("U") ), - dtype=torch.long + dtype=torch.long, ) - def _get_aa_indice_features( - self, batch_df:pd.DataFrame - )->torch.LongTensor: + def _get_aa_indice_features(self, batch_df: pd.DataFrame) -> torch.LongTensor: """ Get indices values for fixed length sequences with 128 ascii codes. """ return self._as_tensor( - get_ascii_indices( - batch_df['sequence'].values.astype('U') - ), - dtype=torch.long + get_ascii_indices(batch_df["sequence"].values.astype("U")), dtype=torch.long ) - def _get_26aa_indice_features( - self, batch_df:pd.DataFrame - )->torch.LongTensor: + def _get_26aa_indice_features(self, batch_df: pd.DataFrame) -> torch.LongTensor: """ Get indices values for 26 upper-case letters (amino acids), from 1 to 26. 0 is used for padding. """ return self._as_tensor( - get_batch_aa_indices( - batch_df['sequence'].values.astype('U') - ), - dtype=torch.long + get_batch_aa_indices(batch_df["sequence"].values.astype("U")), + dtype=torch.long, ) - def _get_features_from_batch_df(self, - batch_df:pd.DataFrame, **kwargs, - )->Union[torch.LongTensor, Tuple[torch.Tensor]]: + def _get_features_from_batch_df( + self, + batch_df: pd.DataFrame, + **kwargs, + ) -> Union[torch.LongTensor, Tuple[torch.Tensor]]: """ Any sub-class must re-implement this method: @@ -946,30 +915,23 @@ def _get_features_from_batch_df(self, """ return self._get_aa_features(batch_df) - def _get_aa_mod_features(self, - batch_df:pd.DataFrame, **kwargs, - )->Tuple[torch.Tensor]: - return ( - self._get_aa_features(batch_df), - self._get_mod_features(batch_df) - ) + def _get_aa_mod_features( + self, + batch_df: pd.DataFrame, + **kwargs, + ) -> Tuple[torch.Tensor]: + return (self._get_aa_features(batch_df), self._get_mod_features(batch_df)) - def _get_mod_features( - self, batch_df:pd.DataFrame - )->torch.Tensor: + def _get_mod_features(self, batch_df: pd.DataFrame) -> torch.Tensor: """ Get modification features. """ if self.fixed_sequence_len < 0: batch_df = batch_df.copy() - batch_df['nAA'] = batch_df.nAA.max() - return self._as_tensor( - get_batch_mod_feature(batch_df) - ) + batch_df["nAA"] = batch_df.nAA.max() + return self._as_tensor(get_batch_mod_feature(batch_df)) - def _get_aa_features(self, - batch_df:pd.DataFrame - )->torch.LongTensor: + def _get_aa_features(self, batch_df: pd.DataFrame) -> torch.LongTensor: """ Get AA indices """ @@ -978,10 +940,7 @@ def _get_aa_features(self, else: return self._get_aa_indice_features_padding_zeros(batch_df) - def _prepare_predict_data_df(self, - precursor_df:pd.DataFrame, - **kwargs - ): + def _prepare_predict_data_df(self, precursor_df: pd.DataFrame, **kwargs): """ This methods fills 0s in the column of `self.target_column_to_predict` in `precursor_df`, @@ -990,10 +949,7 @@ def _prepare_predict_data_df(self, precursor_df[self.target_column_to_predict] = 0.0 self.predict_df = precursor_df - def _prepare_train_data_df(self, - precursor_df:pd.DataFrame, - **kwargs - ): + def _prepare_train_data_df(self, precursor_df: pd.DataFrame, **kwargs): """Changes to the training dataframe can be implemented here. Parameters @@ -1003,10 +959,8 @@ def _prepare_train_data_df(self, """ pass - def _set_batch_predict_data(self, - batch_df:pd.DataFrame, - predict_values:np.ndarray, - **kwargs + def _set_batch_predict_data( + self, batch_df: pd.DataFrame, predict_values: np.ndarray, **kwargs ): """Set predicted values into `self.predict_df`. @@ -1018,72 +972,63 @@ def _set_batch_predict_data(self, predict_values : np.array Predicted values """ - predict_values[predict_values epoch: - warmup_epoch = epoch//2 + warmup_epoch = epoch // 2 return self.lr_scheduler_class( - self.optimizer, - num_warmup_steps=warmup_epoch, - num_training_steps=epoch + self.optimizer, num_warmup_steps=warmup_epoch, num_training_steps=epoch ) - def _pad_zeros_if_fixed_len(self, precursor_df:pd.DataFrame): + def _pad_zeros_if_fixed_len(self, precursor_df: pd.DataFrame): if self.fixed_sequence_len > 0: precursor_df.drop( - index=precursor_df[ - precursor_df.nAA>self.fixed_sequence_len - ].index, + index=precursor_df[precursor_df.nAA > self.fixed_sequence_len].index, inplace=True, ) precursor_df.reset_index(drop=True, inplace=True) - precursor_df['nAA'] = self.fixed_sequence_len + precursor_df["nAA"] = self.fixed_sequence_len - def _prepare_training(self, - precursor_df:pd.DataFrame, - lr:float, - **kwargs - ): - if 'nAA' not in precursor_df.columns: - precursor_df['nAA'] = precursor_df.sequence.str.len() + def _prepare_training(self, precursor_df: pd.DataFrame, lr: float, **kwargs): + if "nAA" not in precursor_df.columns: + precursor_df["nAA"] = precursor_df.sequence.str.len() self._pad_zeros_if_fixed_len(precursor_df) self._prepare_train_data_df(precursor_df, **kwargs) self.model.train() self.set_lr(lr) - def _check_predict_in_order(self, precursor_df:pd.DataFrame): + def _check_predict_in_order(self, precursor_df: pd.DataFrame): if is_precursor_refined(precursor_df): self._predict_in_order = True else: self._predict_in_order = False + def _inference_mode(): # torch.inference_mode() only available in torch>=1.9.0 - if float(torch.__version__[:torch.__version__.rfind(".")]) >= 1.9: + if float(torch.__version__[: torch.__version__.rfind(".")]) >= 1.9: return torch.inference_mode() else: return torch.no_grad() diff --git a/peptdeep/model/ms2.py b/peptdeep/model/ms2.py index 235824a6..fa7b1035 100644 --- a/peptdeep/model/ms2.py +++ b/peptdeep/model/ms2.py @@ -11,24 +11,23 @@ init_fragment_by_precursor_dataframe, update_sliced_fragment_dataframe, get_sliced_fragment_dataframe, - get_charged_frag_types + get_charged_frag_types, ) from peptdeep.utils import get_available_device from peptdeep.model.featurize import ( - get_batch_aa_indices, parse_instrument_indices, - get_batch_mod_feature + get_batch_aa_indices, + parse_instrument_indices, + get_batch_mod_feature, ) -from peptdeep.settings import ( - global_settings as settings, - model_const -) +from peptdeep.settings import global_settings as settings, model_const import peptdeep.model.model_interface as model_interface import peptdeep.model.building_block as building_block + class ModelMS2Transformer(torch.nn.Module): """Transformer model for MS2 prediction @@ -52,13 +51,15 @@ class ModelMS2Transformer(torch.nn.Module): hidden : int, optional Hidden layer size, by default 256 """ - def __init__(self, - num_frag_types:int, - num_modloss_types:int=0, - mask_modloss:bool=True, - dropout:float=0.1, - nlayers:int=4, - hidden:int=256, + + def __init__( + self, + num_frag_types: int, + num_modloss_types: int = 0, + mask_modloss: bool = True, + dropout: float = 0.1, + nlayers: int = 4, + hidden: int = 256, **kwargs, ): super().__init__() @@ -66,13 +67,15 @@ def __init__(self, self.dropout = torch.nn.Dropout(dropout) self._num_modloss_types = num_modloss_types - self._num_non_modloss = num_frag_types-num_modloss_types + self._num_non_modloss = num_frag_types - num_modloss_types self._mask_modloss = mask_modloss if num_modloss_types == 0: self._mask_modloss = True meta_dim = 8 - self.input_nn = building_block.Input_26AA_Mod_PositionalEncoding(hidden-meta_dim) + self.input_nn = building_block.Input_26AA_Mod_PositionalEncoding( + hidden - meta_dim + ) self.meta_nn = building_block.Meta_Embedding(meta_dim) @@ -87,62 +90,67 @@ def __init__(self, if num_modloss_types > 0: # for transfer learning of modloss frags - self.modloss_nn = torch.nn.ModuleList([ - building_block.Hidden_Transformer( - hidden, nlayers=1, dropout=dropout - ), - building_block.Decoder_Linear( - hidden, num_modloss_types, - ), - ]) + self.modloss_nn = torch.nn.ModuleList( + [ + building_block.Hidden_Transformer( + hidden, nlayers=1, dropout=dropout + ), + building_block.Decoder_Linear( + hidden, + num_modloss_types, + ), + ] + ) else: self.modloss_nn = None - def forward(self, + def forward( + self, aa_indices, mod_x, - charges:torch.Tensor, - NCEs:torch.Tensor, + charges: torch.Tensor, + NCEs: torch.Tensor, instrument_indices, ): - - in_x = self.dropout(self.input_nn( - aa_indices, mod_x - )) - meta_x = self.meta_nn( - charges, NCEs, instrument_indices - ).unsqueeze(1).repeat(1,in_x.size(1),1) - in_x = torch.cat((in_x, meta_x),2) + in_x = self.dropout(self.input_nn(aa_indices, mod_x)) + meta_x = ( + self.meta_nn(charges, NCEs, instrument_indices) + .unsqueeze(1) + .repeat(1, in_x.size(1), 1) + ) + in_x = torch.cat((in_x, meta_x), 2) hidden_x = self.hidden_nn(in_x) - hidden_x = self.dropout(hidden_x+in_x*0.2) + hidden_x = self.dropout(hidden_x + in_x * 0.2) - out_x = self.output_nn( - hidden_x - ) + out_x = self.output_nn(hidden_x) if self._num_modloss_types > 0: if self._mask_modloss: - out_x = torch.cat((out_x, torch.zeros( - *out_x.size()[:2],self._num_modloss_types, - device=in_x.device - )), 2) - else: - modloss_x = self.modloss_nn[0]( - in_x - ) + hidden_x - modloss_x = self.modloss_nn[-1]( - modloss_x + out_x = torch.cat( + ( + out_x, + torch.zeros( + *out_x.size()[:2], + self._num_modloss_types, + device=in_x.device, + ), + ), + 2, ) - out_x = torch.cat(( - out_x, modloss_x - ),2) + else: + modloss_x = self.modloss_nn[0](in_x) + hidden_x + modloss_x = self.modloss_nn[-1](modloss_x) + out_x = torch.cat((out_x, modloss_x), 2) + + return out_x[:, 3:, :] - return out_x[:,3:,:] class ModelMS2Bert(torch.nn.Module): """Using HuggingFace's BertEncoder for MS2 prediction""" - def __init__(self, + + def __init__( + self, num_frag_types, num_modloss_types=0, mask_modloss=True, @@ -157,20 +165,24 @@ def __init__(self, self.dropout = torch.nn.Dropout(dropout) self._num_modloss_types = num_modloss_types - self._num_non_modloss = num_frag_types-num_modloss_types + self._num_non_modloss = num_frag_types - num_modloss_types self._mask_modloss = mask_modloss if num_modloss_types == 0: self._mask_modloss = True meta_dim = 8 - self.input_nn = building_block.Input_26AA_Mod_PositionalEncoding(hidden-meta_dim) + self.input_nn = building_block.Input_26AA_Mod_PositionalEncoding( + hidden - meta_dim + ) self.meta_nn = building_block.Meta_Embedding(meta_dim) self._output_attentions = output_attentions self.hidden_nn = building_block.Hidden_HFace_Transformer( - hidden, nlayers=nlayers, dropout=dropout, - output_attentions=output_attentions + hidden, + nlayers=nlayers, + dropout=dropout, + output_attentions=output_attentions, ) self.output_nn = building_block.Decoder_Linear( @@ -180,15 +192,20 @@ def __init__(self, if num_modloss_types > 0: # for transfer learning of modloss frags - self.modloss_nn = torch.nn.ModuleList([ - building_block.Hidden_HFace_Transformer( - hidden, nlayers=1, dropout=dropout, - output_attentions=output_attentions - ), - building_block.Decoder_Linear( - hidden, num_modloss_types, - ), - ]) + self.modloss_nn = torch.nn.ModuleList( + [ + building_block.Hidden_HFace_Transformer( + hidden, + nlayers=1, + dropout=dropout, + output_attentions=output_attentions, + ), + building_block.Decoder_Linear( + hidden, + num_modloss_types, + ), + ] + ) else: self.modloss_nn = None @@ -197,63 +214,66 @@ def output_attentions(self): return self._output_attentions @output_attentions.setter - def output_attentions(self, val:bool): + def output_attentions(self, val: bool): self._output_attentions = val self.hidden_nn.output_attentions = val self.modloss_nn[0].output_attentions = val - def forward(self, + def forward( + self, aa_indices, mod_x, - charges:torch.Tensor, - NCEs:torch.Tensor, + charges: torch.Tensor, + NCEs: torch.Tensor, instrument_indices, ): - in_x = self.dropout(self.input_nn( - aa_indices, mod_x - )) - meta_x = self.meta_nn( - charges, NCEs, instrument_indices - ).unsqueeze(1).repeat(1,in_x.size(1),1) - in_x = torch.cat((in_x, meta_x),2) + in_x = self.dropout(self.input_nn(aa_indices, mod_x)) + meta_x = ( + self.meta_nn(charges, NCEs, instrument_indices) + .unsqueeze(1) + .repeat(1, in_x.size(1), 1) + ) + in_x = torch.cat((in_x, meta_x), 2) hidden_x = self.hidden_nn(in_x) if self.output_attentions: self.attentions = hidden_x[1] else: self.attentions = None - hidden_x = self.dropout(hidden_x[0]+in_x*0.2) + hidden_x = self.dropout(hidden_x[0] + in_x * 0.2) - out_x = self.output_nn( - hidden_x - ) + out_x = self.output_nn(hidden_x) self.modloss_attentions = None if self._num_modloss_types > 0: if self._mask_modloss: - out_x = torch.cat((out_x, torch.zeros( - *out_x.size()[:2],self._num_modloss_types, - device=in_x.device - )), 2) - else: - modloss_x = self.modloss_nn[0]( - in_x + out_x = torch.cat( + ( + out_x, + torch.zeros( + *out_x.size()[:2], + self._num_modloss_types, + device=in_x.device, + ), + ), + 2, ) + else: + modloss_x = self.modloss_nn[0](in_x) if self.output_attentions: self.modloss_attentions = modloss_x[-1] modloss_x = modloss_x[0] + hidden_x - modloss_x = self.modloss_nn[-1]( - modloss_x - ) - out_x = torch.cat(( - out_x, modloss_x - ),2) + modloss_x = self.modloss_nn[-1](modloss_x) + out_x = torch.cat((out_x, modloss_x), 2) + + return out_x[:, 3:, :] - return out_x[:,3:,:] class ModelMS2pDeep(torch.nn.Module): """LSTM model for MS2 prediction similar to pDeep series""" - def __init__(self, + + def __init__( + self, num_frag_types, num_modloss_types=0, mask_modloss=True, @@ -265,20 +285,19 @@ def __init__(self, self.dropout = torch.nn.Dropout(dropout) self._num_modloss_types = num_modloss_types - self._num_non_modloss = num_frag_types-num_modloss_types + self._num_non_modloss = num_frag_types - num_modloss_types self._mask_modloss = mask_modloss if num_modloss_types == 0: self._mask_modloss = True BiRNN = True - hidden=512 - hidden_rnn_layer=2 + hidden = 512 + hidden_rnn_layer = 2 self.input_nn = building_block.InputAALSTM_cat_Meta(hidden) self.hidden_nn = building_block.SeqLSTM( - hidden, hidden, rnn_layer=hidden_rnn_layer, - bidirectional=BiRNN + hidden, hidden, rnn_layer=hidden_rnn_layer, bidirectional=BiRNN ) self.output_nn = building_block.OutputLSTM_cat_Meta( @@ -288,92 +307,89 @@ def __init__(self, if num_modloss_types: # for transfer learning of modloss frags - self.modloss_nn = torch.nn.ModuleList([ - building_block.SeqLSTM( - hidden, hidden, - rnn_layer=1, bidirectional=BiRNN - ), - building_block.SeqLSTM( - hidden, num_modloss_types, - rnn_layer=1, bidirectional=False - ), - ]) + self.modloss_nn = torch.nn.ModuleList( + [ + building_block.SeqLSTM( + hidden, hidden, rnn_layer=1, bidirectional=BiRNN + ), + building_block.SeqLSTM( + hidden, num_modloss_types, rnn_layer=1, bidirectional=False + ), + ] + ) else: self.modloss_nn = None - def forward(self, + def forward( + self, aa_indices, mod_x, - charges:torch.Tensor, - NCEs:torch.Tensor, + charges: torch.Tensor, + NCEs: torch.Tensor, instrument_indices, ): - - in_x = self.input_nn( - aa_indices, mod_x, - charges, NCEs, instrument_indices - ) + in_x = self.input_nn(aa_indices, mod_x, charges, NCEs, instrument_indices) in_x = self.dropout(in_x) hidden_x = self.hidden_nn(in_x) hidden_x = self.dropout(hidden_x) - out_x = self.output_nn( - hidden_x, - charges, NCEs, instrument_indices - ) + out_x = self.output_nn(hidden_x, charges, NCEs, instrument_indices) # modloss is mainly only for Phospho@S/T if self._num_modloss_types > 0: if self._mask_modloss: - out_x = torch.cat((out_x, torch.zeros( - *out_x.size()[:2],self._num_modloss_types, - device=in_x.device - )), 2) - else: - modloss_x = self.modloss_nn[0]( - in_x - ) + hidden_x - modloss_x = self.modloss_nn[-1]( - modloss_x + out_x = torch.cat( + ( + out_x, + torch.zeros( + *out_x.size()[:2], + self._num_modloss_types, + device=in_x.device, + ), + ), + 2, ) - out_x = torch.cat(( - out_x, modloss_x - ),2) + else: + modloss_x = self.modloss_nn[0](in_x) + hidden_x + modloss_x = self.modloss_nn[-1](modloss_x) + out_x = torch.cat((out_x, modloss_x), 2) + + return out_x[:, 3:, :] - return out_x[:,3:,:] class IntenAwareLoss(torch.nn.Module): """Loss weighted by intensity for MS2 models""" + def __init__(self, base_weight=0.2): super().__init__() self.w = base_weight def forward(self, pred, target): - return torch.mean( - (target+self.w)*torch.abs(target-pred) - ) + return torch.mean((target + self.w) * torch.abs(target - pred)) + + +mod_feature_size = len(model_const["mod_elements"]) +max_instrument_num = model_const["max_instrument_num"] +frag_types = settings["model"]["frag_types"] +max_frag_charge = settings["model"]["max_frag_charge"] +num_ion_types = len(frag_types) * max_frag_charge -mod_feature_size = len(model_const['mod_elements']) -max_instrument_num = model_const['max_instrument_num'] -frag_types = settings['model']['frag_types'] -max_frag_charge = settings['model']['max_frag_charge'] -num_ion_types = len(frag_types)*max_frag_charge class pDeepModel(model_interface.ModelInterface): """ `ModelInterface` for MS2 prediction models """ - def __init__(self, - charged_frag_types = get_charged_frag_types( - frag_types, max_frag_charge - ), + + def __init__( + self, + charged_frag_types=get_charged_frag_types(frag_types, max_frag_charge), dropout=0.1, mask_modloss=True, - modloss_type='modloss', - model_class:torch.nn.Module=ModelMS2Bert, - device:str='gpu', - **kwargs, #model params + modloss_type="modloss", + model_class: torch.nn.Module = ModelMS2Bert, + device: str = "gpu", + **kwargs, # model params ): super().__init__(device=device) self.charged_frag_types = charged_frag_types @@ -381,28 +397,29 @@ def __init__(self, self.charge_factor = 0.1 self.NCE_factor = 0.01 - self.model:ModelMS2Bert = None + self.model: ModelMS2Bert = None self.build( model_class, - num_frag_types = len(self.charged_frag_types), - num_modloss_types = len(self._modloss_frag_types), + num_frag_types=len(self.charged_frag_types), + num_modloss_types=len(self._modloss_frag_types), mask_modloss=mask_modloss, dropout=dropout, - **kwargs, # other model params + **kwargs, # other model params ) self.loss_func = torch.nn.L1Loss() self.min_inten = 1e-4 - def _get_modloss_frags(self, modloss='modloss'): + def _get_modloss_frags(self, modloss="modloss"): self._modloss_frag_types = [] - for i,frag in enumerate(self.charged_frag_types): + for i, frag in enumerate(self.charged_frag_types): if modloss in frag: self._modloss_frag_types.append(i) - def _prepare_train_data_df(self, - precursor_df:pd.DataFrame, - fragment_intensity_df:pd.DataFrame=None, + def _prepare_train_data_df( + self, + precursor_df: pd.DataFrame, + fragment_intensity_df: pd.DataFrame = None, ): self.frag_inten_df = fragment_intensity_df[self.charged_frag_types] # if np.all(precursor_df['nce'].values > 1): @@ -411,31 +428,33 @@ def _prepare_train_data_df(self, def _check_predict_in_order(self, precursor_df: pd.DataFrame): pass - def _prepare_predict_data_df(self, - precursor_df:pd.DataFrame, - reference_frag_df:pd.DataFrame=None, + def _prepare_predict_data_df( + self, + precursor_df: pd.DataFrame, + reference_frag_df: pd.DataFrame = None, ): if reference_frag_df is None and precursor_df.nAA.is_monotonic_increasing: self._predict_in_order = True - if 'frag_start_idx' in precursor_df.columns: + if "frag_start_idx" in precursor_df.columns: precursor_df.drop( - columns=['frag_start_idx','frag_stop_idx'], - inplace=True + columns=["frag_start_idx", "frag_stop_idx"], inplace=True ) else: self._predict_in_order = False self.predict_df = init_fragment_by_precursor_dataframe( - precursor_df, self.charged_frag_types, + precursor_df, + self.charged_frag_types, reference_fragment_df=reference_frag_df, - dtype=np.float32 + dtype=np.float32, ) # if np.all(precursor_df['nce'].values > 1): # precursor_df['nce'] = precursor_df['nce']*self.NCE_factor - def _get_features_from_batch_df(self, + def _get_features_from_batch_df( + self, batch_df: pd.DataFrame, **kwargs, ) -> Tuple[torch.Tensor]: @@ -443,75 +462,61 @@ def _get_features_from_batch_df(self, mod_x = self._get_mod_features(batch_df) - charges = self._as_tensor( - batch_df['charge'].values - ).unsqueeze(1)*self.charge_factor + charges = ( + self._as_tensor(batch_df["charge"].values).unsqueeze(1) * self.charge_factor + ) - nces = self._as_tensor( - batch_df['nce'].values - ).unsqueeze(1)*self.NCE_factor + nces = self._as_tensor(batch_df["nce"].values).unsqueeze(1) * self.NCE_factor instrument_indices = self._as_tensor( - parse_instrument_indices(batch_df['instrument']), - dtype=torch.long + parse_instrument_indices(batch_df["instrument"]), dtype=torch.long ) return aa_indices, mod_x, charges, nces, instrument_indices - def _get_targets_from_batch_df(self, - batch_df: pd.DataFrame, - fragment_intensity_df:pd.DataFrame=None + def _get_targets_from_batch_df( + self, batch_df: pd.DataFrame, fragment_intensity_df: pd.DataFrame = None ) -> torch.Tensor: return self._as_tensor( get_sliced_fragment_dataframe( fragment_intensity_df, - batch_df[ - ['frag_start_idx','frag_stop_idx'] - ].values + batch_df[["frag_start_idx", "frag_stop_idx"]].values, ).values - ).view(-1, - batch_df.nAA.values[0]-1, - len(self.charged_frag_types) - ) + ).view(-1, batch_df.nAA.values[0] - 1, len(self.charged_frag_types)) - def _set_batch_predict_data(self, + def _set_batch_predict_data( + self, batch_df: pd.DataFrame, - predicts:np.ndarray, + predicts: np.ndarray, **kwargs, ): apex_intens = predicts.reshape((len(batch_df), -1)).max(axis=1) - apex_intens[apex_intens<=0] = 1 - predicts /= apex_intens.reshape((-1,1,1)) - predicts[predictspd.DataFrame: + def test( + self, + precursor_df: pd.DataFrame, + fragment_intensity_df: pd.DataFrame, + default_instrument: str = "Lumos", + default_nce: float = 30.0, + ) -> pd.DataFrame: if "instrument" not in precursor_df.columns: precursor_df["instrument"] = default_instrument if "nce" not in precursor_df.columns: @@ -541,23 +547,24 @@ def test(self, ) return calc_ms2_similarity( precursor_df, - self.predict( - precursor_df, reference_frag_df=fragment_intensity_df - )[columns], - fragment_intensity_df=fragment_intensity_df[columns] + self.predict(precursor_df, reference_frag_df=fragment_intensity_df)[ + columns + ], + fragment_intensity_df=fragment_intensity_df[columns], )[-1] - def train(self, + def train( + self, precursor_df: pd.DataFrame, fragment_intensity_df, *, batch_size=1024, epoch=20, warmup_epoch=0, - lr = 1e-5, + lr=1e-5, verbose=False, verbose_each_epoch=False, - **kwargs + **kwargs, ): return super().train( precursor_df, @@ -568,96 +575,107 @@ def train(self, lr=lr, verbose=verbose, verbose_each_epoch=verbose_each_epoch, - **kwargs + **kwargs, ) - def predict(self, + def predict( + self, precursor_df: pd.DataFrame, *, batch_size=1024, verbose=False, reference_frag_df=None, - **kwargs + **kwargs, ) -> pd.DataFrame: return super().predict( precursor_df, batch_size=batch_size, verbose=verbose, reference_frag_df=reference_frag_df, - **kwargs + **kwargs, ) - def predict_mp(self, - **kwargs - ) -> pd.DataFrame: + def predict_mp(self, **kwargs) -> pd.DataFrame: warnings.warn( "Please use pretrained_models.ModelManager::predict_all() " "for MS2 prediction with multiprocessing" ) - def bootstrap_nce_search(self, - psm_df:pd.DataFrame, - fragment_intensity_df:pd.DataFrame, - nce_first=15, nce_last=45, nce_step=3, - instrument = 'Lumos', - charged_frag_types:List = None, - metric = 'PCC>0.9', # or 'median PCC' - max_psm_subset = 3000, - n_bootstrap = 3, - callback = None + def bootstrap_nce_search( + self, + psm_df: pd.DataFrame, + fragment_intensity_df: pd.DataFrame, + nce_first=15, + nce_last=45, + nce_step=3, + instrument="Lumos", + charged_frag_types: List = None, + metric="PCC>0.9", # or 'median PCC' + max_psm_subset=3000, + n_bootstrap=3, + callback=None, ): nce_list = [] for i in range(n_bootstrap): nce, instrument = self.grid_nce_search( - psm_df, fragment_intensity_df, - nce_first, nce_last, nce_step, + psm_df, + fragment_intensity_df, + nce_first, + nce_last, + nce_step, [instrument], charged_frag_types, - metric, max_psm_subset, n_bootstrap, - callback + metric, + max_psm_subset, + n_bootstrap, + callback, ) nce_list.append(nce) return np.median(nce_list), instrument - def grid_nce_search(self, - psm_df:pd.DataFrame, - fragment_intensity_df:pd.DataFrame, - nce_first=15, nce_last=45, nce_step=3, - search_instruments = ['Lumos'], - charged_frag_types:List = None, - metric = 'PCC>0.9', # or 'median PCC' - max_psm_subset = 1000000, - callback = None + def grid_nce_search( + self, + psm_df: pd.DataFrame, + fragment_intensity_df: pd.DataFrame, + nce_first=15, + nce_last=45, + nce_step=3, + search_instruments=["Lumos"], + charged_frag_types: List = None, + metric="PCC>0.9", # or 'median PCC' + max_psm_subset=1000000, + callback=None, ): if len(psm_df) > max_psm_subset: psm_df = psm_df.sample(max_psm_subset).copy() best_pcc = -1 - best_nce = 0. + best_nce = 0.0 best_instrument = None - if 'median' in metric: - metric_row = '50%' + if "median" in metric: + metric_row = "50%" else: - metric_row = '>0.90' - search_instruments = set([ - settings['model_mgr']['instrument_group'][inst] - for inst in search_instruments - ]) + metric_row = ">0.90" + search_instruments = set( + [ + settings["model_mgr"]["instrument_group"][inst] + for inst in search_instruments + ] + ) for inst in search_instruments: - for nce in np.arange(nce_first, nce_last+nce_step, nce_step): - psm_df['nce'] = nce - psm_df['instrument'] = inst + for nce in np.arange(nce_first, nce_last + nce_step, nce_step): + psm_df["nce"] = nce + psm_df["instrument"] = inst predict_inten_df = self.predict( - psm_df, - reference_frag_df=fragment_intensity_df + psm_df, reference_frag_df=fragment_intensity_df ) df, metrics = calc_ms2_similarity( psm_df, predict_inten_df, fragment_intensity_df, charged_frag_types=charged_frag_types, - metrics=['PCC'] + metrics=["PCC"], ) - pcc = metrics.loc[metric_row, 'PCC'] + pcc = metrics.loc[metric_row, "PCC"] if pcc > best_pcc: best_pcc = pcc best_nce = nce @@ -666,8 +684,7 @@ def grid_nce_search(self, def normalize_fragment_intensities( - psm_df:pd.DataFrame, - frag_intensity_df:pd.DataFrame + psm_df: pd.DataFrame, frag_intensity_df: pd.DataFrame ): """Normalize the intensities to 0-1 values inplace @@ -682,16 +699,17 @@ def normalize_fragment_intensities( """ frag_intensity_df_np = frag_intensity_df.to_numpy() for i, (frag_start_idx, frag_stop_idx) in enumerate( - psm_df[['frag_start_idx','frag_stop_idx']].values + psm_df[["frag_start_idx", "frag_stop_idx"]].values ): intens = frag_intensity_df_np[frag_start_idx:frag_stop_idx] max_inten = np.max(intens) if max_inten > 0: intens /= max_inten - frag_intensity_df_np[frag_start_idx:frag_stop_idx,:] = intens + frag_intensity_df_np[frag_start_idx:frag_stop_idx, :] = intens frag_intensity_df.loc[:] = frag_intensity_df_np -def pearson_correlation(x:torch.Tensor, y:torch.Tensor): + +def pearson_correlation(x: torch.Tensor, y: torch.Tensor): """Compute pearson correlation between 2 batches of 1-D tensors Parameters @@ -704,33 +722,36 @@ def pearson_correlation(x:torch.Tensor, y:torch.Tensor): """ return torch.cosine_similarity( - x-x.mean(dim=1, keepdim=True), - y-y.mean(dim=1, keepdim=True), - dim = 1 + x - x.mean(dim=1, keepdim=True), y - y.mean(dim=1, keepdim=True), dim=1 ) -#legacy -pearson=pearson_correlation + +# legacy +pearson = pearson_correlation + def spectral_angle(cos): - cos[cos>1] = 1 + cos[cos > 1] = 1 return 1 - 2 * torch.arccos(cos) / np.pi + def _get_ranks(x: torch.Tensor, device) -> torch.Tensor: sorted_idx = x.argsort(dim=1) flat_idx = ( - sorted_idx+torch.arange( - x.size(0), device=device - ).unsqueeze(1)*x.size(1) + sorted_idx + torch.arange(x.size(0), device=device).unsqueeze(1) * x.size(1) ).flatten() ranks = torch.zeros_like(flat_idx) - ranks[flat_idx] = torch.arange( - x.size(1), device=device - ).unsqueeze(0).repeat(x.size(0),1).flatten() + ranks[flat_idx] = ( + torch.arange(x.size(1), device=device) + .unsqueeze(0) + .repeat(x.size(0), 1) + .flatten() + ) ranks = ranks.reshape(x.size()) - ranks[x==0] = 0 + ranks[x == 0] = 0 return ranks + def spearman_correlation(x: torch.Tensor, y: torch.Tensor, device): """Compute spearman correlation between 2 batches of 1-D tensors @@ -748,41 +769,44 @@ def spearman_correlation(x: torch.Tensor, y: torch.Tensor, device): n = x.size(1) upper = 6 * torch.sum((x_rank - y_rank).pow(2), dim=1) - down = n * (n ** 2 - 1.0) + down = n * (n**2 - 1.0) return 1.0 - (upper / down) -#legacy + +# legacy spearman = spearman_correlation -def add_cutoff_metric( - metrics_describ, metrics_df, thres=0.9 -): + +def add_cutoff_metric(metrics_describ, metrics_df, thres=0.9): vals = [] for col in metrics_describ.columns.values: - vals.append(metrics_df.loc[metrics_df[col]>thres, col].count()/len(metrics_df)) - metrics_describ.loc[f'>{thres:.2f}'] = vals + vals.append( + metrics_df.loc[metrics_df[col] > thres, col].count() / len(metrics_df) + ) + metrics_describ.loc[f">{thres:.2f}"] = vals return metrics_describ + def calc_ms2_similarity( psm_df: pd.DataFrame, predict_intensity_df: pd.DataFrame, fragment_intensity_df: pd.DataFrame, - charged_frag_types: List=None, - metrics = ['PCC','COS','SA','SPC'], - GPU = True, + charged_frag_types: List = None, + metrics=["PCC", "COS", "SA", "SPC"], + GPU=True, batch_size=10240, verbose=False, spc_top_k=0, -)->Tuple[pd.DataFrame, pd.DataFrame]: +) -> Tuple[pd.DataFrame, pd.DataFrame]: if GPU: device, _ = get_available_device() else: - device = torch.device('cpu') + device = torch.device("cpu") - if charged_frag_types is None or len(charged_frag_types)==0: + if charged_frag_types is None or len(charged_frag_types) == 0: charged_frag_types = fragment_intensity_df.columns.values - _grouped = psm_df.groupby('nAA') + _grouped = psm_df.groupby("nAA") if verbose: batch_tqdm = tqdm(_grouped) @@ -794,73 +818,65 @@ def calc_ms2_similarity( for nAA, df_group in batch_tqdm: for i in range(0, len(df_group), batch_size): - batch_end = i+batch_size - batch_df = df_group.iloc[i:batch_end,:] + batch_end = i + batch_size + batch_df = df_group.iloc[i:batch_end, :] pred_intens = torch.tensor( get_sliced_fragment_dataframe( predict_intensity_df, - batch_df[ - ['frag_start_idx','frag_stop_idx'] - ].values, - charged_frag_types + batch_df[["frag_start_idx", "frag_stop_idx"]].values, + charged_frag_types, ).values, - dtype=torch.float32, device=device - ).reshape( - -1, (nAA-1)*len(charged_frag_types) - ) + dtype=torch.float32, + device=device, + ).reshape(-1, (nAA - 1) * len(charged_frag_types)) frag_intens = torch.tensor( get_sliced_fragment_dataframe( fragment_intensity_df, - batch_df[ - ['frag_start_idx','frag_stop_idx'] - ].values, - charged_frag_types + batch_df[["frag_start_idx", "frag_stop_idx"]].values, + charged_frag_types, ).values, - dtype=torch.float32, device=device - ).reshape( - -1, (nAA-1)*len(charged_frag_types) - ) + dtype=torch.float32, + device=device, + ).reshape(-1, (nAA - 1) * len(charged_frag_types)) - if 'PCC' in metrics: - psm_df.loc[batch_df.index,'PCC'] = pearson_correlation( - pred_intens, frag_intens - ).cpu().detach().numpy() - - if 'COS' in metrics or 'SA' in metrics: - cos = torch.cosine_similarity( - pred_intens, frag_intens, dim=1 + if "PCC" in metrics: + psm_df.loc[batch_df.index, "PCC"] = ( + pearson_correlation(pred_intens, frag_intens).cpu().detach().numpy() ) - psm_df.loc[ - batch_df.index,'COS' - ] = cos.cpu().detach().numpy() - - if 'SA' in metrics: - psm_df.loc[ - batch_df.index,'SA' - ] = spectral_angle( - cos - ).cpu().detach().numpy() - - if 'SPC' in metrics: + + if "COS" in metrics or "SA" in metrics: + cos = torch.cosine_similarity(pred_intens, frag_intens, dim=1) + psm_df.loc[batch_df.index, "COS"] = cos.cpu().detach().numpy() + + if "SA" in metrics: + psm_df.loc[batch_df.index, "SA"] = ( + spectral_angle(cos).cpu().detach().numpy() + ) + + if "SPC" in metrics: if spc_top_k > 1 and spc_top_k < frag_intens.size(1): sorted_idx = frag_intens.argsort(dim=1, descending=True) flat_idx = ( - sorted_idx[:,:spc_top_k]+torch.arange( - frag_intens.size(0), dtype=torch.int, - device=device - ).unsqueeze(1)*frag_intens.size(1) + sorted_idx[:, :spc_top_k] + + torch.arange( + frag_intens.size(0), dtype=torch.int, device=device + ).unsqueeze(1) + * frag_intens.size(1) ).flatten() pred_intens = pred_intens.flatten()[flat_idx].reshape( - sorted_idx.size(0),-1 + sorted_idx.size(0), -1 ) frag_intens = frag_intens.flatten()[flat_idx].reshape( - sorted_idx.size(0),-1 + sorted_idx.size(0), -1 ) - psm_df.loc[batch_df.index,'SPC'] = spearman_correlation( - pred_intens, frag_intens, device - ).cpu().detach().numpy() + psm_df.loc[batch_df.index, "SPC"] = ( + spearman_correlation(pred_intens, frag_intens, device) + .cpu() + .detach() + .numpy() + ) metrics_describ = psm_df[metrics].describe() add_cutoff_metric(metrics_describ, psm_df, thres=0.9) diff --git a/peptdeep/model/rt.py b/peptdeep/model/rt.py index 8c1ff9ca..66a72840 100644 --- a/peptdeep/model/rt.py +++ b/peptdeep/model/rt.py @@ -2,10 +2,7 @@ import pandas as pd import numpy as np -from peptdeep.model.featurize import ( - get_batch_aa_indices, - get_batch_mod_feature -) +from peptdeep.model.featurize import get_batch_aa_indices, get_batch_mod_feature from peptdeep.settings import model_const @@ -13,35 +10,39 @@ import peptdeep.model.building_block as building_block from peptdeep.utils import evaluate_linear_regression -mod_feature_size = len(model_const['mod_elements']) +mod_feature_size = len(model_const["mod_elements"]) IRT_PEPTIDE_DF = pd.DataFrame( - [['LGGNEQVTR', 'RT-pep a', -24.92, '', ''], - ['GAGSSEPVTGLDAK', 'RT-pep b', 0.00, '', ''], - ['VEATFGVDESNAK', 'RT-pep c', 12.39, '', ''], - ['YILAGVENSK', 'RT-pep d', 19.79, '', ''], - ['TPVISGGPYEYR', 'RT-pep e', 28.71, '', ''], - ['TPVITGAPYEYR', 'RT-pep f', 33.38, '', ''], - ['DGLDAASYYAPVR', 'RT-pep g', 42.26, '', ''], - ['ADVTPADFSEWSK', 'RT-pep h', 54.62, '', ''], - ['GTFIIDPGGVIR', 'RT-pep i', 70.52, '', ''], - ['GTFIIDPAAVIR', 'RT-pep k', 87.23, '', ''], - ['LFLQFGAQGSPFLK', 'RT-pep l', 100.00, '', '']], - columns=['sequence','pep_name','irt', 'mods', 'mod_sites'] + [ + ["LGGNEQVTR", "RT-pep a", -24.92, "", ""], + ["GAGSSEPVTGLDAK", "RT-pep b", 0.00, "", ""], + ["VEATFGVDESNAK", "RT-pep c", 12.39, "", ""], + ["YILAGVENSK", "RT-pep d", 19.79, "", ""], + ["TPVISGGPYEYR", "RT-pep e", 28.71, "", ""], + ["TPVITGAPYEYR", "RT-pep f", 33.38, "", ""], + ["DGLDAASYYAPVR", "RT-pep g", 42.26, "", ""], + ["ADVTPADFSEWSK", "RT-pep h", 54.62, "", ""], + ["GTFIIDPGGVIR", "RT-pep i", 70.52, "", ""], + ["GTFIIDPAAVIR", "RT-pep k", 87.23, "", ""], + ["LFLQFGAQGSPFLK", "RT-pep l", 100.00, "", ""], + ], + columns=["sequence", "pep_name", "irt", "mods", "mod_sites"], ) -IRT_PEPTIDE_DF['nAA'] = IRT_PEPTIDE_DF.sequence.str.len() +IRT_PEPTIDE_DF["nAA"] = IRT_PEPTIDE_DF.sequence.str.len() -#legacy +# legacy irt_pep = IRT_PEPTIDE_DF class Model_RT_Bert(torch.nn.Module): """Transformer model for RT prediction""" - def __init__(self, - dropout = 0.1, - nlayers = 4, - hidden = 128, + + def __init__( + self, + dropout=0.1, + nlayers=4, + hidden=128, output_attentions=False, **kwargs, ): @@ -54,8 +55,10 @@ def __init__(self, self._output_attentions = output_attentions self.hidden_nn = building_block.Hidden_HFace_Transformer( - hidden, nlayers=nlayers, dropout=dropout, - output_attentions=output_attentions + hidden, + nlayers=nlayers, + dropout=dropout, + output_attentions=output_attentions, ) self.output_nn = torch.nn.Sequential( @@ -70,31 +73,32 @@ def output_attentions(self): return self._output_attentions @output_attentions.setter - def output_attentions(self, val:bool): + def output_attentions(self, val: bool): self._output_attentions = val self.hidden_nn.output_attentions = val - def forward(self, + def forward( + self, aa_indices, mod_x, ): - x = self.dropout(self.input_nn( - aa_indices, mod_x - )) + x = self.dropout(self.input_nn(aa_indices, mod_x)) hidden_x = self.hidden_nn(x) if self.output_attentions: self.attentions = hidden_x[1] else: self.attentions = None - x = self.dropout(hidden_x[0]+x*0.2) + x = self.dropout(hidden_x[0] + x * 0.2) return self.output_nn(x).squeeze(1) class Model_RT_LSTM_CNN(torch.nn.Module): """CNN+LSTM model for RT prediction""" - def __init__(self, + + def __init__( + self, dropout=0.2, ): super().__init__() @@ -102,15 +106,12 @@ def __init__(self, self.dropout = torch.nn.Dropout(dropout) hidden = 256 - self.rt_encoder = building_block.Encoder_26AA_Mod_CNN_LSTM_AttnSum( - hidden - ) + self.rt_encoder = building_block.Encoder_26AA_Mod_CNN_LSTM_AttnSum(hidden) - self.rt_decoder = building_block.Decoder_Linear( - hidden, 1 - ) + self.rt_decoder = building_block.Decoder_Linear(hidden, 1) - def forward(self, + def forward( + self, aa_indices, mod_x, ): @@ -118,52 +119,53 @@ def forward(self, x = self.dropout(x) return self.rt_decoder(x).squeeze(1) -#legacy + + +# legacy Model_RT_LSTM = Model_RT_LSTM_CNN + class AlphaRTModel(model_interface.ModelInterface): """ `ModelInterface` for RT models """ - def __init__(self, + + def __init__( + self, dropout=0.1, - model_class:torch.nn.Module=Model_RT_LSTM_CNN, #model defined above - device:str='gpu', + model_class: torch.nn.Module = Model_RT_LSTM_CNN, # model defined above + device: str = "gpu", **kwargs, ): super().__init__(device=device) - self.model:Model_RT_LSTM_CNN = None - self.build( - model_class, - dropout=dropout, - **kwargs - ) - self.target_column_to_predict = 'rt_pred' - self.target_column_to_train = 'rt_norm' + self.model: Model_RT_LSTM_CNN = None + self.build(model_class, dropout=dropout, **kwargs) + self.target_column_to_predict = "rt_pred" + self.target_column_to_train = "rt_norm" - def test(self, + def test( + self, precursor_df: pd.DataFrame, *, - batch_size:int = 1024, + batch_size: int = 1024, ): return evaluate_linear_regression( - self.predict( - precursor_df, batch_size=batch_size - ), - x="rt_pred", y="rt_norm" + self.predict(precursor_df, batch_size=batch_size), x="rt_pred", y="rt_norm" ) - def _get_features_from_batch_df(self, + def _get_features_from_batch_df( + self, batch_df: pd.DataFrame, ): return ( self._get_26aa_indice_features(batch_df), - self._get_mod_features(batch_df) + self._get_mod_features(batch_df), ) - def add_irt_column_to_precursor_df(self, + def add_irt_column_to_precursor_df( + self, precursor_df: pd.DataFrame, - irt_pep_df:pd.DataFrame = None, + irt_pep_df: pd.DataFrame = None, ): if irt_pep_df is None: irt_pep_df = IRT_PEPTIDE_DF @@ -184,5 +186,5 @@ def add_irt_column_to_precursor_df(self, # end linear regression slope = eval_df.slope.values[0] intercept = eval_df.intercept.values[0] - precursor_df['irt_pred'] = precursor_df.rt_pred*slope + intercept + precursor_df["irt_pred"] = precursor_df.rt_pred * slope + intercept return precursor_df diff --git a/peptdeep/pretrained_models.py b/peptdeep/pretrained_models.py index 72f8e981..1e82a240 100644 --- a/peptdeep/pretrained_models.py +++ b/peptdeep/pretrained_models.py @@ -12,6 +12,7 @@ import typing from pickle import UnpicklingError import torch.multiprocessing as mp + if sys.platform.lower().startswith("linux"): # to prevent `too many open files` bug on Linux mp.set_sharing_strategy("file_system") @@ -22,60 +23,48 @@ from alphabase.peptide.fragment import ( create_fragment_mz_dataframe, - concat_precursor_fragment_dataframes -) -from alphabase.peptide.precursor import ( - refine_precursor_df, - update_precursor_mz -) -from alphabase.peptide.mobility import ( - mobility_to_ccs_for_df, - ccs_to_mobility_for_df + concat_precursor_fragment_dataframes, ) +from alphabase.peptide.precursor import refine_precursor_df, update_precursor_mz +from alphabase.peptide.mobility import mobility_to_ccs_for_df, ccs_to_mobility_for_df from peptdeep.settings import global_settings, add_user_defined_modifications from peptdeep.utils import logging, process_bar from peptdeep.settings import global_settings from peptdeep.model.ms2 import ( - pDeepModel, normalize_fragment_intensities, - calc_ms2_similarity + pDeepModel, + normalize_fragment_intensities, + calc_ms2_similarity, ) from peptdeep.model.rt import AlphaRTModel from peptdeep.model.ccs import AlphaCCSModel from peptdeep.model.charge import ChargeModelForAASeq, ChargeModelForModAASeq -from peptdeep.utils import ( - uniform_sampling, evaluate_linear_regression -) +from peptdeep.utils import uniform_sampling, evaluate_linear_regression from peptdeep.settings import global_settings, update_global_settings pretrain_dir = os.path.join( os.path.join( - os.path.expanduser( - global_settings['PEPTDEEP_HOME'] - ), - "pretrained_models" + os.path.expanduser(global_settings["PEPTDEEP_HOME"]), "pretrained_models" ) ) if not os.path.exists(pretrain_dir): os.makedirs(pretrain_dir) -model_zip_name = global_settings['local_model_zip_name'] -model_url = global_settings['model_url'] +model_zip_name = global_settings["local_model_zip_name"] +model_url = global_settings["model_url"] + +model_zip = os.path.join(pretrain_dir, model_zip_name) -model_zip = os.path.join( - pretrain_dir, model_zip_name -) def is_model_zip(downloaded_zip): with ZipFile(downloaded_zip) as zip: - return any(x=='generic/ms2.pth' for x in zip.namelist()) + return any(x == "generic/ms2.pth" for x in zip.namelist()) -def download_models( - url:str=model_url, overwrite=True -): + +def download_models(url: str = model_url, overwrite=True): """ Parameters ---------- @@ -93,97 +82,92 @@ def download_models( If remote url is not accessible. """ if not os.path.isfile(url): - logging.info(f'Downloading {model_zip_name} ...') + logging.info(f"Downloading {model_zip_name} ...") try: context = ssl._create_unverified_context() requests = urllib.request.urlopen(url, context=context, timeout=10) - with open(model_zip, 'wb') as f: + with open(model_zip, "wb") as f: f.write(requests.read()) - except ( - socket.timeout, - urllib.error.URLError, - urllib.error.HTTPError - ) as e: + except (socket.timeout, urllib.error.URLError, urllib.error.HTTPError) as e: raise FileNotFoundError( - 'Downloading model failed! Please download the ' + "Downloading model failed! Please download the " f'zip or tar file by yourself from "{url}",' - ' and use \n' + " and use \n" f'"peptdeep --install-model /path/to/{model_zip_name}.zip"\n' - ' to install the models' + " to install the models" ) else: - shutil.copy( - url, model_zip - ) - logging.info(f'The pretrained models had been downloaded in {model_zip}') + shutil.copy(url, model_zip) + logging.info(f"The pretrained models had been downloaded in {model_zip}") + if not os.path.exists(model_zip): download_models() -model_mgr_settings = global_settings['model_mgr'] +model_mgr_settings = global_settings["model_mgr"] -def count_mods(psm_df)->pd.DataFrame: - mods = psm_df[ - psm_df.mods.str.len()>0 - ].mods.apply(lambda x: x.split(';')) + +def count_mods(psm_df) -> pd.DataFrame: + mods = psm_df[psm_df.mods.str.len() > 0].mods.apply(lambda x: x.split(";")) mod_dict = {} - mod_dict['mutation'] = {} - mod_dict['mutation']['spec_count'] = 0 + mod_dict["mutation"] = {} + mod_dict["mutation"]["spec_count"] = 0 for one_mods in mods.values: for mod in set(one_mods): - items = mod.split('->') - if ( - len(items)==2 - and len(items[0])==3 - and len(items[1])==5 - ): - mod_dict['mutation']['spec_count'] += 1 + items = mod.split("->") + if len(items) == 2 and len(items[0]) == 3 and len(items[1]) == 5: + mod_dict["mutation"]["spec_count"] += 1 elif mod not in mod_dict: mod_dict[mod] = {} - mod_dict[mod]['spec_count'] = 1 + mod_dict[mod]["spec_count"] = 1 else: - mod_dict[mod]['spec_count'] += 1 - return pd.DataFrame().from_dict( - mod_dict, orient='index' - ).reset_index(drop=False).rename( - columns={'index':'mod'} - ).sort_values( - 'spec_count',ascending=False - ).reset_index(drop=True) + mod_dict[mod]["spec_count"] += 1 + return ( + pd.DataFrame() + .from_dict(mod_dict, orient="index") + .reset_index(drop=False) + .rename(columns={"index": "mod"}) + .sort_values("spec_count", ascending=False) + .reset_index(drop=True) + ) + def psm_sampling_with_important_mods( - psm_df, n_sample, - top_n_mods = 10, - n_sample_each_mod = 0, - uniform_sampling_column = None, + psm_df, + n_sample, + top_n_mods=10, + n_sample_each_mod=0, + uniform_sampling_column=None, random_state=1337, ): psm_df_list = [] if uniform_sampling_column is None: + def _sample(psm_df, n): if n < len(psm_df): - return psm_df.sample( - n, replace=False, - random_state=random_state - ).copy() + return psm_df.sample(n, replace=False, random_state=random_state).copy() else: return psm_df.copy() else: + def _sample(psm_df, n): - if len(psm_df) == 0: return psm_df + if len(psm_df) == 0: + return psm_df return uniform_sampling( - psm_df, target=uniform_sampling_column, - n_train = n, random_state=random_state + psm_df, + target=uniform_sampling_column, + n_train=n, + random_state=random_state, ) psm_df_list.append(_sample(psm_df, n_sample)) if n_sample_each_mod > 0: mod_df = count_mods(psm_df) - mod_df = mod_df[mod_df['mod']!='mutation'] + mod_df = mod_df[mod_df["mod"] != "mutation"] if len(mod_df) > top_n_mods: - mod_df = mod_df.iloc[:top_n_mods,:] - for mod in mod_df['mod'].values: + mod_df = mod_df.iloc[:top_n_mods, :] + for mod in mod_df["mod"].values: psm_df_list.append( _sample( psm_df[psm_df.mods.str.contains(mod, regex=False)], @@ -195,43 +179,43 @@ def _sample(psm_df, n): else: return pd.DataFrame() + def load_phos_models(mask_modloss=True): ms2_model = pDeepModel(mask_modloss=mask_modloss) - ms2_model.load(model_zip, model_path_in_zip='phospho/ms2_phos.pth') + ms2_model.load(model_zip, model_path_in_zip="phospho/ms2_phos.pth") rt_model = AlphaRTModel() - rt_model.load(model_zip, model_path_in_zip='phospho/rt_phos.pth') + rt_model.load(model_zip, model_path_in_zip="phospho/rt_phos.pth") ccs_model = AlphaCCSModel() - ccs_model.load(model_zip, model_path_in_zip='generic/ccs.pth') + ccs_model.load(model_zip, model_path_in_zip="generic/ccs.pth") return ms2_model, rt_model, ccs_model + def load_models(mask_modloss=True): ms2_model = pDeepModel(mask_modloss=mask_modloss) - ms2_model.load(model_zip, model_path_in_zip='generic/ms2.pth') + ms2_model.load(model_zip, model_path_in_zip="generic/ms2.pth") rt_model = AlphaRTModel() - rt_model.load(model_zip, model_path_in_zip='generic/rt.pth') + rt_model.load(model_zip, model_path_in_zip="generic/rt.pth") ccs_model = AlphaCCSModel() - ccs_model.load(model_zip, model_path_in_zip='generic/ccs.pth') + ccs_model.load(model_zip, model_path_in_zip="generic/ccs.pth") return ms2_model, rt_model, ccs_model -def load_models_by_model_type_in_zip(model_type_in_zip:str, mask_modloss=True): + +def load_models_by_model_type_in_zip(model_type_in_zip: str, mask_modloss=True): ms2_model = pDeepModel(mask_modloss=mask_modloss) - ms2_model.load(model_zip, model_path_in_zip=f'{model_type_in_zip}/ms2.pth') + ms2_model.load(model_zip, model_path_in_zip=f"{model_type_in_zip}/ms2.pth") rt_model = AlphaRTModel() - rt_model.load(model_zip, model_path_in_zip=f'{model_type_in_zip}/rt.pth') + rt_model.load(model_zip, model_path_in_zip=f"{model_type_in_zip}/rt.pth") ccs_model = AlphaCCSModel() - ccs_model.load(model_zip, model_path_in_zip=f'{model_type_in_zip}/ccs.pth') + ccs_model.load(model_zip, model_path_in_zip=f"{model_type_in_zip}/ccs.pth") return ms2_model, rt_model, ccs_model -def clear_error_modloss_intensities( - fragment_mz_df, fragment_intensity_df -): +def clear_error_modloss_intensities(fragment_mz_df, fragment_intensity_df): # clear error modloss intensities for col in fragment_mz_df.columns.values: - if 'modloss' in col: - fragment_intensity_df.loc[ - fragment_mz_df[col]==0,col - ] = 0 + if "modloss" in col: + fragment_intensity_df.loc[fragment_mz_df[col] == 0, col] = 0 + class ModelManager(object): """ @@ -277,9 +261,11 @@ class ModelManager(object): NCE and instrument type. This will change `self.nce` and `self.instrument` values. Defaults to global_settings['model_mgr']['transfer']['grid_nce_search']. """ - def __init__(self, - mask_modloss:bool=False, - device:str="gpu", + + def __init__( + self, + mask_modloss: bool = False, + device: str = "gpu", ): """ Parameters @@ -296,143 +282,122 @@ def __init__(self, """ self._train_psm_logging = True - self.ms2_model:pDeepModel = pDeepModel(mask_modloss=mask_modloss, device=device) - self.rt_model:AlphaRTModel = AlphaRTModel(device=device) - self.ccs_model:AlphaCCSModel = AlphaCCSModel(device=device) + self.ms2_model: pDeepModel = pDeepModel( + mask_modloss=mask_modloss, device=device + ) + self.rt_model: AlphaRTModel = AlphaRTModel(device=device) + self.ccs_model: AlphaCCSModel = AlphaCCSModel(device=device) self.load_installed_models() - self.charge_model:typing.Union[ChargeModelForAASeq,ChargeModelForModAASeq] = None + self.charge_model: typing.Union[ChargeModelForAASeq, ChargeModelForModAASeq] = ( + None + ) self.reset_by_global_settings(reload_models=False) - def reset_by_global_settings(self, + def reset_by_global_settings( + self, reload_models=True, ): - mgr_settings = global_settings['model_mgr'] + mgr_settings = global_settings["model_mgr"] - if os.path.isfile(mgr_settings['charge_model_file']): - if mgr_settings['charge_model_type'] == 'modseq': + if os.path.isfile(mgr_settings["charge_model_file"]): + if mgr_settings["charge_model_type"] == "modseq": self.charge_model = ChargeModelForModAASeq() else: self.charge_model = ChargeModelForAASeq() - self.charge_model.load(mgr_settings['charge_model_file']) - self.charge_model.predict_batch_size = mgr_settings['predict']['batch_size_charge'] - self.charge_prob_cutoff = mgr_settings['charge_prob_cutoff'] - self.use_predicted_charge_in_speclib = mgr_settings['use_predicted_charge_in_speclib'] + self.charge_model.load(mgr_settings["charge_model_file"]) + self.charge_model.predict_batch_size = mgr_settings["predict"][ + "batch_size_charge" + ] + self.charge_prob_cutoff = mgr_settings["charge_prob_cutoff"] + self.use_predicted_charge_in_speclib = mgr_settings[ + "use_predicted_charge_in_speclib" + ] if reload_models: - self.load_installed_models(mgr_settings['model_type']) + self.load_installed_models(mgr_settings["model_type"]) self.load_external_models( - ms2_model_file = mgr_settings['external_ms2_model'], - rt_model_file = mgr_settings['external_rt_model'], - ccs_model_file = mgr_settings['external_ccs_model'], + ms2_model_file=mgr_settings["external_ms2_model"], + rt_model_file=mgr_settings["external_rt_model"], + ccs_model_file=mgr_settings["external_ccs_model"], ) - self.ms2_model.model._mask_modloss = global_settings['model_mgr']['mask_modloss'] + self.ms2_model.model._mask_modloss = global_settings["model_mgr"][ + "mask_modloss" + ] - device = global_settings['torch_device']['device_type'] + device = global_settings["torch_device"]["device_type"] self.ms2_model.set_device(device) self.rt_model.set_device(device) self.ccs_model.set_device(device) - self.use_grid_nce_search = mgr_settings[ - 'transfer' - ]['grid_nce_search'] - - self.psm_num_to_train_ms2 = mgr_settings[ - "transfer" - ]["psm_num_to_train_ms2"] - self.psm_num_to_test_ms2 = mgr_settings[ - 'transfer' - ]["psm_num_to_test_ms2"] - self.epoch_to_train_ms2 = mgr_settings[ - 'transfer' - ]['epoch_ms2'] - self.warmup_epoch_to_train_ms2 = mgr_settings[ - 'transfer' - ]['warmup_epoch_ms2'] - self.batch_size_to_train_ms2 = mgr_settings[ - 'transfer' - ]['batch_size_ms2'] - self.lr_to_train_ms2 = float( - mgr_settings[ - 'transfer' - ]['lr_ms2'] - ) - - self.psm_num_to_train_rt_ccs = mgr_settings[ - "transfer" - ]["psm_num_to_train_rt_ccs"] - self.psm_num_to_test_rt_ccs = mgr_settings[ - 'transfer' - ]["psm_num_to_test_rt_ccs"] - self.epoch_to_train_rt_ccs = mgr_settings[ - 'transfer' - ]['epoch_rt_ccs'] - self.warmup_epoch_to_train_rt_ccs = mgr_settings[ - 'transfer' - ]['warmup_epoch_rt_ccs'] - self.batch_size_to_train_rt_ccs = mgr_settings[ - 'transfer' - ]['batch_size_rt_ccs'] - self.lr_to_train_rt_ccs = float( - mgr_settings[ - 'transfer' - ]['lr_rt_ccs'] - ) - - self.psm_num_per_mod_to_train_ms2 = mgr_settings[ - 'transfer' - ]["psm_num_per_mod_to_train_ms2"] - - self.psm_num_per_mod_to_train_rt_ccs = mgr_settings[ - 'transfer' - ]["psm_num_per_mod_to_train_rt_ccs"] - self.top_n_mods_to_train = mgr_settings[ - 'transfer' - ]["top_n_mods_to_train"] - - self.nce = mgr_settings['default_nce'] + self.use_grid_nce_search = mgr_settings["transfer"]["grid_nce_search"] + + self.psm_num_to_train_ms2 = mgr_settings["transfer"]["psm_num_to_train_ms2"] + self.psm_num_to_test_ms2 = mgr_settings["transfer"]["psm_num_to_test_ms2"] + self.epoch_to_train_ms2 = mgr_settings["transfer"]["epoch_ms2"] + self.warmup_epoch_to_train_ms2 = mgr_settings["transfer"]["warmup_epoch_ms2"] + self.batch_size_to_train_ms2 = mgr_settings["transfer"]["batch_size_ms2"] + self.lr_to_train_ms2 = float(mgr_settings["transfer"]["lr_ms2"]) + + self.psm_num_to_train_rt_ccs = mgr_settings["transfer"][ + "psm_num_to_train_rt_ccs" + ] + self.psm_num_to_test_rt_ccs = mgr_settings["transfer"]["psm_num_to_test_rt_ccs"] + self.epoch_to_train_rt_ccs = mgr_settings["transfer"]["epoch_rt_ccs"] + self.warmup_epoch_to_train_rt_ccs = mgr_settings["transfer"][ + "warmup_epoch_rt_ccs" + ] + self.batch_size_to_train_rt_ccs = mgr_settings["transfer"]["batch_size_rt_ccs"] + self.lr_to_train_rt_ccs = float(mgr_settings["transfer"]["lr_rt_ccs"]) + + self.psm_num_per_mod_to_train_ms2 = mgr_settings["transfer"][ + "psm_num_per_mod_to_train_ms2" + ] + + self.psm_num_per_mod_to_train_rt_ccs = mgr_settings["transfer"][ + "psm_num_per_mod_to_train_rt_ccs" + ] + self.top_n_mods_to_train = mgr_settings["transfer"]["top_n_mods_to_train"] + + self.nce = mgr_settings["default_nce"] if self.nce == "from_ms_file": self.use_grid_nce_search = False - self.instrument = mgr_settings['default_instrument'] - self.verbose = mgr_settings['predict']['verbose'] - self.train_verbose = mgr_settings['transfer']['verbose'] - + self.instrument = mgr_settings["default_instrument"] + self.verbose = mgr_settings["predict"]["verbose"] + self.train_verbose = mgr_settings["transfer"]["verbose"] @property def instrument(self): return self._instrument + @instrument.setter - def instrument(self, instrument_name:str): + def instrument(self, instrument_name: str): instrument_name = instrument_name.upper() - if instrument_name in model_mgr_settings[ - 'instrument_group' - ]: - self._instrument = model_mgr_settings[ - 'instrument_group' - ][instrument_name] + if instrument_name in model_mgr_settings["instrument_group"]: + self._instrument = model_mgr_settings["instrument_group"][instrument_name] else: - self._instrument = 'Lumos' + self._instrument = "Lumos" def set_default_nce_instrument(self, df): """ Append 'nce' and 'instrument' columns into df with self.nce and self.instrument """ - if 'nce' not in df.columns and 'instrument' not in df.columns: - df['nce'] = float(self.nce) - df['instrument'] = self.instrument - elif 'nce' not in df.columns: - df['nce'] = float(self.nce) - elif 'instrument' not in df.columns: - df['instrument'] = self.instrument + if "nce" not in df.columns and "instrument" not in df.columns: + df["nce"] = float(self.nce) + df["instrument"] = self.instrument + elif "nce" not in df.columns: + df["nce"] = float(self.nce) + elif "instrument" not in df.columns: + df["instrument"] = self.instrument def set_default_nce(self, df): """Alias for `set_default_nce_instrument`""" self.set_default_nce_instrument(df) - def save_models(self, folder:str): + def save_models(self, folder: str): """Save MS2/RT/CCS models into a folder Parameters @@ -441,19 +406,17 @@ def save_models(self, folder:str): folder to save """ if os.path.isdir(folder): - self.ms2_model.save(os.path.join(folder, 'ms2.pth')) - self.rt_model.save(os.path.join(folder, 'rt.pth')) - self.ccs_model.save(os.path.join(folder, 'ccs.pth')) + self.ms2_model.save(os.path.join(folder, "ms2.pth")) + self.rt_model.save(os.path.join(folder, "rt.pth")) + self.ccs_model.save(os.path.join(folder, "ccs.pth")) if self.charge_model is not None: - self.charge_model.save(os.path.join(folder, 'charge.pth')) + self.charge_model.save(os.path.join(folder, "charge.pth")) elif not os.path.exists(folder): os.makedirs(folder) self.save_models(folder) - def load_installed_models(self, - model_type:str='generic' - ): - """ Load built-in MS2/CCS/RT models. + def load_installed_models(self, model_type: str = "generic"): + """Load built-in MS2/CCS/RT models. Parameters ---------- @@ -462,50 +425,25 @@ def load_installed_models(self, It could be 'digly', 'phospho', 'HLA', or 'generic'. Defaults to 'generic'. """ - if model_type.lower() in [ - 'phospho','phos','phosphorylation' - ]: - self.ms2_model.load( - model_zip, - model_path_in_zip='generic/ms2.pth' - ) - self.rt_model.load( - model_zip, - model_path_in_zip='phospho/rt_phos.pth' - ) - self.ccs_model.load( - model_zip, - model_path_in_zip='generic/ccs.pth' - ) + if model_type.lower() in ["phospho", "phos", "phosphorylation"]: + self.ms2_model.load(model_zip, model_path_in_zip="generic/ms2.pth") + self.rt_model.load(model_zip, model_path_in_zip="phospho/rt_phos.pth") + self.ccs_model.load(model_zip, model_path_in_zip="generic/ccs.pth") elif model_type.lower() in [ - 'digly','glygly','ubiquitylation', - 'ubiquitination','ubiquitinylation' - ]: - self.ms2_model.load( - model_zip, - model_path_in_zip='generic/ms2.pth' - ) - self.rt_model.load( - model_zip, - model_path_in_zip='digly/rt_digly.pth' - ) - self.ccs_model.load( - model_zip, - model_path_in_zip='generic/ccs.pth' - ) - elif model_type.lower() in ['regular','common','generic']: - self.ms2_model.load( - model_zip, model_path_in_zip='generic/ms2.pth' - ) - self.rt_model.load( - model_zip, model_path_in_zip='generic/rt.pth' - ) - self.ccs_model.load( - model_zip, model_path_in_zip='generic/ccs.pth' - ) - elif model_type.lower() in [ - 'hla','unspecific','non-specific', 'nonspecific' + "digly", + "glygly", + "ubiquitylation", + "ubiquitination", + "ubiquitinylation", ]: + self.ms2_model.load(model_zip, model_path_in_zip="generic/ms2.pth") + self.rt_model.load(model_zip, model_path_in_zip="digly/rt_digly.pth") + self.ccs_model.load(model_zip, model_path_in_zip="generic/ccs.pth") + elif model_type.lower() in ["regular", "common", "generic"]: + self.ms2_model.load(model_zip, model_path_in_zip="generic/ms2.pth") + self.rt_model.load(model_zip, model_path_in_zip="generic/rt.pth") + self.ccs_model.load(model_zip, model_path_in_zip="generic/ccs.pth") + elif model_type.lower() in ["hla", "unspecific", "non-specific", "nonspecific"]: self.load_installed_models(model_type="generic") else: logging.warning( @@ -513,11 +451,12 @@ def load_installed_models(self, ) self.load_installed_models(model_type="generic") - def load_external_models(self, + def load_external_models( + self, *, - ms2_model_file: Union[str, io.BytesIO]='', - rt_model_file: Union[str, io.BytesIO]='', - ccs_model_file: Union[str, io.BytesIO]='', + ms2_model_file: Union[str, io.BytesIO] = "", + rt_model_file: Union[str, io.BytesIO] = "", + ccs_model_file: Union[str, io.BytesIO] = "", ): """Load external MS2/RT/CCS models. @@ -537,7 +476,8 @@ def load_external_models(self, """ def _load_file(model, model_file): - if model_file is None: return + if model_file is None: + return try: if isinstance(model_file, str): if os.path.isfile(model_file): @@ -547,7 +487,9 @@ def _load_file(model, model_file): else: model.load(model_file) except (UnpicklingError, TypeError, ValueError, KeyError) as e: - logging.info(f"Cannot load {model_file} as {model.__class__} model, peptdeep will use the pretrained model instead.") + logging.info( + f"Cannot load {model_file} as {model.__class__} model, peptdeep will use the pretrained model instead." + ) if isinstance(ms2_model_file, str) and ms2_model_file: logging.info(f"Using external ms2 model: '{ms2_model_file}'") @@ -567,8 +509,9 @@ def _load_file(model, model_file): logging.info(" -- This model file does not exist") _load_file(self.ccs_model, ccs_model_file) - def train_rt_model(self, - psm_df:pd.DataFrame, + def train_rt_model( + self, + psm_df: pd.DataFrame, ): """ Train/fine-tune the RT model. The fine-tuning will be skipped @@ -579,14 +522,17 @@ def train_rt_model(self, psm_df : pd.DataFrame Training psm_df which contains 'rt_norm' column. """ - psm_df = psm_df.groupby( - ['sequence','mods','mod_sites'] - )[['rt_norm']].median().reset_index(drop=False) + psm_df = ( + psm_df.groupby(["sequence", "mods", "mod_sites"])[["rt_norm"]] + .median() + .reset_index(drop=False) + ) if self.psm_num_to_train_rt_ccs > 0: if self.psm_num_to_train_rt_ccs < len(psm_df): tr_df = psm_sampling_with_important_mods( - psm_df, self.psm_num_to_train_rt_ccs, + psm_df, + self.psm_num_to_train_rt_ccs, self.top_n_mods_to_train, self.psm_num_per_mod_to_train_rt_ccs, ).copy() @@ -594,23 +540,24 @@ def train_rt_model(self, tr_df = psm_df if self._train_psm_logging: - logging.info(f"{len(tr_df)} PSMs for RT model training/transfer learning") + logging.info( + f"{len(tr_df)} PSMs for RT model training/transfer learning" + ) else: tr_df = [] if self.psm_num_to_test_rt_ccs > 0: if len(tr_df) > 0: - test_psm_df = psm_df[ - ~psm_df.sequence.isin(set(tr_df.sequence)) - ].copy() + test_psm_df = psm_df[~psm_df.sequence.isin(set(tr_df.sequence))].copy() if len(test_psm_df) > self.psm_num_to_test_rt_ccs: test_psm_df = test_psm_df.sample( n=self.psm_num_to_test_rt_ccs ).copy() elif len(test_psm_df) == 0: - logging.info("No enough PSMs for testing RT models, " - "please reduce the `psm_num_to_train_rt_ccs` " - "value according to overall peptide numbers. " + logging.info( + "No enough PSMs for testing RT models, " + "please reduce the `psm_num_to_train_rt_ccs` " + "value according to overall peptide numbers. " ) test_psm_df = [] else: @@ -620,12 +567,12 @@ def train_rt_model(self, if len(test_psm_df) > 0: logging.info( - "Testing pretrained RT model:\n" + - str(self.rt_model.test(test_psm_df)) + "Testing pretrained RT model:\n" + str(self.rt_model.test(test_psm_df)) ) if len(tr_df) > 0: - self.rt_model.train(tr_df, + self.rt_model.train( + tr_df, batch_size=self.batch_size_to_train_rt_ccs, epoch=self.epoch_to_train_rt_ccs, warmup_epoch=self.warmup_epoch_to_train_rt_ccs, @@ -635,12 +582,12 @@ def train_rt_model(self, if len(test_psm_df) > 0: logging.info( - "Testing refined RT model:\n" + - str(self.rt_model.test(test_psm_df)) + "Testing refined RT model:\n" + str(self.rt_model.test(test_psm_df)) ) - def train_ccs_model(self, - psm_df:pd.DataFrame, + def train_ccs_model( + self, + psm_df: pd.DataFrame, ): """ Train/fine-tune the CCS model. The fine-tuning will be skipped @@ -652,48 +599,50 @@ def train_ccs_model(self, Training psm_df which contains 'ccs' or 'mobility' column. """ - if 'mobility' not in psm_df.columns or 'ccs' not in psm_df.columns: + if "mobility" not in psm_df.columns or "ccs" not in psm_df.columns: return - elif 'ccs' not in psm_df.columns: - psm_df['ccs'] = mobility_to_ccs_for_df( - psm_df, 'mobility' - ) - elif 'mobility' not in psm_df.columns: - psm_df['mobility'] = ccs_to_mobility_for_df( - psm_df, 'ccs' - ) - - psm_df = psm_df.groupby( - ['sequence','mods','mod_sites','charge'] - )[['mobility','ccs']].median().reset_index(drop=False) + elif "ccs" not in psm_df.columns: + psm_df["ccs"] = mobility_to_ccs_for_df(psm_df, "mobility") + elif "mobility" not in psm_df.columns: + psm_df["mobility"] = ccs_to_mobility_for_df(psm_df, "ccs") + + psm_df = ( + psm_df.groupby(["sequence", "mods", "mod_sites", "charge"])[ + ["mobility", "ccs"] + ] + .median() + .reset_index(drop=False) + ) if self.psm_num_to_train_rt_ccs > 0: if self.psm_num_to_train_rt_ccs < len(psm_df): tr_df = psm_sampling_with_important_mods( - psm_df, self.psm_num_to_train_rt_ccs, + psm_df, + self.psm_num_to_train_rt_ccs, self.top_n_mods_to_train, self.psm_num_per_mod_to_train_rt_ccs, ).copy() else: tr_df = psm_df if self._train_psm_logging: - logging.info(f"{len(tr_df)} PSMs for CCS model training/transfer learning") + logging.info( + f"{len(tr_df)} PSMs for CCS model training/transfer learning" + ) else: tr_df = [] if self.psm_num_to_test_rt_ccs > 0: if len(tr_df) > 0: - test_psm_df = psm_df[ - ~psm_df.sequence.isin(set(tr_df.sequence)) - ].copy() + test_psm_df = psm_df[~psm_df.sequence.isin(set(tr_df.sequence))].copy() if len(test_psm_df) > self.psm_num_to_test_rt_ccs: test_psm_df = test_psm_df.sample( n=self.psm_num_to_test_rt_ccs ).copy() elif len(test_psm_df) == 0: - logging.info("No enough PSMs for testing CCS models, " - "please reduce the `psm_num_to_train_rt_ccs` " - "value according to overall precursor numbers. " + logging.info( + "No enough PSMs for testing CCS models, " + "please reduce the `psm_num_to_train_rt_ccs` " + "value according to overall precursor numbers. " ) test_psm_df = [] else: @@ -703,12 +652,13 @@ def train_ccs_model(self, if len(test_psm_df) > 0: logging.info( - "Testing pretrained CCS model:\n" + - str(self.ccs_model.test(test_psm_df)) + "Testing pretrained CCS model:\n" + + str(self.ccs_model.test(test_psm_df)) ) if len(tr_df) > 0: - self.ccs_model.train(tr_df, + self.ccs_model.train( + tr_df, batch_size=self.batch_size_to_train_rt_ccs, epoch=self.epoch_to_train_rt_ccs, warmup_epoch=self.warmup_epoch_to_train_rt_ccs, @@ -718,11 +668,11 @@ def train_ccs_model(self, if len(test_psm_df) > 0: logging.info( - "Testing refined CCS model:\n" + - str(self.ccs_model.test(test_psm_df)) + "Testing refined CCS model:\n" + str(self.ccs_model.test(test_psm_df)) ) - def train_ms2_model(self, + def train_ms2_model( + self, psm_df: pd.DataFrame, matched_intensity_df: pd.DataFrame, ): @@ -744,9 +694,10 @@ def train_ms2_model(self, if self.psm_num_to_train_ms2 > 0: if self.psm_num_to_train_ms2 < len(psm_df): tr_df = psm_sampling_with_important_mods( - psm_df, self.psm_num_to_train_ms2, + psm_df, + self.psm_num_to_train_ms2, self.top_n_mods_to_train, - self.psm_num_per_mod_to_train_ms2 + self.psm_num_per_mod_to_train_ms2, ).copy() else: tr_df = psm_df @@ -757,28 +708,21 @@ def train_ms2_model(self, tr_inten_df[frag_type] = matched_intensity_df[frag_type] else: tr_inten_df[frag_type] = 0.0 - normalize_fragment_intensities( - tr_df, tr_inten_df - ) + normalize_fragment_intensities(tr_df, tr_inten_df) if self.use_grid_nce_search: self.nce, self.instrument = self.ms2_model.grid_nce_search( - tr_df, tr_inten_df, - nce_first=model_mgr_settings['transfer'][ - 'grid_nce_first' - ], - nce_last=model_mgr_settings['transfer'][ - 'grid_nce_last' - ], - nce_step=model_mgr_settings['transfer'][ - 'grid_nce_step' - ], - search_instruments=model_mgr_settings['transfer'][ - 'grid_instrument' + tr_df, + tr_inten_df, + nce_first=model_mgr_settings["transfer"]["grid_nce_first"], + nce_last=model_mgr_settings["transfer"]["grid_nce_last"], + nce_step=model_mgr_settings["transfer"]["grid_nce_step"], + search_instruments=model_mgr_settings["transfer"][ + "grid_instrument" ], ) - tr_df['nce'] = self.nce - tr_df['instrument'] = self.instrument + tr_df["nce"] = self.nce + tr_df["instrument"] = self.instrument else: self.set_default_nce_instrument(tr_df) else: @@ -786,15 +730,14 @@ def train_ms2_model(self, if self.psm_num_to_test_ms2 > 0: if len(tr_df) > 0: - test_psm_df = psm_df[ - ~psm_df.sequence.isin(set(tr_df.sequence)) - ].copy() + test_psm_df = psm_df[~psm_df.sequence.isin(set(tr_df.sequence))].copy() if len(test_psm_df) > self.psm_num_to_test_ms2: test_psm_df = test_psm_df.sample(n=self.psm_num_to_test_ms2) elif len(test_psm_df) == 0: - logging.info("No enough PSMs for testing MS2 models, " - "please reduce the `psm_num_to_train_ms2` " - "value according to overall PSM numbers. " + logging.info( + "No enough PSMs for testing MS2 models, " + "please reduce the `psm_num_to_train_ms2` " + "value according to overall PSM numbers. " ) test_psm_df = [] else: @@ -811,13 +754,16 @@ def train_ms2_model(self, if len(test_psm_df) > 0: logging.info( - "Testing pretrained MS2 model on testing df:\n"+ - str(self.ms2_model.test(test_psm_df, tr_inten_df)) + "Testing pretrained MS2 model on testing df:\n" + + str(self.ms2_model.test(test_psm_df, tr_inten_df)) ) if len(tr_df) > 0: if self._train_psm_logging: - logging.info(f"{len(tr_df)} PSMs for MS2 model training/transfer learning") - self.ms2_model.train(tr_df, + logging.info( + f"{len(tr_df)} PSMs for MS2 model training/transfer learning" + ) + self.ms2_model.train( + tr_df, fragment_intensity_df=tr_inten_df, batch_size=self.batch_size_to_train_ms2, epoch=self.epoch_to_train_ms2, @@ -826,21 +772,22 @@ def train_ms2_model(self, verbose=self.train_verbose, ) logging.info( - "Testing refined MS2 model on training df:\n"+ - str(self.ms2_model.test(tr_df, tr_inten_df)) + "Testing refined MS2 model on training df:\n" + + str(self.ms2_model.test(tr_df, tr_inten_df)) ) if len(test_psm_df) > 0: logging.info( - "Testing refined MS2 model on testing df:\n"+ - str(self.ms2_model.test(test_psm_df, tr_inten_df)) + "Testing refined MS2 model on testing df:\n" + + str(self.ms2_model.test(test_psm_df, tr_inten_df)) ) - - def predict_ms2(self, precursor_df:pd.DataFrame, + def predict_ms2( + self, + precursor_df: pd.DataFrame, *, - batch_size:int=512, - reference_frag_df:pd.DataFrame = None, - )->pd.DataFrame: + batch_size: int = 512, + reference_frag_df: pd.DataFrame = None, + ) -> pd.DataFrame: """Predict MS2 for the given precursor_df Parameters @@ -866,18 +813,18 @@ def predict_ms2(self, precursor_df:pd.DataFrame, """ self.set_default_nce_instrument(precursor_df) if self.verbose: - logging.info('Predicting MS2 ...') - return self.ms2_model.predict(precursor_df, + logging.info("Predicting MS2 ...") + return self.ms2_model.predict( + precursor_df, batch_size=batch_size, reference_frag_df=reference_frag_df, - verbose=self.verbose + verbose=self.verbose, ) - def predict_rt(self, precursor_df:pd.DataFrame, - *, - batch_size:int=1024 - )->pd.DataFrame: - """ Predict RT ('rt_pred') inplace into `precursor_df`. + def predict_rt( + self, precursor_df: pd.DataFrame, *, batch_size: int = 1024 + ) -> pd.DataFrame: + """Predict RT ('rt_pred') inplace into `precursor_df`. Parameters ---------- @@ -895,17 +842,16 @@ def predict_rt(self, precursor_df:pd.DataFrame, """ if self.verbose: logging.info("Predicting RT ...") - df = self.rt_model.predict(precursor_df, - batch_size=batch_size, verbose=self.verbose + df = self.rt_model.predict( + precursor_df, batch_size=batch_size, verbose=self.verbose ) - df['rt_norm_pred'] = df.rt_pred + df["rt_norm_pred"] = df.rt_pred return df - def predict_mobility(self, precursor_df:pd.DataFrame, - *, - batch_size:int=1024 - )->pd.DataFrame: - """ Predict mobility (`ccs_pred` and `mobility_pred`) inplace into `precursor_df`. + def predict_mobility( + self, precursor_df: pd.DataFrame, *, batch_size: int = 1024 + ) -> pd.DataFrame: + """Predict mobility (`ccs_pred` and `mobility_pred`) inplace into `precursor_df`. Parameters ---------- @@ -923,34 +869,30 @@ def predict_mobility(self, precursor_df:pd.DataFrame, """ if self.verbose: logging.info("Predicting mobility ...") - precursor_df = self.ccs_model.predict(precursor_df, - batch_size=batch_size, verbose=self.verbose - ) - return self.ccs_model.ccs_to_mobility_pred( - precursor_df + precursor_df = self.ccs_model.predict( + precursor_df, batch_size=batch_size, verbose=self.verbose ) + return self.ccs_model.ccs_to_mobility_pred(precursor_df) - def _predict_func_for_mp(self, arg_dict:dict): + def _predict_func_for_mp(self, arg_dict: dict): """Internal function, for multiprocessing""" update_global_settings(arg_dict.pop("mp_global_settings")) - return self.predict_all( - multiprocessing=False, **arg_dict - ) + return self.predict_all(multiprocessing=False, **arg_dict) - def predict_all_mp(self, precursor_df:pd.DataFrame, + def predict_all_mp( + self, + precursor_df: pd.DataFrame, *, - predict_items:list = [ - 'rt' ,'mobility' ,'ms2' - ], - frag_types:list = None, - process_num:int = 8, - mp_batch_size:int = 100000, + predict_items: list = ["rt", "mobility", "ms2"], + frag_types: list = None, + process_num: int = 8, + mp_batch_size: int = 100000, ): self.ms2_model.model.share_memory() self.rt_model.model.share_memory() self.ccs_model.model.share_memory() - df_groupby = precursor_df.groupby('nAA') + df_groupby = precursor_df.groupby("nAA") mgr = mp.Manager() mp_global_settings = mgr.dict() @@ -967,75 +909,68 @@ def mp_param_generator(df_groupby): for nAA, df in df_groupby: for i in range(0, len(df), mp_batch_size): yield { - 'precursor_df': df.iloc[i:i+mp_batch_size,:], - 'predict_items': predict_items, - 'frag_types': frag_types, - 'mp_global_settings': mp_global_settings + "precursor_df": df.iloc[i : i + mp_batch_size, :], + "predict_items": predict_items, + "frag_types": frag_types, + "mp_global_settings": mp_global_settings, } precursor_df_list = [] - if 'ms2' in predict_items: + if "ms2" in predict_items: fragment_mz_df_list = [] fragment_intensity_df_list = [] else: fragment_mz_df_list = None if self.verbose: - logging.info( - f'Predicting {",".join(predict_items)} ...' - ) + logging.info(f'Predicting {",".join(predict_items)} ...') verbose_bak = self.verbose self.verbose = False - with mp.get_context('spawn').Pool(process_num) as p: + with mp.get_context("spawn").Pool(process_num) as p: for ret_dict in process_bar( p.imap_unordered( - self._predict_func_for_mp, - mp_param_generator(df_groupby) + self._predict_func_for_mp, mp_param_generator(df_groupby) ), - get_batch_num_mp(df_groupby) + get_batch_num_mp(df_groupby), ): - precursor_df_list.append(ret_dict['precursor_df']) + precursor_df_list.append(ret_dict["precursor_df"]) if fragment_mz_df_list is not None: - fragment_mz_df_list.append( - ret_dict['fragment_mz_df'] - ) - fragment_intensity_df_list.append( - ret_dict['fragment_intensity_df'] - ) + fragment_mz_df_list.append(ret_dict["fragment_mz_df"]) + fragment_intensity_df_list.append(ret_dict["fragment_intensity_df"]) self.verbose = verbose_bak if fragment_mz_df_list is not None: - ( - precursor_df, fragment_mz_df, fragment_intensity_df - ) = concat_precursor_fragment_dataframes( - precursor_df_list, - fragment_mz_df_list, - fragment_intensity_df_list, + (precursor_df, fragment_mz_df, fragment_intensity_df) = ( + concat_precursor_fragment_dataframes( + precursor_df_list, + fragment_mz_df_list, + fragment_intensity_df_list, + ) ) return { - 'precursor_df': precursor_df, - 'fragment_mz_df': fragment_mz_df, - 'fragment_intensity_df': fragment_intensity_df, + "precursor_df": precursor_df, + "fragment_mz_df": fragment_mz_df, + "fragment_intensity_df": fragment_intensity_df, } else: precursor_df = pd.concat(precursor_df_list) precursor_df.reset_index(drop=True, inplace=True) - return {'precursor_df': precursor_df} + return {"precursor_df": precursor_df} - def predict_all(self, precursor_df:pd.DataFrame, + def predict_all( + self, + precursor_df: pd.DataFrame, *, - predict_items:list = [ - 'rt' ,'mobility' ,'ms2' - ], - frag_types:list = None, - multiprocessing:bool = True, - min_required_precursor_num_for_mp:int = 3000, - process_num:int = 8, - mp_batch_size:int = 100000, - )->Dict[str, pd.DataFrame]: + predict_items: list = ["rt", "mobility", "ms2"], + frag_types: list = None, + multiprocessing: bool = True, + min_required_precursor_num_for_mp: int = 3000, + process_num: int = 8, + mp_batch_size: int = 100000, + ) -> Dict[str, pd.DataFrame]: """ Predict all items defined by `predict_items`, which may include rt, mobility, fragment_mz @@ -1084,8 +1019,9 @@ def predict_all(self, precursor_df:pd.DataFrame, } ``` """ + def refine_df(df): - if 'ms2' in predict_items: + if "ms2" in predict_items: refine_precursor_df(df) else: refine_precursor_df(df, drop_frag_idx=False) @@ -1093,68 +1029,69 @@ def refine_df(df): if frag_types is None: if self.ms2_model.model._mask_modloss: frag_types = [ - frag for frag in self.ms2_model.charged_frag_types - if 'modloss' not in frag + frag + for frag in self.ms2_model.charged_frag_types + if "modloss" not in frag ] else: frag_types = self.ms2_model.charged_frag_types - if 'precursor_mz' not in precursor_df.columns: + if "precursor_mz" not in precursor_df.columns: update_precursor_mz(precursor_df) if ( - self.ms2_model.device_type!='cpu' - or not multiprocessing or process_num <= 1 + self.ms2_model.device_type != "cpu" + or not multiprocessing + or process_num <= 1 or len(precursor_df) < min_required_precursor_num_for_mp ): refine_df(precursor_df) - if 'rt' in predict_items: - self.predict_rt(precursor_df, - batch_size=model_mgr_settings['predict']['batch_size_rt_ccs'] + if "rt" in predict_items: + self.predict_rt( + precursor_df, + batch_size=model_mgr_settings["predict"]["batch_size_rt_ccs"], ) - if 'mobility' in predict_items: - self.predict_mobility(precursor_df, - batch_size=model_mgr_settings['predict']['batch_size_rt_ccs'] + if "mobility" in predict_items: + self.predict_mobility( + precursor_df, + batch_size=model_mgr_settings["predict"]["batch_size_rt_ccs"], ) - if 'ms2' in predict_items: - if 'frag_start_idx' in precursor_df.columns: + if "ms2" in predict_items: + if "frag_start_idx" in precursor_df.columns: precursor_df.drop( - columns=['frag_start_idx','frag_stop_idx'], - inplace=True + columns=["frag_start_idx", "frag_stop_idx"], inplace=True ) - fragment_mz_df = create_fragment_mz_dataframe( - precursor_df, frag_types - ) + fragment_mz_df = create_fragment_mz_dataframe(precursor_df, frag_types) fragment_intensity_df = self.predict_ms2( precursor_df, - batch_size=model_mgr_settings['predict']['batch_size_ms2'] + batch_size=model_mgr_settings["predict"]["batch_size_ms2"], ) fragment_intensity_df.drop( columns=[ - col for col in fragment_intensity_df.columns + col + for col in fragment_intensity_df.columns if col not in frag_types - ], inplace=True + ], + inplace=True, ) - clear_error_modloss_intensities( - fragment_mz_df, fragment_intensity_df - ) + clear_error_modloss_intensities(fragment_mz_df, fragment_intensity_df) return { - 'precursor_df': precursor_df, - 'fragment_mz_df': fragment_mz_df, - 'fragment_intensity_df': fragment_intensity_df, + "precursor_df": precursor_df, + "fragment_mz_df": fragment_mz_df, + "fragment_intensity_df": fragment_intensity_df, } else: - return {'precursor_df': precursor_df} + return {"precursor_df": precursor_df} else: logging.info(f"Using multiprocessing with {process_num} processes ...") return self.predict_all_mp( precursor_df, predict_items=predict_items, - process_num = process_num, + process_num=process_num, mp_batch_size=mp_batch_size, ) diff --git a/peptdeep/protein/fasta.py b/peptdeep/protein/fasta.py index dcbfe898..160a2c72 100644 --- a/peptdeep/protein/fasta.py +++ b/peptdeep/protein/fasta.py @@ -7,33 +7,35 @@ class PredictSpecLibFasta(SpecLibFasta, PredictSpecLib): """ Predicted spec lib from fasta files or other peptide files. """ - def __init__(self, - model_manager:ModelManager = None, + + def __init__( + self, + model_manager: ModelManager = None, *, - charged_frag_types:list = ['b_z1','b_z2','y_z1','y_z2'], - protease:str = 'trypsin', - max_missed_cleavages:int = 2, - peptide_length_min:int = 7, - peptide_length_max:int = 35, - precursor_charge_min:int = 2, - precursor_charge_max:int = 4, - precursor_mz_min:float = 400.0, - precursor_mz_max:float = 1800.0, - var_mods:list = ['Acetyl@Protein N-term','Oxidation@M'], - min_var_mod_num:int = 0, - max_var_mod_num:int = 2, - fix_mods:list = ['Carbamidomethyl@C'], - labeling_channels:dict = None, - special_mods:list = [], - min_special_mod_num:int = 0, - max_special_mod_num:int = 1, - special_mods_cannot_modify_pep_n_term:bool=False, - special_mods_cannot_modify_pep_c_term:bool=False, - decoy: str = None, # or pseudo_reverse or diann - include_contaminants: bool=False, + charged_frag_types: list = ["b_z1", "b_z2", "y_z1", "y_z2"], + protease: str = "trypsin", + max_missed_cleavages: int = 2, + peptide_length_min: int = 7, + peptide_length_max: int = 35, + precursor_charge_min: int = 2, + precursor_charge_max: int = 4, + precursor_mz_min: float = 400.0, + precursor_mz_max: float = 1800.0, + var_mods: list = ["Acetyl@Protein N-term", "Oxidation@M"], + min_var_mod_num: int = 0, + max_var_mod_num: int = 2, + fix_mods: list = ["Carbamidomethyl@C"], + labeling_channels: dict = None, + special_mods: list = [], + min_special_mod_num: int = 0, + max_special_mod_num: int = 1, + special_mods_cannot_modify_pep_n_term: bool = False, + special_mods_cannot_modify_pep_c_term: bool = False, + decoy: str = None, # or pseudo_reverse or diann + include_contaminants: bool = False, I_to_L=False, - generate_precursor_isotope:bool = False, - rt_to_irt:bool = False, + generate_precursor_isotope: bool = False, + rt_to_irt: bool = False, ): """ Parameters @@ -130,7 +132,8 @@ def __init__(self, rt_to_irt : bool, optional If convert predicted RT to iRT values, by default False """ - SpecLibFasta.__init__(self, + SpecLibFasta.__init__( + self, charged_frag_types=charged_frag_types, protease=protease, max_missed_cleavages=max_missed_cleavages, @@ -155,7 +158,8 @@ def __init__(self, I_to_L=I_to_L, ) - PredictSpecLib.__init__(self, + PredictSpecLib.__init__( + self, model_manager=model_manager, charged_frag_types=self.charged_frag_types, precursor_mz_min=self.min_precursor_mz, @@ -170,7 +174,9 @@ def __init__(self, self.model_manager = ModelManager() self.model_manager.reset_by_global_settings() else: - print("Oops, `PredictSpecLibFasta.model_manager` is None, while it should not happen") + print( + "Oops, `PredictSpecLibFasta.model_manager` is None, while it should not happen" + ) self.model_manager = model_manager def add_charge(self): @@ -179,15 +185,19 @@ def add_charge(self): else: print(f"Predicting charge states for {len(self.precursor_df)} peptides ...") if self.model_manager.use_predicted_charge_in_speclib: - self._precursor_df = self.model_manager.charge_model.predict_and_clip_charges( - self.precursor_df, - min_precursor_charge=self.min_precursor_charge, - max_precursor_charge=self.max_precursor_charge, - charge_prob_cutoff=self.model_manager.charge_prob_cutoff + self._precursor_df = ( + self.model_manager.charge_model.predict_and_clip_charges( + self.precursor_df, + min_precursor_charge=self.min_precursor_charge, + max_precursor_charge=self.max_precursor_charge, + charge_prob_cutoff=self.model_manager.charge_prob_cutoff, + ) ) else: - self._precursor_df = self.model_manager.charge_model.predict_charges_as_prob( - self.precursor_df, - min_precursor_charge=self.min_precursor_charge, - max_precursor_charge=self.max_precursor_charge + self._precursor_df = ( + self.model_manager.charge_model.predict_charges_as_prob( + self.precursor_df, + min_precursor_charge=self.min_precursor_charge, + max_precursor_charge=self.max_precursor_charge, + ) ) diff --git a/peptdeep/psm_frag_reader/__init__.py b/peptdeep/psm_frag_reader/__init__.py index 8c7e9bcf..06b8de3b 100644 --- a/peptdeep/psm_frag_reader/__init__.py +++ b/peptdeep/psm_frag_reader/__init__.py @@ -1,4 +1,5 @@ from peptdeep.psm_frag_reader import ( maxquant_frag_reader, - psm_frag_reader, psmlabel_reader + psm_frag_reader, + psmlabel_reader, ) diff --git a/peptdeep/rescore/__init__.py b/peptdeep/rescore/__init__.py index cc0872a4..6dfdd360 100644 --- a/peptdeep/rescore/__init__.py +++ b/peptdeep/rescore/__init__.py @@ -1,3 +1 @@ -from peptdeep.rescore import ( - feature_extractor -) +from peptdeep.rescore import feature_extractor diff --git a/peptdeep/rescore/fdr.py b/peptdeep/rescore/fdr.py index d0e807a8..474bc880 100644 --- a/peptdeep/rescore/fdr.py +++ b/peptdeep/rescore/fdr.py @@ -2,10 +2,9 @@ import numpy as np import pandas as pd + @numba.njit -def fdr_to_q_values( - fdr_values:np.ndarray -)->np.ndarray: +def fdr_to_q_values(fdr_values: np.ndarray) -> np.ndarray: """convert FDR values to q_values. Parameters @@ -29,11 +28,10 @@ def fdr_to_q_values( q_values[i] = min_q_value return q_values + def calc_fdr( - df:pd.DataFrame, - score_column:str, - decoy_column:str='decoy' -)->pd.DataFrame: + df: pd.DataFrame, score_column: str, decoy_column: str = "decoy" +) -> pd.DataFrame: """Calculate FDR values (q_values in fact) for the given dataframe Parameters @@ -55,25 +53,25 @@ def calc_fdr( """ df = df.reset_index(drop=True).sort_values( - [score_column,decoy_column], ascending=False + [score_column, decoy_column], ascending=False ) - target_values = 1-df[decoy_column].values + target_values = 1 - df[decoy_column].values decoy_cumsum = np.cumsum(df[decoy_column].values) target_cumsum = np.cumsum(target_values) - fdr_values = decoy_cumsum/target_cumsum - df['fdr'] = fdr_to_q_values(fdr_values) + fdr_values = decoy_cumsum / target_cumsum + df["fdr"] = fdr_to_q_values(fdr_values) return df -#wrapper + +# wrapper calc_fdr_for_df = calc_fdr + @numba.njit def fdr_from_ref( - sorted_scores:np.ndarray, - ref_scores:np.ndarray, - ref_fdr_values:np.ndarray -)->np.ndarray: - """ Calculate FDR values from the given reference scores and fdr_values. + sorted_scores: np.ndarray, ref_scores: np.ndarray, ref_fdr_values: np.ndarray +) -> np.ndarray: + """Calculate FDR values from the given reference scores and fdr_values. It is used to extend peptide-level or sequence-level FDR (reference) to each PSM, as PSMs are more useful for quantification. @@ -97,7 +95,7 @@ def fdr_from_ref( """ q_values = np.zeros_like(sorted_scores) - i,j = 0,0 + i, j = 0, 0 while i < len(sorted_scores) and j < len(ref_scores): if sorted_scores[i] >= ref_scores[j]: q_values[i] = ref_fdr_values[j] @@ -109,14 +107,15 @@ def fdr_from_ref( i += 1 return q_values + def calc_fdr_from_ref( df: pd.DataFrame, - ref_scores:np.ndarray, - ref_fdr_values:np.ndarray, - score_column:str, - decoy_column:str='decoy' -)->pd.DataFrame: - """ Calculate FDR values for a PSM dataframe from the given reference + ref_scores: np.ndarray, + ref_fdr_values: np.ndarray, + score_column: str, + decoy_column: str = "decoy", +) -> pd.DataFrame: + """Calculate FDR values for a PSM dataframe from the given reference scores and fdr_values. It is used to extend peptide-level or sequence-level FDR (reference) to each PSM, as PSMs are more useful for quantification. @@ -148,14 +147,13 @@ def calc_fdr_from_ref( """ df = df.reset_index(drop=True).sort_values( - [score_column,decoy_column], ascending=False + [score_column, decoy_column], ascending=False ) sorted_idxes = np.argsort(ref_fdr_values) ref_scores = ref_scores[sorted_idxes] ref_q_values = ref_fdr_values[sorted_idxes] - df['fdr'] = fdr_from_ref( - df.score.values, ref_scores, ref_q_values - ) + df["fdr"] = fdr_from_ref(df.score.values, ref_scores, ref_q_values) return df + calc_fdr_from_ref_for_df = calc_fdr_from_ref diff --git a/peptdeep/rescore/feature_extractor.py b/peptdeep/rescore/feature_extractor.py index c0ae6630..5004b16f 100644 --- a/peptdeep/rescore/feature_extractor.py +++ b/peptdeep/rescore/feature_extractor.py @@ -6,12 +6,8 @@ import torch.multiprocessing as mp from alphabase.peptide.fragment import get_charged_frag_types -from alphabase.peptide.precursor import ( - refine_precursor_df -) -from alphabase.peptide.fragment import ( - concat_precursor_fragment_dataframes -) +from alphabase.peptide.precursor import refine_precursor_df +from alphabase.peptide.fragment import concat_precursor_fragment_dataframes from peptdeep.pretrained_models import ModelManager from peptdeep.model.ms2 import calc_ms2_similarity @@ -22,54 +18,47 @@ from peptdeep.settings import global_settings # perc_settings = global_settings['percolator'] + def match_one_raw( psm_df_one_raw, ms2_file, ms2_file_type, frag_types_to_match, - ms2_ppm, ms2_tol, + ms2_ppm, + ms2_tol, calibrate_frag_mass_error, ): - """ Internal function """ - match = PepSpecMatch( - charged_frag_types=frag_types_to_match - ) + """Internal function""" + match = PepSpecMatch(charged_frag_types=frag_types_to_match) - ( - psm_df, fragment_mz_df, - matched_intensity_df, matched_mz_err_df - ) = match.match_ms2_one_raw( - refine_precursor_df(psm_df_one_raw), - ms2_file=ms2_file, - ms2_file_type=ms2_file_type, - ppm=ms2_ppm, tol=ms2_tol, + (psm_df, fragment_mz_df, matched_intensity_df, matched_mz_err_df) = ( + match.match_ms2_one_raw( + refine_precursor_df(psm_df_one_raw), + ms2_file=ms2_file, + ms2_file_type=ms2_file_type, + ppm=ms2_ppm, + tol=ms2_tol, + ) ) if calibrate_frag_mass_error: - from peptdeep.mass_spec.mass_calibration import ( - MassCalibratorForRT_KNN - ) + from peptdeep.mass_spec.mass_calibration import MassCalibratorForRT_KNN + frag_mass_calibrator = MassCalibratorForRT_KNN() _df_fdr = psm_df.query("fdr<0.01") - frag_mass_calibrator.fit( - _df_fdr, matched_mz_err_df - ) - matched_mz_err_df = frag_mass_calibrator.calibrate( - psm_df, matched_mz_err_df - ) + frag_mass_calibrator.fit(_df_fdr, matched_mz_err_df) + matched_mz_err_df = frag_mass_calibrator.calibrate(psm_df, matched_mz_err_df) + + return (psm_df, fragment_mz_df, matched_intensity_df, matched_mz_err_df) - return ( - psm_df, fragment_mz_df, - matched_intensity_df, matched_mz_err_df - ) def get_psm_scores( - psm_df:pd.DataFrame, - predict_intensity_df:pd.DataFrame, - matched_intensity_df:pd.DataFrame, - matched_mass_err_df:pd.DataFrame, -)->pd.DataFrame: + psm_df: pd.DataFrame, + predict_intensity_df: pd.DataFrame, + matched_intensity_df: pd.DataFrame, + matched_mass_err_df: pd.DataFrame, +) -> pd.DataFrame: """ AlphaPeptDeep has a built-in score for PSMs, it works much better than other scores such as X!Tandem @@ -91,18 +80,20 @@ def get_psm_scores( `psm_df` with "*_score" columns appended inplace """ matched_norm_intensity_df = pd.DataFrame( - np.log(matched_intensity_df.values+1), - columns=matched_intensity_df.columns.values + np.log(matched_intensity_df.values + 1), + columns=matched_intensity_df.columns.values, ) - matched_merr_weight_df = matched_mass_err_df.mask(matched_mass_err_df>1000000, 0).abs() + matched_merr_weight_df = matched_mass_err_df.mask( + matched_mass_err_df > 1000000, 0 + ).abs() max_merr = matched_merr_weight_df.values.max() if max_merr > 0: matched_merr_weight_df /= max_merr - matched_merr_weight_df = 1-matched_merr_weight_df.pow(4) + matched_merr_weight_df = 1 - matched_merr_weight_df.pow(4) - peak_score_df = matched_norm_intensity_df*matched_merr_weight_df + peak_score_df = matched_norm_intensity_df * matched_merr_weight_df - pred_weighted_score_df = peak_score_df*predict_intensity_df + pred_weighted_score_df = peak_score_df * predict_intensity_df def _get_one_score( frag_start_end, @@ -110,29 +101,35 @@ def _get_one_score( pred_weighted_score_values, ): frag_start, frag_end = frag_start_end - frag_ratio = (peak_score_values[frag_start:frag_end]>0).mean()**0.5 + frag_ratio = (peak_score_values[frag_start:frag_end] > 0).mean() ** 0.5 return ( - peak_score_values[frag_start:frag_end].sum()*frag_ratio, - pred_weighted_score_values[frag_start:frag_end].sum()*frag_ratio + peak_score_values[frag_start:frag_end].sum() * frag_ratio, + pred_weighted_score_values[frag_start:frag_end].sum() * frag_ratio, ) + ( - psm_df['merr_weighted_score'], - psm_df['pred_weighted_score'], - ) = zip(*psm_df[['frag_start_idx','frag_stop_idx']].apply( - _get_one_score, axis=1, - peak_score_values = peak_score_df.values, - pred_weighted_score_values = pred_weighted_score_df.values, - )) + psm_df["merr_weighted_score"], + psm_df["pred_weighted_score"], + ) = zip( + *psm_df[["frag_start_idx", "frag_stop_idx"]].apply( + _get_one_score, + axis=1, + peak_score_values=peak_score_df.values, + pred_weighted_score_values=pred_weighted_score_df.values, + ) + ) return psm_df + def get_ms2_features( - psm_df, frag_types, + psm_df, + frag_types, predict_intensity_df, matched_intensity_df, matched_mass_err_df, -)->pd.DataFrame: - """ Extract ms2 features from the given +) -> pd.DataFrame: + """Extract ms2 features from the given predict_intensity_df and matched_intensity_df. It will add columns into psm_df: - cos: cosine similarity between predicted and matched fragments @@ -153,27 +150,23 @@ def get_ms2_features( - and more ... """ used_frag_types = frag_types - predict_intensity_df = predict_intensity_df[ - used_frag_types - ] + predict_intensity_df = predict_intensity_df[used_frag_types] def _get_frag_features( frag_start_end, - matched_inten_values, predicted_inten_values, - has_matched_intens, has_predicted_intens, + matched_inten_values, + predicted_inten_values, + has_matched_intens, + has_predicted_intens, has_both_matched_predicted, ): frag_start, frag_end = frag_start_end - matched_frag_num = has_matched_intens[ - frag_start:frag_end - ].sum(dtype=np.float32) + matched_frag_num = has_matched_intens[frag_start:frag_end].sum(dtype=np.float32) - pred_frag_num = has_predicted_intens[ - frag_start:frag_end - ].sum(dtype=np.float32) + pred_frag_num = has_predicted_intens[frag_start:frag_end].sum(dtype=np.float32) matched_frag_ratio = matched_frag_num / ( - matched_inten_values.shape[1]*(frag_end-frag_start) + matched_inten_values.shape[1] * (frag_end - frag_start) ) both_matched_pred_frag_num = has_both_matched_predicted[ @@ -181,33 +174,27 @@ def _get_frag_features( ].sum(dtype=np.float32) matched_not_pred_frag_num = ( - has_matched_intens[frag_start:frag_end]& - ~has_both_matched_predicted[frag_start:frag_end] + has_matched_intens[frag_start:frag_end] + & ~has_both_matched_predicted[frag_start:frag_end] ).sum(dtype=np.float32) pred_not_matched_frag_num = ( - has_predicted_intens[frag_start:frag_end]& - ~has_both_matched_predicted[frag_start:frag_end] + has_predicted_intens[frag_start:frag_end] + & ~has_both_matched_predicted[frag_start:frag_end] ).sum(dtype=np.float32) if matched_frag_num > 0: both_matched_pred_frag_to_matched = ( both_matched_pred_frag_num / matched_frag_num ) - matched_not_pred_frag_ratio = ( - matched_not_pred_frag_num / matched_frag_num - ) + matched_not_pred_frag_ratio = matched_not_pred_frag_num / matched_frag_num else: both_matched_pred_frag_to_matched = 0 matched_not_pred_frag_ratio = 0 if pred_frag_num > 0: - both_matched_pred_frag_to_pred = ( - both_matched_pred_frag_num / pred_frag_num - ) - pred_not_matched_frag_ratio = ( - pred_not_matched_frag_num / pred_frag_num - ) + both_matched_pred_frag_to_pred = both_matched_pred_frag_num / pred_frag_num + pred_not_matched_frag_ratio = pred_not_matched_frag_num / pred_frag_num else: both_matched_pred_frag_to_pred = 0 pred_not_matched_frag_ratio = 0 @@ -216,9 +203,7 @@ def _get_frag_features( has_predicted_intens[frag_start:frag_end] ].sum() if matched_frag_rel_to_pred > 0: - matched_frag_rel_to_pred /= matched_inten_values[ - frag_start:frag_end - ].sum() + matched_frag_rel_to_pred /= matched_inten_values[frag_start:frag_end].sum() pred_frag_rel_to_matched = predicted_inten_values[frag_start:frag_end][ has_matched_intens[frag_start:frag_end] @@ -229,7 +214,8 @@ def _get_frag_features( ].sum() return ( - matched_frag_num, matched_frag_ratio, + matched_frag_num, + matched_frag_ratio, both_matched_pred_frag_num, both_matched_pred_frag_to_matched, both_matched_pred_frag_to_pred, @@ -242,248 +228,261 @@ def _get_frag_features( ) psm_df, ms2_metrics_df = calc_ms2_similarity( - psm_df, predict_intensity_df, + psm_df, + predict_intensity_df, matched_intensity_df, charged_frag_types=used_frag_types, - metrics=['COS','SA','SPC','PCC'], - spc_top_k=perc_settings['top_k_frags_to_calc_spc'] + metrics=["COS", "SA", "SPC", "PCC"], + spc_top_k=perc_settings["top_k_frags_to_calc_spc"], ) psm_df.rename( columns={ - 'COS':'cos','SA':'sa','SPC':'spc','PCC':'pcc', + "COS": "cos", + "SA": "sa", + "SPC": "spc", + "PCC": "pcc", }, - inplace=True + inplace=True, ) psm_df = get_psm_scores( psm_df, predict_intensity_df=predict_intensity_df[used_frag_types], matched_intensity_df=matched_intensity_df[used_frag_types], - matched_mass_err_df=matched_mass_err_df[used_frag_types] + matched_mass_err_df=matched_mass_err_df[used_frag_types], ) psm_df.rename( columns={ - 'merr_weighted_score':'merr_weighted_frag_score', - 'pred_weighted_score':'pred_weighted_frag_score', + "merr_weighted_score": "merr_weighted_frag_score", + "pred_weighted_score": "pred_weighted_frag_score", }, - inplace=True + inplace=True, ) - has_matched_intens=matched_intensity_df[ - used_frag_types - ].values > 0 - has_predicted_intens=predict_intensity_df[ - used_frag_types - ].values > 0.001 - has_both_matched_predicted = has_matched_intens&has_predicted_intens + has_matched_intens = matched_intensity_df[used_frag_types].values > 0 + has_predicted_intens = predict_intensity_df[used_frag_types].values > 0.001 + has_both_matched_predicted = has_matched_intens & has_predicted_intens ( - psm_df['matched_frag_num'], - psm_df['matched_frag_ratio'], - psm_df['both_matched_pred_frag_num'], - psm_df['both_matched_pred_frag_to_matched'], - psm_df['both_matched_pred_frag_to_pred'], - psm_df['matched_not_pred_frag_num'], - psm_df['matched_not_pred_frag_ratio'], - psm_df['pred_not_matched_frag_num'], - psm_df['pred_not_matched_frag_ratio'], - psm_df['matched_frag_rel_to_pred'], - psm_df['pred_frag_rel_to_matched'], - ) = zip(*psm_df[['frag_start_idx','frag_stop_idx']].apply( - _get_frag_features, axis=1, - matched_inten_values=matched_intensity_df[used_frag_types].values, - predicted_inten_values=predict_intensity_df[used_frag_types].values, - has_matched_intens=has_matched_intens, - has_predicted_intens=has_predicted_intens, - has_both_matched_predicted=has_both_matched_predicted, - )) - - b_frag_types = [ - _t for _t in used_frag_types - if _t.startswith('b') - ] + psm_df["matched_frag_num"], + psm_df["matched_frag_ratio"], + psm_df["both_matched_pred_frag_num"], + psm_df["both_matched_pred_frag_to_matched"], + psm_df["both_matched_pred_frag_to_pred"], + psm_df["matched_not_pred_frag_num"], + psm_df["matched_not_pred_frag_ratio"], + psm_df["pred_not_matched_frag_num"], + psm_df["pred_not_matched_frag_ratio"], + psm_df["matched_frag_rel_to_pred"], + psm_df["pred_frag_rel_to_matched"], + ) = zip( + *psm_df[["frag_start_idx", "frag_stop_idx"]].apply( + _get_frag_features, + axis=1, + matched_inten_values=matched_intensity_df[used_frag_types].values, + predicted_inten_values=predict_intensity_df[used_frag_types].values, + has_matched_intens=has_matched_intens, + has_predicted_intens=has_predicted_intens, + has_both_matched_predicted=has_both_matched_predicted, + ) + ) + + b_frag_types = [_t for _t in used_frag_types if _t.startswith("b")] if len(b_frag_types) > 0: psm_df, ms2_metrics_df = calc_ms2_similarity( - psm_df, predict_intensity_df, + psm_df, + predict_intensity_df, matched_intensity_df, charged_frag_types=b_frag_types, - metrics=['COS','SA','SPC','PCC'], + metrics=["COS", "SA", "SPC", "PCC"], ) psm_df.rename( columns={ - 'COS':'cos_bion','SA':'sa_bion','SPC':'spc_bion', - 'PCC':'pcc_bion' + "COS": "cos_bion", + "SA": "sa_bion", + "SPC": "spc_bion", + "PCC": "pcc_bion", }, - inplace=True + inplace=True, ) psm_df = get_psm_scores( psm_df, predict_intensity_df=predict_intensity_df[b_frag_types], matched_intensity_df=matched_intensity_df[b_frag_types], - matched_mass_err_df=matched_mass_err_df[b_frag_types] + matched_mass_err_df=matched_mass_err_df[b_frag_types], ) psm_df.rename( columns={ - 'merr_weighted_score':'merr_weighted_bion_score', - 'pred_weighted_score':'pred_weighted_bion_score', + "merr_weighted_score": "merr_weighted_bion_score", + "pred_weighted_score": "pred_weighted_bion_score", }, - inplace=True + inplace=True, ) - has_matched_intens=matched_intensity_df[ - b_frag_types - ].values>0 - has_predicted_intens=predict_intensity_df[ - b_frag_types - ].values>0 - has_both_matched_predicted = has_matched_intens&has_predicted_intens + has_matched_intens = matched_intensity_df[b_frag_types].values > 0 + has_predicted_intens = predict_intensity_df[b_frag_types].values > 0 + has_both_matched_predicted = has_matched_intens & has_predicted_intens ( - psm_df['matched_bion_num'], - psm_df['matched_bion_ratio'], - psm_df['both_matched_pred_bion_num'], - psm_df['both_matched_pred_bion_to_matched'], - psm_df['both_matched_pred_bion_to_pred'], - psm_df['matched_not_pred_bion_num'], - psm_df['matched_not_pred_bion_ratio'], - psm_df['pred_not_matched_bion_num'], - psm_df['pred_not_matched_bion_ratio'], - psm_df['matched_bion_rel_to_pred'], - psm_df['pred_bion_rel_to_matched'], - ) = zip(*psm_df[['frag_start_idx','frag_stop_idx']].apply( - _get_frag_features, axis=1, - matched_inten_values=matched_intensity_df[b_frag_types].values, - predicted_inten_values=predict_intensity_df[b_frag_types].values, - has_matched_intens=has_matched_intens, - has_predicted_intens=has_predicted_intens, - has_both_matched_predicted=has_both_matched_predicted, - )) + psm_df["matched_bion_num"], + psm_df["matched_bion_ratio"], + psm_df["both_matched_pred_bion_num"], + psm_df["both_matched_pred_bion_to_matched"], + psm_df["both_matched_pred_bion_to_pred"], + psm_df["matched_not_pred_bion_num"], + psm_df["matched_not_pred_bion_ratio"], + psm_df["pred_not_matched_bion_num"], + psm_df["pred_not_matched_bion_ratio"], + psm_df["matched_bion_rel_to_pred"], + psm_df["pred_bion_rel_to_matched"], + ) = zip( + *psm_df[["frag_start_idx", "frag_stop_idx"]].apply( + _get_frag_features, + axis=1, + matched_inten_values=matched_intensity_df[b_frag_types].values, + predicted_inten_values=predict_intensity_df[b_frag_types].values, + has_matched_intens=has_matched_intens, + has_predicted_intens=has_predicted_intens, + has_both_matched_predicted=has_both_matched_predicted, + ) + ) else: - psm_df[[ - 'matched_bion_num', 'matched_bion_ratio', - 'both_matched_pred_bion_num', - 'both_matched_pred_bion_to_matched', - 'both_matched_pred_bion_to_pred', - 'matched_not_pred_bion_num', - 'matched_not_pred_bion_ratio', - 'pred_not_matched_bion_num', - 'pred_not_matched_bion_ratio', - 'matched_bion_rel_to_pred', - 'pred_bion_rel_to_matched' - ]] = 0 - - y_frag_types = [ - _t for _t in used_frag_types - if _t.startswith('y') - ] + psm_df[ + [ + "matched_bion_num", + "matched_bion_ratio", + "both_matched_pred_bion_num", + "both_matched_pred_bion_to_matched", + "both_matched_pred_bion_to_pred", + "matched_not_pred_bion_num", + "matched_not_pred_bion_ratio", + "pred_not_matched_bion_num", + "pred_not_matched_bion_ratio", + "matched_bion_rel_to_pred", + "pred_bion_rel_to_matched", + ] + ] = 0 + + y_frag_types = [_t for _t in used_frag_types if _t.startswith("y")] if len(y_frag_types) > 0: psm_df, ms2_metrics_df = calc_ms2_similarity( - psm_df, predict_intensity_df, + psm_df, + predict_intensity_df, matched_intensity_df, charged_frag_types=y_frag_types, - metrics=['COS','SA','SPC', 'PCC'], + metrics=["COS", "SA", "SPC", "PCC"], ) psm_df.rename( columns={ - 'COS':'cos_yion','SA':'sa_yion','SPC':'spc_yion', - 'PCC':'pcc_yion', + "COS": "cos_yion", + "SA": "sa_yion", + "SPC": "spc_yion", + "PCC": "pcc_yion", }, - inplace=True + inplace=True, ) psm_df = get_psm_scores( psm_df, predict_intensity_df=predict_intensity_df[b_frag_types], matched_intensity_df=matched_intensity_df[b_frag_types], - matched_mass_err_df=matched_mass_err_df[b_frag_types] + matched_mass_err_df=matched_mass_err_df[b_frag_types], ) psm_df.rename( columns={ - 'merr_weighted_score':'merr_weighted_yion_score', - 'pred_weighted_score':'pred_weighted_yion_score', + "merr_weighted_score": "merr_weighted_yion_score", + "pred_weighted_score": "pred_weighted_yion_score", }, - inplace=True + inplace=True, ) - has_matched_intens=matched_intensity_df[ - y_frag_types - ].values > 0 - has_predicted_intens=predict_intensity_df[ - y_frag_types - ].values > 0 - has_both_matched_predicted = has_matched_intens&has_predicted_intens + has_matched_intens = matched_intensity_df[y_frag_types].values > 0 + has_predicted_intens = predict_intensity_df[y_frag_types].values > 0 + has_both_matched_predicted = has_matched_intens & has_predicted_intens ( - psm_df['matched_yion_num'], - psm_df['matched_yion_ratio'], - psm_df['both_matched_pred_yion_num'], - psm_df['both_matched_pred_yion_to_matched'], - psm_df['both_matched_pred_yion_to_pred'], - psm_df['matched_not_pred_yion_num'], - psm_df['matched_not_pred_yion_ratio'], - psm_df['pred_not_matched_yion_num'], - psm_df['pred_not_matched_yion_ratio'], - psm_df['matched_yion_rel_to_pred'], - psm_df['pred_yion_rel_to_matched'], - ) = zip(*psm_df[['frag_start_idx','frag_stop_idx']].apply( - _get_frag_features, axis=1, - matched_inten_values=matched_intensity_df[y_frag_types].values, - predicted_inten_values=predict_intensity_df[y_frag_types].values, - has_matched_intens=has_matched_intens, - has_predicted_intens=has_predicted_intens, - has_both_matched_predicted=has_both_matched_predicted, - )) + psm_df["matched_yion_num"], + psm_df["matched_yion_ratio"], + psm_df["both_matched_pred_yion_num"], + psm_df["both_matched_pred_yion_to_matched"], + psm_df["both_matched_pred_yion_to_pred"], + psm_df["matched_not_pred_yion_num"], + psm_df["matched_not_pred_yion_ratio"], + psm_df["pred_not_matched_yion_num"], + psm_df["pred_not_matched_yion_ratio"], + psm_df["matched_yion_rel_to_pred"], + psm_df["pred_yion_rel_to_matched"], + ) = zip( + *psm_df[["frag_start_idx", "frag_stop_idx"]].apply( + _get_frag_features, + axis=1, + matched_inten_values=matched_intensity_df[y_frag_types].values, + predicted_inten_values=predict_intensity_df[y_frag_types].values, + has_matched_intens=has_matched_intens, + has_predicted_intens=has_predicted_intens, + has_both_matched_predicted=has_both_matched_predicted, + ) + ) else: - psm_df[[ - 'matched_yion_num', 'matched_yion_ratio', - 'both_matched_pred_yion_num', - 'both_matched_pred_yion_to_matched', - 'both_matched_pred_yion_to_pred', - 'matched_not_pred_yion_num', - 'matched_not_pred_yion_ratio', - 'pred_not_matched_yion_num', - 'pred_not_matched_yion_ratio', - 'matched_yion_rel_to_pred', - 'pred_yion_rel_to_matched' - ]] = 0 + psm_df[ + [ + "matched_yion_num", + "matched_yion_ratio", + "both_matched_pred_yion_num", + "both_matched_pred_yion_to_matched", + "both_matched_pred_yion_to_pred", + "matched_not_pred_yion_num", + "matched_not_pred_yion_ratio", + "pred_not_matched_yion_num", + "pred_not_matched_yion_ratio", + "matched_yion_rel_to_pred", + "pred_yion_rel_to_matched", + ] + ] = 0 def _charge_one_hot(ch): - x = [0]*7 - if ch>6: + x = [0] * 7 + if ch > 6: x[-1] = 1 else: - x[ch-1] = 1 + x[ch - 1] = 1 return tuple(x) ( - psm_df['pep_z1'],psm_df['pep_z2'], - psm_df['pep_z3'],psm_df['pep_z4'], - psm_df['pep_z5'],psm_df['pep_z6'], - psm_df['pep_z_gt_6'] + psm_df["pep_z1"], + psm_df["pep_z2"], + psm_df["pep_z3"], + psm_df["pep_z4"], + psm_df["pep_z5"], + psm_df["pep_z6"], + psm_df["pep_z_gt_6"], ) = zip(*psm_df.charge.astype(np.int8).apply(_charge_one_hot)) def _mod_count(mods): - if not mods: return 0 + if not mods: + return 0 mod_count = 0 - for mod in mods.split(';'): - if mod != 'Carbamidomethyl@C': + for mod in mods.split(";"): + if mod != "Carbamidomethyl@C": mod_count += 1 return mod_count - psm_df['mod_num'] = psm_df.mods.apply(_mod_count) + psm_df["mod_num"] = psm_df.mods.apply(_mod_count) return psm_df + # for imap/imap_unordered with multiprocessing.Pool() def match_one_raw_mp(args): return match_one_raw(*args) + # for imap/imap_unordered with multiprocessing.Pool() def get_ms2_features_mp(args): return get_ms2_features(*args) class ScoreFeatureExtractor: - """ ScoreFeatureExtractor: Feature extractor for percolator + """ScoreFeatureExtractor: Feature extractor for percolator with a single process. Parameters @@ -491,80 +490,87 @@ class ScoreFeatureExtractor: model_mgr : ModelManager The ModelManager in peptdeep.pretrained_models. """ - def __init__(self, - model_mgr:ModelManager - ): + + def __init__(self, model_mgr: ModelManager): self.model_mgr = model_mgr self.model_mgr.verbose = False - self.raw_num_to_tune = perc_settings['raw_num_to_tune'] + self.raw_num_to_tune = perc_settings["raw_num_to_tune"] self.score_feature_list = [ - 'sa','spc','pcc', - 'sa_bion','spc_bion','pcc_bion', - 'sa_yion','spc_yion','pcc_yion', - 'rt_delta_abs', 'mobility_delta_abs', - 'merr_weighted_frag_score', - 'pred_weighted_frag_score', - 'merr_weighted_bion_score', - 'pred_weighted_bion_score', - 'merr_weighted_yion_score', - 'pred_weighted_yion_score', - 'matched_frag_num', 'matched_frag_ratio', - 'both_matched_pred_frag_num', - 'both_matched_pred_frag_to_matched', - 'both_matched_pred_frag_to_pred', - 'matched_not_pred_frag_num', - 'matched_not_pred_frag_ratio', - 'pred_not_matched_frag_num', - 'pred_not_matched_frag_ratio', - 'matched_frag_rel_to_pred', - 'pred_frag_rel_to_matched', - 'matched_bion_num', 'matched_bion_ratio', - 'both_matched_pred_bion_num', - 'both_matched_pred_bion_to_matched', - 'both_matched_pred_bion_to_pred', - 'matched_not_pred_bion_num', - 'matched_not_pred_bion_ratio', - 'pred_not_matched_bion_num', - 'pred_not_matched_bion_ratio', - 'matched_bion_rel_to_pred', - 'pred_bion_rel_to_matched', - 'matched_yion_num', 'matched_yion_ratio', - 'both_matched_pred_yion_num', - 'both_matched_pred_yion_to_matched', - 'both_matched_pred_yion_to_pred', - 'matched_not_pred_yion_num', - 'matched_not_pred_yion_ratio', - 'pred_not_matched_yion_num', - 'pred_not_matched_yion_ratio', - 'matched_yion_rel_to_pred', - 'pred_yion_rel_to_matched', - 'pep_z1','pep_z2','pep_z3','pep_z4', - 'pep_z5','pep_z6','pep_z_gt_6', - 'mod_num', + "sa", + "spc", + "pcc", + "sa_bion", + "spc_bion", + "pcc_bion", + "sa_yion", + "spc_yion", + "pcc_yion", + "rt_delta_abs", + "mobility_delta_abs", + "merr_weighted_frag_score", + "pred_weighted_frag_score", + "merr_weighted_bion_score", + "pred_weighted_bion_score", + "merr_weighted_yion_score", + "pred_weighted_yion_score", + "matched_frag_num", + "matched_frag_ratio", + "both_matched_pred_frag_num", + "both_matched_pred_frag_to_matched", + "both_matched_pred_frag_to_pred", + "matched_not_pred_frag_num", + "matched_not_pred_frag_ratio", + "pred_not_matched_frag_num", + "pred_not_matched_frag_ratio", + "matched_frag_rel_to_pred", + "pred_frag_rel_to_matched", + "matched_bion_num", + "matched_bion_ratio", + "both_matched_pred_bion_num", + "both_matched_pred_bion_to_matched", + "both_matched_pred_bion_to_pred", + "matched_not_pred_bion_num", + "matched_not_pred_bion_ratio", + "pred_not_matched_bion_num", + "pred_not_matched_bion_ratio", + "matched_bion_rel_to_pred", + "pred_bion_rel_to_matched", + "matched_yion_num", + "matched_yion_ratio", + "both_matched_pred_yion_num", + "both_matched_pred_yion_to_matched", + "both_matched_pred_yion_to_pred", + "matched_not_pred_yion_num", + "matched_not_pred_yion_ratio", + "pred_not_matched_yion_num", + "pred_not_matched_yion_ratio", + "matched_yion_rel_to_pred", + "pred_yion_rel_to_matched", + "pep_z1", + "pep_z2", + "pep_z3", + "pep_z4", + "pep_z5", + "pep_z6", + "pep_z_gt_6", + "mod_num", ] self.reset_by_global_settings() def reset_by_global_settings(self): - self.require_model_tuning = perc_settings[ - 'require_model_tuning' - ] - self.require_raw_specific_tuning = perc_settings[ - 'require_raw_specific_tuning' - ] - self.raw_specific_ms2_tuning = perc_settings[ - 'raw_specific_ms2_tuning' - ] - self.calibrate_frag_mass_error = perc_settings[ - 'calibrate_frag_mass_error' - ] + self.require_model_tuning = perc_settings["require_model_tuning"] + self.require_raw_specific_tuning = perc_settings["require_raw_specific_tuning"] + self.raw_specific_ms2_tuning = perc_settings["raw_specific_ms2_tuning"] + self.calibrate_frag_mass_error = perc_settings["calibrate_frag_mass_error"] - def _select_raw_to_tune(self, - psm_df:pd.DataFrame, - )->tuple: - """ Randomly select `self.raw_num_to_tune` raw files + def _select_raw_to_tune( + self, + psm_df: pd.DataFrame, + ) -> tuple: + """Randomly select `self.raw_num_to_tune` raw files to tune the models. If # raw files is less than `self.raw_num_to_tune`, all raw files will be used to tune the model. @@ -582,11 +588,11 @@ def _select_raw_to_tune(self, selected raw_name list """ - if 'fdr' not in psm_df.columns: - psm_df = calc_fdr_for_df(psm_df, 'score') - df_fdr = psm_df[(psm_df.fdr<0.01)&(psm_df.decoy==0)] + if "fdr" not in psm_df.columns: + psm_df = calc_fdr_for_df(psm_df, "score") + df_fdr = psm_df[(psm_df.fdr < 0.01) & (psm_df.decoy == 0)] - df_groupby_raw = df_fdr.groupby('raw_name') + df_groupby_raw = df_fdr.groupby("raw_name") if df_groupby_raw.ngroups < self.raw_num_to_tune: tune_raw_num = df_groupby_raw.ngroups @@ -594,21 +600,25 @@ def _select_raw_to_tune(self, tune_raw_num = self.raw_num_to_tune raw_list = list( - df_groupby_raw['score'].count().rank( - ascending=True - ).nlargest(tune_raw_num).index + df_groupby_raw["score"] + .count() + .rank(ascending=True) + .nlargest(tune_raw_num) + .index ) return df_groupby_raw, raw_list - def fine_tune_models(self, - psm_df:pd.DataFrame, - ms2_file_dict:dict, - ms2_file_type:str, - frag_types_to_match:str, - ms2_ppm:bool, ms2_tol:float, + def fine_tune_models( + self, + psm_df: pd.DataFrame, + ms2_file_dict: dict, + ms2_file_type: str, + frag_types_to_match: str, + ms2_ppm: bool, + ms2_tol: float, ): - """ Sample some (n=`self.raw_num_to_tune`) + """Sample some (n=`self.raw_num_to_tune`) from ms2 files, and extract spectrum/peak information, and then fine-tune the models. @@ -633,203 +643,169 @@ def fine_tune_models(self, tolerance value for ms2 matching """ - logging.info('Preparing for fine-tuning ...') + logging.info("Preparing for fine-tuning ...") - ( - df_groupby_raw, raw_list - ) = self._select_raw_to_tune(psm_df) + (df_groupby_raw, raw_list) = self._select_raw_to_tune(psm_df) psm_df_list = [] matched_intensity_df_list = [] - for raw_name, df in process_bar( - df_groupby_raw, df_groupby_raw.ngroups - ): - if ( - raw_name not in raw_list - or raw_name not in ms2_file_dict - ): + for raw_name, df in process_bar(df_groupby_raw, df_groupby_raw.ngroups): + if raw_name not in raw_list or raw_name not in ms2_file_dict: continue - ( - df, _, inten_df, _ - ) = match_one_raw( - df, ms2_file_dict[raw_name], + (df, _, inten_df, _) = match_one_raw( + df, + ms2_file_dict[raw_name], ms2_file_type, frag_types_to_match, - ms2_ppm, ms2_tol, + ms2_ppm, + ms2_tol, self.calibrate_frag_mass_error, ) psm_df_list.append(df) matched_intensity_df_list.append(inten_df) - logging.info('Fine-tuning ...') - if len(psm_df_list) == 0: return + logging.info("Fine-tuning ...") + if len(psm_df_list) == 0: + return self._tune( *concat_precursor_fragment_dataframes( - psm_df_list, - matched_intensity_df_list + psm_df_list, matched_intensity_df_list ) ) - logging.info('Fine-tuning done') + logging.info("Fine-tuning done") def _save_models(self): # save the model for future uses - model_folder = os.path.join( - perc_settings['output_folder'], - "tuned_models" - ) + model_folder = os.path.join(perc_settings["output_folder"], "tuned_models") self.model_mgr.save_models(model_folder) - with open(os.path.join( - model_folder, 'grid_instrument_nce_search.txt' - ), 'w') as f: - f.write(f"# The ms2 model is tuned for following instrument and nce, after grid instrument and nce search.\n") + with open( + os.path.join(model_folder, "grid_instrument_nce_search.txt"), "w" + ) as f: + f.write( + f"# The ms2 model is tuned for following instrument and nce, after grid instrument and nce search.\n" + ) f.write(f"instrument={self.model_mgr.instrument}\n") f.write(f"nce={self.model_mgr.nce}\n") - def _tune(self, - psm_df, - matched_intensity_df - ): + def _tune(self, psm_df, matched_intensity_df): self.model_mgr.train_ccs_model(psm_df) self.model_mgr.train_rt_model(psm_df) _grid_nce = self.model_mgr.use_grid_nce_search - if self.model_mgr.ms2_model.device_type == 'cpu': + if self.model_mgr.ms2_model.device_type == "cpu": self.model_mgr.use_grid_nce_search = False - self.model_mgr.train_ms2_model( - psm_df, matched_intensity_df - ) + self.model_mgr.train_ms2_model(psm_df, matched_intensity_df) self.model_mgr.use_grid_nce_search = _grid_nce self._save_models() def extract_rt_features(self, psm_df): if ( - self.require_raw_specific_tuning and - self.model_mgr.ms2_model.device_type!='cpu' + self.require_raw_specific_tuning + and self.model_mgr.ms2_model.device_type != "cpu" ): ( psm_num_to_train_rt_ccs, psm_num_per_mod_to_train_rt_ccs, - epoch_to_train_rt_ccs + epoch_to_train_rt_ccs, ) = ( self.model_mgr.psm_num_to_train_rt_ccs, self.model_mgr.psm_num_per_mod_to_train_rt_ccs, - self.model_mgr.epoch_to_train_rt_ccs + self.model_mgr.epoch_to_train_rt_ccs, ) - ( - self.model_mgr.psm_num_to_train_rt_ccs - ) = perc_settings['psm_num_per_raw_to_tune'] + (self.model_mgr.psm_num_to_train_rt_ccs) = perc_settings[ + "psm_num_per_raw_to_tune" + ] self.model_mgr.psm_num_per_mod_to_train_rt_ccs = 0 - ( - self.model_mgr.epoch_to_train_rt_ccs - ) = perc_settings['epoch_per_raw_to_tune'] + (self.model_mgr.epoch_to_train_rt_ccs) = perc_settings[ + "epoch_per_raw_to_tune" + ] self.model_mgr.train_rt_model( - psm_df[(psm_df.fdr<0.01)&(psm_df.decoy==0)] + psm_df[(psm_df.fdr < 0.01) & (psm_df.decoy == 0)] ) ( self.model_mgr.psm_num_to_train_rt_ccs, self.model_mgr.psm_num_per_mod_to_train_rt_ccs, - self.model_mgr.epoch_to_train_rt_ccs + self.model_mgr.epoch_to_train_rt_ccs, ) = ( psm_num_to_train_rt_ccs, psm_num_per_mod_to_train_rt_ccs, - epoch_to_train_rt_ccs + epoch_to_train_rt_ccs, ) - if 'rt_norm' in psm_df.columns: - psm_df = self.model_mgr.predict_rt( - psm_df - ) - psm_df[ - 'rt_delta' - ] = ( - psm_df.rt_pred-psm_df.rt_norm - ) + if "rt_norm" in psm_df.columns: + psm_df = self.model_mgr.predict_rt(psm_df) + psm_df["rt_delta"] = psm_df.rt_pred - psm_df.rt_norm mean_delta = psm_df.loc[ - (psm_df.fdr<0.01)&(psm_df.decoy==0), - 'rt_delta' + (psm_df.fdr < 0.01) & (psm_df.decoy == 0), "rt_delta" ].mean() if np.isnan(mean_delta): mean_delta = 0 - psm_df['rt_delta_abs'] = ( - psm_df.rt_delta-mean_delta - ).abs() + psm_df["rt_delta_abs"] = (psm_df.rt_delta - mean_delta).abs() else: - psm_df['rt_delta'] = 0 - psm_df['rt_delta_abs'] = 0 + psm_df["rt_delta"] = 0 + psm_df["rt_delta_abs"] = 0 def extract_mobility_features(self, psm_df): - if ( - 'mobility' in psm_df.columns - ): - psm_df = self.model_mgr.predict_mobility( - psm_df - ) + if "mobility" in psm_df.columns: + psm_df = self.model_mgr.predict_mobility(psm_df) - psm_df[ - 'mobility_delta' - ] = ( - psm_df.mobility_pred-psm_df.mobility - ) + psm_df["mobility_delta"] = psm_df.mobility_pred - psm_df.mobility mean_delta = psm_df.loc[ - (psm_df.fdr<0.01)&(psm_df.decoy==0), - 'mobility_delta' + (psm_df.fdr < 0.01) & (psm_df.decoy == 0), "mobility_delta" ].mean() if np.isnan(mean_delta): mean_delta = 0 - psm_df['mobility_delta_abs'] = ( - psm_df.mobility_delta-mean_delta - ).abs() + psm_df["mobility_delta_abs"] = (psm_df.mobility_delta - mean_delta).abs() else: - psm_df['mobility_delta'] = 0 - psm_df['mobility_delta_abs'] = 0 - + psm_df["mobility_delta"] = 0 + psm_df["mobility_delta_abs"] = 0 - def match_ms2(self, + def match_ms2( + self, psm_df: pd.DataFrame, - ms2_file_dict, #raw_name: ms2_file_path or ms_reader object - ms2_file_type:str, - frag_types_to_match:list = get_charged_frag_types(['b','y'], 2), - ms2_ppm=True, ms2_tol=20, + ms2_file_dict, # raw_name: ms2_file_path or ms_reader object + ms2_file_type: str, + frag_types_to_match: list = get_charged_frag_types(["b", "y"], 2), + ms2_ppm=True, + ms2_tol=20, ): - self.match = PepSpecMatch( - charged_frag_types=frag_types_to_match - ) + self.match = PepSpecMatch(charged_frag_types=frag_types_to_match) self.match.match_ms2_centroid( refine_precursor_df(psm_df), ms2_file_dict=ms2_file_dict, ms2_file_type=ms2_file_type, - ppm=ms2_ppm, tol=ms2_tol, + ppm=ms2_ppm, + tol=ms2_tol, ) def _get_model_frag_types(self, frag_types): used_frag_types = [] for frag_type in frag_types: - if frag_type in ( - self.model_mgr.ms2_model.charged_frag_types - ): + if frag_type in (self.model_mgr.ms2_model.charged_frag_types): used_frag_types.append(frag_type) return used_frag_types - def extract_features(self, + def extract_features( + self, psm_df: pd.DataFrame, ms2_file_dict, ms2_file_type, - frag_types:list = get_charged_frag_types(['b','y'], 2), - ms2_ppm=global_settings['peak_matching']['ms2_ppm'], - ms2_tol=global_settings['peak_matching']['ms2_tol_value'], - )->pd.DataFrame: - """ Extract features and add columns (`self.score_feature_list`) into psm_df + frag_types: list = get_charged_frag_types(["b", "y"], 2), + ms2_ppm=global_settings["peak_matching"]["ms2_ppm"], + ms2_tol=global_settings["peak_matching"]["ms2_tol_value"], + ) -> pd.DataFrame: + """Extract features and add columns (`self.score_feature_list`) into psm_df Parameters ---------- @@ -865,27 +841,24 @@ def extract_features(self, frag_types = self._get_model_frag_types(frag_types) if self.require_model_tuning: - logging.info('Fine-tuning models ...') + logging.info("Fine-tuning models ...") self.fine_tune_models( - psm_df, - ms2_file_dict, ms2_file_type, - frag_types, ms2_ppm, ms2_tol + psm_df, ms2_file_dict, ms2_file_type, frag_types, ms2_ppm, ms2_tol ) - logging.info(f'Extracting peptdeep features for {len(psm_df)} PSMs ...') + logging.info(f"Extracting peptdeep features for {len(psm_df)} PSMs ...") result_psm_list = [] - groupby = psm_df.groupby('raw_name') + groupby = psm_df.groupby("raw_name") for raw_name, df in process_bar(groupby, groupby.ngroups): if raw_name not in ms2_file_dict: continue - ( - df, frag_mz_df, frag_inten_df, frag_merr_df - ) = match_one_raw( + (df, frag_mz_df, frag_inten_df, frag_merr_df) = match_one_raw( df, ms2_file_dict[raw_name], ms2_file_type, frag_types, - ms2_ppm, ms2_tol, + ms2_ppm, + ms2_tol, self.calibrate_frag_mass_error, ) @@ -896,25 +869,22 @@ def extract_features(self, result_psm_list.append( get_ms2_features( - df, frag_types, + df, + frag_types, predict_inten_df, frag_inten_df, frag_merr_df, ) ) - self.psm_df = pd.concat( - result_psm_list, ignore_index=True - ) - logging.info('Finish extracting features') + self.psm_df = pd.concat(result_psm_list, ignore_index=True) + logging.info("Finish extracting features") return self.psm_df class ScoreFeatureExtractorMP(ScoreFeatureExtractor): - def __init__(self, - model_mgr:ModelManager - ): - """ ScoreFeatureExtractorMP: Feature extractor for percolator + def __init__(self, model_mgr: ModelManager): + """ScoreFeatureExtractorMP: Feature extractor for percolator with multiprocessing. Parameters @@ -930,15 +900,16 @@ def __init__(self, self.model_mgr.rt_model.model.share_memory() self.model_mgr.ccs_model.model.share_memory() - - def fine_tune_models(self, + def fine_tune_models( + self, psm_df, ms2_file_dict, ms2_file_type, frag_types_to_match, - ms2_ppm, ms2_tol, + ms2_ppm, + ms2_tol, ): - """ Sample some (n=`self.raw_num_to_tune`) + """Sample some (n=`self.raw_num_to_tune`) from ms2 files, and extract (MP) spectrum/peak information, and then fine-tune the models. @@ -962,16 +933,11 @@ def fine_tune_models(self, ms2_tol : float tolerance value for ms2 matching """ - ( - df_groupby_raw, raw_list - ) = self._select_raw_to_tune(psm_df) + (df_groupby_raw, raw_list) = self._select_raw_to_tune(psm_df) def one_raw_param_generator(df_groupby_raw): for raw_name, df in df_groupby_raw: - if ( - raw_name not in raw_list - or raw_name not in ms2_file_dict - ): + if raw_name not in raw_list or raw_name not in ms2_file_dict: continue yield ( @@ -979,48 +945,53 @@ def one_raw_param_generator(df_groupby_raw): ms2_file_dict[raw_name], ms2_file_type, frag_types_to_match, - ms2_ppm, ms2_tol, + ms2_ppm, + ms2_tol, self.calibrate_frag_mass_error, ) - logging.info('Preparing for fine-tuning ...') + logging.info("Preparing for fine-tuning ...") psm_df_list = [] matched_intensity_df_list = [] - with mp.get_context('spawn').Pool(global_settings['thread_num']) as p: + with mp.get_context("spawn").Pool(global_settings["thread_num"]) as p: for df, _, inten_df, _ in process_bar( p.imap_unordered( - match_one_raw_mp, - one_raw_param_generator(df_groupby_raw) - ), df_groupby_raw.ngroups + match_one_raw_mp, one_raw_param_generator(df_groupby_raw) + ), + df_groupby_raw.ngroups, ): psm_df_list.append(df) matched_intensity_df_list.append(inten_df) - logging.info('Fine-tuning ...') - if len(psm_df_list) == 0: return + logging.info("Fine-tuning ...") + if len(psm_df_list) == 0: + return self._tune( *concat_precursor_fragment_dataframes( - psm_df_list, - matched_intensity_df_list + psm_df_list, matched_intensity_df_list ) ) - def extract_features_one_raw_mp(self,args): + def extract_features_one_raw_mp(self, args): return self.extract_features_one_raw(*args) - def extract_features_one_raw(self, + def extract_features_one_raw( + self, df_one_raw: pd.DataFrame, ms2_file, ms2_file_type, frag_types, - ms2_ppm, ms2_tol, + ms2_ppm, + ms2_tol, calibrate_frag_mass_error, ): - ( - df, frag_mz_df, frag_inten_df, frag_merr_df - ) = match_one_raw(df_one_raw, - ms2_file, ms2_file_type, frag_types, - ms2_ppm, ms2_tol, + (df, frag_mz_df, frag_inten_df, frag_merr_df) = match_one_raw( + df_one_raw, + ms2_file, + ms2_file_type, + frag_types, + ms2_ppm, + ms2_tol, calibrate_frag_mass_error, ) @@ -1029,22 +1000,24 @@ def extract_features_one_raw(self, predict_inten_df = self.model_mgr.predict_ms2(df) - return get_ms2_features(df, + return get_ms2_features( + df, frag_types, predict_inten_df, frag_inten_df, frag_merr_df, ) - def extract_features(self, + def extract_features( + self, psm_df: pd.DataFrame, ms2_file_dict, ms2_file_type, - frag_types:list = get_charged_frag_types(['b','y'], 2), - ms2_ppm=global_settings['peak_matching']['ms2_ppm'], - ms2_tol=global_settings['peak_matching']['ms2_tol_value'], - )->pd.DataFrame: - """ Extract (multiprocessing) features and + frag_types: list = get_charged_frag_types(["b", "y"], 2), + ms2_ppm=global_settings["peak_matching"]["ms2_ppm"], + ms2_tol=global_settings["peak_matching"]["ms2_tol_value"], + ) -> pd.DataFrame: + """Extract (multiprocessing) features and add columns (self.score_feature_list) into psm_df. Parameters @@ -1080,11 +1053,9 @@ def extract_features(self, used_frag_types = self._get_model_frag_types(frag_types) if self.require_model_tuning: - logging.info('Require fine-tuning models ...') + logging.info("Require fine-tuning models ...") self.fine_tune_models( - psm_df, - ms2_file_dict, ms2_file_type, - used_frag_types, ms2_ppm, ms2_tol + psm_df, ms2_file_dict, ms2_file_type, used_frag_types, ms2_ppm, ms2_tol ) self.model_mgr._train_psm_logging = False @@ -1098,29 +1069,31 @@ def one_raw_param_generator(df_groupby_raw): ms2_file_dict[raw_name], ms2_file_type, used_frag_types, - ms2_ppm, ms2_tol, + ms2_ppm, + ms2_tol, self.calibrate_frag_mass_error, ) logging.info( - f'Extracting peptdeep features for {len(psm_df)} PSMs with multiprocessing ...' + f"Extracting peptdeep features for {len(psm_df)} PSMs with multiprocessing ..." ) - df_groupby_raw = psm_df.groupby('raw_name') + df_groupby_raw = psm_df.groupby("raw_name") result_psm_list = [] if ( - self.require_raw_specific_tuning or - self.model_mgr.ms2_model.device_type!='cpu' - + self.require_raw_specific_tuning + or self.model_mgr.ms2_model.device_type != "cpu" ): # multiprocessing is only used for ms2 matching def prediction_gen(df_groupby_raw): - with mp.get_context('spawn').Pool(global_settings['thread_num']) as _p: + with mp.get_context("spawn").Pool(global_settings["thread_num"]) as _p: for ( - df, frag_mz_df, frag_inten_df, frag_merr_df + df, + frag_mz_df, + frag_inten_df, + frag_merr_df, ) in _p.imap_unordered( - match_one_raw_mp, - one_raw_param_generator(df_groupby_raw) + match_one_raw_mp, one_raw_param_generator(df_groupby_raw) ): # outsite multiprocessing region self.extract_rt_features(df) @@ -1134,17 +1107,17 @@ def prediction_gen(df_groupby_raw): psm_num_to_train_ms2, psm_num_per_mod_to_train_ms2, epoch_to_train_ms2, - use_grid_nce_search + use_grid_nce_search, ) = ( self.model_mgr.psm_num_to_train_ms2, self.model_mgr.psm_num_per_mod_to_train_ms2, self.model_mgr.epoch_to_train_ms2, - self.model_mgr.use_grid_nce_search + self.model_mgr.use_grid_nce_search, ) - ( - self.model_mgr.psm_num_to_train_ms2 - ) = perc_settings['psm_num_per_raw_to_tune'] + (self.model_mgr.psm_num_to_train_ms2) = perc_settings[ + "psm_num_per_raw_to_tune" + ] self.model_mgr.psm_num_per_mod_to_train_ms2 = 0 @@ -1152,54 +1125,58 @@ def prediction_gen(df_groupby_raw): self.model_mgr.use_grid_nce_search = False - if 'nce' not in df.columns: + if "nce" not in df.columns: self.model_mgr.set_default_nce(df) self.model_mgr.train_ms2_model( - df[(df.fdr<0.01)&(df.decoy==0)], - frag_inten_df + df[(df.fdr < 0.01) & (df.decoy == 0)], frag_inten_df ) ( self.model_mgr.psm_num_to_train_ms2, self.model_mgr.psm_num_per_mod_to_train_ms2, self.model_mgr.epoch_to_train_ms2, - self.model_mgr.use_grid_nce_search + self.model_mgr.use_grid_nce_search, ) = ( psm_num_to_train_ms2, psm_num_per_mod_to_train_ms2, epoch_to_train_ms2, - use_grid_nce_search + use_grid_nce_search, ) predict_inten_df = self.model_mgr.predict_ms2(df) yield ( - df, used_frag_types, + df, + used_frag_types, predict_inten_df, - frag_inten_df, frag_merr_df, + frag_inten_df, + frag_merr_df, ) - with mp.get_context('spawn').Pool(global_settings['thread_num']) as p: - for df in process_bar(p.imap_unordered( - get_ms2_features_mp, - prediction_gen(df_groupby_raw) - ), df_groupby_raw.ngroups): + with mp.get_context("spawn").Pool(global_settings["thread_num"]) as p: + for df in process_bar( + p.imap_unordered( + get_ms2_features_mp, prediction_gen(df_groupby_raw) + ), + df_groupby_raw.ngroups, + ): result_psm_list.append(df) else: # use multiprocessing for prediction # only when no GPUs are available - with mp.get_context('spawn').Pool(global_settings['thread_num']) as p: - for _df in process_bar(p.imap_unordered( - self.extract_features_one_raw_mp, - one_raw_param_generator(df_groupby_raw) - ), df_groupby_raw.ngroups): + with mp.get_context("spawn").Pool(global_settings["thread_num"]) as p: + for _df in process_bar( + p.imap_unordered( + self.extract_features_one_raw_mp, + one_raw_param_generator(df_groupby_raw), + ), + df_groupby_raw.ngroups, + ): result_psm_list.append(_df) - self.psm_df = pd.concat( - result_psm_list, ignore_index=True - ) - logging.info('Finished feature extraction with multiprocessing') + self.psm_df = pd.concat(result_psm_list, ignore_index=True) + logging.info("Finished feature extraction with multiprocessing") self.model_mgr._train_psm_logging = True return self.psm_df diff --git a/peptdeep/rescore/percolator.py b/peptdeep/rescore/percolator.py index ace4e734..1c98c664 100644 --- a/peptdeep/rescore/percolator.py +++ b/peptdeep/rescore/percolator.py @@ -10,12 +10,10 @@ from peptdeep.rescore.feature_extractor import ( ScoreFeatureExtractor, - ScoreFeatureExtractorMP + ScoreFeatureExtractorMP, ) -from peptdeep.rescore.fdr import ( - fdr_from_ref, fdr_to_q_values, calc_fdr_for_df -) +from peptdeep.rescore.fdr import fdr_from_ref, fdr_to_q_values, calc_fdr_for_df from peptdeep.pretrained_models import ModelManager @@ -23,23 +21,29 @@ from peptdeep.utils import logging -perc_settings = global_settings['percolator'] +perc_settings = global_settings["percolator"] + class LogisticRegressionTorch(torch.nn.Module): """Torch-based rescore model""" + def __init__(self, input_dim, **kwargs): super().__init__() torch.manual_seed(1337) self.linear = torch.nn.Linear(input_dim, 1) + def forward(self, x): return self.linear(x).squeeze(1) + class RescoreModelProvider: def __init__(self): self.model_dict = {} - self.model_dict['linear'] = LogisticRegressionTorch + self.model_dict["linear"] = LogisticRegressionTorch + def register(self, model_name, model_class): self.model_dict[model_name.lower()] = model_class + def get_model(self, model_name, input_dim, **kwargs): if model_name.lower() not in self.model_dict: print( @@ -47,59 +51,48 @@ def get_model(self, model_name, input_dim, **kwargs): f"PyTorch rescoring model '{model_name}' is not " "implemented, switch to 'linear' model." ) - return self.model_dict['linear']( - input_dim, **kwargs - ) + return self.model_dict["linear"](input_dim, **kwargs) else: - return self.model_dict[model_name.lower()]( - input_dim, **kwargs - ) + return self.model_dict[model_name.lower()](input_dim, **kwargs) + rescore_model_provider = RescoreModelProvider() + class NNRescore: - def __init__(self, num_features, nn_model_type='linear'): - self.nn_model = rescore_model_provider.get_model( - nn_model_type, num_features - ) + def __init__(self, num_features, nn_model_type="linear"): + self.nn_model = rescore_model_provider.get_model(nn_model_type, num_features) self.train_batch_size = 10000 self.predict_batch_size = 100000 self.optimizer = torch.optim.Adam( - self.nn_model.parameters(), - lr=perc_settings['lr_percolator_torch_model'] + self.nn_model.parameters(), lr=perc_settings["lr_percolator_torch_model"] ) self.loss_func = torch.nn.BCEWithLogitsLoss() if torch.cuda.is_available(): - self.device = torch.device('cuda') + self.device = torch.device("cuda") self.nn_model.to(self.device) else: - self.device = torch.device('cpu') + self.device = torch.device("cpu") self.epoch = 20 - - def fit(self, features, labels): - labels = torch.tensor( - labels, dtype=torch.float, device=self.device - ) - sample_idxes = np.random.RandomState( - 1337 - ).permutation(len(features)) + labels = torch.tensor(labels, dtype=torch.float, device=self.device) + sample_idxes = np.random.RandomState(1337).permutation(len(features)) for _ in range(self.epoch): for i in range(0, len(features), self.train_batch_size): self.optimizer.zero_grad() outputs = self.nn_model( - torch.tensor(features[ - sample_idxes[i:i+self.train_batch_size] - ], dtype=torch.float, device=self.device) + torch.tensor( + features[sample_idxes[i : i + self.train_batch_size]], + dtype=torch.float, + device=self.device, + ) ) loss = self.loss_func( - outputs, labels[ - sample_idxes[i:i+self.train_batch_size] - ] + outputs, labels[sample_idxes[i : i + self.train_batch_size]] ) loss.backward() @@ -108,13 +101,18 @@ def fit(self, features, labels): def decision_function(self, features): outputs = np.empty(len(features)) for i in range(0, len(features), self.predict_batch_size): - outputs[ - i:i+self.predict_batch_size - ] = self.nn_model( - torch.tensor(features[ - i:i+self.predict_batch_size - ], dtype=torch.float, device=self.device) - ).detach().cpu().numpy() + outputs[i : i + self.predict_batch_size] = ( + self.nn_model( + torch.tensor( + features[i : i + self.predict_batch_size], + dtype=torch.float, + device=self.device, + ) + ) + .detach() + .cpu() + .numpy() + ) return outputs @@ -125,15 +123,17 @@ class Percolator: perc_settings = peptdeep.settings.global_settings['percolator'] ``` """ - def __init__(self, + + def __init__( + self, *, - percolator_model:str=perc_settings['percolator_model'], - percolator_backend:str=perc_settings['percolator_backend'], - cv_fold:int = perc_settings['cv_fold'], - iter_num:int = perc_settings['percolator_iter_num'], - ms2_ppm:bool = global_settings['peak_matching']['ms2_ppm'], - ms2_tol:float = global_settings['peak_matching']['ms2_tol_value'], - model_mgr:ModelManager = None + percolator_model: str = perc_settings["percolator_model"], + percolator_backend: str = perc_settings["percolator_backend"], + cv_fold: int = perc_settings["cv_fold"], + iter_num: int = perc_settings["percolator_iter_num"], + ms2_ppm: bool = global_settings["peak_matching"]["ms2_ppm"], + ms2_tol: float = global_settings["peak_matching"]["ms2_tol_value"], + model_mgr: ModelManager = None, ): """ Parameters @@ -175,15 +175,15 @@ def __init__(self, self.model_mgr = ModelManager() else: self.model_mgr = model_mgr - self.charged_frag_types = perc_settings['frag_types'] + self.charged_frag_types = perc_settings["frag_types"] self.ms2_ppm = ms2_ppm self.ms2_tol = ms2_tol - self.fdr_level = perc_settings['fdr_level'] - self.fdr = perc_settings['fdr'] + self.fdr_level = perc_settings["fdr_level"] + self.fdr = perc_settings["fdr"] self.cv_fold = cv_fold self.iter_num = iter_num - if perc_settings['multiprocessing']: + if perc_settings["multiprocessing"]: self.feature_extractor = ScoreFeatureExtractorMP( model_mgr=self.model_mgr, ) @@ -191,35 +191,30 @@ def __init__(self, self.feature_extractor = ScoreFeatureExtractor( model_mgr=self.model_mgr, ) - self.feature_list = [ - f for f in self.feature_extractor.score_feature_list - ] - self.feature_list += ['score','nAA','charge'] + self.feature_list = [f for f in self.feature_extractor.score_feature_list] + self.feature_list += ["score", "nAA", "charge"] - self.max_train_sample = perc_settings['max_perc_train_sample'] - self.min_train_sample = perc_settings['min_perc_train_sample'] - self.per_raw_fdr = perc_settings['use_fdr_for_each_raw'] + self.max_train_sample = perc_settings["max_perc_train_sample"] + self.min_train_sample = perc_settings["min_perc_train_sample"] + self.per_raw_fdr = perc_settings["use_fdr_for_each_raw"] self.init_percolator_model(percolator_model, percolator_backend) - def init_percolator_model(self, - percolator_model="linear", - percolator_backend="sklearn" + def init_percolator_model( + self, percolator_model="linear", percolator_backend="sklearn" ): from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression + self.percolator_model = percolator_model.lower() self.percolator_backend = percolator_backend.lower() - if percolator_backend.lower() == 'pytorch': + if percolator_backend.lower() == "pytorch": self.model = NNRescore( - len(self.feature_list), - nn_model_type=percolator_model - ) - elif percolator_model == 'linear': - self.model = LogisticRegression( - solver='liblinear' + len(self.feature_list), nn_model_type=percolator_model ) - elif percolator_model == 'random_forest': + elif percolator_model == "linear": + self.model = LogisticRegression(solver="liblinear") + elif percolator_model == "random_forest": self.model = RandomForestClassifier() else: logging.info( @@ -227,11 +222,9 @@ def init_percolator_model(self, f"Rescoring model '{percolator_model}' is not " "implemented, switch to sklearn 'linear' model." ) - self.model = LogisticRegression( - solver='liblinear' - ) - self.percolator_model = 'linear' - self.percolator_backend = 'sklearn' + self.model = LogisticRegression(solver="liblinear") + self.percolator_model = "linear" + self.percolator_backend = "sklearn" def enable_model_fine_tuning(self, flag=True): self.feature_extractor.require_model_tuning = flag @@ -241,12 +234,13 @@ def disable_model_fine_tuning(self): self.feature_extractor.require_model_tuning = False self.feature_extractor.require_raw_specific_rt_tuning = False - def _estimate_fdr(self, - df:pd.DataFrame, - fdr_level:str=None, - per_raw_fdr:bool=None, - )->pd.DataFrame: - df = df.sort_values(['ml_score','decoy'], ascending=False) + def _estimate_fdr( + self, + df: pd.DataFrame, + fdr_level: str = None, + per_raw_fdr: bool = None, + ) -> pd.DataFrame: + df = df.sort_values(["ml_score", "decoy"], ascending=False) df = df.reset_index(drop=True) if fdr_level is None: fdr_level = self.fdr_level @@ -254,90 +248,76 @@ def _estimate_fdr(self, per_raw_fdr = self.per_raw_fdr if per_raw_fdr: df_list = [] - for raw_name, df_raw in df.groupby('raw_name'): - df_list.append(self._estimate_fdr(df_raw, - fdr_level = fdr_level, - per_raw_fdr = False - )) + for raw_name, df_raw in df.groupby("raw_name"): + df_list.append( + self._estimate_fdr(df_raw, fdr_level=fdr_level, per_raw_fdr=False) + ) return pd.concat(df_list) - if fdr_level == 'psm': - target_values = 1-df['decoy'].values - decoy_cumsum = np.cumsum(df['decoy'].values) + if fdr_level == "psm": + target_values = 1 - df["decoy"].values + decoy_cumsum = np.cumsum(df["decoy"].values) target_cumsum = np.cumsum(target_values) - fdr_values = decoy_cumsum/target_cumsum - df['fdr'] = fdr_to_q_values(fdr_values) + fdr_values = decoy_cumsum / target_cumsum + df["fdr"] = fdr_to_q_values(fdr_values) else: - if fdr_level == 'precursor': - _df = df.groupby([ - 'sequence','mods','mod_sites','charge','decoy' - ])['ml_score'].max() - elif fdr_level == 'peptide': - _df = df.groupby([ - 'sequence','mods','mod_sites','decoy' - ])['ml_score'].max() + if fdr_level == "precursor": + _df = df.groupby(["sequence", "mods", "mod_sites", "charge", "decoy"])[ + "ml_score" + ].max() + elif fdr_level == "peptide": + _df = df.groupby(["sequence", "mods", "mod_sites", "decoy"])[ + "ml_score" + ].max() else: - _df = df.groupby(['sequence','decoy'])['ml_score'].max() + _df = df.groupby(["sequence", "decoy"])["ml_score"].max() _df = _df.reset_index(drop=True) - _df = _df.sort_values(['ml_score','decoy'], ascending=False) - target_values = 1-_df['decoy'].values - decoy_cumsum = np.cumsum(_df['decoy'].values) + _df = _df.sort_values(["ml_score", "decoy"], ascending=False) + target_values = 1 - _df["decoy"].values + decoy_cumsum = np.cumsum(_df["decoy"].values) target_cumsum = np.cumsum(target_values) - fdr_values = decoy_cumsum/target_cumsum - _df['fdr'] = fdr_to_q_values(fdr_values) - df['fdr'] = fdr_from_ref( - df['ml_score'].values, _df['ml_score'].values, - _df['fdr'].values + fdr_values = decoy_cumsum / target_cumsum + _df["fdr"] = fdr_to_q_values(fdr_values) + df["fdr"] = fdr_from_ref( + df["ml_score"].values, _df["ml_score"].values, _df["fdr"].values ) return df def _train(self, train_t_df, train_d_df): if len(train_t_df) > self.max_train_sample: - train_t_df = train_t_df.sample( - n=self.max_train_sample, - random_state=1337 - ) + train_t_df = train_t_df.sample(n=self.max_train_sample, random_state=1337) if len(train_d_df) > self.max_train_sample: - train_d_df = train_d_df.sample( - n=self.max_train_sample, - random_state=1337 - ) + train_d_df = train_d_df.sample(n=self.max_train_sample, random_state=1337) train_df = pd.concat((train_t_df, train_d_df)) - train_label = np.ones(len(train_df),dtype=np.int32) - train_label[len(train_t_df):] = 0 + train_label = np.ones(len(train_df), dtype=np.int32) + train_label[len(train_t_df) :] = 0 - self.model.fit( - train_df[self.feature_list].values, - train_label - ) + self.model.fit(train_df[self.feature_list].values, train_label) def _predict(self, test_df): - if self.percolator_model != 'random_forest': - test_df['ml_score'] = self.model.decision_function( + if self.percolator_model != "random_forest": + test_df["ml_score"] = self.model.decision_function( test_df[self.feature_list].values ) else: - test_df['ml_score'] = self.model.predict_proba( + test_df["ml_score"] = self.model.predict_proba( test_df[self.feature_list].values - )[:,1] + )[:, 1] return test_df - def _cv_score(self, df:pd.DataFrame)->pd.DataFrame: - df = df.sample( - frac=1, random_state=1337 - ).reset_index(drop=True) + def _cv_score(self, df: pd.DataFrame) -> pd.DataFrame: + df = df.sample(frac=1, random_state=1337).reset_index(drop=True) df_target = df[df.decoy == 0] df_decoy = df[df.decoy != 0] if ( - np.sum(df_target.fdr<0.01) < - self.min_train_sample*self.cv_fold - or len(df_decoy) < self.min_train_sample*self.cv_fold + np.sum(df_target.fdr < 0.01) < self.min_train_sample * self.cv_fold + or len(df_decoy) < self.min_train_sample * self.cv_fold ): logging.info( "[PERC] " - f'#target={np.sum(df_target.fdr<0.01)} or #decoy={len(df_decoy)} ' - f'< minimal training sample={self.min_train_sample} ' - f'for cv-fold={self.cv_fold}. Skip rescoring!!!' + f"#target={np.sum(df_target.fdr<0.01)} or #decoy={len(df_decoy)} " + f"< minimal training sample={self.min_train_sample} " + f"for cv-fold={self.cv_fold}. Skip rescoring!!!" ) return df @@ -348,9 +328,7 @@ def _cv_score(self, df:pd.DataFrame)->pd.DataFrame: _slice = slice(i, len(df_target), self.cv_fold) t_mask[_slice] = False cv_df_target = df_target[t_mask] - train_t_df = cv_df_target[ - cv_df_target.fdr <= self.fdr - ] + train_t_df = cv_df_target[cv_df_target.fdr <= self.fdr] test_t_df = df_target[_slice] d_mask = np.ones(len(df_decoy), dtype=bool) @@ -373,9 +351,7 @@ def _cv_score(self, df:pd.DataFrame)->pd.DataFrame: return self._predict(test_df) - def load_psms(self, - psm_file_list:list, psm_type:str - )->pd.DataFrame: + def load_psms(self, psm_file_list: list, psm_type: str) -> pd.DataFrame: """Load PSM dataframe from file path list. Parameters @@ -391,9 +367,7 @@ def load_psms(self, pd.DataFrame PSM dataframe with 100% FDR including decoys. """ - reader = psm_reader_provider.get_reader( - psm_type, fdr=1, keep_decoy=True - ) + reader = psm_reader_provider.get_reader(psm_type, fdr=1, keep_decoy=True) psm_df_list = [] for psm_file in psm_file_list: _df = reader.import_file(psm_file) @@ -401,9 +375,9 @@ def load_psms(self, psm_df_list.append(_df) return pd.concat(psm_df_list) - def extract_features(self, - psm_df:pd.DataFrame, ms2_file_dict:dict, ms2_file_type:str - )->pd.DataFrame: + def extract_features( + self, psm_df: pd.DataFrame, ms2_file_dict: dict, ms2_file_type: str + ) -> pd.DataFrame: """Extract features for rescoring Parameters @@ -422,18 +396,20 @@ def extract_features(self, pd.DataFrame psm_df with feature columns appended inplace. """ - psm_df['ml_score'] = psm_df.score - psm_df = self._estimate_fdr(psm_df, 'psm') + psm_df["ml_score"] = psm_df.score + psm_df = self._estimate_fdr(psm_df, "psm") psm_df = self.feature_extractor.extract_features( - psm_df, ms2_file_dict, + psm_df, + ms2_file_dict, ms2_file_type, frag_types=self.charged_frag_types, - ms2_ppm=self.ms2_ppm, ms2_tol=self.ms2_tol + ms2_ppm=self.ms2_ppm, + ms2_tol=self.ms2_tol, ) return psm_df - def re_score(self, df:pd.DataFrame)->pd.DataFrame: + def re_score(self, df: pd.DataFrame) -> pd.DataFrame: """Rescore Parameters @@ -448,28 +424,28 @@ def re_score(self, df:pd.DataFrame)->pd.DataFrame: """ logging.info( "[PERC] " - f'{np.sum((df.fdr<=self.fdr) & (df.decoy==0))} ' - f'target PSMs at {self.fdr} psm-level FDR' + f"{np.sum((df.fdr<=self.fdr) & (df.decoy==0))} " + f"target PSMs at {self.fdr} psm-level FDR" ) for i in range(self.iter_num): - logging.info(f'[PERC] Iteration {i+1} of Percolator ...') + logging.info(f"[PERC] Iteration {i+1} of Percolator ...") df = self._cv_score(df) - df = self._estimate_fdr(df, 'psm', False) + df = self._estimate_fdr(df, "psm", False) logging.info( - f'[PERC] {len(df[(df.fdr<=self.fdr) & (df.decoy==0)])} ' - f'target PSMs at {self.fdr} psm-level FDR' + f"[PERC] {len(df[(df.fdr<=self.fdr) & (df.decoy==0)])} " + f"target PSMs at {self.fdr} psm-level FDR" ) df = self._estimate_fdr(df) logging.info( "[PERC] " - f'{len(df[(df.fdr<=self.fdr) & (df.decoy==0)])} ' - f'target PSMs at {self.fdr} {self.fdr_level}-level FDR' + f"{len(df[(df.fdr<=self.fdr) & (df.decoy==0)])} " + f"target PSMs at {self.fdr} {self.fdr_level}-level FDR" ) return df - def run(self, - psm_df:pd.DataFrame, ms2_file_dict:dict, ms2_file_type:str - )->pd.DataFrame: + def run( + self, psm_df: pd.DataFrame, ms2_file_dict: dict, ms2_file_type: str + ) -> pd.DataFrame: """ Run percolator workflow: @@ -492,7 +468,5 @@ def run(self, pd.DataFrame psm_df with feature columns appended inplace. """ - df = self.extract_features( - psm_df, ms2_file_dict, ms2_file_type - ) + df = self.extract_features(psm_df, ms2_file_dict, ms2_file_type) return self.re_score(df) diff --git a/peptdeep/settings.py b/peptdeep/settings.py index 2d089310..8720fbfd 100644 --- a/peptdeep/settings.py +++ b/peptdeep/settings.py @@ -4,55 +4,57 @@ from alphabase.yaml_utils import load_yaml from alphabase.constants.modification import ( - load_mod_df, keep_modloss_by_importance, - add_new_modifications, MOD_DF + load_mod_df, + keep_modloss_by_importance, + add_new_modifications, + MOD_DF, ) from peptdeep.constants._const import CONST_FOLDER -global_settings = load_yaml( - os.path.join(CONST_FOLDER, 'default_settings.yaml') -) +global_settings = load_yaml(os.path.join(CONST_FOLDER, "default_settings.yaml")) """ Global settings in peptdeep, it controls all functionalities of PeptDeep. """ -model_const = load_yaml( - os.path.join(CONST_FOLDER, 'model_const.yaml') -) +model_const = load_yaml(os.path.join(CONST_FOLDER, "model_const.yaml")) ### MOD_TO_FEATURE -mod_elements = model_const['mod_elements'] +mod_elements = model_const["mod_elements"] mod_feature_size = len(mod_elements) mod_elem_to_idx = dict(zip(mod_elements, range(mod_feature_size))) + def _parse_mod_formula(formula): - ''' + """ Parse a modification formula to a feature vector - ''' + """ feature = np.zeros(mod_feature_size) - elems = formula.strip(')').split(')') + elems = formula.strip(")").split(")") for elem in elems: - chem, num = elem.split('(') - num = int(num) + chem, num = elem.split("(") + num = int(num) if chem in mod_elem_to_idx: feature[mod_elem_to_idx[chem]] = num else: feature[-1] += num return feature + MOD_TO_FEATURE = {} + + def update_all_mod_features(): - for modname, formula in MOD_DF[['mod_name','composition']].values: + for modname, formula in MOD_DF[["mod_name", "composition"]].values: MOD_TO_FEATURE[modname] = _parse_mod_formula(formula) + + update_all_mod_features() -def add_user_defined_modifications( - user_mods:dict=None -): +def add_user_defined_modifications(user_mods: dict = None): """ Add user-defined modifications into the system, this is userful for isotope labeling. @@ -78,40 +80,36 @@ def add_user_defined_modifications( update_all_mod_features() + def _refine_global_settings(): global_settings["thread_num"] = min( - global_settings["thread_num"], - global_settings["MAX_THREADS"] - ) - global_settings['PEPTDEEP_HOME'] = os.path.expanduser( - global_settings['PEPTDEEP_HOME'] + global_settings["thread_num"], global_settings["MAX_THREADS"] ) - global_settings['library']['output_folder']=( - global_settings['library']['output_folder'].format( - PEPTDEEP_HOME=global_settings['PEPTDEEP_HOME'] - ) + global_settings["PEPTDEEP_HOME"] = os.path.expanduser( + global_settings["PEPTDEEP_HOME"] ) - global_settings['model_mgr']['transfer']['model_output_folder']=( - global_settings['model_mgr']['transfer']['model_output_folder'].format( - PEPTDEEP_HOME=global_settings['PEPTDEEP_HOME'] - ) + global_settings["library"]["output_folder"] = global_settings["library"][ + "output_folder" + ].format(PEPTDEEP_HOME=global_settings["PEPTDEEP_HOME"]) + global_settings["model_mgr"]["transfer"]["model_output_folder"] = global_settings[ + "model_mgr" + ]["transfer"]["model_output_folder"].format( + PEPTDEEP_HOME=global_settings["PEPTDEEP_HOME"] ) # global_settings['percolator']['output_folder']=( # global_settings['percolator']['output_folder'].format( # PEPTDEEP_HOME=global_settings['PEPTDEEP_HOME'] # ) # ) - for key, val in list(global_settings['model_mgr'][ - 'instrument_group' - ].items()): - global_settings['model_mgr'][ - 'instrument_group' - ][key.upper()] = val + for key, val in list(global_settings["model_mgr"]["instrument_group"].items()): + global_settings["model_mgr"]["instrument_group"][key.upper()] = val add_user_defined_modifications() + _refine_global_settings() + def update_settings(dict_, new_dict): for k, v in new_dict.items(): if isinstance(v, collections.abc.Mapping): @@ -120,17 +118,18 @@ def update_settings(dict_, new_dict): dict_[k] = v return dict_ + def update_global_settings(new_settings): update_settings(global_settings, new_settings) _refine_global_settings() -def load_global_settings(yaml:str): + +def load_global_settings(yaml: str): d = load_yaml(yaml) update_global_settings(d) -def update_modifications(tsv:str="", - modloss_importance_level:float=1.0 -): + +def update_modifications(tsv: str = "", modloss_importance_level: float = 1.0): """ Load modification tsv either from alphabase default `modification.tsv `_ @@ -151,4 +150,5 @@ def update_modifications(tsv:str="", add_user_defined_modifications() + update_modifications() diff --git a/peptdeep/spec_lib/library_factory.py b/peptdeep/spec_lib/library_factory.py index 4d1fa506..43404d81 100644 --- a/peptdeep/spec_lib/library_factory.py +++ b/peptdeep/spec_lib/library_factory.py @@ -11,53 +11,61 @@ from peptdeep.settings import global_settings from peptdeep.protein.fasta import PredictSpecLibFasta from peptdeep.spec_lib.translate import ( - speclib_to_single_df, mod_to_unimod_dict, - translate_to_tsv + speclib_to_single_df, + mod_to_unimod_dict, + translate_to_tsv, ) from peptdeep.pretrained_models import ModelManager -from peptdeep.utils import logging,read_peptide_table +from peptdeep.utils import logging, read_peptide_table + class PredictLibraryMakerBase(object): """ Base class to predict libraries """ - def __init__(self, - model_manager:ModelManager = None, + + def __init__( + self, + model_manager: ModelManager = None, ): - lib_settings = global_settings['library'] + lib_settings = global_settings["library"] self.spec_lib = PredictSpecLibFasta( model_manager=model_manager, - charged_frag_types = get_charged_frag_types( - lib_settings['frag_types'], - lib_settings['max_frag_charge'], + charged_frag_types=get_charged_frag_types( + lib_settings["frag_types"], + lib_settings["max_frag_charge"], ), - protease = lib_settings['fasta']['protease'], - max_missed_cleavages = lib_settings['fasta']['max_miss_cleave'], - peptide_length_min = lib_settings['min_peptide_len'], - peptide_length_max = lib_settings['max_peptide_len'], - precursor_charge_min = lib_settings['min_precursor_charge'], - precursor_charge_max = lib_settings['max_precursor_charge'], - precursor_mz_min = lib_settings['min_precursor_mz'], - precursor_mz_max = lib_settings['max_precursor_mz'], - var_mods = lib_settings['var_mods'], - min_var_mod_num = lib_settings['min_var_mod_num'], - max_var_mod_num = lib_settings['max_var_mod_num'], - fix_mods = lib_settings['fix_mods'], - labeling_channels = lib_settings['labeling_channels'], - special_mods = lib_settings['special_mods'], - min_special_mod_num = lib_settings['min_special_mod_num'], - max_special_mod_num = lib_settings['max_special_mod_num'], - special_mods_cannot_modify_pep_n_term = lib_settings['special_mods_cannot_modify_pep_n_term'], - special_mods_cannot_modify_pep_c_term = lib_settings['special_mods_cannot_modify_pep_c_term'], - decoy = lib_settings['decoy'], - include_contaminants=lib_settings['fasta']['add_contaminants'], + protease=lib_settings["fasta"]["protease"], + max_missed_cleavages=lib_settings["fasta"]["max_miss_cleave"], + peptide_length_min=lib_settings["min_peptide_len"], + peptide_length_max=lib_settings["max_peptide_len"], + precursor_charge_min=lib_settings["min_precursor_charge"], + precursor_charge_max=lib_settings["max_precursor_charge"], + precursor_mz_min=lib_settings["min_precursor_mz"], + precursor_mz_max=lib_settings["max_precursor_mz"], + var_mods=lib_settings["var_mods"], + min_var_mod_num=lib_settings["min_var_mod_num"], + max_var_mod_num=lib_settings["max_var_mod_num"], + fix_mods=lib_settings["fix_mods"], + labeling_channels=lib_settings["labeling_channels"], + special_mods=lib_settings["special_mods"], + min_special_mod_num=lib_settings["min_special_mod_num"], + max_special_mod_num=lib_settings["max_special_mod_num"], + special_mods_cannot_modify_pep_n_term=lib_settings[ + "special_mods_cannot_modify_pep_n_term" + ], + special_mods_cannot_modify_pep_c_term=lib_settings[ + "special_mods_cannot_modify_pep_c_term" + ], + decoy=lib_settings["decoy"], + include_contaminants=lib_settings["fasta"]["add_contaminants"], I_to_L=False, - generate_precursor_isotope=lib_settings['generate_precursor_isotope'], - rt_to_irt=lib_settings['rt_to_irt'], - ) + generate_precursor_isotope=lib_settings["generate_precursor_isotope"], + rt_to_irt=lib_settings["rt_to_irt"], + ) - def _check_df(self)->str: + def _check_df(self) -> str: pass def _input(self, infiles): @@ -68,18 +76,18 @@ def _predict(self): self.spec_lib.predict_all() @property - def precursor_df(self)->pd.DataFrame: + def precursor_df(self) -> pd.DataFrame: return self.spec_lib.precursor_df @property - def fragment_intensity_df(self)->pd.DataFrame: + def fragment_intensity_df(self) -> pd.DataFrame: return self.spec_lib.fragment_intensity_df @property - def fragment_mz_df(self)->pd.DataFrame: + def fragment_mz_df(self) -> pd.DataFrame: return self.spec_lib.fragment_mz_df - def make_library(self, infiles:Union[str,list,pd.DataFrame]): + def make_library(self, infiles: Union[str, list, pd.DataFrame]): """Predict a library for the `infiles`, this function runs the following methods. @@ -105,162 +113,162 @@ def make_library(self, infiles:Union[str,list,pd.DataFrame]): self._predict() logging.info( - 'Predicting the spectral library with ' - f'{len(self.precursor_df)} precursors ' - f'and {np.prod(self.fragment_mz_df.values.shape, dtype=float)*(1e-6):.2f}M fragments ' - f'used {psutil.Process(os.getpid()).memory_info().rss/1024**3:.4f} GB memory' + "Predicting the spectral library with " + f"{len(self.precursor_df)} precursors " + f"and {np.prod(self.fragment_mz_df.values.shape, dtype=float)*(1e-6):.2f}M fragments " + f"used {psutil.Process(os.getpid()).memory_info().rss/1024**3:.4f} GB memory" ) except ValueError as e: raise e - def translate_to_tsv(self, - tsv_path:str, - translate_mod_dict:dict=None - ): - """Translate the predicted DataFrames into a TSV file - """ + def translate_to_tsv(self, tsv_path: str, translate_mod_dict: dict = None): + """Translate the predicted DataFrames into a TSV file""" logging.info(f"Translating to {tsv_path} for DiaNN/Spectronaut...") - lib_settings = global_settings['library'] + lib_settings = global_settings["library"] - if 'proteins' not in self.spec_lib._precursor_df.columns: + if "proteins" not in self.spec_lib._precursor_df.columns: self.spec_lib.append_protein_name() translate_to_tsv( self.spec_lib, tsv_path, - keep_k_highest_fragments=lib_settings['output_tsv'][ - 'keep_higest_k_peaks' - ], - min_frag_intensity=lib_settings['output_tsv'][ - 'min_relative_intensity' - ], - min_frag_mz=lib_settings['output_tsv'][ - 'min_fragment_mz' - ], - max_frag_mz=lib_settings['output_tsv'][ - 'max_fragment_mz' - ], - batch_size=lib_settings['output_tsv'][ - 'translate_batch_size' - ], + keep_k_highest_fragments=lib_settings["output_tsv"]["keep_higest_k_peaks"], + min_frag_intensity=lib_settings["output_tsv"]["min_relative_intensity"], + min_frag_mz=lib_settings["output_tsv"]["min_fragment_mz"], + max_frag_mz=lib_settings["output_tsv"]["max_fragment_mz"], + batch_size=lib_settings["output_tsv"]["translate_batch_size"], translate_mod_dict=translate_mod_dict, ) - def translate_library(self, - translate_mod_dict:dict=None - )->pd.DataFrame: + def translate_library(self, translate_mod_dict: dict = None) -> pd.DataFrame: """Translate predicted DataFrames into a single DataFrame in SWATH library format """ logging.info("Translating library for DiaNN/Spectronaut...") - lib_settings = global_settings['library'] + lib_settings = global_settings["library"] - if 'proteins' not in self.spec_lib._precursor_df.columns: + if "proteins" not in self.spec_lib._precursor_df.columns: self.spec_lib.append_protein_name() return speclib_to_single_df( self.spec_lib, translate_mod_dict=translate_mod_dict, - keep_k_highest_fragments=lib_settings['output_tsv'][ - 'keep_higest_k_peaks' - ], - min_frag_intensity=lib_settings['output_tsv'][ - 'min_relative_intensity' - ], - min_frag_mz=lib_settings['output_tsv'][ - 'min_fragment_mz' - ], - max_frag_mz=lib_settings['output_tsv'][ - 'max_fragment_mz' - ], + keep_k_highest_fragments=lib_settings["output_tsv"]["keep_higest_k_peaks"], + min_frag_intensity=lib_settings["output_tsv"]["min_relative_intensity"], + min_frag_mz=lib_settings["output_tsv"]["min_fragment_mz"], + max_frag_mz=lib_settings["output_tsv"]["max_fragment_mz"], ) + def load_dfs(infiles): - if isinstance(infiles,str): infiles = [infiles] + if isinstance(infiles, str): + infiles = [infiles] df_list = [] for file_path in infiles: df_list.append(read_peptide_table(file_path)) return pd.concat(df_list, ignore_index=True) + class PSMReaderLibraryMaker(PredictLibraryMakerBase): - def _input(self, psm_type_infiles:Tuple[str,Union[str,list]]): + def _input(self, psm_type_infiles: Tuple[str, Union[str, list]]): psm_type, infiles = psm_type_infiles - if isinstance(infiles, str): infiles = [infiles] + if isinstance(infiles, str): + infiles = [infiles] psm_reader = psm_reader_provider.get_reader(psm_type) df = psm_reader.import_files(infiles) - df.drop_duplicates(["sequence","mods","mod_sites","charge"],inplace=True) - df.drop(columns=[x for x in df.columns.values if x not in - ["sequence","mods","mod_sites","charge","proteins","genes","nAA"] - ], inplace=True) - df["sequence"] = df.sequence.astype('U') - df["mods"] = df.mods.astype('U') - df["mod_sites"] = df.mod_sites.astype('U') + df.drop_duplicates(["sequence", "mods", "mod_sites", "charge"], inplace=True) + df.drop( + columns=[ + x + for x in df.columns.values + if x + not in [ + "sequence", + "mods", + "mod_sites", + "charge", + "proteins", + "genes", + "nAA", + ] + ], + inplace=True, + ) + df["sequence"] = df.sequence.astype("U") + df["mods"] = df.mods.astype("U") + df["mod_sites"] = df.mod_sites.astype("U") if "proteins" in df.columns: - df["proteins"] = df.proteins.astype('U') + df["proteins"] = df.proteins.astype("U") if "genes" in df.columns: - df["genes"] = df.genes.astype('U') + df["genes"] = df.genes.astype("U") self.spec_lib._precursor_df = df self.spec_lib.append_decoy_sequence() self.spec_lib.add_peptide_labeling() + class PrecursorLibraryMaker(PredictLibraryMakerBase): """For input dataframe of charged modified sequences""" - def _input(self, infiles:Union[str,list,pd.DataFrame]): + + def _input(self, infiles: Union[str, list, pd.DataFrame]): if isinstance(infiles, pd.DataFrame): df = infiles else: df = load_dfs(infiles) - if 'charge' not in df.columns: + if "charge" not in df.columns: raise KeyError('`precursor_table` must contain the "charge" column.') - df.drop_duplicates(["sequence","mods","mod_sites","charge"],inplace=True) + df.drop_duplicates(["sequence", "mods", "mod_sites", "charge"], inplace=True) self.spec_lib._precursor_df = df self.spec_lib.add_peptide_labeling() self.spec_lib.append_decoy_sequence() def _check_df(self): - ( - self.spec_lib.precursor_df['charge'] - ) = self.spec_lib.precursor_df['charge'].astype(np.int8) + (self.spec_lib.precursor_df["charge"]) = self.spec_lib.precursor_df[ + "charge" + ].astype(np.int8) if ( - 'mods' not in self.spec_lib.precursor_df.columns or - 'mod_sites' not in self.spec_lib.precursor_df.columns + "mods" not in self.spec_lib.precursor_df.columns + or "mod_sites" not in self.spec_lib.precursor_df.columns ): - self.spec_lib.precursor_df['mods'] = '' - self.spec_lib.precursor_df['mod_sites'] = '' + self.spec_lib.precursor_df["mods"] = "" + self.spec_lib.precursor_df["mod_sites"] = "" else: - ( - self.spec_lib.precursor_df['mods'] - ) = self.spec_lib.precursor_df['mods'].astype('U') - ( - self.spec_lib.precursor_df['mod_sites'] - ) = self.spec_lib.precursor_df['mod_sites'].astype('U') + (self.spec_lib.precursor_df["mods"]) = self.spec_lib.precursor_df[ + "mods" + ].astype("U") + (self.spec_lib.precursor_df["mod_sites"]) = self.spec_lib.precursor_df[ + "mod_sites" + ].astype("U") self.spec_lib.protein_df = pd.DataFrame() + class PeptideLibraryMaker(PrecursorLibraryMaker): """For input dataframe of modified sequences""" - def _input(self, infiles:Union[str,list,pd.DataFrame]): + + def _input(self, infiles: Union[str, list, pd.DataFrame]): if isinstance(infiles, pd.DataFrame): df = infiles else: df = load_dfs(infiles) - df.drop_duplicates(["sequence","mods","mod_sites"],inplace=True) + df.drop_duplicates(["sequence", "mods", "mod_sites"], inplace=True) self.spec_lib._precursor_df = df self.spec_lib.append_decoy_sequence() self.spec_lib.add_peptide_labeling() self.spec_lib.add_charge() + class SequenceLibraryMaker(PeptideLibraryMaker): """For input dataframe of AA sequences""" - def _input(self, infiles:Union[str,list,pd.DataFrame]): + + def _input(self, infiles: Union[str, list, pd.DataFrame]): if isinstance(infiles, pd.DataFrame): df = infiles else: df = load_dfs(infiles) if "sequence" not in df.columns: raise KeyError("`SequenceLibraryMaker` must contain `sequence` column") - df.drop_duplicates(["sequence"],inplace=True) + df.drop_duplicates(["sequence"], inplace=True) self.spec_lib._precursor_df = df self.spec_lib.append_decoy_sequence() self.spec_lib.add_modifications() @@ -268,9 +276,11 @@ def _input(self, infiles:Union[str,list,pd.DataFrame]): self.spec_lib.add_peptide_labeling() self.spec_lib.add_charge() + class FastaLibraryMaker(PredictLibraryMakerBase): """For fasta or a list of fasta files""" - def _input(self, fasta:Union[str,list]): + + def _input(self, fasta: Union[str, list]): self.spec_lib.get_peptides_from_fasta(fasta) self.spec_lib.append_decoy_sequence() self.spec_lib.add_modifications() @@ -278,33 +288,39 @@ def _input(self, fasta:Union[str,list]): self.spec_lib.add_peptide_labeling() self.spec_lib.add_charge() + class LibraryMakerProvider: """ Factory class for library makers """ + def __init__(self): self.library_maker_dict = {} - def register_maker(self, maker_name:str, maker_class): + def register_maker(self, maker_name: str, maker_class): self.library_maker_dict[maker_name.lower()] = maker_class - def get_maker(self, maker_name:str, *, - model_manager = None, - )->PredictLibraryMakerBase: + def get_maker( + self, + maker_name: str, + *, + model_manager=None, + ) -> PredictLibraryMakerBase: maker_name = maker_name.lower() if maker_name in self.library_maker_dict: return self.library_maker_dict[maker_name](model_manager) elif maker_name in psm_reader_provider.reader_dict: return PSMReaderLibraryMaker(model_manager) else: - raise KeyError(f'Library maker `{maker_name}` is not registered.') + raise KeyError(f"Library maker `{maker_name}` is not registered.") + library_maker_provider = LibraryMakerProvider() -library_maker_provider.register_maker('precursor_table', PrecursorLibraryMaker) -library_maker_provider.register_maker('precursor_library', PrecursorLibraryMaker) -library_maker_provider.register_maker('peptide_table', PeptideLibraryMaker) -library_maker_provider.register_maker('peptide_library', PeptideLibraryMaker) -library_maker_provider.register_maker('sequence_table', SequenceLibraryMaker) -library_maker_provider.register_maker('sequence_library', SequenceLibraryMaker) -library_maker_provider.register_maker('fasta', FastaLibraryMaker) -library_maker_provider.register_maker('fasta_library', FastaLibraryMaker) +library_maker_provider.register_maker("precursor_table", PrecursorLibraryMaker) +library_maker_provider.register_maker("precursor_library", PrecursorLibraryMaker) +library_maker_provider.register_maker("peptide_table", PeptideLibraryMaker) +library_maker_provider.register_maker("peptide_library", PeptideLibraryMaker) +library_maker_provider.register_maker("sequence_table", SequenceLibraryMaker) +library_maker_provider.register_maker("sequence_library", SequenceLibraryMaker) +library_maker_provider.register_maker("fasta", FastaLibraryMaker) +library_maker_provider.register_maker("fasta_library", FastaLibraryMaker) diff --git a/peptdeep/spec_lib/predict_lib.py b/peptdeep/spec_lib/predict_lib.py index 3ad7e799..f96a9ea3 100644 --- a/peptdeep/spec_lib/predict_lib.py +++ b/peptdeep/spec_lib/predict_lib.py @@ -4,12 +4,14 @@ import tqdm from alphabase.peptide.precursor import ( - calc_precursor_isotope_mp, calc_precursor_isotope + calc_precursor_isotope_mp, + calc_precursor_isotope, ) from alphabase.spectral_library.base import SpecLibBase from alphabase.spectral_library.flat import SpecLibFlat from alphabase.peptide.fragment import ( - flatten_fragments, concat_precursor_fragment_dataframes + flatten_fragments, + concat_precursor_fragment_dataframes, ) from peptdeep.pretrained_models import ModelManager @@ -17,17 +19,19 @@ from peptdeep.utils import logging from peptdeep.utils import process_bar -model_mgr_settings = global_settings['model_mgr'] +model_mgr_settings = global_settings["model_mgr"] + class PredictSpecLib(SpecLibBase): - def __init__(self, + def __init__( + self, model_manager: ModelManager = None, - charged_frag_types = ['b_z1','b_z2','y_z1','y_z2'], - precursor_mz_min:float = 400.0, - precursor_mz_max:float = 2000.0, - decoy:str = 'pseudo_reverse', - rt_to_irt:bool = False, - generate_precursor_isotope:bool = False, + charged_frag_types=["b_z1", "b_z2", "y_z1", "y_z2"], + precursor_mz_min: float = 400.0, + precursor_mz_max: float = 2000.0, + decoy: str = "pseudo_reverse", + rt_to_irt: bool = False, + generate_precursor_isotope: bool = False, ): """ Parameters @@ -54,11 +58,12 @@ def __init__(self, generate_precursor_isotope : bool, optional Generate precursor isotopes, defaults to False """ - SpecLibBase.__init__(self, + SpecLibBase.__init__( + self, charged_frag_types, precursor_mz_min=precursor_mz_min, precursor_mz_max=precursor_mz_max, - decoy = decoy + decoy=decoy, ) self.model_manager = model_manager @@ -66,11 +71,12 @@ def __init__(self, self._fragment_intensity_df = pd.DataFrame() self._fragment_mz_df = pd.DataFrame() - self.mp_predict_batch_size:int = 100000 + self.mp_predict_batch_size: int = 100000 self.rt_to_irt = rt_to_irt self.generate_precursor_isotope = generate_precursor_isotope - def set_precursor_and_fragment(self, + def set_precursor_and_fragment( + self, *, precursor_df: pd.DataFrame, fragment_mz_df: pd.DataFrame, @@ -80,60 +86,69 @@ def set_precursor_and_fragment(self, self._fragment_intensity_df = fragment_intensity_df self._fragment_mz_df = fragment_mz_df - self._fragment_mz_df.drop(columns=[ - col for col in self._fragment_mz_df.columns - if col not in self.charged_frag_types - ], inplace=True) + self._fragment_mz_df.drop( + columns=[ + col + for col in self._fragment_mz_df.columns + if col not in self.charged_frag_types + ], + inplace=True, + ) - self._fragment_intensity_df.drop(columns=[ - col for col in self._fragment_intensity_df.columns - if col not in self.charged_frag_types - ], inplace=True) + self._fragment_intensity_df.drop( + columns=[ + col + for col in self._fragment_intensity_df.columns + if col not in self.charged_frag_types + ], + inplace=True, + ) - def translate_rt_to_irt_pred(self, irt_pep_df:pd.DataFrame = None): - """ Add 'irt_pred' into columns based on 'rt_pred' """ + def translate_rt_to_irt_pred(self, irt_pep_df: pd.DataFrame = None): + """Add 'irt_pred' into columns based on 'rt_pred'""" return self.model_manager.rt_model.add_irt_column_to_precursor_df( self._precursor_df, irt_pep_df=irt_pep_df ) - def predict_all(self, - min_required_precursor_num_for_mp:int=2000, - predict_items:list = ['rt','mobility','ms2'], + def predict_all( + self, + min_required_precursor_num_for_mp: int = 2000, + predict_items: list = ["rt", "mobility", "ms2"], ): """ 1. Predict RT/IM/MS2 for self._precursor_df 2. Calculate isotope information in self._precursor_df """ - if 'precursor_mz' not in self.precursor_df.columns: + if "precursor_mz" not in self.precursor_df.columns: self.calc_precursor_mz() self.clip_by_precursor_mz_() if self.generate_precursor_isotope: if self.model_manager.verbose: - logging.info('Calculating precursor isotope distributions ...') + logging.info("Calculating precursor isotope distributions ...") if len(self.precursor_df) < min_required_precursor_num_for_mp: - self._precursor_df = calc_precursor_isotope( - self._precursor_df - ) + self._precursor_df = calc_precursor_isotope(self._precursor_df) else: self._precursor_df = calc_precursor_isotope_mp( self._precursor_df, progress_bar=process_bar ) if self.model_manager.verbose: - logging.info(f'Predicting RT/IM/MS2 for {len(self._precursor_df)} precursors ...') + logging.info( + f"Predicting RT/IM/MS2 for {len(self._precursor_df)} precursors ..." + ) res = self.model_manager.predict_all( self._precursor_df, predict_items=predict_items, frag_types=self.charged_frag_types, min_required_precursor_num_for_mp=min_required_precursor_num_for_mp, - multiprocessing=model_mgr_settings['predict']['multiprocessing'], + multiprocessing=model_mgr_settings["predict"]["multiprocessing"], mp_batch_size=self.mp_predict_batch_size, - process_num=global_settings['thread_num'], + process_num=global_settings["thread_num"], ) self.set_precursor_and_fragment(**res) - if self.rt_to_irt and 'rt_pred' in self._precursor_df.columns: + if self.rt_to_irt and "rt_pred" in self._precursor_df.columns: self.translate_rt_to_irt_pred() if self.model_manager.verbose: - logging.info('End predicting RT/IM/MS2') + logging.info("End predicting RT/IM/MS2") class PredictSpecLibFlat(SpecLibFlat): @@ -148,23 +163,28 @@ class PredictSpecLibFlat(SpecLibFlat): keep_top_k_fragments : int, optional top k highest peaks to keep, by default 1000 """ - def __init__(self, - min_fragment_intensity:float = 0.001, - keep_top_k_fragments:int = 1000, - custom_fragment_df_columns:list = [ - 'type','number','position','charge','loss_type' + + def __init__( + self, + min_fragment_intensity: float = 0.001, + keep_top_k_fragments: int = 1000, + custom_fragment_df_columns: list = [ + "type", + "number", + "position", + "charge", + "loss_type", ], **kwargs, ): super().__init__( min_fragment_intensity=min_fragment_intensity, keep_top_k_fragments=keep_top_k_fragments, - custom_fragment_df_columns=custom_fragment_df_columns + custom_fragment_df_columns=custom_fragment_df_columns, ) - def predict_and_parse_lib_in_batch(self, - predict_lib:PredictSpecLib, - batch_size:int = 200000 + def predict_and_parse_lib_in_batch( + self, predict_lib: PredictSpecLib, batch_size: int = 200000 ): """Predict and flatten fragments in batch @@ -175,7 +195,9 @@ def predict_and_parse_lib_in_batch(self, batch_size : int, optional the batch size, by default 200000 """ - logging.info(f"Flattening {len(predict_lib.precursor_df)} precursors in batch size {batch_size} ...") + logging.info( + f"Flattening {len(predict_lib.precursor_df)} precursors in batch size {batch_size} ..." + ) if len(predict_lib.precursor_df) <= batch_size: predict_lib.predict_all() self.parse_base_library(predict_lib) @@ -186,19 +208,21 @@ def predict_and_parse_lib_in_batch(self, precursor_df_list = [] fragment_df_list = [] for i in tqdm.tqdm(range(0, len(df), batch_size)): - predict_lib._precursor_df = df.iloc[i:i+batch_size].copy() + predict_lib._precursor_df = df.iloc[i : i + batch_size].copy() predict_lib.predict_all() flat_df, frag_df = flatten_fragments( predict_lib.precursor_df, predict_lib.fragment_mz_df, predict_lib.fragment_intensity_df, - min_fragment_intensity = self.min_fragment_intensity, - keep_top_k_fragments = self.keep_top_k_fragments, - custom_columns=self.custom_fragment_df_columns + min_fragment_intensity=self.min_fragment_intensity, + keep_top_k_fragments=self.keep_top_k_fragments, + custom_columns=self.custom_fragment_df_columns, ) precursor_df_list.append(flat_df) fragment_df_list.append(frag_df) predict_lib._precursor_df = df - self._precursor_df, self._fragment_df = concat_precursor_fragment_dataframes( - precursor_df_list, fragment_df_list + self._precursor_df, self._fragment_df = ( + concat_precursor_fragment_dataframes( + precursor_df_list, fragment_df_list + ) ) diff --git a/peptdeep/utils/__init__.py b/peptdeep/utils/__init__.py index e6e857e7..3e98499e 100644 --- a/peptdeep/utils/__init__.py +++ b/peptdeep/utils/__init__.py @@ -11,39 +11,40 @@ import pandas as pd import numpy as np + # from alphatims def process_bar(iterator, len_iter): with tqdm.tqdm(total=len_iter) as bar: i = 0 - for i,iter in enumerate(iterator): + for i, iter in enumerate(iterator): yield iter bar.update() - bar.update(len_iter-i-1) + bar.update(len_iter - i - 1) + -def _get_delimiter(tsv_file:str): +def _get_delimiter(tsv_file: str): with open(tsv_file, "r") as f: line = f.readline().strip() - if '\t' in line: return '\t' - elif ',' in line: return ',' - else: return '\t' + if "\t" in line: + return "\t" + elif "," in line: + return "," + else: + return "\t" + -def read_peptide_table(tsv_file:str)->pd.DataFrame: +def read_peptide_table(tsv_file: str) -> pd.DataFrame: sep = _get_delimiter(tsv_file) df = pd.read_csv(tsv_file, sep=sep, keep_default_na=False) - if 'mod_sites' in df.columns: - df['mod_sites'] = df.mod_sites.astype('U') + if "mod_sites" in df.columns: + df["mod_sites"] = df.mod_sites.astype("U") return df -_special_raw_suffices = [ - '.ms_data.hdf', - '_hcdft.mgf', - '.mzml' - '.mgf' -] -def parse_ms_file_names_to_dict( - ms_file_list:list -)->dict: +_special_raw_suffices = [".ms_data.hdf", "_hcdft.mgf", ".mzml" ".mgf"] + + +def parse_ms_file_names_to_dict(ms_file_list: list) -> dict: """ Load spectrum file paths into a dict: "/Users/xxx/raw_name.raw" -> {"raw_name":"/Users/xxx/raw_name.raw"} @@ -65,27 +66,27 @@ def parse_ms_file_names_to_dict( raw_name = raw_filename.lower() for raw_suff in _special_raw_suffices: if raw_name.endswith(raw_suff): - raw_name = raw_filename[:-len(raw_suff)] + raw_name = raw_filename[: -len(raw_suff)] break - if len(raw_filename)==len(raw_name): + if len(raw_filename) == len(raw_name): raw_name = os.path.splitext(raw_name)[0] spec_dict[raw_name] = ms_file return spec_dict + def _flatten(list_of_lists): - ''' + """ Flatten a list of lists - ''' - return list( - itertools.chain.from_iterable(list_of_lists) - ) + """ + return list(itertools.chain.from_iterable(list_of_lists)) + -def explode_multiple_columns(df:pd.DataFrame, columns:list): +def explode_multiple_columns(df: pd.DataFrame, columns: list): try: return df.explode(columns) except ValueError: # pandas <= 1.2.x? - logging.warn(f'pandas=={pd.__version__} cannot explode multiple columns') + logging.warn(f"pandas=={pd.__version__} cannot explode multiple columns") ret_df = df.explode(columns[0]) for col in columns[1:]: ret_df[col] = _flatten(df[col].values) diff --git a/peptdeep/utils/_pyinstaller_hooks.py b/peptdeep/utils/_pyinstaller_hooks.py index 63e5f669..adddc059 100644 --- a/peptdeep/utils/_pyinstaller_hooks.py +++ b/peptdeep/utils/_pyinstaller_hooks.py @@ -1,5 +1,6 @@ from transformers.dependency_versions_check import pkgs_to_check_at_runtime + def get_peptdeep_datas(): """ Huggingface has some dependencies those are not included in pyinstaller, @@ -9,7 +10,8 @@ def get_peptdeep_datas(): `.. = Analysis(..., datas=datas,...)`. """ from PyInstaller.utils.hooks import copy_metadata - for _pkg in ["python","accelerate"]: + + for _pkg in ["python", "accelerate"]: if _pkg in pkgs_to_check_at_runtime: pkgs_to_check_at_runtime.remove(_pkg) datas = [] diff --git a/peptdeep/utils/device_utils.py b/peptdeep/utils/device_utils.py index 883292f9..3958265e 100644 --- a/peptdeep/utils/device_utils.py +++ b/peptdeep/utils/device_utils.py @@ -1,32 +1,34 @@ import torch -def _is_mps_available()->bool: +def _is_mps_available() -> bool: try: return torch.backends.mps.is_available() except AttributeError: return False -torch_devices:dict = { - 'gpu': { - 'is_available': torch.cuda.is_available, - 'device': 'cuda', + +torch_devices: dict = { + "gpu": { + "is_available": torch.cuda.is_available, + "device": "cuda", + }, + "cuda": { + "is_available": torch.cuda.is_available, + "device": "cuda", }, - 'cuda': { - 'is_available': torch.cuda.is_available, - 'device': 'cuda', + "mps": { + "is_available": _is_mps_available, + "device": "mps", }, - 'mps': { - 'is_available': _is_mps_available, - 'device': 'mps', + "m1": { + "is_available": _is_mps_available, + "device": "mps", }, - 'm1': { - 'is_available': _is_mps_available, - 'device': 'mps', - } } -def get_device(device:str, device_ids:list=[])->tuple: + +def get_device(device: str, device_ids: list = []) -> tuple: """Device name to torch.device Parameters @@ -44,23 +46,18 @@ def get_device(device:str, device_ids:list=[])->tuple: """ device = device.lower() if device in torch_devices: - if torch_devices[device]['is_available'](): - if ( - torch_devices[device]['device'] == 'cuda' - and len(device_ids) > 0 - ): + if torch_devices[device]["is_available"](): + if torch_devices[device]["device"] == "cuda" and len(device_ids) > 0: return torch.device( f'cuda:{",".join(str(_id) for _id in device_ids)}' - ), 'cuda' + ), "cuda" else: - return ( - torch.device(torch_devices[device]['device']), - device - ) - return torch.device('cpu'), 'cpu' + return (torch.device(torch_devices[device]["device"]), device) + return torch.device("cpu"), "cpu" + -def get_available_device()->tuple: +def get_available_device() -> tuple: for name, item in torch_devices.items(): - if item['is_available'](): - return torch.device(item['device']), name - return torch.device('cpu'), 'cpu' + if item["is_available"](): + return torch.device(item["device"]), name + return torch.device("cpu"), "cpu" diff --git a/peptdeep/utils/logger.py b/peptdeep/utils/logger.py index 591c1fb7..3e90ee19 100644 --- a/peptdeep/utils/logger.py +++ b/peptdeep/utils/logger.py @@ -10,22 +10,23 @@ import peptdeep -BASE_PATH = os.path.expanduser('~/peptdeep') +BASE_PATH = os.path.expanduser("~/peptdeep") LOG_PATH = os.path.join(BASE_PATH, "logs") log_level_dict = { - 'debug': logging.DEBUG, - 'info': logging.INFO, - 'warning': logging.WARNING, - 'error': logging.ERROR, - 'critical': logging.CRITICAL, + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL, } + def set_logger( *, log_file_name="", stream: bool = True, - log_level: str = 'info', + log_level: str = "info", overwrite: bool = False, ) -> str: """Set the log stream and file. @@ -70,9 +71,7 @@ def set_logger( log_level = logging.INFO root = logging.getLogger() - formatter = logging.Formatter( - '%(asctime)s> %(message)s', "%Y-%m-%d %H:%M:%S" - ) + formatter = logging.Formatter("%(asctime)s> %(message)s", "%Y-%m-%d %H:%M:%S") root.setLevel(log_level) while root.hasHandlers(): root.removeHandler(root.handlers[0]) @@ -91,18 +90,15 @@ def set_logger( current_time = time.localtime() current_time = "".join( [ - f'{current_time.tm_year:04}', - f'{current_time.tm_mon:02}', - f'{current_time.tm_mday:02}', - f'{current_time.tm_hour:02}', - f'{current_time.tm_min:02}', - f'{current_time.tm_sec:02}', + f"{current_time.tm_year:04}", + f"{current_time.tm_mon:02}", + f"{current_time.tm_mday:02}", + f"{current_time.tm_hour:02}", + f"{current_time.tm_min:02}", + f"{current_time.tm_sec:02}", ] ) - log_file_name = os.path.join( - log_file_name, - f"log_{current_time}.txt" - ) + log_file_name = os.path.join(log_file_name, f"log_{current_time}.txt") directory = os.path.dirname(log_file_name) if not os.path.exists(directory): os.makedirs(directory) @@ -114,8 +110,11 @@ def set_logger( file_handler.setFormatter(formatter) root.addHandler(file_handler) return log_file_name + + set_logger(log_file_name=None) + def show_platform_info() -> None: """Log all platform information. This is done in the following format: @@ -167,7 +166,7 @@ def show_python_info() -> None: """ module_versions = { "python": platform.python_version(), - "peptdeep": peptdeep.__version__ + "peptdeep": peptdeep.__version__, } requirements = importlib.metadata.requires("peptdeep") for requirement in requirements: diff --git a/peptdeep/utils/regression.py b/peptdeep/utils/regression.py index f857cd8c..dbd4f781 100644 --- a/peptdeep/utils/regression.py +++ b/peptdeep/utils/regression.py @@ -1,12 +1,15 @@ import pandas as pd import numpy as np -def regional_sampling(psm_df:pd.DataFrame, - target:str='rt_norm', n_train:int=1000, - return_test_df:bool=False, - random_state:int=1337, -)->pd.DataFrame: - """ Divide `psm_df` into 10 bins and sample training values + +def regional_sampling( + psm_df: pd.DataFrame, + target: str = "rt_norm", + n_train: int = 1000, + return_test_df: bool = False, + random_state: int = 1337, +) -> pd.DataFrame: + """Divide `psm_df` into 10 bins and sample training values from each bins for model fine-tuning. The values are defined in the `target` column (`rt_norm` or `ccs`). @@ -37,26 +40,19 @@ def regional_sampling(psm_df:pd.DataFrame, The sampled training PSMs (DataFrame) Additional [pd.DataFrame] is returned if `return_test_df==True` for PSMs not sampled. """ - x = np.arange(0, 11)/10*psm_df[target].max() - sub_n = n_train//(len(x)-1) + x = np.arange(0, 11) / 10 * psm_df[target].max() + sub_n = n_train // (len(x) - 1) df_list = [] - for i in range(len(x)-1): - _df = psm_df[ - (psm_df[target]>=x[i])&(psm_df[target]= x[i]) & (psm_df[target] < x[i + 1])] + if len(_df) == 0: + pass + elif len(_df) // 2 < sub_n: + df_list.append( + _df.sample(len(_df) // 2, replace=False, random_state=random_state) + ) else: - df_list.append(_df.sample( - sub_n, - replace=False, - random_state=random_state - )) + df_list.append(_df.sample(sub_n, replace=False, random_state=random_state)) if return_test_df: if len(df_list) == 0: return pd.DataFrame(), pd.DataFrame() @@ -68,17 +64,19 @@ def regional_sampling(psm_df:pd.DataFrame, return pd.DataFrame() return pd.concat(df_list) -#legacy + +# legacy uniform_sampling = regional_sampling + def linear_regression(x, y): coeffs = np.polyfit(x, y, 1) w, b = coeffs.tolist() yhat = np.poly1d(coeffs)(x) - ybar = np.sum(y)/len(y) - ssreg = np.sum((yhat-ybar)**2) - sstot = np.sum((y-ybar)**2) - R_square = ssreg/sstot + ybar = np.sum(y) / len(y) + ssreg = np.sum((yhat - ybar) ** 2) + sstot = np.sum((y - ybar) ** 2) + R_square = ssreg / sstot return dict( R_square=[R_square], R=[np.sqrt(R_square)], @@ -86,9 +84,9 @@ def linear_regression(x, y): intercept=[b], ) + def evaluate_linear_regression( - df:pd.DataFrame, x='rt_pred', y='rt_norm', - ci=95, n_sample=10000000 + df: pd.DataFrame, x="rt_pred", y="rt_norm", ci=95, n_sample=10000000 ): if len(df) > n_sample: df = df.sample(n_sample, replace=False) @@ -98,19 +96,24 @@ def evaluate_linear_regression( return pd.DataFrame(regs) + def evaluate_linear_regression_plot( - df:pd.DataFrame, x='rt_pred', y='rt_norm', - ci=95, n_sample=100000 + df: pd.DataFrame, x="rt_pred", y="rt_norm", ci=95, n_sample=100000 ): import seaborn as sns + if len(df) > n_sample: df = df.sample(n_sample) alpha = 0.05 if len(df) < 5000: alpha = 1 elif len(df) < 50000: - alpha = 5000.0/len(df) + alpha = 5000.0 / len(df) return sns.regplot( - data=df, x=x, y=y, color='r', ci=ci, - scatter_kws={'s':0.05, 'alpha':alpha, 'color':'b'} + data=df, + x=x, + y=y, + color="r", + ci=ci, + scatter_kws={"s": 0.05, "alpha": alpha, "color": "b"}, ) diff --git a/peptdeep/webui/library_ui.py b/peptdeep/webui/library_ui.py index a427c1ff..39c15d36 100644 --- a/peptdeep/webui/library_ui.py +++ b/peptdeep/webui/library_ui.py @@ -8,9 +8,7 @@ from alphabase.yaml_utils import save_yaml from alphabase.protein.fasta import protease_dict -from peptdeep.webui.ui_utils import ( - get_posix, select_files, file_type_selectbox -) +from peptdeep.webui.ui_utils import get_posix, select_files, file_type_selectbox from peptdeep.webui.server import queue_folder @@ -20,310 +18,411 @@ global_ui_settings = global_settings + def mod_options(): with st.form(key="Select modifications"): - st.write('#### Fixed and variable modifications') + st.write("#### Fixed and variable modifications") fixmod = st.multiselect( - label='Please select fixed modifications', + label="Please select fixed modifications", options=MOD_DF.index.values, - default = global_ui_settings['library']['fix_mods'] + default=global_ui_settings["library"]["fix_mods"], ) varmod = st.multiselect( - label='Please select variable modifications', + label="Please select variable modifications", options=MOD_DF.index.values, - default = global_ui_settings['library']['var_mods'] + default=global_ui_settings["library"]["var_mods"], ) - global_ui_settings['library']['fix_mods'] = fixmod - global_ui_settings['library']['var_mods'] = varmod + global_ui_settings["library"]["fix_mods"] = fixmod + global_ui_settings["library"]["var_mods"] = varmod st.form_submit_button(label="Click to add these selected modifications") st.write("Selected modifications:") - st.dataframe(MOD_DF.loc[fixmod+varmod,[ - 'mod_name','classification','composition','mass', - 'modloss_composition','modloss','modloss_importance' - ]], hide_index=True) + st.dataframe( + MOD_DF.loc[ + fixmod + varmod, + [ + "mod_name", + "classification", + "composition", + "mass", + "modloss_composition", + "modloss", + "modloss_importance", + ], + ], + hide_index=True, + ) varmod_range() + def varmod_range(): - min_varmod = st.number_input(label='Min number of variable modifications', - value = global_ui_settings['library']['min_var_mod_num'], - min_value = 0, step = 1, + min_varmod = st.number_input( + label="Min number of variable modifications", + value=global_ui_settings["library"]["min_var_mod_num"], + min_value=0, + step=1, ) - max_varmod = st.number_input(label='Max number of variable modifications', - value = global_ui_settings['library']['max_var_mod_num'], - min_value = 0, step = 1, + max_varmod = st.number_input( + label="Max number of variable modifications", + value=global_ui_settings["library"]["max_var_mod_num"], + min_value=0, + step=1, ) - global_ui_settings['library']['min_var_mod_num'] = min_varmod - global_ui_settings['library']['max_var_mod_num'] = max_varmod + global_ui_settings["library"]["min_var_mod_num"] = min_varmod + global_ui_settings["library"]["max_var_mod_num"] = max_varmod + def specialmod_options(): - st.write('#### Special modificatins') - st.write('*Useful for Phospho@S/T or GlyGly@K*') - st.write('- For Phospho@S/T or HexNAc@S, as a sequence may generate many peptidoforms, this can control the overall number.') - st.write('- For GlyGly@K or GG@K, it will not occur at C-term Lys/K, using `special modifications` to enable this feature.') + st.write("#### Special modificatins") + st.write("*Useful for Phospho@S/T or GlyGly@K*") + st.write( + "- For Phospho@S/T or HexNAc@S, as a sequence may generate many peptidoforms, this can control the overall number." + ) + st.write( + "- For GlyGly@K or GG@K, it will not occur at C-term Lys/K, using `special modifications` to enable this feature." + ) specialmod_expander = st.expander( - label='Special modificatins', - expanded=len(global_ui_settings['library']['special_mods'])>0, + label="Special modificatins", + expanded=len(global_ui_settings["library"]["special_mods"]) > 0, ) with specialmod_expander: with st.form(key="Select special modifications"): - global_ui_settings['library']['special_mods'] = st.multiselect( - label='Please select special modifications', + global_ui_settings["library"]["special_mods"] = st.multiselect( + label="Please select special modifications", options=MOD_DF.index.values, - default=global_ui_settings['library']['special_mods'] + default=global_ui_settings["library"]["special_mods"], ) st.form_submit_button(label="Click to add selected modifications") st.write("Selected special modifications:") - st.dataframe(MOD_DF.loc[ - global_ui_settings['library']['special_mods'], - [ - 'mod_name','classification','composition','mass', - 'modloss_composition','modloss','modloss_importance' - ] - ], hide_index=True) + st.dataframe( + MOD_DF.loc[ + global_ui_settings["library"]["special_mods"], + [ + "mod_name", + "classification", + "composition", + "mass", + "modloss_composition", + "modloss", + "modloss_importance", + ], + ], + hide_index=True, + ) specialmod_range() + def specialmod_range(): - min_specialmod = st.number_input(label='Min number of special modifications', - value = global_ui_settings['library']['min_special_mod_num'], - min_value = 0, step = 1 + min_specialmod = st.number_input( + label="Min number of special modifications", + value=global_ui_settings["library"]["min_special_mod_num"], + min_value=0, + step=1, ) - max_specialmod = st.number_input(label='Max number of special modifications', - value = global_ui_settings['library']['max_special_mod_num'], - min_value = 0, step = 1 + max_specialmod = st.number_input( + label="Max number of special modifications", + value=global_ui_settings["library"]["max_special_mod_num"], + min_value=0, + step=1, ) - global_ui_settings['library']['min_special_mod_num'] = min_specialmod - global_ui_settings['library']['max_special_mod_num'] = max_specialmod + global_ui_settings["library"]["min_special_mod_num"] = min_specialmod + global_ui_settings["library"]["max_special_mod_num"] = max_specialmod st.write("Special modifications cannot modify AAs at:") st.write("*e.g. GlyGly@K will not occur at C-term Lys/K*") - global_ui_settings['library'][ - 'special_mods_cannot_modify_pep_n_term' - ] = bool( - st.checkbox(label='N-term', - value=global_ui_settings['library'][ - 'special_mods_cannot_modify_pep_n_term' - ]) - ) - global_ui_settings['library'][ - 'special_mods_cannot_modify_pep_c_term' - ] = bool( - st.checkbox(label='C-term', - value=global_ui_settings['library'][ - 'special_mods_cannot_modify_pep_c_term' - ]) + global_ui_settings["library"]["special_mods_cannot_modify_pep_n_term"] = bool( + st.checkbox( + label="N-term", + value=global_ui_settings["library"][ + "special_mods_cannot_modify_pep_n_term" + ], + ) ) + global_ui_settings["library"]["special_mods_cannot_modify_pep_c_term"] = bool( + st.checkbox( + label="C-term", + value=global_ui_settings["library"][ + "special_mods_cannot_modify_pep_c_term" + ], + ) + ) + def labeling_options(): def _concat_df_dict(d): df_list = [] for channel, mods in d.items(): - _df = MOD_DF.loc[mods,['mod_name','composition','mass']] - _df['labeling_channel'] = channel + _df = MOD_DF.loc[mods, ["mod_name", "composition", "mass"]] + _df["labeling_channel"] = channel df_list.append(_df) if len(df_list) == 0: return pd.DataFrame() else: return pd.concat(df_list, ignore_index=True) + def _clear_all(): - global_ui_settings['library']['labeling_channels'] = {} + global_ui_settings["library"]["labeling_channels"] = {} st.session_state.select_labeling = [] - st.session_state.labeling_channel_id = '' + st.session_state.labeling_channel_id = "" return - st.write('#### Peptide labeling') - st.write('*For multiplex-DIA (mDIA) workflow*') + + st.write("#### Peptide labeling") + st.write("*For multiplex-DIA (mDIA) workflow*") labeling_expander = st.expander( - label='Labeling channels', - expanded=len(global_ui_settings['library']['labeling_channels'])>0, + label="Labeling channels", + expanded=len(global_ui_settings["library"]["labeling_channels"]) > 0, ) with labeling_expander: with st.form(key="Peptide labeling"): - channel = st.text_input(label="Channel",key='labeling_channel_id') + channel = st.text_input(label="Channel", key="labeling_channel_id") mods = st.multiselect( - label='Please select labeling modifications', + label="Please select labeling modifications", options=MOD_DF.index.values, - key='select_labeling' + key="select_labeling", ) if channel and len(mods) > 0: try: channel = int(channel) except ValueError: pass - global_ui_settings['library']['labeling_channels'][channel] = mods + global_ui_settings["library"]["labeling_channels"][channel] = mods st.form_submit_button(label="Add selected labeling") st.write("Selected labeling modifications:") st.dataframe( - _concat_df_dict(global_ui_settings['library']['labeling_channels']), - hide_index=True + _concat_df_dict(global_ui_settings["library"]["labeling_channels"]), + hide_index=True, ) - st.button(label='Clear all labeling', on_click=_clear_all) + st.button(label="Clear all labeling", on_click=_clear_all) + def choose_precursor_charge(): - from_charge = st.number_input(label='Min precursor charge', min_value = 1, max_value = 4, value = global_ui_settings['library']['min_precursor_charge'], step = 1) + from_charge = st.number_input( + label="Min precursor charge", + min_value=1, + max_value=4, + value=global_ui_settings["library"]["min_precursor_charge"], + step=1, + ) to_charge = st.number_input( - label='Max precursor charge', - min_value = from_charge, max_value = 7, value = global_ui_settings['library']['max_precursor_charge'], step = 1 + label="Max precursor charge", + min_value=from_charge, + max_value=7, + value=global_ui_settings["library"]["max_precursor_charge"], + step=1, ) - global_ui_settings['library']['min_precursor_charge'] = from_charge - global_ui_settings['library']['max_precursor_charge'] = to_charge + global_ui_settings["library"]["min_precursor_charge"] = from_charge + global_ui_settings["library"]["max_precursor_charge"] = to_charge + def choose_precursor_mz(): - min_precursor_mz = st.number_input(label='Min precursor mz', value = global_ui_settings['library']['min_precursor_mz']) - global_ui_settings['library']['min_precursor_mz'] = min_precursor_mz - max_precursor_mz = st.number_input(label='Max precursor mz', min_value = min_precursor_mz, value = global_ui_settings['library']['max_precursor_mz']) - global_ui_settings['library']['max_precursor_mz'] = max_precursor_mz + min_precursor_mz = st.number_input( + label="Min precursor mz", + value=global_ui_settings["library"]["min_precursor_mz"], + ) + global_ui_settings["library"]["min_precursor_mz"] = min_precursor_mz + max_precursor_mz = st.number_input( + label="Max precursor mz", + min_value=min_precursor_mz, + value=global_ui_settings["library"]["max_precursor_mz"], + ) + global_ui_settings["library"]["max_precursor_mz"] = max_precursor_mz + def add_decoy(): - global_ui_settings['library']['decoy'] = st.selectbox( - label='Decoy method (Protein-level decoy only works for fasta)', - options=global_ui_settings['library']['decoy_choices'], - index = global_ui_settings['library']['decoy_choices'].index( - global_ui_settings['library']['decoy'] - ) + global_ui_settings["library"]["decoy"] = st.selectbox( + label="Decoy method (Protein-level decoy only works for fasta)", + options=global_ui_settings["library"]["decoy_choices"], + index=global_ui_settings["library"]["decoy_choices"].index( + global_ui_settings["library"]["decoy"] + ), ) + def choose_protease(): def on_custom_protease(): if ( - len(st.session_state['custom_protease_text']) <= 1 - or st.session_state['custom_protease_text'] in protease_dict + len(st.session_state["custom_protease_text"]) <= 1 + or st.session_state["custom_protease_text"] in protease_dict or ( - '(' in st.session_state['custom_protease_text'] and - ')' in st.session_state['custom_protease_text'] and - len(st.session_state['custom_protease_text']) >= 3 + "(" in st.session_state["custom_protease_text"] + and ")" in st.session_state["custom_protease_text"] + and len(st.session_state["custom_protease_text"]) >= 3 ) ): return else: - st.session_state['custom_protease_text'] = '' + st.session_state["custom_protease_text"] = "" print( - st.session_state['custom_protease_text'], - global_ui_settings['library']['fasta']['protease'] + st.session_state["custom_protease_text"], + global_ui_settings["library"]["fasta"]["protease"], ) custom_protease = st.text_input( label="Custom protease (name or regular expression, use `Common protease` below if empty)", - value=global_ui_settings['library']['fasta']['protease'], + value=global_ui_settings["library"]["fasta"]["protease"], key="custom_protease_text", - on_change=on_custom_protease + on_change=on_custom_protease, ) protease = st.selectbox( - label='Common protease (set `Custom protease` above as empty to enable this)', - options=global_ui_settings['library']['fasta']['protease_choices'], - index=0, disabled=(custom_protease!='') + label="Common protease (set `Custom protease` above as empty to enable this)", + options=global_ui_settings["library"]["fasta"]["protease_choices"], + index=0, + disabled=(custom_protease != ""), ) if custom_protease: - global_ui_settings['library']['fasta']['protease'] = custom_protease + global_ui_settings["library"]["fasta"]["protease"] = custom_protease else: - global_ui_settings['library']['fasta']['protease'] = protease + global_ui_settings["library"]["fasta"]["protease"] = protease st.text(f"Selected protease: {global_ui_settings['library']['fasta']['protease']}") - max_miss_cleave = st.number_input(label='Max number of miss cleavages',value = global_ui_settings['library']['fasta']['max_miss_cleave']) - global_ui_settings['library']['fasta']['max_miss_cleave'] = max_miss_cleave + max_miss_cleave = st.number_input( + label="Max number of miss cleavages", + value=global_ui_settings["library"]["fasta"]["max_miss_cleave"], + ) + global_ui_settings["library"]["fasta"]["max_miss_cleave"] = max_miss_cleave + def choose_peptide_len(): - min_peptide_len = st.number_input(label='Min peptide length', value = global_ui_settings['library']['min_peptide_len']) - max_peptide_len = st.number_input(label='Max peptide length', min_value = min_peptide_len, value = global_ui_settings['library']['max_peptide_len']) - global_ui_settings['library']['min_peptide_len'] = min_peptide_len - global_ui_settings['library']['max_peptide_len'] = max_peptide_len + min_peptide_len = st.number_input( + label="Min peptide length", + value=global_ui_settings["library"]["min_peptide_len"], + ) + max_peptide_len = st.number_input( + label="Max peptide length", + min_value=min_peptide_len, + value=global_ui_settings["library"]["max_peptide_len"], + ) + global_ui_settings["library"]["min_peptide_len"] = min_peptide_len + global_ui_settings["library"]["max_peptide_len"] = max_peptide_len + def choose_frag_types(): frag_types = st.multiselect( - label='Fragment types',options=(global_ui_settings['model']['frag_types']), - default = global_ui_settings['library']['frag_types'] + label="Fragment types", + options=(global_ui_settings["model"]["frag_types"]), + default=global_ui_settings["library"]["frag_types"], ) - global_ui_settings['library']['frag_types'] = frag_types - max_frag_charge = st.number_input(label='Max fragment charge',min_value = 1, max_value = 2, value = global_ui_settings['library']['max_frag_charge'], step = 1) - global_ui_settings['library']['max_frag_charge'] = max_frag_charge + global_ui_settings["library"]["frag_types"] = frag_types + max_frag_charge = st.number_input( + label="Max fragment charge", + min_value=1, + max_value=2, + value=global_ui_settings["library"]["max_frag_charge"], + step=1, + ) + global_ui_settings["library"]["max_frag_charge"] = max_frag_charge + def output_tsv(): - min_fragment_mz = st.number_input(label='Min fragment mz:', value = global_ui_settings['library']['output_tsv']['min_fragment_mz']) - global_ui_settings['library']['output_tsv']['min_fragment_mz'] = min_fragment_mz - max_fragment_mz = st.number_input(label='Max fragment mz:', min_value = min_fragment_mz, value = global_ui_settings['library']['output_tsv']['max_fragment_mz']) - global_ui_settings['library']['output_tsv']['max_fragment_mz'] = max_fragment_mz + min_fragment_mz = st.number_input( + label="Min fragment mz:", + value=global_ui_settings["library"]["output_tsv"]["min_fragment_mz"], + ) + global_ui_settings["library"]["output_tsv"]["min_fragment_mz"] = min_fragment_mz + max_fragment_mz = st.number_input( + label="Max fragment mz:", + min_value=min_fragment_mz, + value=global_ui_settings["library"]["output_tsv"]["max_fragment_mz"], + ) + global_ui_settings["library"]["output_tsv"]["max_fragment_mz"] = max_fragment_mz min_relative_intensity = st.number_input( - label='Min relative intensity:', - value = global_ui_settings['library']['output_tsv']['min_relative_intensity'], + label="Min relative intensity:", + value=global_ui_settings["library"]["output_tsv"]["min_relative_intensity"], step=0.0001, - format='%0.4f' + format="%0.4f", + ) + global_ui_settings["library"]["output_tsv"]["min_relative_intensity"] = ( + min_relative_intensity ) - global_ui_settings['library']['output_tsv']['min_relative_intensity'] = min_relative_intensity keep_higest_k_peaks = st.number_input( - label='Number of highest peaks to keep:', - value = global_ui_settings['library']['output_tsv']['keep_higest_k_peaks'] + label="Number of highest peaks to keep:", + value=global_ui_settings["library"]["output_tsv"]["keep_higest_k_peaks"], + ) + global_ui_settings["library"]["output_tsv"]["keep_higest_k_peaks"] = ( + keep_higest_k_peaks + ) + global_ui_settings["library"]["output_tsv"]["translate_mod_to_unimod_id"] = bool( + st.checkbox( + label="Translate modifications to Unimod ids", + value=global_ui_settings["library"]["output_tsv"][ + "translate_mod_to_unimod_id" + ], + ) ) - global_ui_settings['library']['output_tsv']['keep_higest_k_peaks'] = keep_higest_k_peaks - global_ui_settings['library']['output_tsv']['translate_mod_to_unimod_id']=bool( - st.checkbox(label='Translate modifications to Unimod ids', - value=global_ui_settings['library']['output_tsv']['translate_mod_to_unimod_id'] - )) + def show(): st.write("# Library Prediction") - st.write('### Input') + st.write("### Input") infile_type = file_type_selectbox( - ui_label='Input file type', - st_key='lib_input_type', - default_type=global_ui_settings['library']['infile_type'], - monitor_files=global_ui_settings['library']['infiles'], - choices=global_ui_settings['library']['infile_type_choices'], - index=global_ui_settings['library']['infile_type_choices'].index( - global_ui_settings['library']['infile_type'] - ) + ui_label="Input file type", + st_key="lib_input_type", + default_type=global_ui_settings["library"]["infile_type"], + monitor_files=global_ui_settings["library"]["infiles"], + choices=global_ui_settings["library"]["infile_type_choices"], + index=global_ui_settings["library"]["infile_type_choices"].index( + global_ui_settings["library"]["infile_type"] + ), ) - global_ui_settings['library']['infile_type'] = infile_type - - if infile_type != 'fasta': - df = pd.DataFrame({ - 'sequence': ['ACDEFGHIK','LMNPQRSTVK','WYVSTR'], - 'mods': ['Carbamidomethyl@C','Acetyl@Protein N-term;Phospho@S',''], - 'mod_sites': ['2','0;7',''], - 'charge': [2,3,1], - }) + global_ui_settings["library"]["infile_type"] = infile_type + + if infile_type != "fasta": + df = pd.DataFrame( + { + "sequence": ["ACDEFGHIK", "LMNPQRSTVK", "WYVSTR"], + "mods": ["Carbamidomethyl@C", "Acetyl@Protein N-term;Phospho@S", ""], + "mod_sites": ["2", "0;7", ""], + "charge": [2, 3, 1], + } + ) infile_expander = st.expander("Input file examples") with infile_expander: - st.write('`sequence_table`:') - st.dataframe(df[['sequence']], hide_index=True) - st.write('`peptide_table` with alphabase PTMs:') - st.dataframe(df[['sequence','mods','mod_sites']], hide_index=True) - st.write('`precursor_table` with alphabase PTMs:') - st.dataframe(df[['sequence','mods','mod_sites','charge']], hide_index=True) + st.write("`sequence_table`:") + st.dataframe(df[["sequence"]], hide_index=True) + st.write("`peptide_table` with alphabase PTMs:") + st.dataframe(df[["sequence", "mods", "mod_sites"]], hide_index=True) + st.write("`precursor_table` with alphabase PTMs:") + st.dataframe( + df[["sequence", "mods", "mod_sites", "charge"]], hide_index=True + ) infile_ext_dict = { - 'fasta': ['.fasta','.fa'], - 'sequence_table': ['tsv','txt','csv'], - 'peptide_table': ['tsv','txt','csv'], - 'precursor_table': ['tsv','txt','csv'], + "fasta": [".fasta", ".fa"], + "sequence_table": ["tsv", "txt", "csv"], + "peptide_table": ["tsv", "txt", "csv"], + "precursor_table": ["tsv", "txt", "csv"], } - if infile_type == 'fasta': - global_ui_settings['library']['fasta']['add_contaminants'] = bool(st.checkbox( - label='Add fasta of contaminants', - value=global_ui_settings['library']['fasta']['add_contaminants'] - )) + if infile_type == "fasta": + global_ui_settings["library"]["fasta"]["add_contaminants"] = bool( + st.checkbox( + label="Add fasta of contaminants", + value=global_ui_settings["library"]["fasta"]["add_contaminants"], + ) + ) select_files( - global_ui_settings['library']['infiles'], + global_ui_settings["library"]["infiles"], infile_ext_dict[infile_type], - 'Input sequence files', + "Input sequence files", ) - st.write('### Library settings') + st.write("### Library settings") add_decoy() - if infile_type == 'fasta': + if infile_type == "fasta": choose_protease() mod_options() specialmod_options() - elif infile_type == 'sequence_table': + elif infile_type == "sequence_table": mod_options() specialmod_options() @@ -331,10 +430,10 @@ def show(): st.write("#### Common peptide settings") - if infile_type == 'fasta': + if infile_type == "fasta": choose_peptide_len() - if infile_type in ['fasta','sequence_table','peptide_table']: + if infile_type in ["fasta", "sequence_table", "peptide_table"]: choose_precursor_charge() choose_precursor_mz() @@ -344,52 +443,52 @@ def show(): output_folder = st.text_input( label="Output folder", - value=global_ui_settings['library']['output_folder'].format( - PEPTDEEP_HOME=global_ui_settings['PEPTDEEP_HOME'] - ) + value=global_ui_settings["library"]["output_folder"].format( + PEPTDEEP_HOME=global_ui_settings["PEPTDEEP_HOME"] + ), ) output_folder = os.path.expanduser(output_folder) output_folder = get_posix(output_folder) - global_ui_settings['library']['output_folder'] = output_folder - - global_ui_settings['library']['rt_to_irt'] = bool(st.checkbox( - label='Convert predicted RT to iRT', - value=global_ui_settings['library']['rt_to_irt'] - )) - - global_ui_settings['library']['generate_precursor_isotope'] = bool(st.checkbox( - label="Generate precursor isotopes (don't check this for DiaNN/Spectronaut search)", - value=global_ui_settings['library']['generate_precursor_isotope'] - )) - - tsv_enabled = bool(st.checkbox( - label='Output TSV (for DiaNN/Spectronaut)', - value=global_ui_settings['library']['output_tsv']['enabled'] - )) - global_ui_settings['library']['output_tsv']['enabled'] = tsv_enabled + global_ui_settings["library"]["output_folder"] = output_folder + + global_ui_settings["library"]["rt_to_irt"] = bool( + st.checkbox( + label="Convert predicted RT to iRT", + value=global_ui_settings["library"]["rt_to_irt"], + ) + ) + + global_ui_settings["library"]["generate_precursor_isotope"] = bool( + st.checkbox( + label="Generate precursor isotopes (don't check this for DiaNN/Spectronaut search)", + value=global_ui_settings["library"]["generate_precursor_isotope"], + ) + ) + + tsv_enabled = bool( + st.checkbox( + label="Output TSV (for DiaNN/Spectronaut)", + value=global_ui_settings["library"]["output_tsv"]["enabled"], + ) + ) + global_ui_settings["library"]["output_tsv"]["enabled"] = tsv_enabled st.warning("Writing the TSV file for a big library is very slow") if tsv_enabled: output_tsv() now = datetime.now() current_time = now.strftime("%Y-%m-%d--%H-%M-%S.%f") - task_name = st.text_input(label="Task name", value=f"peptdeep_library_{current_time}") + task_name = st.text_input( + label="Task name", value=f"peptdeep_library_{current_time}" + ) - if st.button(label='Submit for library prediction'): - global_ui_settings['task_workflow'] = ['library'] + if st.button(label="Submit for library prediction"): + global_ui_settings["task_workflow"] = ["library"] if not os.path.exists(output_folder): os.makedirs(output_folder) - yaml_path = f'{queue_folder}/{task_name}.yaml' - save_yaml( - yaml_path, global_ui_settings - ) - save_yaml( - os.path.join( - output_folder, - f'{task_name}.yaml' - ), - global_ui_settings - ) - st.write(f'`library` task saved as `{os.path.expanduser(yaml_path)}`') + yaml_path = f"{queue_folder}/{task_name}.yaml" + save_yaml(yaml_path, global_ui_settings) + save_yaml(os.path.join(output_folder, f"{task_name}.yaml"), global_ui_settings) + st.write(f"`library` task saved as `{os.path.expanduser(yaml_path)}`") diff --git a/peptdeep/webui/main_ui.py b/peptdeep/webui/main_ui.py index 64be704f..f2baccbd 100644 --- a/peptdeep/webui/main_ui.py +++ b/peptdeep/webui/main_ui.py @@ -8,15 +8,19 @@ import peptdeep from peptdeep.webui import ( - model_ui, startpage, rescore_ui, - library_ui, transfer_ui, - settings_ui, server_ui, + model_ui, + startpage, + rescore_ui, + library_ui, + transfer_ui, + settings_ui, + server_ui, ) _this_file = __file__ _this_directory = os.path.dirname(_this_file) -LOGO_PATH = os.path.join(_this_directory, 'logos', 'peptdeep.png') -ICON_PATH = os.path.join(_this_directory, 'logos', 'peptdeep.ico') +LOGO_PATH = os.path.join(_this_directory, "logos", "peptdeep.png") +ICON_PATH = os.path.join(_this_directory, "logos", "peptdeep.ico") image = Image.open(LOGO_PATH) icon = Image.open(ICON_PATH) computer_name = socket.gethostname() @@ -36,17 +40,17 @@ """ st.markdown(hide_streamlit_style, unsafe_allow_html=True) -st.sidebar.image(image, width = 300) +st.sidebar.image(image, width=300) st.sidebar.code(f"AlphaPeptDeep (PeptDeep) {peptdeep.__version__} \n{computer_name}") sidebar = { - 'Start page': startpage.show, - 'Model': model_ui.show, - 'Transfer': transfer_ui.show, - 'Library': library_ui.show, + "Start page": startpage.show, + "Model": model_ui.show, + "Transfer": transfer_ui.show, + "Library": library_ui.show, # 'Rescore': rescore_ui.show, - 'Server': server_ui.show, - 'Settings': settings_ui.show, + "Server": server_ui.show, + "Settings": settings_ui.show, } menu = st.sidebar.radio("", list(sidebar.keys())) @@ -54,5 +58,5 @@ if menu: sidebar[menu]() -link = f'[PeptDeep on GitHub]({peptdeep.__github__})' +link = f"[PeptDeep on GitHub]({peptdeep.__github__})" st.sidebar.markdown(link, unsafe_allow_html=True) diff --git a/peptdeep/webui/model_ui.py b/peptdeep/webui/model_ui.py index 306f0a57..b00eff7e 100644 --- a/peptdeep/webui/model_ui.py +++ b/peptdeep/webui/model_ui.py @@ -4,52 +4,80 @@ global_ui_settings = global_settings + def predict(): - batch_size_ms2 = st.number_input(label='Batch size to predict MS2', value = global_ui_settings['model_mgr']['predict']['batch_size_ms2']) - global_ui_settings['model_mgr']['predict']['batch_size_ms2'] = batch_size_ms2 - batch_size_rt_ccs = st.number_input(label='Batch size to predict RT and CCS', value = global_ui_settings['model_mgr']['predict']['batch_size_rt_ccs']) - global_ui_settings['model_mgr']['predict']['batch_size_rt_ccs'] = batch_size_rt_ccs - - instruments = list(global_ui_settings['model_mgr']['instrument_group'].keys()) - global_ui_settings['model_mgr']['default_instrument'] = st.selectbox( - label='Instrument',options=instruments,index = instruments.index( - global_ui_settings['model_mgr']['default_instrument'] - ) + batch_size_ms2 = st.number_input( + label="Batch size to predict MS2", + value=global_ui_settings["model_mgr"]["predict"]["batch_size_ms2"], + ) + global_ui_settings["model_mgr"]["predict"]["batch_size_ms2"] = batch_size_ms2 + batch_size_rt_ccs = st.number_input( + label="Batch size to predict RT and CCS", + value=global_ui_settings["model_mgr"]["predict"]["batch_size_rt_ccs"], + ) + global_ui_settings["model_mgr"]["predict"]["batch_size_rt_ccs"] = batch_size_rt_ccs + + instruments = list(global_ui_settings["model_mgr"]["instrument_group"].keys()) + global_ui_settings["model_mgr"]["default_instrument"] = st.selectbox( + label="Instrument", + options=instruments, + index=instruments.index(global_ui_settings["model_mgr"]["default_instrument"]), + ) + default_nce = st.number_input( + label="NCE", value=global_ui_settings["model_mgr"]["default_nce"] + ) + global_ui_settings["model_mgr"]["default_nce"] = default_nce + + verbose = st.checkbox( + label="Verbose", value=global_ui_settings["model_mgr"]["predict"]["verbose"] ) - default_nce = st.number_input(label='NCE', value = global_ui_settings['model_mgr']['default_nce']) - global_ui_settings['model_mgr']['default_nce'] = default_nce + global_ui_settings["model_mgr"]["predict"]["verbose"] = verbose + multiprocessing = st.checkbox( + label="Multiprocessing (if no GPUs)", + value=global_ui_settings["model_mgr"]["predict"]["multiprocessing"], + ) + global_ui_settings["model_mgr"]["predict"]["multiprocessing"] = multiprocessing - verbose = st.checkbox(label='Verbose', value=global_ui_settings['model_mgr']['predict']['verbose']) - global_ui_settings['model_mgr']['predict']['verbose'] = verbose - multiprocessing = st.checkbox(label='Multiprocessing (if no GPUs)', value=global_ui_settings['model_mgr']['predict']['multiprocessing']) - global_ui_settings['model_mgr']['predict']['multiprocessing'] = multiprocessing def model(): - model_url = st.text_input(label='URL (or local path) to download the pre-trained models',value = global_ui_settings['model_url']) - global_ui_settings['model_url'] = model_url + model_url = st.text_input( + label="URL (or local path) to download the pre-trained models", + value=global_ui_settings["model_url"], + ) + global_ui_settings["model_url"] = model_url - global_ui_settings['model_mgr']['external_ms2_model'] = st.text_input(label='External MS2 model', value=global_ui_settings['model_mgr']['external_ms2_model']) - global_ui_settings['model_mgr']['external_rt_model'] = st.text_input(label='External RT model', value=global_ui_settings['model_mgr']['external_rt_model']) - global_ui_settings['model_mgr']['external_ccs_model'] = st.text_input(label='External CCS model', value=global_ui_settings['model_mgr']['external_ccs_model']) + global_ui_settings["model_mgr"]["external_ms2_model"] = st.text_input( + label="External MS2 model", + value=global_ui_settings["model_mgr"]["external_ms2_model"], + ) + global_ui_settings["model_mgr"]["external_rt_model"] = st.text_input( + label="External RT model", + value=global_ui_settings["model_mgr"]["external_rt_model"], + ) + global_ui_settings["model_mgr"]["external_ccs_model"] = st.text_input( + label="External CCS model", + value=global_ui_settings["model_mgr"]["external_ccs_model"], + ) def show(): - st.write("# Model Configuration") - st.write('### Pre-trained models') + st.write("### Pre-trained models") model() - global_ui_settings['model_mgr']['model_type'] = st.selectbox( - label='Model type', - options=global_ui_settings['model_mgr']['model_choices'], - index = global_ui_settings['model_mgr']['model_choices'].index( - global_ui_settings['model_mgr']['model_type'] - ) + global_ui_settings["model_mgr"]["model_type"] = st.selectbox( + label="Model type", + options=global_ui_settings["model_mgr"]["model_choices"], + index=global_ui_settings["model_mgr"]["model_choices"].index( + global_ui_settings["model_mgr"]["model_type"] + ), ) - global_ui_settings['model_mgr']['mask_modloss'] = bool( - st.checkbox(label='mask modloss (this will set intensity values to zero for neutral loss of PTMs (e.g. -98 Da for Phospho@S/T))', - value = global_ui_settings['model_mgr']['mask_modloss']) + global_ui_settings["model_mgr"]["mask_modloss"] = bool( + st.checkbox( + label="mask modloss (this will set intensity values to zero for neutral loss of PTMs (e.g. -98 Da for Phospho@S/T))", + value=global_ui_settings["model_mgr"]["mask_modloss"], + ) ) - st.write('### Prediction') + st.write("### Prediction") predict() diff --git a/peptdeep/webui/rescore_ui.py b/peptdeep/webui/rescore_ui.py index fdc76f3f..f586d885 100644 --- a/peptdeep/webui/rescore_ui.py +++ b/peptdeep/webui/rescore_ui.py @@ -8,42 +8,39 @@ global_ui_settings = global_settings + def show(): """Streamlit page that displays information on how to rescore.""" st.write("# DDA Rescoring") - MS_type = st.selectbox( - label='MS file type', - options=('Raw', 'MGF', 'ms_data.hdf') - ) - raw_folder = st.text_input(label='Raw folder') + MS_type = st.selectbox(label="MS file type", options=("Raw", "MGF", "ms_data.hdf")) + raw_folder = st.text_input(label="Raw folder") if os.path.isdir(raw_folder): st.write(f"### MS files in {raw_folder}") - raw_files = files_in_folder_pandas(raw_folder,MS_type) + raw_files = files_in_folder_pandas(raw_folder, MS_type) st.dataframe(raw_files) else: st.write(f"Invalid folder: {raw_folder}") - result_folder = st.text_input(label='Result folder') - #st.write('The current result folder is', result_folder) - PSM_type = st.selectbox(label='PSM file type', - options=('AlphaPept', 'pFind') - ) - if PSM_type == 'AlphaPept': - psm_type = 'ms_data.hdf' - elif PSM_type == 'pFind': - psm_type = 'spectra' + result_folder = st.text_input(label="Result folder") + # st.write('The current result folder is', result_folder) + PSM_type = st.selectbox(label="PSM file type", options=("AlphaPept", "pFind")) + if PSM_type == "AlphaPept": + psm_type = "ms_data.hdf" + elif PSM_type == "pFind": + psm_type = "spectra" if os.path.isdir(result_folder): - st.write(f"### PSM files in {result_folder}") - result_files = files_in_folder_pandas(result_folder,psm_type) + result_files = files_in_folder_pandas(result_folder, psm_type) st.dataframe(result_files) st.warning("We are still working on Rescore GUI panel.") st.warning("For command line users, please use `peptdeep rescore` for rescoring.") - st.warning("For Python users, please use `peptdeep.pipeline_api.rescore()` for rescoring.") + st.warning( + "For Python users, please use `peptdeep.pipeline_api.rescore()` for rescoring." + ) diff --git a/peptdeep/webui/server.py b/peptdeep/webui/server.py index 55d0de1e..fae9d42b 100644 --- a/peptdeep/webui/server.py +++ b/peptdeep/webui/server.py @@ -12,30 +12,32 @@ from peptdeep.settings import global_settings, load_global_settings from peptdeep.utils import logging + def get_yamls(folder): ymls = [] for file in os.listdir(folder): - if file.endswith('.yaml'): + if file.endswith(".yaml"): ymls.append(os.path.join(folder, file)) ymls.sort(key=lambda x: os.path.getmtime(x)) return ymls + def _create_dir(dir): if not os.path.isdir(dir): os.makedirs(dir) -home_folder = os.path.expanduser( - global_settings['PEPTDEEP_HOME'] -) -queue_folder = f'{home_folder}/tasks/queue' -done_folder = f'{home_folder}/tasks/done' -failed_folder = f'{home_folder}/tasks/failed' +home_folder = os.path.expanduser(global_settings["PEPTDEEP_HOME"]) + +queue_folder = f"{home_folder}/tasks/queue" +done_folder = f"{home_folder}/tasks/done" +failed_folder = f"{home_folder}/tasks/failed" _create_dir(queue_folder) _create_dir(done_folder) _create_dir(failed_folder) + def serve(): files = [] echo_waiting = True @@ -45,31 +47,33 @@ def serve(): yaml_file = files.pop(0) print(f"[PeptDeep] Starting a new job '{yaml_file}'...") - running_txt = f'{home_folder}/tasks/running.txt' - with open(running_txt,'w') as f: + running_txt = f"{home_folder}/tasks/running.txt" + with open(running_txt, "w") as f: f.write(yaml_file) try: load_global_settings(yaml_file) - if global_settings['task_workflow'][0] == 'train': + if global_settings["task_workflow"][0] == "train": print("[PeptDeep] Transfer learning ... ") transfer_learn() - elif global_settings['task_workflow'][0] == 'library': + elif global_settings["task_workflow"][0] == "library": print("[PeptDeep] Predicting library ... ") generate_library() # elif global_settings['task_workflow'] == 'rescore': # print("[PeptDeep] Rescoring PSMs ... ") # rescore() else: - logging.warning(f"[PeptDeep] Unknown task type `{global_settings['task_workflow']}`, skip ... ") + logging.warning( + f"[PeptDeep] Unknown task type `{global_settings['task_workflow']}`, skip ... " + ) continue if os.path.isfile(yaml_file): shutil.move( yaml_file, - os.path.join(done_folder, os.path.basename(yaml_file)) + os.path.join(done_folder, os.path.basename(yaml_file)), ) except KeyboardInterrupt as e: - with open(running_txt,'w') as f: + with open(running_txt, "w") as f: f.write("") raise e except Exception as e: @@ -79,24 +83,25 @@ def serve(): os.path.join(failed_folder, os.path.basename(yaml_file)), ) print(e) - with open(running_txt,'w') as f: + with open(running_txt, "w") as f: f.write("") - echo_waiting=True + echo_waiting = True else: if echo_waiting: print("*********************************") print("[PeptDeep] Waiting for tasks ... ") print("*********************************") - echo_waiting=False + echo_waiting = False time.sleep(3) + class PeptDeepServer: def __init__(self): - self.process:mp.Process = None + self.process: mp.Process = None self._process_file = os.path.join( - global_settings['PEPTDEEP_HOME'], - 'tasks/serve_pid.txt', + global_settings["PEPTDEEP_HOME"], + "tasks/serve_pid.txt", ) def start(self): @@ -104,7 +109,7 @@ def start(self): self.process = mp.Process(target=serve) self.process.start() - with open(self._process_file, 'w') as f: + with open(self._process_file, "w") as f: f.write(str(self.process.pid)) def terminate(self): @@ -113,9 +118,10 @@ def terminate(self): self.process.kill() self.process = None - os.replace(self._process_file, self._process_file[:-3]+'prev.txt') + os.replace(self._process_file, self._process_file[:-3] + "prev.txt") def __del__(self): self.terminate() + _server = PeptDeepServer() diff --git a/peptdeep/webui/server_ui.py b/peptdeep/webui/server_ui.py index 571ab9bf..d27d79e3 100644 --- a/peptdeep/webui/server_ui.py +++ b/peptdeep/webui/server_ui.py @@ -5,11 +5,10 @@ from alphabase.yaml_utils import load_yaml -from peptdeep.webui.server import ( - get_yamls, queue_folder -) +from peptdeep.webui.server import get_yamls, queue_folder from peptdeep.webui.ui_utils import files_in_pandas + def display_tasks(): st.write("## Tasks in the queue") @@ -20,13 +19,13 @@ def display_tasks(): tasks = [] for _yml in yamls: _dict = load_yaml(_yml) - if 'task_workflow' in _dict: - task_workflow = _dict['task_workflow'] + if "task_workflow" in _dict: + task_workflow = _dict["task_workflow"] else: - task_workflow = ['library'] + task_workflow = ["library"] tasks.append(task_workflow) - df['Task Type'] = tasks + df["Task Type"] = tasks st.dataframe(df) @@ -35,15 +34,13 @@ def display_tasks(): with st.expander(label="Remove tasks in the task queue"): yaml_fname = st.text_input(label="Task yaml file to delete") if st.button(label="Remove this yaml file") and len(df) > 0: - if (df["File Path"].values[0] == yaml_fname): + if df["File Path"].values[0] == yaml_fname: st.write("Cannot remove the task which is currently running") - elif ( - os.path.isfile(yaml_fname) and - yaml_fname.startswith(queue_folder) - ): + elif os.path.isfile(yaml_fname) and yaml_fname.startswith(queue_folder): os.remove(yaml_fname) st.write(f"Task {yaml_fname} has been removed") + def show(): st.write("# AlphaPeptDeep Server") @@ -66,7 +63,7 @@ def show(): st.write("### Hardware utilization") - c1,c2 = st.columns(2) + c1, c2 = st.columns(2) c1.text("RAM") ram = c1.progress( 1 - psutil.virtual_memory().available / psutil.virtual_memory().total diff --git a/peptdeep/webui/settings_ui.py b/peptdeep/webui/settings_ui.py index 43def53c..a2069ee3 100644 --- a/peptdeep/webui/settings_ui.py +++ b/peptdeep/webui/settings_ui.py @@ -4,9 +4,7 @@ from io import StringIO import multiprocessing -from alphabase.constants.modification import ( - MOD_DF, keep_modloss_by_importance -) +from alphabase.constants.modification import MOD_DF, keep_modloss_by_importance from peptdeep.settings import ( update_settings, @@ -17,53 +15,54 @@ global_ui_settings = global_settings + def add_user_mods(): st.write("#### User-defined modifications") - st.write('PeptDeep supports modifications those are not in UniMod') + st.write("PeptDeep supports modifications those are not in UniMod") user_mod_expander = st.expander(label="Add user-defined modifications") with user_mod_expander: mod_name = st.text_input( - label='User-defined modification name, e.g. Hello@K', - key='user_mod_name' + label="User-defined modification name, e.g. Hello@K", key="user_mod_name" ).strip() composition = st.text_input( - label='The modification composition, e.g. H(1)P(1)O(3)', - key='user_mod_comp' + label="The modification composition, e.g. H(1)P(1)O(3)", key="user_mod_comp" ).strip() modloss_composition = st.text_input( label="The modification loss composition, e.g. H(3)P(1)O(4)", - key='user_mod_loss' + key="user_mod_loss", ).strip() if mod_name: - global_ui_settings['common']['user_defined_modifications'][mod_name] = { - 'composition': composition, - 'modloss_composition': modloss_composition, + global_ui_settings["common"]["user_defined_modifications"][mod_name] = { + "composition": composition, + "modloss_composition": modloss_composition, } - st.dataframe(pd.DataFrame().from_dict( - global_ui_settings['common']['user_defined_modifications'], - orient = 'index', - )) + st.dataframe( + pd.DataFrame().from_dict( + global_ui_settings["common"]["user_defined_modifications"], + orient="index", + ) + ) def _clear_user_mods(): - global_ui_settings['common']['user_defined_modifications'] = {} - st.session_state.user_mod_name = '' - st.session_state.user_mod_comp = '' - st.session_state.user_mod_loss = '' + global_ui_settings["common"]["user_defined_modifications"] = {} + st.session_state.user_mod_name = "" + st.session_state.user_mod_comp = "" + st.session_state.user_mod_loss = "" - st.button(label='Clear all user modifications', - on_click=_clear_user_mods - ) + st.button(label="Clear all user modifications", on_click=_clear_user_mods) - if st.button(label='Add user modifications into AlphaBase'): + if st.button(label="Add user modifications into AlphaBase"): add_user_defined_modifications() st.write("Check last n+2 modifications:") - st.dataframe(MOD_DF.tail( - len(global_ui_settings['common'][ - 'user_defined_modifications' - ])+2 - ), hide_index=True) + st.dataframe( + MOD_DF.tail( + len(global_ui_settings["common"]["user_defined_modifications"]) + 2 + ), + hide_index=True, + ) + def show(): load_settings_gui() @@ -73,68 +72,94 @@ def show(): add_user_mods() - ms2_ppm = st.checkbox(label='MS2 ppm (otherwise Da)', value=global_ui_settings['peak_matching']['ms2_ppm']) - global_ui_settings['peak_matching']['ms2_ppm'] = ms2_ppm - ms2_tol_value = st.number_input(label='MS2 tolerance', value = global_ui_settings['peak_matching']['ms2_tol_value'], step = 1.0) - global_ui_settings['peak_matching']['ms2_tol_value'] = ms2_tol_value + ms2_ppm = st.checkbox( + label="MS2 ppm (otherwise Da)", + value=global_ui_settings["peak_matching"]["ms2_ppm"], + ) + global_ui_settings["peak_matching"]["ms2_ppm"] = ms2_ppm + ms2_tol_value = st.number_input( + label="MS2 tolerance", + value=global_ui_settings["peak_matching"]["ms2_tol_value"], + step=1.0, + ) + global_ui_settings["peak_matching"]["ms2_tol_value"] = ms2_tol_value - ms1_ppm = st.checkbox(label='MS1 ppm (otherwise Da)', value=global_ui_settings['peak_matching']['ms1_ppm']) - global_ui_settings['peak_matching']['ms1_ppm'] = ms1_ppm - ms1_tol_value = st.number_input(label='MS1 tolerance', value = global_ui_settings['peak_matching']['ms1_tol_value'], step = 1.0) - global_ui_settings['peak_matching']['ms1_tol_value'] = ms1_tol_value + ms1_ppm = st.checkbox( + label="MS1 ppm (otherwise Da)", + value=global_ui_settings["peak_matching"]["ms1_ppm"], + ) + global_ui_settings["peak_matching"]["ms1_ppm"] = ms1_ppm + ms1_tol_value = st.number_input( + label="MS1 tolerance", + value=global_ui_settings["peak_matching"]["ms1_tol_value"], + step=1.0, + ) + global_ui_settings["peak_matching"]["ms1_tol_value"] = ms1_tol_value cpu_count = multiprocessing.cpu_count() - thread_num = st.number_input(label='Thread number', + thread_num = st.number_input( + label="Thread number", value=min( - global_ui_settings['thread_num'], - cpu_count, - global_settings['MAX_THREADS'] + global_ui_settings["thread_num"], cpu_count, global_settings["MAX_THREADS"] ), - max_value=min( - cpu_count, global_settings['MAX_THREADS'] - ), - step=1 + max_value=min(cpu_count, global_settings["MAX_THREADS"]), + step=1, ) - global_ui_settings['thread_num'] = thread_num + global_ui_settings["thread_num"] = thread_num - global_ui_settings['torch_device']['device_type'] = st.selectbox( - label='Computing devices', - options=global_ui_settings['torch_device']['device_type_choices'], - index = global_ui_settings['torch_device']['device_type_choices'].index( - global_ui_settings['torch_device']['device_type'] - ) + global_ui_settings["torch_device"]["device_type"] = st.selectbox( + label="Computing devices", + options=global_ui_settings["torch_device"]["device_type_choices"], + index=global_ui_settings["torch_device"]["device_type_choices"].index( + global_ui_settings["torch_device"]["device_type"] + ), ) - global_ui_settings['log_level'] = st.selectbox( - label='Log level', - options=global_ui_settings['log_level_choices'], - index = global_ui_settings['log_level_choices'].index( - global_ui_settings['log_level'] - ) + global_ui_settings["log_level"] = st.selectbox( + label="Log level", + options=global_ui_settings["log_level_choices"], + index=global_ui_settings["log_level_choices"].index( + global_ui_settings["log_level"] + ), ) - global_ui_settings['common']['modloss_importance_level'] = st.number_input( - 'Modification loss importance level (for a PTM, fragment modloss mz=0 if modloss_importance pd.DataFrame: +def files_in_pandas(files: list) -> pd.DataFrame: """Reads a folder and returns a pandas dataframe containing the files and additional information. Args: folder (str): Path to folder. @@ -23,61 +25,62 @@ def files_in_pandas(files:list) -> pd.DataFrame: pd.DataFrame: PandasDataFrame. """ ctimes = [os.path.getctime(_) for _ in files] - created = [datetime.datetime.fromtimestamp(_).strftime("%Y-%m-%d %H:%M:%S") for _ in ctimes] - sizes = [os.path.getsize(_) / 1024 ** 2 for _ in files] + created = [ + datetime.datetime.fromtimestamp(_).strftime("%Y-%m-%d %H:%M:%S") for _ in ctimes + ] + sizes = [os.path.getsize(_) / 1024**2 for _ in files] df = pd.DataFrame(files, columns=["File Path"]) df["Created Time"] = created df["File Size (Mb)"] = sizes return df + def file_type_selectbox( - ui_label:str, - st_key:str, - default_type:str, - monitor_files:list, - choices:list, + ui_label: str, + st_key: str, + default_type: str, + monitor_files: list, + choices: list, index=0, -)->str: +) -> str: def on_type_change(): - if len(monitor_files)>0: + if len(monitor_files) > 0: st.warning("Please clear all files before changing the file type") st.session_state[st_key] = default_type return st.selectbox( label=ui_label, - options=choices, index=index, + options=choices, + index=index, key=st_key, - on_change=on_type_change + on_change=on_type_change, ) -def update_input_paths(file_list:list): - _list = [ - _ for _ in file_list - if os.path.isfile(_) - ] + +def update_input_paths(file_list: list): + _list = [_ for _ in file_list if os.path.isfile(_)] file_list.clear() file_list.extend(_list) -def select_files( - file_list:list, - file_exts:list, - ui_label="File" -): + +def select_files(file_list: list, file_exts: list, ui_label="File"): if isinstance(file_exts, str): file_exts = [file_exts.lower()] else: file_exts = [ext.lower() for ext in file_exts] - st.write('##### ' + ui_label) - path = st.text_input(label='Input a file or a folder path', key=ui_label+'text_input') + st.write("##### " + ui_label) + path = st.text_input( + label="Input a file or a folder path", key=ui_label + "text_input" + ) path = get_posix(path) - col1, col2, col3 = st.columns([0.5,0.5,2]) + col1, col2, col3 = st.columns([0.5, 0.5, 2]) with col1: - add = st.button(label='Add', key=ui_label+"Add") + add = st.button(label="Add", key=ui_label + "Add") with col2: - remove = st.button(label='Remove', key=ui_label+"Remove") + remove = st.button(label="Remove", key=ui_label + "Remove") with col3: - clear = st.button(label='Clear all files', key=ui_label+"Clear") + clear = st.button(label="Clear all files", key=ui_label + "Clear") if add is True: if os.path.isdir(path): for _file in os.listdir(path): @@ -91,9 +94,10 @@ def select_files( if clear is True: file_list.clear() update_input_paths(file_list) - st.write('##### Selected files') + st.write("##### Selected files") st.dataframe(files_in_pandas(file_list)) + def escape_markdown(text: str) -> str: """Helper function to escape markdown in text. @@ -147,7 +151,8 @@ def files_in_folder(folder: str, ending: str, sort: str = "name") -> list: return files -def files_in_folder_pandas(folder: str, file_type:str=None) -> pd.DataFrame: + +def files_in_folder_pandas(folder: str, file_type: str = None) -> pd.DataFrame: """Reads a folder and returns a pandas dataframe containing the files and additional information. Args: folder (str): Path to folder. @@ -160,11 +165,12 @@ def files_in_folder_pandas(folder: str, file_type:str=None) -> pd.DataFrame: else: file_type = file_type.lower() files = [ - file for file in os.listdir(folder) + file + for file in os.listdir(folder) if file.lower().endswith(f".{file_type}") or file.lower() == file_type ] created = [time.ctime(os.path.getctime(os.path.join(folder, _))) for _ in files] - sizes = [os.path.getsize(os.path.join(folder, _)) / 1024 ** 2 for _ in files] + sizes = [os.path.getsize(os.path.join(folder, _)) / 1024**2 for _ in files] df = pd.DataFrame(files, columns=["File"]) df["Created"] = created df["Filesize (Mb)"] = sizes @@ -220,9 +226,7 @@ def start_process( def check_process( process_path: str, -) -> Tuple[ - bool, Union[str, None], Union[str, None], Union[str, None], bool -]: +) -> Tuple[bool, Union[str, None], Union[str, None], Union[str, None], bool]: """Function to check the status of a process. Reads the process file from the yaml and checks the process id. diff --git a/release/pyinstaller/peptdeep_cli_pyinstaller.py b/release/pyinstaller/peptdeep_cli_pyinstaller.py index 3dac09f6..88e12b46 100644 --- a/release/pyinstaller/peptdeep_cli_pyinstaller.py +++ b/release/pyinstaller/peptdeep_cli_pyinstaller.py @@ -2,6 +2,7 @@ try: import peptdeep.cli import multiprocessing + multiprocessing.freeze_support() peptdeep.cli.run() except KeyboardInterrupt: @@ -9,6 +10,7 @@ except Exception: import traceback import sys + exc_info = sys.exc_info() # Display the *original* exception traceback.print_exception(*exc_info) diff --git a/release/pyinstaller/peptdeep_pyinstaller.py b/release/pyinstaller/peptdeep_pyinstaller.py index 5f8bd174..7aa544f2 100644 --- a/release/pyinstaller/peptdeep_pyinstaller.py +++ b/release/pyinstaller/peptdeep_pyinstaller.py @@ -2,6 +2,7 @@ try: import peptdeep.cli import multiprocessing + multiprocessing.freeze_support() peptdeep.cli._gui() except KeyboardInterrupt: @@ -9,6 +10,7 @@ except Exception: import traceback import sys + exc_info = sys.exc_info() # Display the *original* exception traceback.print_exception(*exc_info) diff --git a/setup.py b/setup.py index 06c75d05..274d242d 100644 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ import setuptools import re import os + # local import peptdeep as package2install @@ -13,6 +14,7 @@ def get_long_description(): long_description = readme_file.read() return long_description + def get_requirements(): extra_requirements = {} requirement_file_names = package2install.__extra_requirements__ @@ -32,7 +34,7 @@ def get_requirements(): for line in requirements_file: extra_requirements[extra_stable].append(line) # conditional requirements like: pywin32; sys_platform=='win32' - line, *conditions = line.split(';') + line, *conditions = line.split(";") requirement, *comparison = re.split("[><=~!]", line) requirement = requirement.strip() requirement = ";".join([requirement] + conditions) From 195f6c48610e51dd893a33444c04a58a33e856ea Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Tue, 2 Jul 2024 13:16:28 +0200 Subject: [PATCH 2/2] #170: add pre-commit instructions to README.md --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 09ec9372..aad92608 100644 --- a/README.md +++ b/README.md @@ -942,8 +942,21 @@ clone the repository and create a [pull request](https://github.com/MannLabs/alphapeptdeep/pulls) with a new branch. For an even more interactive participation, check out the [discussions](https://github.com/MannLabs/alphapeptdeep/discussions) and -the [the Contributors License Agreement](misc/CLA.md). +the [Contributors License Agreement](misc/CLA.md). +### Notes for developers +#### pre-commit hooks +It is highly recommended to use the provided pre-commit hooks, as the CI pipeline enforces all checks therein to +pass in order to merge a branch. + +The hooks need to be installed once by +```bash +pre-commit install +``` +You can run the checks yourself using: +```bash +pre-commit run --all-files +``` ------------------------------------------------------------------------ ## Changelog