From 817805b8f6ab09ee996e18cc6c00c24120977e1f Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 11 Mar 2023 17:26:10 +0200 Subject: [PATCH 01/28] add antibody developability from TDC add Antibody Developability from: https://tdcommons.ai/single_pred_tasks/develop/ for both: TAP SAbDab, Chen et al. Need some one to ensure two list that I convert to two columns. Thanks --- data/SAbDab_Chen/meta.yaml | 57 +++++++++ data/SAbDab_Chen/transform.py | 155 +++++++++++++++++++++++ data/TAP/meta.yaml | 100 +++++++++++++++ data/TAP/transform.py | 225 ++++++++++++++++++++++++++++++++++ 4 files changed, 537 insertions(+) create mode 100644 data/SAbDab_Chen/meta.yaml create mode 100644 data/SAbDab_Chen/transform.py create mode 100644 data/TAP/meta.yaml create mode 100644 data/TAP/transform.py diff --git a/data/SAbDab_Chen/meta.yaml b/data/SAbDab_Chen/meta.yaml new file mode 100644 index 000000000..f0b067d6f --- /dev/null +++ b/data/SAbDab_Chen/meta.yaml @@ -0,0 +1,57 @@ +name: SAbDab_Chen +description: "Antibody data from Chen et al, where they process from the SAbDab. \n\ + \ From an initial dataset of 3816 antibodies, they retained 2426 antibodies\n\ + \ that satisfy the following criteria: 1. \n have both sequence (FASTA)\ + \ and Protein Data Bank (PDB) structure files,\n 2. contain both a heavy\ + \ chain and a light chain, and 3. \n have crystal structures with resolution\ + \ < 3 \xC5. \n The DI label is derived from BIOVIA's pipelines." +targets: +- id: developability + description: functional antibody candidate to be developed into a manufacturable(1), + or not(0) + units: '' + type: categorical + names: + - antibody developability + - monoclonal anitbody + - functional antibody candidate + - manufacturable, stable, safe, and effective antibody drug + uris: + - https://rb.gy/idkdqp + - https://rb.gy/b8cx8i +identifiers: +- id: antibody_pdb_ID + type: Other + description: anitbody pdb id +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence in FASTA +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence in FASTA +license: CC BY 4.0 +links: +- url: https://doi.org/10.1101/2020.06.18.159798 + description: corresponding publication +- url: https://doi.org/10.1093/nar/gkt1043 + description: corresponding publication +- url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ + description: corresponding tools used +- url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al + description: data source +num_points: 2409 +bibtex: +- "@article{Chen2020,\n doi = {10.1101/2020.06.18.159798},\n url\ + \ = {https://doi.org/10.1101/2020.06.18.159798},\n year = {2020},\n \ + \ month = jun,\n publisher = {Cold Spring Harbor Laboratory},\n \ + \ author = {Xingyao Chen and Thomas Dougherty and \n Chan Hong\ + \ and Rachel Schibler and Yi Cong Zhao and \n Reza Sadeghi and Naim Matasci\ + \ and Yi-Chieh Wu and Ian Kerman},\n title = {Predicting Antibody Developability\ + \ from Sequence \n using Machine Learning}}" +- "@article{Dunbar2013,\n doi = {10.1093/nar/gkt1043},\n url = {https://doi.org/10.1093/nar/gkt1043},\n\ + \ year = {2013},\n month = nov,\n publisher = {Oxford\ + \ University Press ({OUP})},\n volume = {42},\n number = {D1},\n\ + \ pages = {D1140--D1146},\n author = {James Dunbar and Konrad\ + \ Krawczyk and Jinwoo Leem \n and Terry Baker and Angelika Fuchs and Guy\ + \ Georges and Jiye Shi and\n Charlotte M. Deane},\n title = {{SAbDab}:\ + \ the structural antibody database},\n journal = {Nucleic Acids Research}}" diff --git a/data/SAbDab_Chen/transform.py b/data/SAbDab_Chen/transform.py new file mode 100644 index 000000000..7b319dd33 --- /dev/null +++ b/data/SAbDab_Chen/transform.py @@ -0,0 +1,155 @@ +import pandas as pd +import yaml +from tdc.single_pred import Develop + + +def get_and_transform_data(): + # get raw data + target_folder = 'SAbDab_Chen' + target_subfolder = 'SAbDab_Chen' + data = Develop(name = target_subfolder) + + # proceed raw data + df = data.get_data() + fields_orig = df.columns.tolist() + assert fields_orig == ['Antibody_ID', 'Antibody', 'Y'] + + fn_data_original = "data_original.csv" + + antibody_list = df.Antibody.tolist() + s2l = lambda list_string: list(map(str.strip, list_string.strip('][').replace("'", "").split(','))) + df['heavy_chain'] = [s2l(x)[0] for x in antibody_list] + df['light_chain'] = [s2l(x)[1] for x in antibody_list] + df = df[['Antibody_ID', 'heavy_chain', 'light_chain', 'Y']] + df.to_csv(fn_data_original, index=False) + + # load raw data and assert columns + df = pd.read_csv(fn_data_original, sep=',') + fields_orig = df.columns.tolist() + assert fields_orig == ['Antibody_ID', 'heavy_chain', 'light_chain', 'Y'] + fields_clean = ['antibody_pdb_ID', 'heavy_chain', 'light_chain', 'developability'] + df.columns = fields_clean + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + meta = { + "name": f"{target_folder}", # unique identifier, we will also use this for directory names + "description": """Antibody data from Chen et al, where they process from the SAbDab. + From an initial dataset of 3816 antibodies, they retained 2426 antibodies + that satisfy the following criteria: 1. + have both sequence (FASTA) and Protein Data Bank (PDB) structure files, + 2. contain both a heavy chain and a light chain, and 3. + have crystal structures with resolution < 3 Å. + The DI label is derived from BIOVIA's pipelines.""", + + + "targets": [ + { + "id": "developability", # name of the column in a tabular dataset + "description": "functional antibody candidate to be developed into a manufacturable(1), or not(0)", + "units": "", # units of the values in this column (leave empty if unitless) + "type": "categorical", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "antibody developability", + "monoclonal anitbody", + "functional antibody candidate", + "manufacturable, stable, safe, and effective antibody drug" + ], + "uris":[ + "https://rb.gy/idkdqp", + "https://rb.gy/b8cx8i", + ], + }, + ], + + "identifiers": [ + { + "id": "antibody_pdb_ID", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody pdb id", # description (optional, except for "Other") + }, + { + "id": "heavy_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody heavy chain amino acid sequence in FASTA", # description (optional, except for "Other") + }, + { + "id": "light_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody light chain amino acid sequence in FASTA", # description (optional, except for "Other") + } + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.1101/2020.06.18.159798", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gkt1043", + "description": "corresponding publication", + }, + { + "url": "https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/", + "description": "corresponding tools used", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al", + "description": "data source", + } + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{Chen2020, + doi = {10.1101/2020.06.18.159798}, + url = {https://doi.org/10.1101/2020.06.18.159798}, + year = {2020}, + month = jun, + publisher = {Cold Spring Harbor Laboratory}, + author = {Xingyao Chen and Thomas Dougherty and + Chan Hong and Rachel Schibler and Yi Cong Zhao and + Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, + title = {Predicting Antibody Developability from Sequence + using Machine Learning}}""", + + """@article{Dunbar2013, + doi = {10.1093/nar/gkt1043}, + url = {https://doi.org/10.1093/nar/gkt1043}, + year = {2013}, + month = nov, + publisher = {Oxford University Press ({OUP})}, + volume = {42}, + number = {D1}, + pages = {D1140--D1146}, + author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem + and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and + Charlotte M. Deane}, + title = {{SAbDab}: the structural antibody database}, + journal = {Nucleic Acids Research}}""", + + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/TAP/meta.yaml b/data/TAP/meta.yaml new file mode 100644 index 000000000..ec3f14232 --- /dev/null +++ b/data/TAP/meta.yaml @@ -0,0 +1,100 @@ +name: TAP +description: "Immunogenicity, instability, self-association, \n high viscosity,\ + \ polyspecificity, or poor expression can all preclude\n an antibody from\ + \ becoming a therapeutic. Early identification of these\n negative characteristics\ + \ is essential. Akin to the Lipinski guidelines,\n which measure druglikeness\ + \ in small molecules, \n Therapeutic Antibody Profiler (TAP) highlights antibodies\ + \ \n that possess characteristics that are rare/unseen in \n clinical-stage\ + \ mAb therapeutics." +targets: +- id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i +- id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i +- id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i +identifiers: +- id: antibody_name + type: Other + description: anitbody name +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence +license: CC BY 4.0 +links: +- url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source +num_points: 241 +bibtex: +- |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences}} diff --git a/data/TAP/transform.py b/data/TAP/transform.py new file mode 100644 index 000000000..1dcd05aa5 --- /dev/null +++ b/data/TAP/transform.py @@ -0,0 +1,225 @@ +import pandas as pd +import yaml +from tdc.utils import retrieve_label_name_list +from tdc.single_pred import Develop + + +def get_and_transform_data(): + # get raw data + target_folder = 'TAP' + target_subfolder = 'TAP' + label_list = retrieve_label_name_list(target_subfolder) + data = Develop(name = target_subfolder, label_name = label_list[0]) + # proceed raw data + df = pd.read_csv('data/tap.tab',sep='\t') + fields_orig = df.columns.tolist() + assert fields_orig == ['X', 'ID', 'CDR_Length', 'PSH', 'PPC', 'PNC', 'SFvCSP'] + fields_clean = ['antibody_two_sequences', 'antibody_name', 'CDR_Length', 'PSH', 'PPC', 'PNC', 'SFvCSP'] + df.columns = fields_clean + # convert list columns to two columns + antibody_list = df.antibody_two_sequences.tolist() + s2l = lambda list_string: list(map(str.strip, list_string.strip('][').replace("'", "").split(','))) + antibody2list = lambda list_string: [x.strip() for x in s2l(list_string)[0].split('\\n')] + df['heavy_chain'] = [antibody2list(x)[0] for x in antibody_list] + df['light_chain'] = [antibody2list(x)[1] for x in antibody_list] + fn_data_original = 'data_original.csv' + df.to_csv(fn_data_original,index=None) + + # load raw data and assert columns + df = pd.read_csv(fn_data_original, sep=',') + fields_orig = df.columns.tolist() + assert fields_orig == ['antibody_two_sequences', + 'antibody_name', + 'CDR_Length', + 'PSH', + 'PPC', + 'PNC', + 'SFvCSP', + 'heavy_chain', + 'light_chain'] + + df = df[['antibody_name', + 'heavy_chain', + 'light_chain', + 'CDR_Length', + 'PSH', + 'PPC', + 'PNC', + 'SFvCSP']] + fields_clean= ['antibody_name', + 'heavy_chain', + 'light_chain', + 'CDR_Length', + 'PSH', + 'PPC', + 'PNC', + 'SFvCSP'] + + df.columns = fields_clean + assert fields_orig != fields_clean + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + meta = { + "name": f"{target_folder}", # unique identifier, we will also use this for directory names + "description": """Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics.""", + + "targets": [ + { + "id": "CDR_Length", # name of the column in a tabular dataset + "description": "CDR Complementarity-determining regions length", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "Antibody Complementarity-determining regions length", + "Therapeutic Antibody Profiler", + "antibody developability", + "monoclonal anitbody" + ], + "uris":[ + "https://rb.gy/s9gv88", + "https://rb.gy/km77hq", + "https://rb.gy/b8cx8i", + ], + }, + { + "id": "PSH", # name of the column in a tabular dataset + "description": "patches of surface hydrophobicity", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "antibody patches of surface hydrophobicity", + "Therapeutic Antibody Profiler", + "antibody developability", + "monoclonal anitbody" + ], + "uris":[ + "https://rb.gy/bchhaa", + "https://rb.gy/2irr4l", + "https://rb.gy/b8cx8i", + ], + }, + { + "id": "PPC", # name of the column in a tabular dataset + "description": "patches of positive charge", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "patches of positive charge", + "Therapeutic Antibody Profiler", + "antibody developability", + "monoclonal anitbody" + ], + "uris":[ + "https://rb.gy/b8cx8i", + ], + }, + { + "id": "PNC", # name of the column in a tabular dataset + "description": "patches of negative charge", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "anitbody patches of negative charge", + "Therapeutic Antibody Profiler", + "antibody developability", + "monoclonal anitbody" + ], + "uris":[ + "https://rb.gy/b8cx8i", + ], + }, + { + "id": "SFvCSP", # name of the column in a tabular dataset + "description": "structural Fv charge symmetry parameter", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "antibody structural Fv charge symmetry parameter", + "Therapeutic Antibody Profiler", + "antibody developability", + "monoclonal anitbody" + ], + "uris":[ + "https://rb.gy/uxyhc3", + "https://rb.gy/b8cx8i", + ], + } + ], + + "identifiers": [ + { + "id": "antibody_name", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody name", # description (optional, except for "Other") + }, + { + "id": "heavy_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody heavy chain amino acid sequence", # description (optional, except for "Other") + }, + { + "id": "light_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody light chain amino acid sequence", # description (optional, except for "Other") + } + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.1073/pnas.1810576116", + "description": "corresponding publication", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/develop/#tap", + "description": "data source", + } + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences}}""", + + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + +if __name__ == "__main__": + get_and_transform_data() From 66b78cd43c5b282c3f5379a473207556e8b76c40 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 11 Mar 2023 15:26:56 +0000 Subject: [PATCH 02/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/SAbDab_Chen/meta.yaml | 93 ++++++++--------- data/SAbDab_Chen/transform.py | 74 +++++++------- data/TAP/meta.yaml | 184 +++++++++++++++++----------------- data/TAP/transform.py | 158 ++++++++++++++++------------- 4 files changed, 258 insertions(+), 251 deletions(-) diff --git a/data/SAbDab_Chen/meta.yaml b/data/SAbDab_Chen/meta.yaml index f0b067d6f..9b6474dd0 100644 --- a/data/SAbDab_Chen/meta.yaml +++ b/data/SAbDab_Chen/meta.yaml @@ -1,57 +1,50 @@ +--- name: SAbDab_Chen -description: "Antibody data from Chen et al, where they process from the SAbDab. \n\ - \ From an initial dataset of 3816 antibodies, they retained 2426 antibodies\n\ - \ that satisfy the following criteria: 1. \n have both sequence (FASTA)\ - \ and Protein Data Bank (PDB) structure files,\n 2. contain both a heavy\ - \ chain and a light chain, and 3. \n have crystal structures with resolution\ - \ < 3 \xC5. \n The DI label is derived from BIOVIA's pipelines." +description: "Antibody data from Chen et al, where they process from the SAbDab. \n From an initial dataset of 3816 antibodies, they retained 2426\ + \ antibodies\n that satisfy the following criteria: 1. \n have both sequence (FASTA) and Protein Data Bank (PDB) structure files,\n \ + \ 2. contain both a heavy chain and a light chain, and 3. \n have crystal structures with resolution < 3 Å. \n The DI label is derived\ + \ from BIOVIA's pipelines." targets: -- id: developability - description: functional antibody candidate to be developed into a manufacturable(1), - or not(0) - units: '' - type: categorical - names: - - antibody developability - - monoclonal anitbody - - functional antibody candidate - - manufacturable, stable, safe, and effective antibody drug - uris: - - https://rb.gy/idkdqp - - https://rb.gy/b8cx8i + - id: developability + description: functional antibody candidate to be developed into a manufacturable(1), or not(0) + units: '' + type: categorical + names: + - antibody developability + - monoclonal anitbody + - functional antibody candidate + - manufacturable, stable, safe, and effective antibody drug + uris: + - https://rb.gy/idkdqp + - https://rb.gy/b8cx8i identifiers: -- id: antibody_pdb_ID - type: Other - description: anitbody pdb id -- id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence in FASTA -- id: light_chain - type: Other - description: anitbody light chain amino acid sequence in FASTA + - id: antibody_pdb_ID + type: Other + description: anitbody pdb id + - id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence in FASTA + - id: light_chain + type: Other + description: anitbody light chain amino acid sequence in FASTA license: CC BY 4.0 links: -- url: https://doi.org/10.1101/2020.06.18.159798 - description: corresponding publication -- url: https://doi.org/10.1093/nar/gkt1043 - description: corresponding publication -- url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ - description: corresponding tools used -- url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al - description: data source + - url: https://doi.org/10.1101/2020.06.18.159798 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gkt1043 + description: corresponding publication + - url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ + description: corresponding tools used + - url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al + description: data source num_points: 2409 bibtex: -- "@article{Chen2020,\n doi = {10.1101/2020.06.18.159798},\n url\ - \ = {https://doi.org/10.1101/2020.06.18.159798},\n year = {2020},\n \ - \ month = jun,\n publisher = {Cold Spring Harbor Laboratory},\n \ - \ author = {Xingyao Chen and Thomas Dougherty and \n Chan Hong\ - \ and Rachel Schibler and Yi Cong Zhao and \n Reza Sadeghi and Naim Matasci\ - \ and Yi-Chieh Wu and Ian Kerman},\n title = {Predicting Antibody Developability\ - \ from Sequence \n using Machine Learning}}" -- "@article{Dunbar2013,\n doi = {10.1093/nar/gkt1043},\n url = {https://doi.org/10.1093/nar/gkt1043},\n\ - \ year = {2013},\n month = nov,\n publisher = {Oxford\ - \ University Press ({OUP})},\n volume = {42},\n number = {D1},\n\ - \ pages = {D1140--D1146},\n author = {James Dunbar and Konrad\ - \ Krawczyk and Jinwoo Leem \n and Terry Baker and Angelika Fuchs and Guy\ - \ Georges and Jiye Shi and\n Charlotte M. Deane},\n title = {{SAbDab}:\ - \ the structural antibody database},\n journal = {Nucleic Acids Research}}" + - "@article{Chen2020,\n doi = {10.1101/2020.06.18.159798},\n url = {https://doi.org/10.1101/2020.06.18.159798},\n year =\ + \ {2020},\n month = jun,\n publisher = {Cold Spring Harbor Laboratory},\n author = {Xingyao Chen and Thomas Dougherty and\ + \ \n Chan Hong and Rachel Schibler and Yi Cong Zhao and \n Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman},\n \ + \ title = {Predicting Antibody Developability from Sequence \n using Machine Learning}}" + - "@article{Dunbar2013,\n doi = {10.1093/nar/gkt1043},\n url = {https://doi.org/10.1093/nar/gkt1043},\n year = {2013},\n\ + \ month = nov,\n publisher = {Oxford University Press ({OUP})},\n volume = {42},\n number = {D1},\n pages\ + \ = {D1140--D1146},\n author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem \n and Terry Baker and Angelika Fuchs and Guy Georges\ + \ and Jiye Shi and\n Charlotte M. Deane},\n title = {{SAbDab}: the structural antibody database},\n journal = {Nucleic\ + \ Acids Research}}" diff --git a/data/SAbDab_Chen/transform.py b/data/SAbDab_Chen/transform.py index 7b319dd33..838cd9d79 100644 --- a/data/SAbDab_Chen/transform.py +++ b/data/SAbDab_Chen/transform.py @@ -5,47 +5,47 @@ def get_and_transform_data(): # get raw data - target_folder = 'SAbDab_Chen' - target_subfolder = 'SAbDab_Chen' - data = Develop(name = target_subfolder) - + target_folder = "SAbDab_Chen" + target_subfolder = "SAbDab_Chen" + data = Develop(name=target_subfolder) + # proceed raw data df = data.get_data() fields_orig = df.columns.tolist() - assert fields_orig == ['Antibody_ID', 'Antibody', 'Y'] - + assert fields_orig == ["Antibody_ID", "Antibody", "Y"] + fn_data_original = "data_original.csv" - + antibody_list = df.Antibody.tolist() - s2l = lambda list_string: list(map(str.strip, list_string.strip('][').replace("'", "").split(','))) - df['heavy_chain'] = [s2l(x)[0] for x in antibody_list] - df['light_chain'] = [s2l(x)[1] for x in antibody_list] - df = df[['Antibody_ID', 'heavy_chain', 'light_chain', 'Y']] + s2l = lambda list_string: list( + map(str.strip, list_string.strip("][").replace("'", "").split(",")) + ) + df["heavy_chain"] = [s2l(x)[0] for x in antibody_list] + df["light_chain"] = [s2l(x)[1] for x in antibody_list] + df = df[["Antibody_ID", "heavy_chain", "light_chain", "Y"]] df.to_csv(fn_data_original, index=False) - + # load raw data and assert columns - df = pd.read_csv(fn_data_original, sep=',') + df = pd.read_csv(fn_data_original, sep=",") fields_orig = df.columns.tolist() - assert fields_orig == ['Antibody_ID', 'heavy_chain', 'light_chain', 'Y'] - fields_clean = ['antibody_pdb_ID', 'heavy_chain', 'light_chain', 'developability'] + assert fields_orig == ["Antibody_ID", "heavy_chain", "light_chain", "Y"] + fields_clean = ["antibody_pdb_ID", "heavy_chain", "light_chain", "developability"] df.columns = fields_clean assert not df.duplicated().sum() # save to csv fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - + meta = { "name": f"{target_folder}", # unique identifier, we will also use this for directory names - "description": """Antibody data from Chen et al, where they process from the SAbDab. + "description": """Antibody data from Chen et al, where they process from the SAbDab. From an initial dataset of 3816 antibodies, they retained 2426 antibodies - that satisfy the following criteria: 1. + that satisfy the following criteria: 1. have both sequence (FASTA) and Protein Data Bank (PDB) structure files, - 2. contain both a heavy chain and a light chain, and 3. - have crystal structures with resolution < 3 Å. + 2. contain both a heavy chain and a light chain, and 3. + have crystal structures with resolution < 3 Å. The DI label is derived from BIOVIA's pipelines.""", - - "targets": [ { "id": "developability", # name of the column in a tabular dataset @@ -56,31 +56,30 @@ def get_and_transform_data(): "antibody developability", "monoclonal anitbody", "functional antibody candidate", - "manufacturable, stable, safe, and effective antibody drug" + "manufacturable, stable, safe, and effective antibody drug", ], - "uris":[ + "uris": [ "https://rb.gy/idkdqp", "https://rb.gy/b8cx8i", ], }, ], - "identifiers": [ { "id": "antibody_pdb_ID", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" "description": "anitbody pdb id", # description (optional, except for "Other") }, - { + { "id": "heavy_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" "description": "anitbody heavy chain amino acid sequence in FASTA", # description (optional, except for "Other") }, - { + { "id": "light_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" "description": "anitbody light chain amino acid sequence in FASTA", # description (optional, except for "Other") - } + }, ], "license": "CC BY 4.0", # license under which the original dataset was published "links": [ # list of relevant links (original dataset, other uses, etc.) @@ -92,29 +91,28 @@ def get_and_transform_data(): "url": "https://doi.org/10.1093/nar/gkt1043", "description": "corresponding publication", }, - { + { "url": "https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/", "description": "corresponding tools used", }, { "url": "https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al", "description": "data source", - } + }, ], "num_points": len(df), # number of datapoints in this dataset "bibtex": [ - """@article{Chen2020, + """@article{Chen2020, doi = {10.1101/2020.06.18.159798}, url = {https://doi.org/10.1101/2020.06.18.159798}, year = {2020}, month = jun, publisher = {Cold Spring Harbor Laboratory}, - author = {Xingyao Chen and Thomas Dougherty and - Chan Hong and Rachel Schibler and Yi Cong Zhao and + author = {Xingyao Chen and Thomas Dougherty and + Chan Hong and Rachel Schibler and Yi Cong Zhao and Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, - title = {Predicting Antibody Developability from Sequence + title = {Predicting Antibody Developability from Sequence using Machine Learning}}""", - """@article{Dunbar2013, doi = {10.1093/nar/gkt1043}, url = {https://doi.org/10.1093/nar/gkt1043}, @@ -124,15 +122,14 @@ def get_and_transform_data(): volume = {42}, number = {D1}, pages = {D1140--D1146}, - author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem + author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and Charlotte M. Deane}, title = {{SAbDab}: the structural antibody database}, journal = {Nucleic Acids Research}}""", - ], } - + def str_presenter(dumper, data): """configures yaml for dumping multiline strings Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data @@ -151,5 +148,6 @@ def str_presenter(dumper, data): print(f"Finished processing {meta['name']} dataset!") + if __name__ == "__main__": get_and_transform_data() diff --git a/data/TAP/meta.yaml b/data/TAP/meta.yaml index ec3f14232..1eb58dd72 100644 --- a/data/TAP/meta.yaml +++ b/data/TAP/meta.yaml @@ -1,100 +1,98 @@ +--- name: TAP -description: "Immunogenicity, instability, self-association, \n high viscosity,\ - \ polyspecificity, or poor expression can all preclude\n an antibody from\ - \ becoming a therapeutic. Early identification of these\n negative characteristics\ - \ is essential. Akin to the Lipinski guidelines,\n which measure druglikeness\ - \ in small molecules, \n Therapeutic Antibody Profiler (TAP) highlights antibodies\ - \ \n that possess characteristics that are rare/unseen in \n clinical-stage\ - \ mAb therapeutics." +description: "Immunogenicity, instability, self-association, \n high viscosity, polyspecificity, or poor expression can all preclude\n an\ + \ antibody from becoming a therapeutic. Early identification of these\n negative characteristics is essential. Akin to the Lipinski guidelines,\n\ + \ which measure druglikeness in small molecules, \n Therapeutic Antibody Profiler (TAP) highlights antibodies \n that possess characteristics\ + \ that are rare/unseen in \n clinical-stage mAb therapeutics." targets: -- id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i -- id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i -- id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i + - id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i + - id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i + - id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i identifiers: -- id: antibody_name - type: Other - description: anitbody name -- id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence -- id: light_chain - type: Other - description: anitbody light chain amino acid sequence + - id: antibody_name + type: Other + description: anitbody name + - id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence + - id: light_chain + type: Other + description: anitbody light chain amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication -- url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source + - url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: -- |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences}} + - |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences}} diff --git a/data/TAP/transform.py b/data/TAP/transform.py index 1dcd05aa5..f434d1035 100644 --- a/data/TAP/transform.py +++ b/data/TAP/transform.py @@ -1,59 +1,79 @@ import pandas as pd import yaml -from tdc.utils import retrieve_label_name_list from tdc.single_pred import Develop +from tdc.utils import retrieve_label_name_list def get_and_transform_data(): # get raw data - target_folder = 'TAP' - target_subfolder = 'TAP' + target_folder = "TAP" + target_subfolder = "TAP" label_list = retrieve_label_name_list(target_subfolder) - data = Develop(name = target_subfolder, label_name = label_list[0]) + data = Develop(name=target_subfolder, label_name=label_list[0]) # proceed raw data - df = pd.read_csv('data/tap.tab',sep='\t') + df = pd.read_csv("data/tap.tab", sep="\t") fields_orig = df.columns.tolist() - assert fields_orig == ['X', 'ID', 'CDR_Length', 'PSH', 'PPC', 'PNC', 'SFvCSP'] - fields_clean = ['antibody_two_sequences', 'antibody_name', 'CDR_Length', 'PSH', 'PPC', 'PNC', 'SFvCSP'] + assert fields_orig == ["X", "ID", "CDR_Length", "PSH", "PPC", "PNC", "SFvCSP"] + fields_clean = [ + "antibody_two_sequences", + "antibody_name", + "CDR_Length", + "PSH", + "PPC", + "PNC", + "SFvCSP", + ] df.columns = fields_clean # convert list columns to two columns antibody_list = df.antibody_two_sequences.tolist() - s2l = lambda list_string: list(map(str.strip, list_string.strip('][').replace("'", "").split(','))) - antibody2list = lambda list_string: [x.strip() for x in s2l(list_string)[0].split('\\n')] - df['heavy_chain'] = [antibody2list(x)[0] for x in antibody_list] - df['light_chain'] = [antibody2list(x)[1] for x in antibody_list] - fn_data_original = 'data_original.csv' - df.to_csv(fn_data_original,index=None) - + s2l = lambda list_string: list( + map(str.strip, list_string.strip("][").replace("'", "").split(",")) + ) + antibody2list = lambda list_string: [ + x.strip() for x in s2l(list_string)[0].split("\\n") + ] + df["heavy_chain"] = [antibody2list(x)[0] for x in antibody_list] + df["light_chain"] = [antibody2list(x)[1] for x in antibody_list] + fn_data_original = "data_original.csv" + df.to_csv(fn_data_original, index=None) + # load raw data and assert columns - df = pd.read_csv(fn_data_original, sep=',') + df = pd.read_csv(fn_data_original, sep=",") fields_orig = df.columns.tolist() - assert fields_orig == ['antibody_two_sequences', - 'antibody_name', - 'CDR_Length', - 'PSH', - 'PPC', - 'PNC', - 'SFvCSP', - 'heavy_chain', - 'light_chain'] + assert fields_orig == [ + "antibody_two_sequences", + "antibody_name", + "CDR_Length", + "PSH", + "PPC", + "PNC", + "SFvCSP", + "heavy_chain", + "light_chain", + ] - df = df[['antibody_name', - 'heavy_chain', - 'light_chain', - 'CDR_Length', - 'PSH', - 'PPC', - 'PNC', - 'SFvCSP']] - fields_clean= ['antibody_name', - 'heavy_chain', - 'light_chain', - 'CDR_Length', - 'PSH', - 'PPC', - 'PNC', - 'SFvCSP'] + df = df[ + [ + "antibody_name", + "heavy_chain", + "light_chain", + "CDR_Length", + "PSH", + "PPC", + "PNC", + "SFvCSP", + ] + ] + fields_clean = [ + "antibody_name", + "heavy_chain", + "light_chain", + "CDR_Length", + "PSH", + "PPC", + "PNC", + "SFvCSP", + ] df.columns = fields_clean assert fields_orig != fields_clean @@ -62,18 +82,17 @@ def get_and_transform_data(): # save to csv fn_data_csv = "data_clean.csv" df.to_csv(fn_data_csv, index=False) - + meta = { "name": f"{target_folder}", # unique identifier, we will also use this for directory names - "description": """Immunogenicity, instability, self-association, + "description": """Immunogenicity, instability, self-association, high viscosity, polyspecificity, or poor expression can all preclude an antibody from becoming a therapeutic. Early identification of these negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in clinical-stage mAb therapeutics.""", - "targets": [ { "id": "CDR_Length", # name of the column in a tabular dataset @@ -84,15 +103,15 @@ def get_and_transform_data(): "Antibody Complementarity-determining regions length", "Therapeutic Antibody Profiler", "antibody developability", - "monoclonal anitbody" + "monoclonal anitbody", ], - "uris":[ + "uris": [ "https://rb.gy/s9gv88", "https://rb.gy/km77hq", "https://rb.gy/b8cx8i", ], }, - { + { "id": "PSH", # name of the column in a tabular dataset "description": "patches of surface hydrophobicity", # description of what this column means "units": "", # units of the values in this column (leave empty if unitless) @@ -101,15 +120,15 @@ def get_and_transform_data(): "antibody patches of surface hydrophobicity", "Therapeutic Antibody Profiler", "antibody developability", - "monoclonal anitbody" + "monoclonal anitbody", ], - "uris":[ + "uris": [ "https://rb.gy/bchhaa", "https://rb.gy/2irr4l", "https://rb.gy/b8cx8i", ], }, - { + { "id": "PPC", # name of the column in a tabular dataset "description": "patches of positive charge", # description of what this column means "units": "", # units of the values in this column (leave empty if unitless) @@ -118,13 +137,13 @@ def get_and_transform_data(): "patches of positive charge", "Therapeutic Antibody Profiler", "antibody developability", - "monoclonal anitbody" + "monoclonal anitbody", ], - "uris":[ + "uris": [ "https://rb.gy/b8cx8i", ], }, - { + { "id": "PNC", # name of the column in a tabular dataset "description": "patches of negative charge", # description of what this column means "units": "", # units of the values in this column (leave empty if unitless) @@ -133,13 +152,13 @@ def get_and_transform_data(): "anitbody patches of negative charge", "Therapeutic Antibody Profiler", "antibody developability", - "monoclonal anitbody" + "monoclonal anitbody", ], - "uris":[ + "uris": [ "https://rb.gy/b8cx8i", ], }, - { + { "id": "SFvCSP", # name of the column in a tabular dataset "description": "structural Fv charge symmetry parameter", # description of what this column means "units": "", # units of the values in this column (leave empty if unitless) @@ -148,31 +167,30 @@ def get_and_transform_data(): "antibody structural Fv charge symmetry parameter", "Therapeutic Antibody Profiler", "antibody developability", - "monoclonal anitbody" + "monoclonal anitbody", ], - "uris":[ + "uris": [ "https://rb.gy/uxyhc3", "https://rb.gy/b8cx8i", ], - } + }, ], - "identifiers": [ { "id": "antibody_name", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" "description": "anitbody name", # description (optional, except for "Other") }, - { + { "id": "heavy_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" "description": "anitbody heavy chain amino acid sequence", # description (optional, except for "Other") }, - { + { "id": "light_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" "description": "anitbody light chain amino acid sequence", # description (optional, except for "Other") - } + }, ], "license": "CC BY 4.0", # license under which the original dataset was published "links": [ # list of relevant links (original dataset, other uses, etc.) @@ -183,11 +201,11 @@ def get_and_transform_data(): { "url": "https://tdcommons.ai/single_pred_tasks/develop/#tap", "description": "data source", - } + }, ], "num_points": len(df), # number of datapoints in this dataset "bibtex": [ - """@article{Raybould2019, + """@article{Raybould2019, doi = {10.1073/pnas.1810576116}, url = {https://doi.org/10.1073/pnas.1810576116}, year = {2019}, @@ -198,11 +216,10 @@ def get_and_transform_data(): pages = {4025--4030}, author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek and Jiye Shi and Charlotte M. Deane}, title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences}}""", - + journal = {Proceedings of the National Academy of Sciences}}""", ], } - + def str_presenter(dumper, data): """configures yaml for dumping multiline strings Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data @@ -221,5 +238,6 @@ def str_presenter(dumper, data): print(f"Finished processing {meta['name']} dataset!") + if __name__ == "__main__": get_and_transform_data() From ad2dd8f5d723728b0785a055d2f0eca85d494943 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 00:51:37 +0200 Subject: [PATCH 03/28] Add files via upload --- data/sabdab_chen/meta.yaml | 75 ++++++++++ data/sabdab_chen/transform.py | 159 +++++++++++++++++++++ data/tap/meta.yaml | 108 +++++++++++++++ data/tap/transform.py | 252 ++++++++++++++++++++++++++++++++++ 4 files changed, 594 insertions(+) create mode 100644 data/sabdab_chen/meta.yaml create mode 100644 data/sabdab_chen/transform.py create mode 100644 data/tap/meta.yaml create mode 100644 data/tap/transform.py diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml new file mode 100644 index 000000000..72d072244 --- /dev/null +++ b/data/sabdab_chen/meta.yaml @@ -0,0 +1,75 @@ +name: sabdab_chen +description: |- + Antibody data from Chen et al, where they process from the SAbDab. + From an initial dataset of 3816 antibodies, they retained 2426 antibodies that + satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data + Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, + and 3.have crystal structures with resolution < 3 A. The DI label is derived + from BIOVIA's pipelines. +targets: +- id: developability + description: functional antibody candidate to be developed into a manufacturable(1), + or not(0) + units: '' + type: categorical + names: + - antibody developability + - monoclonal anitbody + - functional antibody candidate + - manufacturable, stable, safe, and effective antibody drug + uris: + - https://rb.gy/idkdqp + - https://rb.gy/b8cx8i +benchmarks: +- name: TDC + link: https://tdcommons.ai/ + split_column: split +identifiers: +- id: antibody_pdb_ID + type: Other + description: anitbody pdb id +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence in FASTA +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence in FASTA +license: CC BY 4.0 +links: +- url: https://doi.org/10.1101/2020.06.18.159798 + description: corresponding publication +- url: https://doi.org/10.1093/nar/gkt1043 + description: corresponding publication +- url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ + description: corresponding tools used +- url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al + description: data source +num_points: 2409 +bibtex: +- |- + @article{Chen2020, + doi = {10.1101/2020.06.18.159798}, + url = {https://doi.org/10.1101/2020.06.18.159798}, + year = {2020}, + month = jun, + publisher = {Cold Spring Harbor Laboratory}, + author = {Xingyao Chen and Thomas Dougherty and + Chan Hong and Rachel Schibler and Yi Cong Zhao and + Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, + title = {Predicting Antibody Developability from Sequence + using Machine Learning} +- |- + @article{Dunbar2013, + doi = {10.1093/nar/gkt1043}, + url = {https://doi.org/10.1093/nar/gkt1043}, + year = {2013}, + month = nov, + publisher = {Oxford University Press ({OUP})}, + volume = {42}, + number = {D1}, + pages = {D1140--D1146}, + author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem + and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and + Charlotte M. Deane}, + title = {SAbDab: the structural antibody database}, + journal = {Nucleic Acids Research} diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py new file mode 100644 index 000000000..2f5c52af6 --- /dev/null +++ b/data/sabdab_chen/transform.py @@ -0,0 +1,159 @@ +import pandas as pd +import yaml +from tdc.single_pred import Develop + + +def get_and_transform_data(): + # get raw data + target_folder = "SAbDab_Chen" + target_subfolder = "SAbDab_Chen" + data = Develop(name=target_subfolder) + + # proceed raw data + df = data.get_data() + fields_orig = df.columns.tolist() + assert fields_orig == ["Antibody_ID", "Antibody", "Y"] + + fn_data_original = "data_original.csv" + + antibody_list = df.Antibody.tolist() + s2l = lambda list_string: list( + map(str.strip, list_string.strip("][").replace("'", "").split(",")) + ) + df["heavy_chain"] = [s2l(x)[0] for x in antibody_list] + df["light_chain"] = [s2l(x)[1] for x in antibody_list] + df = df[["Antibody_ID", "heavy_chain", "light_chain", "Y"]] + df.to_csv(fn_data_original, index=False) + + # load raw data and assert columns + df = pd.read_csv(fn_data_original, sep=",") + fields_orig = df.columns.tolist() + assert fields_orig == ["Antibody_ID", "heavy_chain", "light_chain", "Y"] + fields_clean = ["antibody_pdb_ID", "heavy_chain", "light_chain", "developability"] + df.columns = fields_clean + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + meta = { + "name": "sabdab_chen", # unique identifier, we will also use this for directory names + "description": """Antibody data from Chen et al, where they process from the SAbDab. +From an initial dataset of 3816 antibodies, they retained 2426 antibodies that +satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data +Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, +and 3.have crystal structures with resolution < 3 A. The DI label is derived +from BIOVIA's pipelines.""", + "targets": [ + { + "id": "developability", # name of the column in a tabular dataset + "description": "functional antibody candidate to be developed into a manufacturable(1), or not(0)", + "units": "", # units of the values in this column (leave empty if unitless) + "type": "categorical", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "antibody developability", + "monoclonal anitbody", + "functional antibody candidate", + "manufacturable, stable, safe, and effective antibody drug", + ], + "uris": [ + "https://rb.gy/idkdqp", + "https://rb.gy/b8cx8i", + ], + }, + ], + "benchmarks": [ + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, + ], + "identifiers": [ + { + "id": "antibody_pdb_ID", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody pdb id", # description (optional, except for "Other") + }, + { + "id": "heavy_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody heavy chain amino acid sequence in FASTA", # description (optional, except for "Other") + }, + { + "id": "light_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody light chain amino acid sequence in FASTA", # description (optional, except for "Other") + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.1101/2020.06.18.159798", + "description": "corresponding publication", + }, + { + "url": "https://doi.org/10.1093/nar/gkt1043", + "description": "corresponding publication", + }, + { + "url": "https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/", + "description": "corresponding tools used", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al", + "description": "data source", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{Chen2020, +doi = {10.1101/2020.06.18.159798}, +url = {https://doi.org/10.1101/2020.06.18.159798}, +year = {2020}, +month = jun, +publisher = {Cold Spring Harbor Laboratory}, +author = {Xingyao Chen and Thomas Dougherty and +Chan Hong and Rachel Schibler and Yi Cong Zhao and +Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, +title = {Predicting Antibody Developability from Sequence +using Machine Learning}""", + """@article{Dunbar2013, +doi = {10.1093/nar/gkt1043}, +url = {https://doi.org/10.1093/nar/gkt1043}, +year = {2013}, +month = nov, +publisher = {Oxford University Press ({OUP})}, +volume = {42}, +number = {D1}, +pages = {D1140--D1146}, +author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem +and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and +Charlotte M. Deane}, +title = {SAbDab: the structural antibody database}, +journal = {Nucleic Acids Research}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml new file mode 100644 index 000000000..ae57d425b --- /dev/null +++ b/data/tap/meta.yaml @@ -0,0 +1,108 @@ +name: tap +description: |- + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. +targets: +- id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i +- id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i +- id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i +benchmarks: +- name: TDC + link: https://tdcommons.ai/ + split_column: split +identifiers: +- id: antibody_name + type: Other + description: anitbody name +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence +license: CC BY 4.0 +links: +- url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source +num_points: 241 +bibtex: +- |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py new file mode 100644 index 000000000..352944dc2 --- /dev/null +++ b/data/tap/transform.py @@ -0,0 +1,252 @@ +import pandas as pd +import yaml +from tdc.single_pred import Develop +from tdc.utils import retrieve_label_name_list + + +def get_and_transform_data(): + # get raw data + target_folder = "TAP" + target_subfolder = "TAP" + label_list = retrieve_label_name_list(target_subfolder) + data = Develop(name=target_subfolder, label_name=label_list[0]) + # proceed raw data + df = pd.read_csv("data/tap.tab", sep="\t") + fields_orig = df.columns.tolist() + assert fields_orig == ["X", "ID", "CDR_Length", "PSH", "PPC", "PNC", "SFvCSP"] + fields_clean = [ + "antibody_two_sequences", + "antibody_name", + "CDR_Length", + "PSH", + "PPC", + "PNC", + "SFvCSP", + ] + df.columns = fields_clean + # convert list columns to two columns + antibody_list = df.antibody_two_sequences.tolist() + s2l = lambda list_string: list( + map(str.strip, list_string.strip("][").replace("'", "").split(",")) + ) + antibody2list = lambda list_string: [ + x.strip() for x in s2l(list_string)[0].split("\\n") + ] + df["heavy_chain"] = [antibody2list(x)[0] for x in antibody_list] + df["light_chain"] = [antibody2list(x)[1] for x in antibody_list] + fn_data_original = "data_original.csv" + df.to_csv(fn_data_original, index=None) + + # load raw data and assert columns + df = pd.read_csv(fn_data_original, sep=",") + fields_orig = df.columns.tolist() + assert fields_orig == [ + "antibody_two_sequences", + "antibody_name", + "CDR_Length", + "PSH", + "PPC", + "PNC", + "SFvCSP", + "heavy_chain", + "light_chain", + ] + + df = df[ + [ + "antibody_name", + "heavy_chain", + "light_chain", + "CDR_Length", + "PSH", + "PPC", + "PNC", + "SFvCSP", + ] + ] + fields_clean = [ + "antibody_name", + "heavy_chain", + "light_chain", + "CDR_Length", + "PSH", + "PPC", + "PNC", + "SFvCSP", + ] + + df.columns = fields_clean + assert fields_orig != fields_clean + assert not df.duplicated().sum() + + # save to csv + fn_data_csv = "data_clean.csv" + df.to_csv(fn_data_csv, index=False) + + meta = { + "name": "tap", # unique identifier, we will also use this for directory names + "description": """Immunogenicity, instability, self-association, +high viscosity, polyspecificity, or poor expression can all preclude +an antibody from becoming a therapeutic. Early identification of these +negative characteristics is essential. Akin to the Lipinski guidelines, +which measure druglikeness in small molecules, +Therapeutic Antibody Profiler (TAP) highlights antibodies +that possess characteristics that are rare/unseen in +clinical-stage mAb therapeutics.""", + "targets": [ + { + "id": "CDR_Length", # name of the column in a tabular dataset + "description": "CDR Complementarity-determining regions length", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "Antibody Complementarity-determining regions length", + "Therapeutic Antibody Profiler", + "antibody developability", + "monoclonal anitbody", + ], + "uris": [ + "https://rb.gy/s9gv88", + "https://rb.gy/km77hq", + "https://rb.gy/b8cx8i", + ], + }, + { + "id": "PSH", # name of the column in a tabular dataset + "description": "patches of surface hydrophobicity", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "antibody patches of surface hydrophobicity", + "Therapeutic Antibody Profiler", + "antibody developability", + "monoclonal anitbody", + ], + "uris": [ + "https://rb.gy/bchhaa", + "https://rb.gy/2irr4l", + "https://rb.gy/b8cx8i", + ], + }, + { + "id": "PPC", # name of the column in a tabular dataset + "description": "patches of positive charge", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "patches of positive charge", + "Therapeutic Antibody Profiler", + "antibody developability", + "monoclonal anitbody", + ], + "uris": [ + "https://rb.gy/b8cx8i", + ], + }, + { + "id": "PNC", # name of the column in a tabular dataset + "description": "patches of negative charge", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "anitbody patches of negative charge", + "Therapeutic Antibody Profiler", + "antibody developability", + "monoclonal anitbody", + ], + "uris": [ + "https://rb.gy/b8cx8i", + ], + }, + { + "id": "SFvCSP", # name of the column in a tabular dataset + "description": "structural Fv charge symmetry parameter", # description of what this column means + "units": "", # units of the values in this column (leave empty if unitless) + "type": "continuous", # can be "categorical", "ordinal", "continuous" + "names": [ # names for the property (to sample from for building the prompts) + "antibody structural Fv charge symmetry parameter", + "Therapeutic Antibody Profiler", + "antibody developability", + "monoclonal anitbody", + ], + "uris": [ + "https://rb.gy/uxyhc3", + "https://rb.gy/b8cx8i", + ], + }, + ], + "benchmarks": [ + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, + ], + "identifiers": [ + { + "id": "antibody_name", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody name", # description (optional, except for "Other") + }, + { + "id": "heavy_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody heavy chain amino acid sequence", # description (optional, except for "Other") + }, + { + "id": "light_chain", # column name + "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "description": "anitbody light chain amino acid sequence", # description (optional, except for "Other") + }, + ], + "license": "CC BY 4.0", # license under which the original dataset was published + "links": [ # list of relevant links (original dataset, other uses, etc.) + { + "url": "https://doi.org/10.1073/pnas.1810576116", + "description": "corresponding publication", + }, + { + "url": "https://tdcommons.ai/single_pred_tasks/develop/#tap", + "description": "data source", + }, + ], + "num_points": len(df), # number of datapoints in this dataset + "bibtex": [ + """@article{Raybould2019, +doi = {10.1073/pnas.1810576116}, +url = {https://doi.org/10.1073/pnas.1810576116}, +year = {2019}, +month = feb, +publisher = {Proceedings of the National Academy of Sciences}, +volume = {116}, +number = {10}, +pages = {4025--4030}, +author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk +and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek +and Jiye Shi and Charlotte M. Deane}, +title = {Five computational developability guidelines for therapeutic antibody profiling}, +journal = {Proceedings of the National Academy of Sciences}""", + ], + } + + def str_presenter(dumper, data): + """configures yaml for dumping multiline strings + Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data + """ + if data.count("\n") > 0: # check for multiline string + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) + + yaml.add_representer(str, str_presenter) + yaml.representer.SafeRepresenter.add_representer( + str, str_presenter + ) # to use with safe_dum + fn_meta = "meta.yaml" + with open(fn_meta, "w") as f: + yaml.dump(meta, f, sort_keys=False) + + print(f"Finished processing {meta['name']} dataset!") + + +if __name__ == "__main__": + get_and_transform_data() From 94c8df1f756fdc00eba8d47825c7d2eefddaca0e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Mar 2023 22:51:45 +0000 Subject: [PATCH 04/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/sabdab_chen/meta.yaml | 132 +++++++++++----------- data/sabdab_chen/transform.py | 10 +- data/tap/meta.yaml | 199 +++++++++++++++++----------------- data/tap/transform.py | 10 +- 4 files changed, 176 insertions(+), 175 deletions(-) diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml index 72d072244..cf19dcddb 100644 --- a/data/sabdab_chen/meta.yaml +++ b/data/sabdab_chen/meta.yaml @@ -1,75 +1,75 @@ +--- name: sabdab_chen description: |- - Antibody data from Chen et al, where they process from the SAbDab. - From an initial dataset of 3816 antibodies, they retained 2426 antibodies that - satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data - Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, - and 3.have crystal structures with resolution < 3 A. The DI label is derived - from BIOVIA's pipelines. + Antibody data from Chen et al, where they process from the SAbDab. + From an initial dataset of 3816 antibodies, they retained 2426 antibodies that + satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data + Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, + and 3.have crystal structures with resolution < 3 A. The DI label is derived + from BIOVIA's pipelines. targets: -- id: developability - description: functional antibody candidate to be developed into a manufacturable(1), - or not(0) - units: '' - type: categorical - names: - - antibody developability - - monoclonal anitbody - - functional antibody candidate - - manufacturable, stable, safe, and effective antibody drug - uris: - - https://rb.gy/idkdqp - - https://rb.gy/b8cx8i + - id: developability + description: functional antibody candidate to be developed into a manufacturable(1), or not(0) + units: '' + type: categorical + names: + - antibody developability + - monoclonal anitbody + - functional antibody candidate + - manufacturable, stable, safe, and effective antibody drug + uris: + - https://rb.gy/idkdqp + - https://rb.gy/b8cx8i benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: -- id: antibody_pdb_ID - type: Other - description: anitbody pdb id -- id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence in FASTA -- id: light_chain - type: Other - description: anitbody light chain amino acid sequence in FASTA + - id: antibody_pdb_ID + type: Other + description: anitbody pdb id + - id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence in FASTA + - id: light_chain + type: Other + description: anitbody light chain amino acid sequence in FASTA license: CC BY 4.0 links: -- url: https://doi.org/10.1101/2020.06.18.159798 - description: corresponding publication -- url: https://doi.org/10.1093/nar/gkt1043 - description: corresponding publication -- url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ - description: corresponding tools used -- url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al - description: data source + - url: https://doi.org/10.1101/2020.06.18.159798 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gkt1043 + description: corresponding publication + - url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ + description: corresponding tools used + - url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al + description: data source num_points: 2409 bibtex: -- |- - @article{Chen2020, - doi = {10.1101/2020.06.18.159798}, - url = {https://doi.org/10.1101/2020.06.18.159798}, - year = {2020}, - month = jun, - publisher = {Cold Spring Harbor Laboratory}, - author = {Xingyao Chen and Thomas Dougherty and - Chan Hong and Rachel Schibler and Yi Cong Zhao and - Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, - title = {Predicting Antibody Developability from Sequence - using Machine Learning} -- |- - @article{Dunbar2013, - doi = {10.1093/nar/gkt1043}, - url = {https://doi.org/10.1093/nar/gkt1043}, - year = {2013}, - month = nov, - publisher = {Oxford University Press ({OUP})}, - volume = {42}, - number = {D1}, - pages = {D1140--D1146}, - author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem - and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and - Charlotte M. Deane}, - title = {SAbDab: the structural antibody database}, - journal = {Nucleic Acids Research} + - |- + @article{Chen2020, + doi = {10.1101/2020.06.18.159798}, + url = {https://doi.org/10.1101/2020.06.18.159798}, + year = {2020}, + month = jun, + publisher = {Cold Spring Harbor Laboratory}, + author = {Xingyao Chen and Thomas Dougherty and + Chan Hong and Rachel Schibler and Yi Cong Zhao and + Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, + title = {Predicting Antibody Developability from Sequence + using Machine Learning} + - |- + @article{Dunbar2013, + doi = {10.1093/nar/gkt1043}, + url = {https://doi.org/10.1093/nar/gkt1043}, + year = {2013}, + month = nov, + publisher = {Oxford University Press ({OUP})}, + volume = {42}, + number = {D1}, + pages = {D1140--D1146}, + author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem + and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and + Charlotte M. Deane}, + title = {SAbDab: the structural antibody database}, + journal = {Nucleic Acids Research} diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py index 2f5c52af6..3860a1aba 100644 --- a/data/sabdab_chen/transform.py +++ b/data/sabdab_chen/transform.py @@ -64,11 +64,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index ae57d425b..7c9844c4d 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,108 +1,109 @@ +--- name: tap description: |- - Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics. + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. targets: -- id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i -- id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i -- id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i + - id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i + - id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i + - id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: -- id: antibody_name - type: Other - description: anitbody name -- id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence -- id: light_chain - type: Other - description: anitbody light chain amino acid sequence + - id: antibody_name + type: Other + description: anitbody name + - id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence + - id: light_chain + type: Other + description: anitbody light chain amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication -- url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source + - url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: -- |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk - and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek - and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences} + - |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py index 352944dc2..734708dbb 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -176,11 +176,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From 0b955ef3d40ab63f7799e7f14c3bc2e492fbf173 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 00:51:51 +0200 Subject: [PATCH 05/28] Delete data/SAbDab_Chen directory --- data/SAbDab_Chen/meta.yaml | 50 ----------- data/SAbDab_Chen/transform.py | 153 ---------------------------------- 2 files changed, 203 deletions(-) delete mode 100644 data/SAbDab_Chen/meta.yaml delete mode 100644 data/SAbDab_Chen/transform.py diff --git a/data/SAbDab_Chen/meta.yaml b/data/SAbDab_Chen/meta.yaml deleted file mode 100644 index 9b6474dd0..000000000 --- a/data/SAbDab_Chen/meta.yaml +++ /dev/null @@ -1,50 +0,0 @@ ---- -name: SAbDab_Chen -description: "Antibody data from Chen et al, where they process from the SAbDab. \n From an initial dataset of 3816 antibodies, they retained 2426\ - \ antibodies\n that satisfy the following criteria: 1. \n have both sequence (FASTA) and Protein Data Bank (PDB) structure files,\n \ - \ 2. contain both a heavy chain and a light chain, and 3. \n have crystal structures with resolution < 3 Å. \n The DI label is derived\ - \ from BIOVIA's pipelines." -targets: - - id: developability - description: functional antibody candidate to be developed into a manufacturable(1), or not(0) - units: '' - type: categorical - names: - - antibody developability - - monoclonal anitbody - - functional antibody candidate - - manufacturable, stable, safe, and effective antibody drug - uris: - - https://rb.gy/idkdqp - - https://rb.gy/b8cx8i -identifiers: - - id: antibody_pdb_ID - type: Other - description: anitbody pdb id - - id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence in FASTA - - id: light_chain - type: Other - description: anitbody light chain amino acid sequence in FASTA -license: CC BY 4.0 -links: - - url: https://doi.org/10.1101/2020.06.18.159798 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gkt1043 - description: corresponding publication - - url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ - description: corresponding tools used - - url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al - description: data source -num_points: 2409 -bibtex: - - "@article{Chen2020,\n doi = {10.1101/2020.06.18.159798},\n url = {https://doi.org/10.1101/2020.06.18.159798},\n year =\ - \ {2020},\n month = jun,\n publisher = {Cold Spring Harbor Laboratory},\n author = {Xingyao Chen and Thomas Dougherty and\ - \ \n Chan Hong and Rachel Schibler and Yi Cong Zhao and \n Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman},\n \ - \ title = {Predicting Antibody Developability from Sequence \n using Machine Learning}}" - - "@article{Dunbar2013,\n doi = {10.1093/nar/gkt1043},\n url = {https://doi.org/10.1093/nar/gkt1043},\n year = {2013},\n\ - \ month = nov,\n publisher = {Oxford University Press ({OUP})},\n volume = {42},\n number = {D1},\n pages\ - \ = {D1140--D1146},\n author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem \n and Terry Baker and Angelika Fuchs and Guy Georges\ - \ and Jiye Shi and\n Charlotte M. Deane},\n title = {{SAbDab}: the structural antibody database},\n journal = {Nucleic\ - \ Acids Research}}" diff --git a/data/SAbDab_Chen/transform.py b/data/SAbDab_Chen/transform.py deleted file mode 100644 index 838cd9d79..000000000 --- a/data/SAbDab_Chen/transform.py +++ /dev/null @@ -1,153 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import Develop - - -def get_and_transform_data(): - # get raw data - target_folder = "SAbDab_Chen" - target_subfolder = "SAbDab_Chen" - data = Develop(name=target_subfolder) - - # proceed raw data - df = data.get_data() - fields_orig = df.columns.tolist() - assert fields_orig == ["Antibody_ID", "Antibody", "Y"] - - fn_data_original = "data_original.csv" - - antibody_list = df.Antibody.tolist() - s2l = lambda list_string: list( - map(str.strip, list_string.strip("][").replace("'", "").split(",")) - ) - df["heavy_chain"] = [s2l(x)[0] for x in antibody_list] - df["light_chain"] = [s2l(x)[1] for x in antibody_list] - df = df[["Antibody_ID", "heavy_chain", "light_chain", "Y"]] - df.to_csv(fn_data_original, index=False) - - # load raw data and assert columns - df = pd.read_csv(fn_data_original, sep=",") - fields_orig = df.columns.tolist() - assert fields_orig == ["Antibody_ID", "heavy_chain", "light_chain", "Y"] - fields_clean = ["antibody_pdb_ID", "heavy_chain", "light_chain", "developability"] - df.columns = fields_clean - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - meta = { - "name": f"{target_folder}", # unique identifier, we will also use this for directory names - "description": """Antibody data from Chen et al, where they process from the SAbDab. - From an initial dataset of 3816 antibodies, they retained 2426 antibodies - that satisfy the following criteria: 1. - have both sequence (FASTA) and Protein Data Bank (PDB) structure files, - 2. contain both a heavy chain and a light chain, and 3. - have crystal structures with resolution < 3 Å. - The DI label is derived from BIOVIA's pipelines.""", - "targets": [ - { - "id": "developability", # name of the column in a tabular dataset - "description": "functional antibody candidate to be developed into a manufacturable(1), or not(0)", - "units": "", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "antibody developability", - "monoclonal anitbody", - "functional antibody candidate", - "manufacturable, stable, safe, and effective antibody drug", - ], - "uris": [ - "https://rb.gy/idkdqp", - "https://rb.gy/b8cx8i", - ], - }, - ], - "identifiers": [ - { - "id": "antibody_pdb_ID", # column name - "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "anitbody pdb id", # description (optional, except for "Other") - }, - { - "id": "heavy_chain", # column name - "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "anitbody heavy chain amino acid sequence in FASTA", # description (optional, except for "Other") - }, - { - "id": "light_chain", # column name - "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "anitbody light chain amino acid sequence in FASTA", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1101/2020.06.18.159798", - "description": "corresponding publication", - }, - { - "url": "https://doi.org/10.1093/nar/gkt1043", - "description": "corresponding publication", - }, - { - "url": "https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/", - "description": "corresponding tools used", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al", - "description": "data source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Chen2020, - doi = {10.1101/2020.06.18.159798}, - url = {https://doi.org/10.1101/2020.06.18.159798}, - year = {2020}, - month = jun, - publisher = {Cold Spring Harbor Laboratory}, - author = {Xingyao Chen and Thomas Dougherty and - Chan Hong and Rachel Schibler and Yi Cong Zhao and - Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, - title = {Predicting Antibody Developability from Sequence - using Machine Learning}}""", - """@article{Dunbar2013, - doi = {10.1093/nar/gkt1043}, - url = {https://doi.org/10.1093/nar/gkt1043}, - year = {2013}, - month = nov, - publisher = {Oxford University Press ({OUP})}, - volume = {42}, - number = {D1}, - pages = {D1140--D1146}, - author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem - and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and - Charlotte M. Deane}, - title = {{SAbDab}: the structural antibody database}, - journal = {Nucleic Acids Research}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() From a1c016308a4ba4d0c9da83587d0e9747a79ab6b1 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 00:52:00 +0200 Subject: [PATCH 06/28] Delete data/TAP directory --- data/TAP/meta.yaml | 98 ----------------- data/TAP/transform.py | 243 ------------------------------------------ 2 files changed, 341 deletions(-) delete mode 100644 data/TAP/meta.yaml delete mode 100644 data/TAP/transform.py diff --git a/data/TAP/meta.yaml b/data/TAP/meta.yaml deleted file mode 100644 index 1eb58dd72..000000000 --- a/data/TAP/meta.yaml +++ /dev/null @@ -1,98 +0,0 @@ ---- -name: TAP -description: "Immunogenicity, instability, self-association, \n high viscosity, polyspecificity, or poor expression can all preclude\n an\ - \ antibody from becoming a therapeutic. Early identification of these\n negative characteristics is essential. Akin to the Lipinski guidelines,\n\ - \ which measure druglikeness in small molecules, \n Therapeutic Antibody Profiler (TAP) highlights antibodies \n that possess characteristics\ - \ that are rare/unseen in \n clinical-stage mAb therapeutics." -targets: - - id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i - - id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i - - id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i -identifiers: - - id: antibody_name - type: Other - description: anitbody name - - id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence - - id: light_chain - type: Other - description: anitbody light chain amino acid sequence -license: CC BY 4.0 -links: - - url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication - - url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source -num_points: 241 -bibtex: - - |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences}} diff --git a/data/TAP/transform.py b/data/TAP/transform.py deleted file mode 100644 index f434d1035..000000000 --- a/data/TAP/transform.py +++ /dev/null @@ -1,243 +0,0 @@ -import pandas as pd -import yaml -from tdc.single_pred import Develop -from tdc.utils import retrieve_label_name_list - - -def get_and_transform_data(): - # get raw data - target_folder = "TAP" - target_subfolder = "TAP" - label_list = retrieve_label_name_list(target_subfolder) - data = Develop(name=target_subfolder, label_name=label_list[0]) - # proceed raw data - df = pd.read_csv("data/tap.tab", sep="\t") - fields_orig = df.columns.tolist() - assert fields_orig == ["X", "ID", "CDR_Length", "PSH", "PPC", "PNC", "SFvCSP"] - fields_clean = [ - "antibody_two_sequences", - "antibody_name", - "CDR_Length", - "PSH", - "PPC", - "PNC", - "SFvCSP", - ] - df.columns = fields_clean - # convert list columns to two columns - antibody_list = df.antibody_two_sequences.tolist() - s2l = lambda list_string: list( - map(str.strip, list_string.strip("][").replace("'", "").split(",")) - ) - antibody2list = lambda list_string: [ - x.strip() for x in s2l(list_string)[0].split("\\n") - ] - df["heavy_chain"] = [antibody2list(x)[0] for x in antibody_list] - df["light_chain"] = [antibody2list(x)[1] for x in antibody_list] - fn_data_original = "data_original.csv" - df.to_csv(fn_data_original, index=None) - - # load raw data and assert columns - df = pd.read_csv(fn_data_original, sep=",") - fields_orig = df.columns.tolist() - assert fields_orig == [ - "antibody_two_sequences", - "antibody_name", - "CDR_Length", - "PSH", - "PPC", - "PNC", - "SFvCSP", - "heavy_chain", - "light_chain", - ] - - df = df[ - [ - "antibody_name", - "heavy_chain", - "light_chain", - "CDR_Length", - "PSH", - "PPC", - "PNC", - "SFvCSP", - ] - ] - fields_clean = [ - "antibody_name", - "heavy_chain", - "light_chain", - "CDR_Length", - "PSH", - "PPC", - "PNC", - "SFvCSP", - ] - - df.columns = fields_clean - assert fields_orig != fields_clean - assert not df.duplicated().sum() - - # save to csv - fn_data_csv = "data_clean.csv" - df.to_csv(fn_data_csv, index=False) - - meta = { - "name": f"{target_folder}", # unique identifier, we will also use this for directory names - "description": """Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics.""", - "targets": [ - { - "id": "CDR_Length", # name of the column in a tabular dataset - "description": "CDR Complementarity-determining regions length", # description of what this column means - "units": "", # units of the values in this column (leave empty if unitless) - "type": "continuous", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "Antibody Complementarity-determining regions length", - "Therapeutic Antibody Profiler", - "antibody developability", - "monoclonal anitbody", - ], - "uris": [ - "https://rb.gy/s9gv88", - "https://rb.gy/km77hq", - "https://rb.gy/b8cx8i", - ], - }, - { - "id": "PSH", # name of the column in a tabular dataset - "description": "patches of surface hydrophobicity", # description of what this column means - "units": "", # units of the values in this column (leave empty if unitless) - "type": "continuous", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "antibody patches of surface hydrophobicity", - "Therapeutic Antibody Profiler", - "antibody developability", - "monoclonal anitbody", - ], - "uris": [ - "https://rb.gy/bchhaa", - "https://rb.gy/2irr4l", - "https://rb.gy/b8cx8i", - ], - }, - { - "id": "PPC", # name of the column in a tabular dataset - "description": "patches of positive charge", # description of what this column means - "units": "", # units of the values in this column (leave empty if unitless) - "type": "continuous", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "patches of positive charge", - "Therapeutic Antibody Profiler", - "antibody developability", - "monoclonal anitbody", - ], - "uris": [ - "https://rb.gy/b8cx8i", - ], - }, - { - "id": "PNC", # name of the column in a tabular dataset - "description": "patches of negative charge", # description of what this column means - "units": "", # units of the values in this column (leave empty if unitless) - "type": "continuous", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "anitbody patches of negative charge", - "Therapeutic Antibody Profiler", - "antibody developability", - "monoclonal anitbody", - ], - "uris": [ - "https://rb.gy/b8cx8i", - ], - }, - { - "id": "SFvCSP", # name of the column in a tabular dataset - "description": "structural Fv charge symmetry parameter", # description of what this column means - "units": "", # units of the values in this column (leave empty if unitless) - "type": "continuous", # can be "categorical", "ordinal", "continuous" - "names": [ # names for the property (to sample from for building the prompts) - "antibody structural Fv charge symmetry parameter", - "Therapeutic Antibody Profiler", - "antibody developability", - "monoclonal anitbody", - ], - "uris": [ - "https://rb.gy/uxyhc3", - "https://rb.gy/b8cx8i", - ], - }, - ], - "identifiers": [ - { - "id": "antibody_name", # column name - "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "anitbody name", # description (optional, except for "Other") - }, - { - "id": "heavy_chain", # column name - "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "anitbody heavy chain amino acid sequence", # description (optional, except for "Other") - }, - { - "id": "light_chain", # column name - "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "description": "anitbody light chain amino acid sequence", # description (optional, except for "Other") - }, - ], - "license": "CC BY 4.0", # license under which the original dataset was published - "links": [ # list of relevant links (original dataset, other uses, etc.) - { - "url": "https://doi.org/10.1073/pnas.1810576116", - "description": "corresponding publication", - }, - { - "url": "https://tdcommons.ai/single_pred_tasks/develop/#tap", - "description": "data source", - }, - ], - "num_points": len(df), # number of datapoints in this dataset - "bibtex": [ - """@article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences}}""", - ], - } - - def str_presenter(dumper, data): - """configures yaml for dumping multiline strings - Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data - """ - if data.count("\n") > 0: # check for multiline string - return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") - return dumper.represent_scalar("tag:yaml.org,2002:str", data) - - yaml.add_representer(str, str_presenter) - yaml.representer.SafeRepresenter.add_representer( - str, str_presenter - ) # to use with safe_dum - fn_meta = "meta.yaml" - with open(fn_meta, "w") as f: - yaml.dump(meta, f, sort_keys=False) - - print(f"Finished processing {meta['name']} dataset!") - - -if __name__ == "__main__": - get_and_transform_data() From 6eb651cfd4ed14b711793d6cc20c2fee9c0e0d69 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 01:00:13 +0200 Subject: [PATCH 07/28] Add files via upload --- data/tap/meta.yaml | 199 +++++++++++++++++++++--------------------- data/tap/transform.py | 10 +-- 2 files changed, 104 insertions(+), 105 deletions(-) diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index 7c9844c4d..ae57d425b 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,109 +1,108 @@ ---- name: tap description: |- - Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics. + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. targets: - - id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i - - id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i - - id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i +- id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i +- id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i +- id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: antibody_name - type: Other - description: anitbody name - - id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence - - id: light_chain - type: Other - description: anitbody light chain amino acid sequence +- id: antibody_name + type: Other + description: anitbody name +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence license: CC BY 4.0 links: - - url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication - - url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source +- url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: - - |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk - and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek - and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences} +- |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py index 734708dbb..352944dc2 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -176,11 +176,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From caf6babf1b4fa4c5acf2bfc4dba4bafef773cf22 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Mar 2023 23:00:20 +0000 Subject: [PATCH 08/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/tap/meta.yaml | 199 +++++++++++++++++++++--------------------- data/tap/transform.py | 10 +-- 2 files changed, 105 insertions(+), 104 deletions(-) diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index ae57d425b..7c9844c4d 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,108 +1,109 @@ +--- name: tap description: |- - Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics. + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. targets: -- id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i -- id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i -- id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i + - id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i + - id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i + - id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: -- id: antibody_name - type: Other - description: anitbody name -- id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence -- id: light_chain - type: Other - description: anitbody light chain amino acid sequence + - id: antibody_name + type: Other + description: anitbody name + - id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence + - id: light_chain + type: Other + description: anitbody light chain amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication -- url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source + - url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: -- |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk - and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek - and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences} + - |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py index 352944dc2..734708dbb 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -176,11 +176,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From 309bfe65bc218fd7ec46639ae675661e20c63a1e Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 01:00:37 +0200 Subject: [PATCH 09/28] Add files via upload --- data/sabdab_chen/meta.yaml | 132 +++++++++++++++++----------------- data/sabdab_chen/transform.py | 10 +-- 2 files changed, 71 insertions(+), 71 deletions(-) diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml index cf19dcddb..72d072244 100644 --- a/data/sabdab_chen/meta.yaml +++ b/data/sabdab_chen/meta.yaml @@ -1,75 +1,75 @@ ---- name: sabdab_chen description: |- - Antibody data from Chen et al, where they process from the SAbDab. - From an initial dataset of 3816 antibodies, they retained 2426 antibodies that - satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data - Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, - and 3.have crystal structures with resolution < 3 A. The DI label is derived - from BIOVIA's pipelines. + Antibody data from Chen et al, where they process from the SAbDab. + From an initial dataset of 3816 antibodies, they retained 2426 antibodies that + satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data + Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, + and 3.have crystal structures with resolution < 3 A. The DI label is derived + from BIOVIA's pipelines. targets: - - id: developability - description: functional antibody candidate to be developed into a manufacturable(1), or not(0) - units: '' - type: categorical - names: - - antibody developability - - monoclonal anitbody - - functional antibody candidate - - manufacturable, stable, safe, and effective antibody drug - uris: - - https://rb.gy/idkdqp - - https://rb.gy/b8cx8i +- id: developability + description: functional antibody candidate to be developed into a manufacturable(1), + or not(0) + units: '' + type: categorical + names: + - antibody developability + - monoclonal anitbody + - functional antibody candidate + - manufacturable, stable, safe, and effective antibody drug + uris: + - https://rb.gy/idkdqp + - https://rb.gy/b8cx8i benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: antibody_pdb_ID - type: Other - description: anitbody pdb id - - id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence in FASTA - - id: light_chain - type: Other - description: anitbody light chain amino acid sequence in FASTA +- id: antibody_pdb_ID + type: Other + description: anitbody pdb id +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence in FASTA +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence in FASTA license: CC BY 4.0 links: - - url: https://doi.org/10.1101/2020.06.18.159798 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gkt1043 - description: corresponding publication - - url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ - description: corresponding tools used - - url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al - description: data source +- url: https://doi.org/10.1101/2020.06.18.159798 + description: corresponding publication +- url: https://doi.org/10.1093/nar/gkt1043 + description: corresponding publication +- url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ + description: corresponding tools used +- url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al + description: data source num_points: 2409 bibtex: - - |- - @article{Chen2020, - doi = {10.1101/2020.06.18.159798}, - url = {https://doi.org/10.1101/2020.06.18.159798}, - year = {2020}, - month = jun, - publisher = {Cold Spring Harbor Laboratory}, - author = {Xingyao Chen and Thomas Dougherty and - Chan Hong and Rachel Schibler and Yi Cong Zhao and - Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, - title = {Predicting Antibody Developability from Sequence - using Machine Learning} - - |- - @article{Dunbar2013, - doi = {10.1093/nar/gkt1043}, - url = {https://doi.org/10.1093/nar/gkt1043}, - year = {2013}, - month = nov, - publisher = {Oxford University Press ({OUP})}, - volume = {42}, - number = {D1}, - pages = {D1140--D1146}, - author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem - and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and - Charlotte M. Deane}, - title = {SAbDab: the structural antibody database}, - journal = {Nucleic Acids Research} +- |- + @article{Chen2020, + doi = {10.1101/2020.06.18.159798}, + url = {https://doi.org/10.1101/2020.06.18.159798}, + year = {2020}, + month = jun, + publisher = {Cold Spring Harbor Laboratory}, + author = {Xingyao Chen and Thomas Dougherty and + Chan Hong and Rachel Schibler and Yi Cong Zhao and + Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, + title = {Predicting Antibody Developability from Sequence + using Machine Learning} +- |- + @article{Dunbar2013, + doi = {10.1093/nar/gkt1043}, + url = {https://doi.org/10.1093/nar/gkt1043}, + year = {2013}, + month = nov, + publisher = {Oxford University Press ({OUP})}, + volume = {42}, + number = {D1}, + pages = {D1140--D1146}, + author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem + and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and + Charlotte M. Deane}, + title = {SAbDab: the structural antibody database}, + journal = {Nucleic Acids Research} diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py index 3860a1aba..2f5c52af6 100644 --- a/data/sabdab_chen/transform.py +++ b/data/sabdab_chen/transform.py @@ -64,11 +64,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From 22bff0172d7647186ff25535573c7b244b76fe9d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Mar 2023 23:00:44 +0000 Subject: [PATCH 10/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/sabdab_chen/meta.yaml | 132 +++++++++++++++++----------------- data/sabdab_chen/transform.py | 10 +-- 2 files changed, 71 insertions(+), 71 deletions(-) diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml index 72d072244..cf19dcddb 100644 --- a/data/sabdab_chen/meta.yaml +++ b/data/sabdab_chen/meta.yaml @@ -1,75 +1,75 @@ +--- name: sabdab_chen description: |- - Antibody data from Chen et al, where they process from the SAbDab. - From an initial dataset of 3816 antibodies, they retained 2426 antibodies that - satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data - Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, - and 3.have crystal structures with resolution < 3 A. The DI label is derived - from BIOVIA's pipelines. + Antibody data from Chen et al, where they process from the SAbDab. + From an initial dataset of 3816 antibodies, they retained 2426 antibodies that + satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data + Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, + and 3.have crystal structures with resolution < 3 A. The DI label is derived + from BIOVIA's pipelines. targets: -- id: developability - description: functional antibody candidate to be developed into a manufacturable(1), - or not(0) - units: '' - type: categorical - names: - - antibody developability - - monoclonal anitbody - - functional antibody candidate - - manufacturable, stable, safe, and effective antibody drug - uris: - - https://rb.gy/idkdqp - - https://rb.gy/b8cx8i + - id: developability + description: functional antibody candidate to be developed into a manufacturable(1), or not(0) + units: '' + type: categorical + names: + - antibody developability + - monoclonal anitbody + - functional antibody candidate + - manufacturable, stable, safe, and effective antibody drug + uris: + - https://rb.gy/idkdqp + - https://rb.gy/b8cx8i benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: -- id: antibody_pdb_ID - type: Other - description: anitbody pdb id -- id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence in FASTA -- id: light_chain - type: Other - description: anitbody light chain amino acid sequence in FASTA + - id: antibody_pdb_ID + type: Other + description: anitbody pdb id + - id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence in FASTA + - id: light_chain + type: Other + description: anitbody light chain amino acid sequence in FASTA license: CC BY 4.0 links: -- url: https://doi.org/10.1101/2020.06.18.159798 - description: corresponding publication -- url: https://doi.org/10.1093/nar/gkt1043 - description: corresponding publication -- url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ - description: corresponding tools used -- url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al - description: data source + - url: https://doi.org/10.1101/2020.06.18.159798 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gkt1043 + description: corresponding publication + - url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ + description: corresponding tools used + - url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al + description: data source num_points: 2409 bibtex: -- |- - @article{Chen2020, - doi = {10.1101/2020.06.18.159798}, - url = {https://doi.org/10.1101/2020.06.18.159798}, - year = {2020}, - month = jun, - publisher = {Cold Spring Harbor Laboratory}, - author = {Xingyao Chen and Thomas Dougherty and - Chan Hong and Rachel Schibler and Yi Cong Zhao and - Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, - title = {Predicting Antibody Developability from Sequence - using Machine Learning} -- |- - @article{Dunbar2013, - doi = {10.1093/nar/gkt1043}, - url = {https://doi.org/10.1093/nar/gkt1043}, - year = {2013}, - month = nov, - publisher = {Oxford University Press ({OUP})}, - volume = {42}, - number = {D1}, - pages = {D1140--D1146}, - author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem - and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and - Charlotte M. Deane}, - title = {SAbDab: the structural antibody database}, - journal = {Nucleic Acids Research} + - |- + @article{Chen2020, + doi = {10.1101/2020.06.18.159798}, + url = {https://doi.org/10.1101/2020.06.18.159798}, + year = {2020}, + month = jun, + publisher = {Cold Spring Harbor Laboratory}, + author = {Xingyao Chen and Thomas Dougherty and + Chan Hong and Rachel Schibler and Yi Cong Zhao and + Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, + title = {Predicting Antibody Developability from Sequence + using Machine Learning} + - |- + @article{Dunbar2013, + doi = {10.1093/nar/gkt1043}, + url = {https://doi.org/10.1093/nar/gkt1043}, + year = {2013}, + month = nov, + publisher = {Oxford University Press ({OUP})}, + volume = {42}, + number = {D1}, + pages = {D1140--D1146}, + author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem + and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and + Charlotte M. Deane}, + title = {SAbDab: the structural antibody database}, + journal = {Nucleic Acids Research} diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py index 2f5c52af6..3860a1aba 100644 --- a/data/sabdab_chen/transform.py +++ b/data/sabdab_chen/transform.py @@ -64,11 +64,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From 3b7c7f8d62f8d8398b71e63b6010f289c694719b Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 01:07:54 +0200 Subject: [PATCH 11/28] Add files via upload --- data/tap/meta.yaml | 199 +++++++++++++++++++++--------------------- data/tap/transform.py | 10 +-- 2 files changed, 104 insertions(+), 105 deletions(-) diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index 7c9844c4d..ae57d425b 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,109 +1,108 @@ ---- name: tap description: |- - Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics. + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. targets: - - id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i - - id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i - - id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i +- id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i +- id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i +- id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: antibody_name - type: Other - description: anitbody name - - id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence - - id: light_chain - type: Other - description: anitbody light chain amino acid sequence +- id: antibody_name + type: Other + description: anitbody name +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence license: CC BY 4.0 links: - - url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication - - url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source +- url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: - - |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk - and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek - and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences} +- |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py index 734708dbb..352944dc2 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -176,11 +176,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From 6aa3608f6d0c0220056f102606a4b6233eb01301 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Mar 2023 23:08:00 +0000 Subject: [PATCH 12/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/tap/meta.yaml | 199 +++++++++++++++++++++--------------------- data/tap/transform.py | 10 +-- 2 files changed, 105 insertions(+), 104 deletions(-) diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index ae57d425b..7c9844c4d 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,108 +1,109 @@ +--- name: tap description: |- - Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics. + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. targets: -- id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i -- id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i -- id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i + - id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i + - id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i + - id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: -- id: antibody_name - type: Other - description: anitbody name -- id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence -- id: light_chain - type: Other - description: anitbody light chain amino acid sequence + - id: antibody_name + type: Other + description: anitbody name + - id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence + - id: light_chain + type: Other + description: anitbody light chain amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication -- url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source + - url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: -- |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk - and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek - and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences} + - |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py index 352944dc2..734708dbb 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -176,11 +176,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From a52db119adc474f629213fda9aeb55fa7d5fb6c7 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 01:08:56 +0200 Subject: [PATCH 13/28] Update meta.yaml --- data/tap/meta.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index 7c9844c4d..8d83d7dbf 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,4 +1,3 @@ ---- name: tap description: |- Immunogenicity, instability, self-association, From 87dc335bb09298cab4061d86834886040bf5b7e3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Mar 2023 23:09:04 +0000 Subject: [PATCH 14/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/tap/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index 8d83d7dbf..7c9844c4d 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,3 +1,4 @@ +--- name: tap description: |- Immunogenicity, instability, self-association, From 40ad39665fb1223a335d92dcb57710315888f356 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 01:09:15 +0200 Subject: [PATCH 15/28] Update meta.yaml --- data/sabdab_chen/meta.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml index cf19dcddb..faca55a65 100644 --- a/data/sabdab_chen/meta.yaml +++ b/data/sabdab_chen/meta.yaml @@ -1,4 +1,3 @@ ---- name: sabdab_chen description: |- Antibody data from Chen et al, where they process from the SAbDab. From d626e3fbb813ce82b099628401571bdca23612d6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Mar 2023 23:09:23 +0000 Subject: [PATCH 16/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/sabdab_chen/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml index faca55a65..cf19dcddb 100644 --- a/data/sabdab_chen/meta.yaml +++ b/data/sabdab_chen/meta.yaml @@ -1,3 +1,4 @@ +--- name: sabdab_chen description: |- Antibody data from Chen et al, where they process from the SAbDab. From 5067c904f800ad1a08dcc525f0f22fb18d9a01b3 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sat, 25 Mar 2023 01:11:27 +0200 Subject: [PATCH 17/28] Add files via upload --- data/tap/meta.yaml | 199 +++++++++++++++++++++--------------------- data/tap/transform.py | 10 +-- 2 files changed, 104 insertions(+), 105 deletions(-) diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index 7c9844c4d..ae57d425b 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,109 +1,108 @@ ---- name: tap description: |- - Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics. + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. targets: - - id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i - - id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i - - id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i +- id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i +- id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i +- id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: antibody_name - type: Other - description: anitbody name - - id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence - - id: light_chain - type: Other - description: anitbody light chain amino acid sequence +- id: antibody_name + type: Other + description: anitbody name +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence license: CC BY 4.0 links: - - url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication - - url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source +- url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: - - |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk - and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek - and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences} +- |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py index 734708dbb..352944dc2 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -176,11 +176,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From 21e3301b4a1290c3961c70cd551053d45042bf7c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Mar 2023 23:11:34 +0000 Subject: [PATCH 18/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/tap/meta.yaml | 199 +++++++++++++++++++++--------------------- data/tap/transform.py | 10 +-- 2 files changed, 105 insertions(+), 104 deletions(-) diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index ae57d425b..7c9844c4d 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,108 +1,109 @@ +--- name: tap description: |- - Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics. + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. targets: -- id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i -- id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i -- id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i + - id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i + - id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i + - id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: -- id: antibody_name - type: Other - description: anitbody name -- id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence -- id: light_chain - type: Other - description: anitbody light chain amino acid sequence + - id: antibody_name + type: Other + description: anitbody name + - id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence + - id: light_chain + type: Other + description: anitbody light chain amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication -- url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source + - url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: -- |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk - and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek - and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences} + - |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py index 352944dc2..734708dbb 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -176,11 +176,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From c6a05c26a884adfe85a70f86456369de8b84acdf Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sun, 26 Mar 2023 01:32:36 +0200 Subject: [PATCH 19/28] Add files via upload --- data/sabdab_chen/meta.yaml | 132 +++++++++++----------- data/sabdab_chen/transform.py | 10 +- data/tap/meta.yaml | 199 +++++++++++++++++----------------- data/tap/transform.py | 10 +- 4 files changed, 175 insertions(+), 176 deletions(-) diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml index cf19dcddb..72d072244 100644 --- a/data/sabdab_chen/meta.yaml +++ b/data/sabdab_chen/meta.yaml @@ -1,75 +1,75 @@ ---- name: sabdab_chen description: |- - Antibody data from Chen et al, where they process from the SAbDab. - From an initial dataset of 3816 antibodies, they retained 2426 antibodies that - satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data - Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, - and 3.have crystal structures with resolution < 3 A. The DI label is derived - from BIOVIA's pipelines. + Antibody data from Chen et al, where they process from the SAbDab. + From an initial dataset of 3816 antibodies, they retained 2426 antibodies that + satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data + Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, + and 3.have crystal structures with resolution < 3 A. The DI label is derived + from BIOVIA's pipelines. targets: - - id: developability - description: functional antibody candidate to be developed into a manufacturable(1), or not(0) - units: '' - type: categorical - names: - - antibody developability - - monoclonal anitbody - - functional antibody candidate - - manufacturable, stable, safe, and effective antibody drug - uris: - - https://rb.gy/idkdqp - - https://rb.gy/b8cx8i +- id: developability + description: functional antibody candidate to be developed into a manufacturable(1), + or not(0) + units: '' + type: categorical + names: + - antibody developability + - monoclonal anitbody + - functional antibody candidate + - manufacturable, stable, safe, and effective antibody drug + uris: + - https://rb.gy/idkdqp + - https://rb.gy/b8cx8i benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: antibody_pdb_ID - type: Other - description: anitbody pdb id - - id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence in FASTA - - id: light_chain - type: Other - description: anitbody light chain amino acid sequence in FASTA +- id: antibody_pdb_ID + type: Other + description: anitbody pdb id +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence in FASTA +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence in FASTA license: CC BY 4.0 links: - - url: https://doi.org/10.1101/2020.06.18.159798 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gkt1043 - description: corresponding publication - - url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ - description: corresponding tools used - - url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al - description: data source +- url: https://doi.org/10.1101/2020.06.18.159798 + description: corresponding publication +- url: https://doi.org/10.1093/nar/gkt1043 + description: corresponding publication +- url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ + description: corresponding tools used +- url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al + description: data source num_points: 2409 bibtex: - - |- - @article{Chen2020, - doi = {10.1101/2020.06.18.159798}, - url = {https://doi.org/10.1101/2020.06.18.159798}, - year = {2020}, - month = jun, - publisher = {Cold Spring Harbor Laboratory}, - author = {Xingyao Chen and Thomas Dougherty and - Chan Hong and Rachel Schibler and Yi Cong Zhao and - Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, - title = {Predicting Antibody Developability from Sequence - using Machine Learning} - - |- - @article{Dunbar2013, - doi = {10.1093/nar/gkt1043}, - url = {https://doi.org/10.1093/nar/gkt1043}, - year = {2013}, - month = nov, - publisher = {Oxford University Press ({OUP})}, - volume = {42}, - number = {D1}, - pages = {D1140--D1146}, - author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem - and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and - Charlotte M. Deane}, - title = {SAbDab: the structural antibody database}, - journal = {Nucleic Acids Research} +- |- + @article{Chen2020, + doi = {10.1101/2020.06.18.159798}, + url = {https://doi.org/10.1101/2020.06.18.159798}, + year = {2020}, + month = jun, + publisher = {Cold Spring Harbor Laboratory}, + author = {Xingyao Chen and Thomas Dougherty and + Chan Hong and Rachel Schibler and Yi Cong Zhao and + Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, + title = {Predicting Antibody Developability from Sequence + using Machine Learning} +- |- + @article{Dunbar2013, + doi = {10.1093/nar/gkt1043}, + url = {https://doi.org/10.1093/nar/gkt1043}, + year = {2013}, + month = nov, + publisher = {Oxford University Press ({OUP})}, + volume = {42}, + number = {D1}, + pages = {D1140--D1146}, + author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem + and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and + Charlotte M. Deane}, + title = {SAbDab: the structural antibody database}, + journal = {Nucleic Acids Research} diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py index 3860a1aba..2f5c52af6 100644 --- a/data/sabdab_chen/transform.py +++ b/data/sabdab_chen/transform.py @@ -64,11 +64,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index 7c9844c4d..ae57d425b 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,109 +1,108 @@ ---- name: tap description: |- - Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics. + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. targets: - - id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i - - id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i - - id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i +- id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i +- id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i +- id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: antibody_name - type: Other - description: anitbody name - - id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence - - id: light_chain - type: Other - description: anitbody light chain amino acid sequence +- id: antibody_name + type: Other + description: anitbody name +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence license: CC BY 4.0 links: - - url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication - - url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source +- url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: - - |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk - and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek - and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences} +- |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py index 734708dbb..352944dc2 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -176,11 +176,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From 94e42edc67995988a796b632a3d5eeb3bf2c4cd1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 25 Mar 2023 23:32:43 +0000 Subject: [PATCH 20/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- data/sabdab_chen/meta.yaml | 132 +++++++++++----------- data/sabdab_chen/transform.py | 10 +- data/tap/meta.yaml | 199 +++++++++++++++++----------------- data/tap/transform.py | 10 +- 4 files changed, 176 insertions(+), 175 deletions(-) diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml index 72d072244..cf19dcddb 100644 --- a/data/sabdab_chen/meta.yaml +++ b/data/sabdab_chen/meta.yaml @@ -1,75 +1,75 @@ +--- name: sabdab_chen description: |- - Antibody data from Chen et al, where they process from the SAbDab. - From an initial dataset of 3816 antibodies, they retained 2426 antibodies that - satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data - Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, - and 3.have crystal structures with resolution < 3 A. The DI label is derived - from BIOVIA's pipelines. + Antibody data from Chen et al, where they process from the SAbDab. + From an initial dataset of 3816 antibodies, they retained 2426 antibodies that + satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data + Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, + and 3.have crystal structures with resolution < 3 A. The DI label is derived + from BIOVIA's pipelines. targets: -- id: developability - description: functional antibody candidate to be developed into a manufacturable(1), - or not(0) - units: '' - type: categorical - names: - - antibody developability - - monoclonal anitbody - - functional antibody candidate - - manufacturable, stable, safe, and effective antibody drug - uris: - - https://rb.gy/idkdqp - - https://rb.gy/b8cx8i + - id: developability + description: functional antibody candidate to be developed into a manufacturable(1), or not(0) + units: '' + type: categorical + names: + - antibody developability + - monoclonal anitbody + - functional antibody candidate + - manufacturable, stable, safe, and effective antibody drug + uris: + - https://rb.gy/idkdqp + - https://rb.gy/b8cx8i benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: -- id: antibody_pdb_ID - type: Other - description: anitbody pdb id -- id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence in FASTA -- id: light_chain - type: Other - description: anitbody light chain amino acid sequence in FASTA + - id: antibody_pdb_ID + type: Other + description: anitbody pdb id + - id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence in FASTA + - id: light_chain + type: Other + description: anitbody light chain amino acid sequence in FASTA license: CC BY 4.0 links: -- url: https://doi.org/10.1101/2020.06.18.159798 - description: corresponding publication -- url: https://doi.org/10.1093/nar/gkt1043 - description: corresponding publication -- url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ - description: corresponding tools used -- url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al - description: data source + - url: https://doi.org/10.1101/2020.06.18.159798 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gkt1043 + description: corresponding publication + - url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ + description: corresponding tools used + - url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al + description: data source num_points: 2409 bibtex: -- |- - @article{Chen2020, - doi = {10.1101/2020.06.18.159798}, - url = {https://doi.org/10.1101/2020.06.18.159798}, - year = {2020}, - month = jun, - publisher = {Cold Spring Harbor Laboratory}, - author = {Xingyao Chen and Thomas Dougherty and - Chan Hong and Rachel Schibler and Yi Cong Zhao and - Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, - title = {Predicting Antibody Developability from Sequence - using Machine Learning} -- |- - @article{Dunbar2013, - doi = {10.1093/nar/gkt1043}, - url = {https://doi.org/10.1093/nar/gkt1043}, - year = {2013}, - month = nov, - publisher = {Oxford University Press ({OUP})}, - volume = {42}, - number = {D1}, - pages = {D1140--D1146}, - author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem - and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and - Charlotte M. Deane}, - title = {SAbDab: the structural antibody database}, - journal = {Nucleic Acids Research} + - |- + @article{Chen2020, + doi = {10.1101/2020.06.18.159798}, + url = {https://doi.org/10.1101/2020.06.18.159798}, + year = {2020}, + month = jun, + publisher = {Cold Spring Harbor Laboratory}, + author = {Xingyao Chen and Thomas Dougherty and + Chan Hong and Rachel Schibler and Yi Cong Zhao and + Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, + title = {Predicting Antibody Developability from Sequence + using Machine Learning} + - |- + @article{Dunbar2013, + doi = {10.1093/nar/gkt1043}, + url = {https://doi.org/10.1093/nar/gkt1043}, + year = {2013}, + month = nov, + publisher = {Oxford University Press ({OUP})}, + volume = {42}, + number = {D1}, + pages = {D1140--D1146}, + author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem + and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and + Charlotte M. Deane}, + title = {SAbDab: the structural antibody database}, + journal = {Nucleic Acids Research} diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py index 2f5c52af6..3860a1aba 100644 --- a/data/sabdab_chen/transform.py +++ b/data/sabdab_chen/transform.py @@ -64,11 +64,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index ae57d425b..7c9844c4d 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,108 +1,109 @@ +--- name: tap description: |- - Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics. + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. targets: -- id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i -- id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i -- id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i + - id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i + - id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i + - id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i + - id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: -- id: antibody_name - type: Other - description: anitbody name -- id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence -- id: light_chain - type: Other - description: anitbody light chain amino acid sequence + - id: antibody_name + type: Other + description: anitbody name + - id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence + - id: light_chain + type: Other + description: anitbody light chain amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication -- url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source + - url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: -- |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk - and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek - and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences} + - |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py index 352944dc2..734708dbb 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -176,11 +176,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From 39744aa7a1f48efe3d405ee7815c2cadafa537c7 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Sun, 26 Mar 2023 22:16:11 +0200 Subject: [PATCH 21/28] Add files via upload --- data/sabdab_chen/meta.yaml | 132 +++++++++++----------- data/sabdab_chen/transform.py | 10 +- data/tap/meta.yaml | 199 +++++++++++++++++----------------- data/tap/transform.py | 10 +- 4 files changed, 175 insertions(+), 176 deletions(-) diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml index cf19dcddb..72d072244 100644 --- a/data/sabdab_chen/meta.yaml +++ b/data/sabdab_chen/meta.yaml @@ -1,75 +1,75 @@ ---- name: sabdab_chen description: |- - Antibody data from Chen et al, where they process from the SAbDab. - From an initial dataset of 3816 antibodies, they retained 2426 antibodies that - satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data - Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, - and 3.have crystal structures with resolution < 3 A. The DI label is derived - from BIOVIA's pipelines. + Antibody data from Chen et al, where they process from the SAbDab. + From an initial dataset of 3816 antibodies, they retained 2426 antibodies that + satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data + Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, + and 3.have crystal structures with resolution < 3 A. The DI label is derived + from BIOVIA's pipelines. targets: - - id: developability - description: functional antibody candidate to be developed into a manufacturable(1), or not(0) - units: '' - type: categorical - names: - - antibody developability - - monoclonal anitbody - - functional antibody candidate - - manufacturable, stable, safe, and effective antibody drug - uris: - - https://rb.gy/idkdqp - - https://rb.gy/b8cx8i +- id: developability + description: functional antibody candidate to be developed into a manufacturable(1), + or not(0) + units: '' + type: categorical + names: + - antibody developability + - monoclonal anitbody + - functional antibody candidate + - manufacturable, stable, safe, and effective antibody drug + uris: + - https://rb.gy/idkdqp + - https://rb.gy/b8cx8i benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: antibody_pdb_ID - type: Other - description: anitbody pdb id - - id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence in FASTA - - id: light_chain - type: Other - description: anitbody light chain amino acid sequence in FASTA +- id: antibody_pdb_ID + type: Other + description: anitbody pdb id +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence in FASTA +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence in FASTA license: CC BY 4.0 links: - - url: https://doi.org/10.1101/2020.06.18.159798 - description: corresponding publication - - url: https://doi.org/10.1093/nar/gkt1043 - description: corresponding publication - - url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ - description: corresponding tools used - - url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al - description: data source +- url: https://doi.org/10.1101/2020.06.18.159798 + description: corresponding publication +- url: https://doi.org/10.1093/nar/gkt1043 + description: corresponding publication +- url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ + description: corresponding tools used +- url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al + description: data source num_points: 2409 bibtex: - - |- - @article{Chen2020, - doi = {10.1101/2020.06.18.159798}, - url = {https://doi.org/10.1101/2020.06.18.159798}, - year = {2020}, - month = jun, - publisher = {Cold Spring Harbor Laboratory}, - author = {Xingyao Chen and Thomas Dougherty and - Chan Hong and Rachel Schibler and Yi Cong Zhao and - Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, - title = {Predicting Antibody Developability from Sequence - using Machine Learning} - - |- - @article{Dunbar2013, - doi = {10.1093/nar/gkt1043}, - url = {https://doi.org/10.1093/nar/gkt1043}, - year = {2013}, - month = nov, - publisher = {Oxford University Press ({OUP})}, - volume = {42}, - number = {D1}, - pages = {D1140--D1146}, - author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem - and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and - Charlotte M. Deane}, - title = {SAbDab: the structural antibody database}, - journal = {Nucleic Acids Research} +- |- + @article{Chen2020, + doi = {10.1101/2020.06.18.159798}, + url = {https://doi.org/10.1101/2020.06.18.159798}, + year = {2020}, + month = jun, + publisher = {Cold Spring Harbor Laboratory}, + author = {Xingyao Chen and Thomas Dougherty and + Chan Hong and Rachel Schibler and Yi Cong Zhao and + Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, + title = {Predicting Antibody Developability from Sequence + using Machine Learning} +- |- + @article{Dunbar2013, + doi = {10.1093/nar/gkt1043}, + url = {https://doi.org/10.1093/nar/gkt1043}, + year = {2013}, + month = nov, + publisher = {Oxford University Press ({OUP})}, + volume = {42}, + number = {D1}, + pages = {D1140--D1146}, + author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem + and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and + Charlotte M. Deane}, + title = {SAbDab: the structural antibody database}, + journal = {Nucleic Acids Research} diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py index 3860a1aba..2f5c52af6 100644 --- a/data/sabdab_chen/transform.py +++ b/data/sabdab_chen/transform.py @@ -64,11 +64,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index 7c9844c4d..ae57d425b 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,109 +1,108 @@ ---- name: tap description: |- - Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics. + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. targets: - - id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i - - id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i - - id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i - - id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i +- id: CDR_Length + description: CDR Complementarity-determining regions length + units: '' + type: continuous + names: + - Antibody Complementarity-determining regions length + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/s9gv88 + - https://rb.gy/km77hq + - https://rb.gy/b8cx8i +- id: PSH + description: patches of surface hydrophobicity + units: '' + type: continuous + names: + - antibody patches of surface hydrophobicity + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/bchhaa + - https://rb.gy/2irr4l + - https://rb.gy/b8cx8i +- id: PPC + description: patches of positive charge + units: '' + type: continuous + names: + - patches of positive charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: PNC + description: patches of negative charge + units: '' + type: continuous + names: + - anitbody patches of negative charge + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/b8cx8i +- id: SFvCSP + description: structural Fv charge symmetry parameter + units: '' + type: continuous + names: + - antibody structural Fv charge symmetry parameter + - Therapeutic Antibody Profiler + - antibody developability + - monoclonal anitbody + uris: + - https://rb.gy/uxyhc3 + - https://rb.gy/b8cx8i benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split +- name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: - - id: antibody_name - type: Other - description: anitbody name - - id: heavy_chain - type: Other - description: anitbody heavy chain amino acid sequence - - id: light_chain - type: Other - description: anitbody light chain amino acid sequence +- id: antibody_name + type: Other + description: anitbody name +- id: heavy_chain + type: Other + description: anitbody heavy chain amino acid sequence +- id: light_chain + type: Other + description: anitbody light chain amino acid sequence license: CC BY 4.0 links: - - url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication - - url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source +- url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication +- url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: - - |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk - and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek - and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences} +- |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py index 734708dbb..352944dc2 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -176,11 +176,11 @@ def get_and_transform_data(): }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { From d5d51877642f038b2937a1aecf61f393c68efe5f Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Mon, 27 Mar 2023 17:06:08 +0200 Subject: [PATCH 22/28] Add files via upload --- data/sabdab_chen/meta.yaml | 9 +++++++++ data/sabdab_chen/transform.py | 12 ++++++++++++ data/tap/meta.yaml | 10 ++++++++++ data/tap/transform.py | 13 +++++++++++++ 4 files changed, 44 insertions(+) diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml index 72d072244..2d34f75b2 100644 --- a/data/sabdab_chen/meta.yaml +++ b/data/sabdab_chen/meta.yaml @@ -27,12 +27,21 @@ benchmarks: identifiers: - id: antibody_pdb_ID type: Other + names: + - pdb id + - Protein Data Bank id description: anitbody pdb id - id: heavy_chain type: Other + names: + - Fastq + - gene sequence description: anitbody heavy chain amino acid sequence in FASTA - id: light_chain type: Other + names: + - Fastq + - gene sequence description: anitbody light chain amino acid sequence in FASTA license: CC BY 4.0 links: diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py index 2f5c52af6..faddfbaaa 100644 --- a/data/sabdab_chen/transform.py +++ b/data/sabdab_chen/transform.py @@ -74,16 +74,28 @@ def get_and_transform_data(): { "id": "antibody_pdb_ID", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names":[ + "pdb id", + "Protein Data Bank id", + ], "description": "anitbody pdb id", # description (optional, except for "Other") }, { "id": "heavy_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names":[ + "Fastq", + "gene sequence", + ], "description": "anitbody heavy chain amino acid sequence in FASTA", # description (optional, except for "Other") }, { "id": "light_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names":[ + "Fastq", + "gene sequence", + ], "description": "anitbody light chain amino acid sequence in FASTA", # description (optional, except for "Other") }, ], diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index ae57d425b..bb2f1f58d 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -76,12 +76,22 @@ benchmarks: identifiers: - id: antibody_name type: Other + names: + - Name of the antibody + - Name of the antibody drug + - Name of drug description: anitbody name - id: heavy_chain type: Other + names: + - Fastq + - gene sequence description: anitbody heavy chain amino acid sequence - id: light_chain type: Other + names: + - Fastq + - gene sequence description: anitbody light chain amino acid sequence license: CC BY 4.0 links: diff --git a/data/tap/transform.py b/data/tap/transform.py index 352944dc2..4dacd2c92 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -186,16 +186,29 @@ def get_and_transform_data(): { "id": "antibody_name", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names":[ + "Name of the antibody", + "Name of the antibody drug", + "Name of drug" + ], "description": "anitbody name", # description (optional, except for "Other") }, { "id": "heavy_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names":[ + "Fastq", + "gene sequence", + ], "description": "anitbody heavy chain amino acid sequence", # description (optional, except for "Other") }, { "id": "light_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" + "names":[ + "Fastq", + "gene sequence", + ], "description": "anitbody light chain amino acid sequence", # description (optional, except for "Other") }, ], From c583f3b042dd2b897e3727893a8f6077ca044208 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Mon, 27 Mar 2023 17:13:20 +0200 Subject: [PATCH 23/28] Add files via upload --- data/sabdab_chen/transform.py | 1 - data/tap/transform.py | 11 +++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py index faddfbaaa..3af7f9737 100644 --- a/data/sabdab_chen/transform.py +++ b/data/sabdab_chen/transform.py @@ -5,7 +5,6 @@ def get_and_transform_data(): # get raw data - target_folder = "SAbDab_Chen" target_subfolder = "SAbDab_Chen" data = Develop(name=target_subfolder) diff --git a/data/tap/transform.py b/data/tap/transform.py index 4dacd2c92..883393d03 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -6,7 +6,6 @@ def get_and_transform_data(): # get raw data - target_folder = "TAP" target_subfolder = "TAP" label_list = retrieve_label_name_list(target_subfolder) data = Develop(name=target_subfolder, label_name=label_list[0]) @@ -96,7 +95,7 @@ def get_and_transform_data(): "targets": [ { "id": "CDR_Length", # name of the column in a tabular dataset - "description": "CDR Complementarity-determining regions length", # description of what this column means + "description": "CDR Complementarity-determining regions length", "units": "", # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) @@ -113,7 +112,7 @@ def get_and_transform_data(): }, { "id": "PSH", # name of the column in a tabular dataset - "description": "patches of surface hydrophobicity", # description of what this column means + "description": "patches of surface hydrophobicity", "units": "", # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) @@ -130,7 +129,7 @@ def get_and_transform_data(): }, { "id": "PPC", # name of the column in a tabular dataset - "description": "patches of positive charge", # description of what this column means + "description": "patches of positive charge", "units": "", # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) @@ -145,7 +144,7 @@ def get_and_transform_data(): }, { "id": "PNC", # name of the column in a tabular dataset - "description": "patches of negative charge", # description of what this column means + "description": "patches of negative charge", "units": "", # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) @@ -160,7 +159,7 @@ def get_and_transform_data(): }, { "id": "SFvCSP", # name of the column in a tabular dataset - "description": "structural Fv charge symmetry parameter", # description of what this column means + "description": "structural Fv charge symmetry parameter", "units": "", # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) From d7a6192184ed0c850af396f3abd0be46b2296fd6 Mon Sep 17 00:00:00 2001 From: Mohamed Abd Elaleem <109590482+phalem@users.noreply.github.com> Date: Wed, 29 Mar 2023 03:25:35 +0200 Subject: [PATCH 24/28] Add benchmark to transform.py and remove for other --- data/sabdab_chen/transform.py | 25 +++++++++++++++++++------ data/tap/meta.yaml | 4 ---- data/tap/transform.py | 7 ------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py index 3af7f9737..1d2f5aeca 100644 --- a/data/sabdab_chen/transform.py +++ b/data/sabdab_chen/transform.py @@ -6,12 +6,25 @@ def get_and_transform_data(): # get raw data target_subfolder = "SAbDab_Chen" - data = Develop(name=target_subfolder) + splits = Develop(name=target_subfolder).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + df = pd.concat([df_train, df_valid, df_test], axis=0) + + fn_data_raw = "data_raw.csv" + df.to_csv(fn_data_raw, index=False) + del df # proceed raw data - df = data.get_data() + df = pd.read_csv(fn_data_raw, sep=",") + + fields_orig = df.columns.tolist() - assert fields_orig == ["Antibody_ID", "Antibody", "Y"] + assert fields_orig == ["Antibody_ID", "Antibody", "Y","split"] fn_data_original = "data_original.csv" @@ -21,14 +34,14 @@ def get_and_transform_data(): ) df["heavy_chain"] = [s2l(x)[0] for x in antibody_list] df["light_chain"] = [s2l(x)[1] for x in antibody_list] - df = df[["Antibody_ID", "heavy_chain", "light_chain", "Y"]] + df = df[["Antibody_ID", "heavy_chain", "light_chain", "Y","split"]] df.to_csv(fn_data_original, index=False) # load raw data and assert columns df = pd.read_csv(fn_data_original, sep=",") fields_orig = df.columns.tolist() - assert fields_orig == ["Antibody_ID", "heavy_chain", "light_chain", "Y"] - fields_clean = ["antibody_pdb_ID", "heavy_chain", "light_chain", "developability"] + assert fields_orig == ["Antibody_ID", "heavy_chain", "light_chain", "Y","split"] + fields_clean = ["antibody_pdb_ID", "heavy_chain", "light_chain", "developability","split"] df.columns = fields_clean assert not df.duplicated().sum() diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index bb2f1f58d..d64b75dad 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -69,10 +69,6 @@ targets: uris: - https://rb.gy/uxyhc3 - https://rb.gy/b8cx8i -benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split identifiers: - id: antibody_name type: Other diff --git a/data/tap/transform.py b/data/tap/transform.py index 883393d03..33ef0b3a6 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -174,13 +174,6 @@ def get_and_transform_data(): ], }, ], - "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, - ], "identifiers": [ { "id": "antibody_name", # column name From 09f589960b1f4b9e50eb9b8a1d5dfd036362362b Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Tue, 18 Apr 2023 19:52:42 +0200 Subject: [PATCH 25/28] feat: sabdab_chen clean up --- data/sabdab_chen/meta.yaml | 149 +++++++++++++++++----------------- data/sabdab_chen/transform.py | 78 +++++++++--------- 2 files changed, 115 insertions(+), 112 deletions(-) diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml index 2d34f75b2..1ba866bbd 100644 --- a/data/sabdab_chen/meta.yaml +++ b/data/sabdab_chen/meta.yaml @@ -1,84 +1,83 @@ +--- name: sabdab_chen description: |- - Antibody data from Chen et al, where they process from the SAbDab. - From an initial dataset of 3816 antibodies, they retained 2426 antibodies that - satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data - Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, - and 3.have crystal structures with resolution < 3 A. The DI label is derived - from BIOVIA's pipelines. + Antibody data from Chen et al, where they process from the SAbDab. + From an initial dataset of 3816 antibodies, they retained 2426 antibodies that + satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data + Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, + and 3. have crystal structures with resolution < 0.3 nm. The DI label is derived + from BIOVIA's pipelines. targets: -- id: developability - description: functional antibody candidate to be developed into a manufacturable(1), - or not(0) - units: '' - type: categorical - names: - - antibody developability - - monoclonal anitbody - - functional antibody candidate - - manufacturable, stable, safe, and effective antibody drug - uris: - - https://rb.gy/idkdqp - - https://rb.gy/b8cx8i + - id: developability + description: functional antibody candidate to be developed into a manufacturable one (1) or not (0) + units: + type: boolean + names: + - developability + - developability of an antibody + - developable antibody + uris: benchmarks: -- name: TDC - link: https://tdcommons.ai/ - split_column: split + - name: TDC + link: https://tdcommons.ai/ + split_column: split identifiers: -- id: antibody_pdb_ID - type: Other - names: - - pdb id - - Protein Data Bank id - description: anitbody pdb id -- id: heavy_chain - type: Other - names: - - Fastq - - gene sequence - description: anitbody heavy chain amino acid sequence in FASTA -- id: light_chain - type: Other - names: - - Fastq - - gene sequence - description: anitbody light chain amino acid sequence in FASTA + - id: antibody_pdb_ID + type: Other + names: + - pdb id + - Protein Data Bank id + description: anitbody pdb id + - id: heavy_chain + type: Other + names: + - amino acid sequence + - heavy chain amino acid sequence + - heavy chain AA sequence + description: anitbody heavy chain amino acid sequence in FASTA + - id: light_chain + type: Other + names: + - amino acid sequence + - light chain amino acid sequence + - light chain AA sequence + description: anitbody light chain amino acid sequence in FASTA license: CC BY 4.0 links: -- url: https://doi.org/10.1101/2020.06.18.159798 - description: corresponding publication -- url: https://doi.org/10.1093/nar/gkt1043 - description: corresponding publication -- url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ - description: corresponding tools used -- url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al - description: data source + - url: https://doi.org/10.1101/2020.06.18.159798 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gkt1043 + description: corresponding publication + - url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/ + description: corresponding tools used + - url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al + description: data source num_points: 2409 bibtex: -- |- - @article{Chen2020, - doi = {10.1101/2020.06.18.159798}, - url = {https://doi.org/10.1101/2020.06.18.159798}, - year = {2020}, - month = jun, - publisher = {Cold Spring Harbor Laboratory}, - author = {Xingyao Chen and Thomas Dougherty and - Chan Hong and Rachel Schibler and Yi Cong Zhao and - Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, - title = {Predicting Antibody Developability from Sequence - using Machine Learning} -- |- - @article{Dunbar2013, - doi = {10.1093/nar/gkt1043}, - url = {https://doi.org/10.1093/nar/gkt1043}, - year = {2013}, - month = nov, - publisher = {Oxford University Press ({OUP})}, - volume = {42}, - number = {D1}, - pages = {D1140--D1146}, - author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem - and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and - Charlotte M. Deane}, - title = {SAbDab: the structural antibody database}, - journal = {Nucleic Acids Research} + - |- + @article{Chen2020, + doi = {10.1101/2020.06.18.159798}, + url = {https://doi.org/10.1101/2020.06.18.159798}, + year = {2020}, + month = jun, + publisher = {Cold Spring Harbor Laboratory}, + author = {Xingyao Chen and Thomas Dougherty and + Chan Hong and Rachel Schibler and Yi Cong Zhao and + Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman}, + title = {Predicting Antibody Developability from Sequence + using Machine Learning} + - |- + @article{Dunbar2013, + doi = {10.1093/nar/gkt1043}, + url = {https://doi.org/10.1093/nar/gkt1043}, + year = {2013}, + month = nov, + publisher = {Oxford University Press ({OUP})}, + volume = {42}, + number = {D1}, + pages = {D1140--D1146}, + author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem + and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and + Charlotte M. Deane}, + title = {SAbDab: the structural antibody database}, + journal = {Nucleic Acids Research} diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py index 1d2f5aeca..20704f489 100644 --- a/data/sabdab_chen/transform.py +++ b/data/sabdab_chen/transform.py @@ -22,26 +22,32 @@ def get_and_transform_data(): # proceed raw data df = pd.read_csv(fn_data_raw, sep=",") - fields_orig = df.columns.tolist() - assert fields_orig == ["Antibody_ID", "Antibody", "Y","split"] + assert fields_orig == ["Antibody_ID", "Antibody", "Y", "split"] fn_data_original = "data_original.csv" antibody_list = df.Antibody.tolist() - s2l = lambda list_string: list( - map(str.strip, list_string.strip("][").replace("'", "").split(",")) - ) + + def s2l(list_string): + return list(map(str.strip, list_string.strip("][").replace("'", "").split(","))) + df["heavy_chain"] = [s2l(x)[0] for x in antibody_list] df["light_chain"] = [s2l(x)[1] for x in antibody_list] - df = df[["Antibody_ID", "heavy_chain", "light_chain", "Y","split"]] + df = df[["Antibody_ID", "heavy_chain", "light_chain", "Y", "split"]] df.to_csv(fn_data_original, index=False) # load raw data and assert columns df = pd.read_csv(fn_data_original, sep=",") fields_orig = df.columns.tolist() - assert fields_orig == ["Antibody_ID", "heavy_chain", "light_chain", "Y","split"] - fields_clean = ["antibody_pdb_ID", "heavy_chain", "light_chain", "developability","split"] + assert fields_orig == ["Antibody_ID", "heavy_chain", "light_chain", "Y", "split"] + fields_clean = [ + "antibody_pdb_ID", + "heavy_chain", + "light_chain", + "developability", + "split", + ] df.columns = fields_clean assert not df.duplicated().sum() @@ -53,62 +59,60 @@ def get_and_transform_data(): "name": "sabdab_chen", # unique identifier, we will also use this for directory names "description": """Antibody data from Chen et al, where they process from the SAbDab. From an initial dataset of 3816 antibodies, they retained 2426 antibodies that -satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data +satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data Bank (PDB) structure files, 2. contain both a heavy chain and a light chain, -and 3.have crystal structures with resolution < 3 A. The DI label is derived +and 3. have crystal structures with resolution < 0.3 nm. The DI label is derived from BIOVIA's pipelines.""", "targets": [ { "id": "developability", # name of the column in a tabular dataset - "description": "functional antibody candidate to be developed into a manufacturable(1), or not(0)", - "units": "", # units of the values in this column (leave empty if unitless) - "type": "categorical", # can be "categorical", "ordinal", "continuous" + "description": "functional antibody candidate to be developed into a manufacturable one (1) or not (0)", + "units": None, # units of the values in this column (leave empty if unitless) + "type": "boolean", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "antibody developability", - "monoclonal anitbody", - "functional antibody candidate", - "manufacturable, stable, safe, and effective antibody drug", - ], - "uris": [ - "https://rb.gy/idkdqp", - "https://rb.gy/b8cx8i", + "developability", + "developability of an antibody", + "developable antibody", ], + "uris": None, }, ], "benchmarks": [ - { - "name": "TDC", # unique benchmark name - "link": "https://tdcommons.ai/", # benchmark URL - "split_column": "split", # name of the column that contains the split information - }, + { + "name": "TDC", # unique benchmark name + "link": "https://tdcommons.ai/", # benchmark URL + "split_column": "split", # name of the column that contains the split information + }, ], "identifiers": [ { "id": "antibody_pdb_ID", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "names":[ - "pdb id", - "Protein Data Bank id", + "names": [ + "pdb id", + "Protein Data Bank id", ], "description": "anitbody pdb id", # description (optional, except for "Other") }, { "id": "heavy_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "names":[ - "Fastq", - "gene sequence", + "names": [ + "amino acid sequence", + "heavy chain amino acid sequence", + "heavy chain AA sequence", ], - "description": "anitbody heavy chain amino acid sequence in FASTA", # description (optional, except for "Other") + "description": "anitbody heavy chain amino acid sequence in FASTA", }, { "id": "light_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "names":[ - "Fastq", - "gene sequence", + "names": [ + "amino acid sequence", + "light chain amino acid sequence", + "light chain AA sequence", ], - "description": "anitbody light chain amino acid sequence in FASTA", # description (optional, except for "Other") + "description": "anitbody light chain amino acid sequence in FASTA", }, ], "license": "CC BY 4.0", # license under which the original dataset was published From f141055df56d174195f37a137166906bdc97fa45 Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Wed, 19 Apr 2023 13:44:25 +0200 Subject: [PATCH 26/28] feat: tap clean up train split setup --- data/tap/meta.yaml | 215 +++++++++++++++++++++--------------------- data/tap/transform.py | 183 ++++++++++++++++++++--------------- 2 files changed, 217 insertions(+), 181 deletions(-) diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index d64b75dad..71df08444 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -1,114 +1,117 @@ +--- name: tap description: |- - Immunogenicity, instability, self-association, - high viscosity, polyspecificity, or poor expression can all preclude - an antibody from becoming a therapeutic. Early identification of these - negative characteristics is essential. Akin to the Lipinski guidelines, - which measure druglikeness in small molecules, - Therapeutic Antibody Profiler (TAP) highlights antibodies - that possess characteristics that are rare/unseen in - clinical-stage mAb therapeutics. + Immunogenicity, instability, self-association, + high viscosity, polyspecificity, or poor expression can all preclude + an antibody from becoming a therapeutic. Early identification of these + negative characteristics is essential. Akin to the Lipinski guidelines, + which measure druglikeness in small molecules, + Therapeutic Antibody Profiler (TAP) highlights antibodies + that possess characteristics that are rare/unseen in + clinical-stage mAb therapeutics. targets: -- id: CDR_Length - description: CDR Complementarity-determining regions length - units: '' - type: continuous - names: - - Antibody Complementarity-determining regions length - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/s9gv88 - - https://rb.gy/km77hq - - https://rb.gy/b8cx8i -- id: PSH - description: patches of surface hydrophobicity - units: '' - type: continuous - names: - - antibody patches of surface hydrophobicity - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/bchhaa - - https://rb.gy/2irr4l - - https://rb.gy/b8cx8i -- id: PPC - description: patches of positive charge - units: '' - type: continuous - names: - - patches of positive charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: PNC - description: patches of negative charge - units: '' - type: continuous - names: - - anitbody patches of negative charge - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/b8cx8i -- id: SFvCSP - description: structural Fv charge symmetry parameter - units: '' - type: continuous - names: - - antibody structural Fv charge symmetry parameter - - Therapeutic Antibody Profiler - - antibody developability - - monoclonal anitbody - uris: - - https://rb.gy/uxyhc3 - - https://rb.gy/b8cx8i + - id: CDR_Length + description: complementarity-determining regions (CDR) length + units: amino acids + type: continuous + names: + - antibody complementarity-determining regions length + - antibody complementarity-determining regions (CDR) length + - antibody CDR length + - complementarity-determining regions (CDR) length + - complementarity-determining regions length + - CDR length + uris: + - id: PSH + description: patches of surface hydrophobicity (PSH) score + units: + type: continuous + names: + - antibody patches of surface hydrophobicity (PSH) score + - antibody patches of surface hydrophobicity score + - antibody PSH score + - patches of surface hydrophobicity (PSH) score + - patches of surface hydrophobicity score + - PSH score + uris: + - id: PPC + description: patches of positive charge (PPC) score + units: + type: continuous + names: + - antibody patches of positive charge (PPC) score + - antibody patches of positive charge score + - antibody PPC score + - patches of positive charge (PPC) score + - patches of positive charge score + - PPC score + uris: + - id: PNC + description: patches of negative charge (PNC) score + units: + type: continuous + names: + - antibody patches of negative charge (PNC) score + - antibody patches of negative charge score + - antibody PNC score + - patches of negative charge (PNC) score + - patches of negative charge score + - PNC score + uris: + - id: SFvCSP + description: structural Fv charge symmetry parameter (SFvCSP) score + units: + type: continuous + names: + - antibody structural Fv charge symmetry parameter (SFvCSP) score + - antibody structural Fv charge symmetry parameter score + - antibody SFvCSP score + - structural Fv charge symmetry parameter (SFvCSP) score + - structural Fv charge symmetry parameter score + - SFvCSP score + uris: identifiers: -- id: antibody_name - type: Other - names: - - Name of the antibody - - Name of the antibody drug - - Name of drug - description: anitbody name -- id: heavy_chain - type: Other - names: - - Fastq - - gene sequence - description: anitbody heavy chain amino acid sequence -- id: light_chain - type: Other - names: - - Fastq - - gene sequence - description: anitbody light chain amino acid sequence + - id: antibody_name + type: Other + names: + - antibody name + - name of the antibody + - name of the antibody drug + description: antibody name + - id: heavy_chain + type: Other + names: + - amino acid sequence + - heavy chain amino acid sequence + - heavy chain AA sequence + description: antibody heavy chain amino acid sequence + - id: light_chain + type: Other + names: + - amino acid sequence + - light chain amino acid sequence + - light chain AA sequence + description: antibody light chain amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1073/pnas.1810576116 - description: corresponding publication -- url: https://tdcommons.ai/single_pred_tasks/develop/#tap - description: data source + - url: https://doi.org/10.1073/pnas.1810576116 + description: corresponding publication + - url: https://tdcommons.ai/single_pred_tasks/develop/#tap + description: data source num_points: 241 bibtex: -- |- - @article{Raybould2019, - doi = {10.1073/pnas.1810576116}, - url = {https://doi.org/10.1073/pnas.1810576116}, - year = {2019}, - month = feb, - publisher = {Proceedings of the National Academy of Sciences}, - volume = {116}, - number = {10}, - pages = {4025--4030}, - author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk - and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek - and Jiye Shi and Charlotte M. Deane}, - title = {Five computational developability guidelines for therapeutic antibody profiling}, - journal = {Proceedings of the National Academy of Sciences} + - |- + @article{Raybould2019, + doi = {10.1073/pnas.1810576116}, + url = {https://doi.org/10.1073/pnas.1810576116}, + year = {2019}, + month = feb, + publisher = {Proceedings of the National Academy of Sciences}, + volume = {116}, + number = {10}, + pages = {4025--4030}, + author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk + and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek + and Jiye Shi and Charlotte M. Deane}, + title = {Five computational developability guidelines for therapeutic antibody profiling}, + journal = {Proceedings of the National Academy of Sciences} diff --git a/data/tap/transform.py b/data/tap/transform.py index 33ef0b3a6..80b20a3ee 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -8,15 +8,47 @@ def get_and_transform_data(): # get raw data target_subfolder = "TAP" label_list = retrieve_label_name_list(target_subfolder) - data = Develop(name=target_subfolder, label_name=label_list[0]) + df = pd.DataFrame() + for i, label in enumerate(label_list): + print(f"Get data subset {label}:") + splits = Develop(name=target_subfolder, label_name=label).get_split() + df_train = splits["train"] + df_valid = splits["valid"] + df_test = splits["test"] + df_train["split"] = "train" + df_valid["split"] = "valid" + df_test["split"] = "test" + df_cat = pd.concat([df_train, df_valid, df_test], axis=0) + assert df_cat.columns.tolist() == ["Antibody_ID", "Antibody", "Y", "split"] + df_cat.columns = ["Antibody_ID", "Antibody", label, "split"] + if i > 0: + df = pd.merge(df, df_cat, on=["Antibody_ID", "Antibody", "split"]) + else: + df = df_cat + + fn_data_raw = "data_raw.csv" + df.to_csv(fn_data_raw, index=False) + del df + # proceed raw data - df = pd.read_csv("data/tap.tab", sep="\t") + df = pd.read_csv(fn_data_raw, sep=",") fields_orig = df.columns.tolist() - assert fields_orig == ["X", "ID", "CDR_Length", "PSH", "PPC", "PNC", "SFvCSP"] + + assert fields_orig == [ + "Antibody_ID", + "Antibody", + "CDR_Length", + "split", + "PSH", + "PPC", + "PNC", + "SFvCSP", + ] fields_clean = [ - "antibody_two_sequences", "antibody_name", + "antibody_sequences", "CDR_Length", + "split", "PSH", "PPC", "PNC", @@ -24,13 +56,14 @@ def get_and_transform_data(): ] df.columns = fields_clean # convert list columns to two columns - antibody_list = df.antibody_two_sequences.tolist() - s2l = lambda list_string: list( - map(str.strip, list_string.strip("][").replace("'", "").split(",")) - ) - antibody2list = lambda list_string: [ - x.strip() for x in s2l(list_string)[0].split("\\n") - ] + antibody_list = df.antibody_sequences.tolist() + + def s2l(list_string): + return list(map(str.strip, list_string.strip("][").replace("'", "").split(","))) + + def antibody2list(list_string): + return [x.strip() for x in s2l(list_string)[0].split("\\n")] + df["heavy_chain"] = [antibody2list(x)[0] for x in antibody_list] df["light_chain"] = [antibody2list(x)[1] for x in antibody_list] fn_data_original = "data_original.csv" @@ -40,9 +73,10 @@ def get_and_transform_data(): df = pd.read_csv(fn_data_original, sep=",") fields_orig = df.columns.tolist() assert fields_orig == [ - "antibody_two_sequences", "antibody_name", + "antibody_sequences", "CDR_Length", + "split", "PSH", "PPC", "PNC", @@ -57,6 +91,7 @@ def get_and_transform_data(): "heavy_chain", "light_chain", "CDR_Length", + "split", "PSH", "PPC", "PNC", @@ -68,6 +103,7 @@ def get_and_transform_data(): "heavy_chain", "light_chain", "CDR_Length", + "split", "PSH", "PPC", "PNC", @@ -95,113 +131,110 @@ def get_and_transform_data(): "targets": [ { "id": "CDR_Length", # name of the column in a tabular dataset - "description": "CDR Complementarity-determining regions length", - "units": "", # units of the values in this column (leave empty if unitless) + "description": "complementarity-determining regions (CDR) length", + "units": "amino acids", # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "Antibody Complementarity-determining regions length", - "Therapeutic Antibody Profiler", - "antibody developability", - "monoclonal anitbody", - ], - "uris": [ - "https://rb.gy/s9gv88", - "https://rb.gy/km77hq", - "https://rb.gy/b8cx8i", + "antibody complementarity-determining regions length", + "antibody complementarity-determining regions (CDR) length", + "antibody CDR length", + "complementarity-determining regions (CDR) length", + "complementarity-determining regions length", + "CDR length", ], + "uris": None, }, { "id": "PSH", # name of the column in a tabular dataset - "description": "patches of surface hydrophobicity", - "units": "", # units of the values in this column (leave empty if unitless) + "description": "patches of surface hydrophobicity (PSH) score", + "units": None, # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "antibody patches of surface hydrophobicity", - "Therapeutic Antibody Profiler", - "antibody developability", - "monoclonal anitbody", - ], - "uris": [ - "https://rb.gy/bchhaa", - "https://rb.gy/2irr4l", - "https://rb.gy/b8cx8i", + "antibody patches of surface hydrophobicity (PSH) score", + "antibody patches of surface hydrophobicity score", + "antibody PSH score", + "patches of surface hydrophobicity (PSH) score", + "patches of surface hydrophobicity score", + "PSH score", ], + "uris": None, }, { "id": "PPC", # name of the column in a tabular dataset - "description": "patches of positive charge", - "units": "", # units of the values in this column (leave empty if unitless) + "description": "patches of positive charge (PPC) score", + "units": None, # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "patches of positive charge", - "Therapeutic Antibody Profiler", - "antibody developability", - "monoclonal anitbody", - ], - "uris": [ - "https://rb.gy/b8cx8i", + "antibody patches of positive charge (PPC) score", + "antibody patches of positive charge score", + "antibody PPC score", + "patches of positive charge (PPC) score", + "patches of positive charge score", + "PPC score", ], + "uris": None, }, { "id": "PNC", # name of the column in a tabular dataset - "description": "patches of negative charge", - "units": "", # units of the values in this column (leave empty if unitless) + "description": "patches of negative charge (PNC) score", + "units": None, # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "anitbody patches of negative charge", - "Therapeutic Antibody Profiler", - "antibody developability", - "monoclonal anitbody", - ], - "uris": [ - "https://rb.gy/b8cx8i", + "antibody patches of negative charge (PNC) score", + "antibody patches of negative charge score", + "antibody PNC score", + "patches of negative charge (PNC) score", + "patches of negative charge score", + "PNC score", ], + "uris": None, }, { "id": "SFvCSP", # name of the column in a tabular dataset - "description": "structural Fv charge symmetry parameter", - "units": "", # units of the values in this column (leave empty if unitless) + "description": "structural Fv charge symmetry parameter (SFvCSP) score", + "units": None, # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "antibody structural Fv charge symmetry parameter", - "Therapeutic Antibody Profiler", - "antibody developability", - "monoclonal anitbody", - ], - "uris": [ - "https://rb.gy/uxyhc3", - "https://rb.gy/b8cx8i", + "antibody structural Fv charge symmetry parameter (SFvCSP) score", + "antibody structural Fv charge symmetry parameter score", + "antibody SFvCSP score", + "structural Fv charge symmetry parameter (SFvCSP) score", + "structural Fv charge symmetry parameter score", + "SFvCSP score", ], + "uris": None, }, ], "identifiers": [ { "id": "antibody_name", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "names":[ - "Name of the antibody", - "Name of the antibody drug", - "Name of drug" + "names": [ + "antibody name", + "name of the antibody", + "name of the antibody drug", ], - "description": "anitbody name", # description (optional, except for "Other") + "description": "antibody name", # description (optional, except for "Other") }, { "id": "heavy_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "names":[ - "Fastq", - "gene sequence", + "names": [ + "amino acid sequence", + "heavy chain amino acid sequence", + "heavy chain AA sequence", ], - "description": "anitbody heavy chain amino acid sequence", # description (optional, except for "Other") + "description": "antibody heavy chain amino acid sequence", # description (optional, except for "Other") }, { "id": "light_chain", # column name "type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "Other" - "names":[ - "Fastq", - "gene sequence", + "names": [ + "amino acid sequence", + "light chain amino acid sequence", + "light chain AA sequence", ], - "description": "anitbody light chain amino acid sequence", # description (optional, except for "Other") + "description": "antibody light chain amino acid sequence", # description (optional, except for "Other") }, ], "license": "CC BY 4.0", # license under which the original dataset was published From 9ba3e1c66051fa3a87820eb178bd66ad0852c8e9 Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Thu, 27 Apr 2023 14:21:16 +0200 Subject: [PATCH 27/28] feat: update new names setup for sabdab_chen --- data/sabdab_chen/meta.yaml | 5 ++--- data/sabdab_chen/transform.py | 7 ++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml index 1ba866bbd..5166a66e7 100644 --- a/data/sabdab_chen/meta.yaml +++ b/data/sabdab_chen/meta.yaml @@ -13,9 +13,8 @@ targets: units: type: boolean names: - - developability - - developability of an antibody - - developable antibody + - noun: functional antibody candidate to be developed into a manufacturable one + - noun: manufacturable and functional antibody candidate uris: benchmarks: - name: TDC diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py index 20704f489..044487609 100644 --- a/data/sabdab_chen/transform.py +++ b/data/sabdab_chen/transform.py @@ -70,9 +70,10 @@ def s2l(list_string): "units": None, # units of the values in this column (leave empty if unitless) "type": "boolean", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "developability", - "developability of an antibody", - "developable antibody", + { + "noun": "functional antibody candidate to be developed into a manufacturable one" + }, + {"noun": "manufacturable and functional antibody candidate"}, ], "uris": None, }, From aebcfd7cb7f3749c9c05f22466f74d7093e08f9d Mon Sep 17 00:00:00 2001 From: Michael Pieler Date: Thu, 27 Apr 2023 14:23:14 +0200 Subject: [PATCH 28/28] feat: update new names setup for tap --- data/tap/meta.yaml | 12 ++++++------ data/tap/transform.py | 14 ++++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml index 71df08444..cb3cc5d2c 100644 --- a/data/tap/meta.yaml +++ b/data/tap/meta.yaml @@ -15,12 +15,12 @@ targets: units: amino acids type: continuous names: - - antibody complementarity-determining regions length - - antibody complementarity-determining regions (CDR) length - - antibody CDR length - - complementarity-determining regions (CDR) length - - complementarity-determining regions length - - CDR length + - noun: antibody complementarity-determining regions length + - noun: antibody complementarity-determining regions (CDR) length + - noun: antibody CDR length + - noun: complementarity-determining regions (CDR) length + - noun: complementarity-determining regions length + - noun: CDR length uris: - id: PSH description: patches of surface hydrophobicity (PSH) score diff --git a/data/tap/transform.py b/data/tap/transform.py index 80b20a3ee..bcc9feabe 100644 --- a/data/tap/transform.py +++ b/data/tap/transform.py @@ -135,12 +135,14 @@ def antibody2list(list_string): "units": "amino acids", # units of the values in this column (leave empty if unitless) "type": "continuous", # can be "categorical", "ordinal", "continuous" "names": [ # names for the property (to sample from for building the prompts) - "antibody complementarity-determining regions length", - "antibody complementarity-determining regions (CDR) length", - "antibody CDR length", - "complementarity-determining regions (CDR) length", - "complementarity-determining regions length", - "CDR length", + {"noun": "antibody complementarity-determining regions length"}, + { + "noun": "antibody complementarity-determining regions (CDR) length" + }, + {"noun": "antibody CDR length"}, + {"noun": "complementarity-determining regions (CDR) length"}, + {"noun": "complementarity-determining regions length"}, + {"noun": "CDR length"}, ], "uris": None, },