OpenBioML · phalem · Mar 11, 2023 · Mar 11, 2023 · Mar 24, 2023 · Mar 24, 2023
diff --git a/data/sabdab_chen/meta.yaml b/data/sabdab_chen/meta.yaml
@@ -0,0 +1,82 @@
+---
+name: sabdab_chen
+description: |-
+    Antibody data from Chen et al, where they process from the SAbDab.
+    From an initial dataset of 3816 antibodies, they retained 2426 antibodies that
+    satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data
+    Bank (PDB) structure files, 2. contain both a heavy chain and a light chain,
+    and 3. have crystal structures with resolution < 0.3 nm. The DI label is derived
+    from BIOVIA's pipelines.
+targets:
+    - id: developability
+      description: functional antibody candidate to be developed into a manufacturable one (1) or not (0)
+      units:
+      type: boolean
+      names:
+          - noun: functional antibody candidate to be developed into a manufacturable one
+          - noun: manufacturable and functional antibody candidate
+      uris:
+benchmarks:
+    - name: TDC
+      link: https://tdcommons.ai/
+      split_column: split
+identifiers:
+    - id: antibody_pdb_ID
+      type: Other
+      names:
+          - pdb id
+          - Protein Data Bank id
+      description: anitbody pdb id
+    - id: heavy_chain
+      type: Other
+      names:
+          - amino acid sequence
+          - heavy chain amino acid sequence
+          - heavy chain AA sequence
+      description: anitbody heavy chain amino acid sequence in FASTA
+    - id: light_chain
+      type: Other
+      names:
+          - amino acid sequence
+          - light chain amino acid sequence
+          - light chain AA sequence
+      description: anitbody light chain amino acid sequence in FASTA
+license: CC BY 4.0
+links:
+    - url: https://doi.org/10.1101/2020.06.18.159798
+      description: corresponding publication
+    - url: https://doi.org/10.1093/nar/gkt1043
+      description: corresponding publication
+    - url: https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/
+      description: corresponding tools used
+    - url: https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al
+      description: data source
+num_points: 2409
+bibtex:
+    - |-
+      @article{Chen2020,
+      doi = {10.1101/2020.06.18.159798},
+      url = {https://doi.org/10.1101/2020.06.18.159798},
+      year = {2020},
+      month = jun,
+      publisher = {Cold Spring Harbor Laboratory},
+      author = {Xingyao Chen and Thomas Dougherty and
+      Chan Hong and Rachel Schibler and Yi Cong Zhao and
+      Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman},
+      title = {Predicting Antibody Developability from Sequence
+      using Machine Learning}
+    - |-
+      @article{Dunbar2013,
+      doi = {10.1093/nar/gkt1043},
+      url = {https://doi.org/10.1093/nar/gkt1043},
+      year = {2013},
+      month = nov,
+      publisher = {Oxford University Press ({OUP})},
+      volume = {42},
+      number = {D1},
+      pages = {D1140--D1146},
+      author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem
+      and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and
+      Charlotte M. Deane},
+      title = {SAbDab: the structural antibody database},
+      journal = {Nucleic Acids Research}
diff --git a/data/sabdab_chen/transform.py b/data/sabdab_chen/transform.py
@@ -0,0 +1,188 @@
+import pandas as pd
+import yaml
+from tdc.single_pred import Develop
+
+
+def get_and_transform_data():
+    # get raw data
+    target_subfolder = "SAbDab_Chen"
+    splits = Develop(name=target_subfolder).get_split()
+    df_train = splits["train"]
+    df_valid = splits["valid"]
+    df_test = splits["test"]
+    df_train["split"] = "train"
+    df_valid["split"] = "valid"
+    df_test["split"] = "test"
+    df = pd.concat([df_train, df_valid, df_test], axis=0)
+
+    fn_data_raw = "data_raw.csv"
+    df.to_csv(fn_data_raw, index=False)
+    del df
+
+    # proceed raw data
+    df = pd.read_csv(fn_data_raw, sep=",")
+
+    fields_orig = df.columns.tolist()
+    assert fields_orig == ["Antibody_ID", "Antibody", "Y", "split"]
+
+    fn_data_original = "data_original.csv"
+
+    antibody_list = df.Antibody.tolist()
+
+    def s2l(list_string):
+        return list(map(str.strip, list_string.strip("][").replace("'", "").split(",")))
+
+    df["heavy_chain"] = [s2l(x)[0] for x in antibody_list]
+    df["light_chain"] = [s2l(x)[1] for x in antibody_list]
+    df = df[["Antibody_ID", "heavy_chain", "light_chain", "Y", "split"]]
+    df.to_csv(fn_data_original, index=False)
+
+    #  load raw data and assert columns
+    df = pd.read_csv(fn_data_original, sep=",")
+    fields_orig = df.columns.tolist()
+    assert fields_orig == ["Antibody_ID", "heavy_chain", "light_chain", "Y", "split"]
+    fields_clean = [
+        "antibody_pdb_ID",
+        "heavy_chain",
+        "light_chain",
+        "developability",
+        "split",
+    ]
+    df.columns = fields_clean
+    assert not df.duplicated().sum()
+
+    # save to csv
+    fn_data_csv = "data_clean.csv"
+    df.to_csv(fn_data_csv, index=False)
+
+    meta = {
+        "name": "sabdab_chen",  # unique identifier, we will also use this for directory names
+        "description": """Antibody data from Chen et al, where they process from the SAbDab.
+From an initial dataset of 3816 antibodies, they retained 2426 antibodies that
+satisfy the following criteria: 1.have both sequence (FASTA) and Protein Data
+Bank (PDB) structure files, 2. contain both a heavy chain and a light chain,
+and 3. have crystal structures with resolution < 0.3 nm. The DI label is derived
+from BIOVIA's pipelines.""",
+        "targets": [
+            {
+                "id": "developability",  # name of the column in a tabular dataset
+                "description": "functional antibody candidate to be developed into a manufacturable one (1) or not (0)",
+                "units": None,  # units of the values in this column (leave empty if unitless)
+                "type": "boolean",  # can be "categorical", "ordinal", "continuous"
+                "names": [  # names for the property (to sample from for building the prompts)
+                    {
+                        "noun": "functional antibody candidate to be developed into a manufacturable one"
+                    },
+                    {"noun": "manufacturable and functional antibody candidate"},
+                ],
+                "uris": None,
+            },
+        ],
+        "benchmarks": [
+            {
+                "name": "TDC",  # unique benchmark name
+                "link": "https://tdcommons.ai/",  # benchmark URL
+                "split_column": "split",  # name of the column that contains the split information
+            },
+        ],
+        "identifiers": [
+            {
+                "id": "antibody_pdb_ID",  # column name
+                "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
+                "names": [
+                    "pdb id",
+                    "Protein Data Bank id",
+                ],
+                "description": "anitbody pdb id",  # description (optional, except for "Other")
+            },
+            {
+                "id": "heavy_chain",  # column name
+                "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
+                "names": [
+                    "amino acid sequence",
+                    "heavy chain amino acid sequence",
+                    "heavy chain AA sequence",
+                ],
+                "description": "anitbody heavy chain amino acid sequence in FASTA",
+            },
+            {
+                "id": "light_chain",  # column name
+                "type": "Other",  # can be "SMILES", "SELFIES", "IUPAC", "Other"
+                "names": [
+                    "amino acid sequence",
+                    "light chain amino acid sequence",
+                    "light chain AA sequence",
+                ],
+                "description": "anitbody light chain amino acid sequence in FASTA",
+            },
+        ],
+        "license": "CC BY 4.0",  # license under which the original dataset was published
+        "links": [  # list of relevant links (original dataset, other uses, etc.)
+            {
+                "url": "https://doi.org/10.1101/2020.06.18.159798",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://doi.org/10.1093/nar/gkt1043",
+                "description": "corresponding publication",
+            },
+            {
+                "url": "https://www.3ds.com/products-services/biovia/products/data-science/pipeline-pilot/",
+                "description": "corresponding tools used",
+            },
+            {
+                "url": "https://tdcommons.ai/single_pred_tasks/develop/#sabdab-chen-et-al",
+                "description": "data source",
+            },
+        ],
+        "num_points": len(df),  # number of datapoints in this dataset
+        "bibtex": [
+            """@article{Chen2020,
+doi = {10.1101/2020.06.18.159798},
+url = {https://doi.org/10.1101/2020.06.18.159798},
+year = {2020},
+month = jun,
+publisher = {Cold Spring Harbor Laboratory},
+author = {Xingyao Chen and Thomas Dougherty and
+Chan Hong and Rachel Schibler and Yi Cong Zhao and
+Reza Sadeghi and Naim Matasci and Yi-Chieh Wu and Ian Kerman},
+title = {Predicting Antibody Developability from Sequence
+using Machine Learning}""",
+            """@article{Dunbar2013,
+doi = {10.1093/nar/gkt1043},
+url = {https://doi.org/10.1093/nar/gkt1043},
+year = {2013},
+month = nov,
+publisher = {Oxford University Press ({OUP})},
+volume = {42},
+number = {D1},
+pages = {D1140--D1146},
+author = {James Dunbar and Konrad Krawczyk and Jinwoo Leem
+and Terry Baker and Angelika Fuchs and Guy Georges and Jiye Shi and
+Charlotte M. Deane},
+title = {SAbDab: the structural antibody database},
+journal = {Nucleic Acids Research}""",
+        ],
+    }
+
+    def str_presenter(dumper, data):
+        """configures yaml for dumping multiline strings
+        Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
+        """
+        if data.count("\n") > 0:  # check for multiline string
+            return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
+        return dumper.represent_scalar("tag:yaml.org,2002:str", data)
+
+    yaml.add_representer(str, str_presenter)
+    yaml.representer.SafeRepresenter.add_representer(
+        str, str_presenter
+    )  # to use with safe_dum
+    fn_meta = "meta.yaml"
+    with open(fn_meta, "w") as f:
+        yaml.dump(meta, f, sort_keys=False)
+
+    print(f"Finished processing {meta['name']} dataset!")
+
+
+if __name__ == "__main__":
+    get_and_transform_data()
diff --git a/data/tap/meta.yaml b/data/tap/meta.yaml
@@ -0,0 +1,117 @@
+---
+name: tap
+description: |-
+    Immunogenicity, instability, self-association,
+    high viscosity, polyspecificity, or poor expression can all preclude
+    an antibody from becoming a therapeutic. Early identification of these
+    negative characteristics is essential. Akin to the Lipinski guidelines,
+    which measure druglikeness in small molecules,
+    Therapeutic Antibody Profiler (TAP) highlights antibodies
+    that possess characteristics that are rare/unseen in
+    clinical-stage mAb therapeutics.
+targets:
+    - id: CDR_Length
+      description: complementarity-determining regions (CDR) length
+      units: amino acids
+      type: continuous
+      names:
+          - noun: antibody complementarity-determining regions length
+          - noun: antibody complementarity-determining regions (CDR) length
+          - noun: antibody CDR length
+          - noun: complementarity-determining regions (CDR) length
+          - noun: complementarity-determining regions length
+          - noun: CDR length
+      uris:
+    - id: PSH
+      description: patches of surface hydrophobicity (PSH) score
+      units:
+      type: continuous
+      names:
+          - antibody patches of surface hydrophobicity (PSH) score
+          - antibody patches of surface hydrophobicity score
+          - antibody PSH score
+          - patches of surface hydrophobicity (PSH) score
+          - patches of surface hydrophobicity score
+          - PSH score
+      uris:
+    - id: PPC
+      description: patches of positive charge (PPC) score
+      units:
+      type: continuous
+      names:
+          - antibody patches of positive charge (PPC) score
+          - antibody patches of positive charge score
+          - antibody PPC score
+          - patches of positive charge (PPC) score
+          - patches of positive charge score
+          - PPC score
+      uris:
+    - id: PNC
+      description: patches of negative charge (PNC) score
+      units:
+      type: continuous
+      names:
+          - antibody patches of negative charge (PNC) score
+          - antibody patches of negative charge score
+          - antibody PNC score
+          - patches of negative charge (PNC) score
+          - patches of negative charge score
+          - PNC score
+      uris:
+    - id: SFvCSP
+      description: structural Fv charge symmetry parameter (SFvCSP) score
+      units:
+      type: continuous
+      names:
+          - antibody structural Fv charge symmetry parameter (SFvCSP) score
+          - antibody structural Fv charge symmetry parameter score
+          - antibody SFvCSP score
+          - structural Fv charge symmetry parameter (SFvCSP) score
+          - structural Fv charge symmetry parameter score
+          - SFvCSP score
+      uris:
+identifiers:
+    - id: antibody_name
+      type: Other
+      names:
+          - antibody name
+          - name of the antibody
+          - name of the antibody drug
+      description: antibody name
+    - id: heavy_chain
+      type: Other
+      names:
+          - amino acid sequence
+          - heavy chain amino acid sequence
+          - heavy chain AA sequence
+      description: antibody heavy chain amino acid sequence
+    - id: light_chain
+      type: Other
+      names:
+          - amino acid sequence
+          - light chain amino acid sequence
+          - light chain AA sequence
+      description: antibody light chain amino acid sequence
+license: CC BY 4.0
+links:
+    - url: https://doi.org/10.1073/pnas.1810576116
+      description: corresponding publication
+    - url: https://tdcommons.ai/single_pred_tasks/develop/#tap
+      description: data source
+num_points: 241
+bibtex:
+    - |-
+      @article{Raybould2019,
+      doi = {10.1073/pnas.1810576116},
+      url = {https://doi.org/10.1073/pnas.1810576116},
+      year = {2019},
+      month = feb,
+      publisher = {Proceedings of the National Academy of Sciences},
+      volume = {116},
+      number = {10},
+      pages = {4025--4030},
+      author = {Matthew I. J. Raybould and Claire Marks and Konrad Krawczyk
+      and Bruck Taddese and Jaroslaw Nowak and Alan P. Lewis and Alexander Bujotzek
+      and Jiye Shi and Charlotte M. Deane},
+      title = {Five computational developability guidelines for therapeutic antibody profiling},
+      journal = {Proceedings of the National Academy of Sciences}