Remove usage of MMTF where possible

biotite-dev · Feb 16, 2024 · 7ede7b0 · 7ede7b0
1 parent 48f2280
commit 7ede7b0
Show file tree

Hide file tree

Showing 25 changed files with 183 additions and 184 deletions.
diff --git a/src/biotite/structure/bonds.pyx b/src/biotite/structure/bonds.pyx
@@ -1578,10 +1578,8 @@ def connect_via_residue_names(atoms, atom_mask=None, bint inter_residue=True):
 
     Notes
     -----
-    If obtaining the bonds from an *MMTF* file is not possible, this is
-    the recommended way to obtain :class:`BondList` for a structure.
-    However, this method can only find bonds for residues in the RCSB
-    ``components.cif`` dataset.
+    This method can only find bonds for residues in the RCSB
+    *Chemical Component Dictionary*.
     Although this includes most molecules one encounters, this will fail
     for exotic molecules, e.g. specialized inhibitors.
     """

diff --git a/src/biotite/structure/filter.py b/src/biotite/structure/filter.py
@@ -10,9 +10,9 @@
 __name__ = "biotite.structure"
 __author__ = "Patrick Kunzmann, Tom David Müller"
 __all__ = ["filter_solvent", "filter_monoatomic_ions", "filter_nucleotides",
-           "filter_canonical_nucleotides", "filter_amino_acids", 
-           "filter_canonical_amino_acids", "filter_carbohydrates", 
-           "filter_backbone", "filter_intersection", "filter_first_altloc", 
+           "filter_canonical_nucleotides", "filter_amino_acids",
+           "filter_canonical_amino_acids", "filter_carbohydrates",
+           "filter_backbone", "filter_intersection", "filter_first_altloc",
            "filter_highest_occupancy_altloc", "filter_peptide_backbone",
            "filter_phosphate_backbone", "filter_linear_bond_continuity",
            "filter_polymer"]
@@ -118,13 +118,13 @@ def filter_nucleotides(array):
 
     Notes
     -----
-    Nucleotides are identified according to the PDB chemical component 
+    Nucleotides are identified according to the PDB chemical component
     dictionary. A residue is considered a nucleotide if it its
     ``_chem_comp.type`` property has one of the following values (case
     insensitive):
 
-    ``DNA LINKING``, ``DNA OH 3 PRIME TERMINUS``, 
-    ``DNA OH 5 PRIME TERMINUS``, ``L-DNA LINKING``, ``L-RNA LINKING``, 
+    ``DNA LINKING``, ``DNA OH 3 PRIME TERMINUS``,
+    ``DNA OH 5 PRIME TERMINUS``, ``L-DNA LINKING``, ``L-RNA LINKING``,
     ``RNA LINKING``, ``RNA OH 3 PRIME TERMINUS``,
     ``RNA OH 5 PRIME TERMINUS``
     """
@@ -133,7 +133,7 @@ def filter_nucleotides(array):
 
 def filter_canonical_amino_acids(array):
     """
-    Filter all atoms of one array that belong to canonical amino acid 
+    Filter all atoms of one array that belong to canonical amino acid
     residues.
 
     Parameters
@@ -164,20 +164,20 @@ def filter_amino_acids(array):
     filter : ndarray, dtype=bool
         This array is `True` for all indices in `array`, where the atom
         belongs to an amino acid residue.
-    
+
     Notes
     -----
-    Amino acids are identified according to the PDB chemical component 
+    Amino acids are identified according to the PDB chemical component
     dictionary. A residue is considered an amino acid if it its
     ``_chem_comp.type`` property has one of the following values (case
     insensitive):
 
-    ``D-BETA-PEPTIDE``, ``C-GAMMA LINKING``, ``D-GAMMA-PEPTIDE``, 
-    ``C-DELTA LINKING``, ``D-PEPTIDE LINKING``, 
-    ``D-PEPTIDE NH3 AMINO TERMINUS``, 
-    ``L-BETA-PEPTIDE, C-GAMMA LINKING``, 
-    ``L-GAMMA-PEPTIDE, C-DELTA LINKING``, 
-    ``L-PEPTIDE COOH CARBOXY TERMINUS``, ``L-PEPTIDE LINKING``, 
+    ``D-BETA-PEPTIDE``, ``C-GAMMA LINKING``, ``D-GAMMA-PEPTIDE``,
+    ``C-DELTA LINKING``, ``D-PEPTIDE LINKING``,
+    ``D-PEPTIDE NH3 AMINO TERMINUS``,
+    ``L-BETA-PEPTIDE, C-GAMMA LINKING``,
+    ``L-GAMMA-PEPTIDE, C-DELTA LINKING``,
+    ``L-PEPTIDE COOH CARBOXY TERMINUS``, ``L-PEPTIDE LINKING``,
     ``L-PEPTIDE NH3 AMINO TERMINUS``, ``PEPTIDE LINKING``
     """
     return np.isin(array.res_name, _amino_acid_list)
@@ -197,17 +197,17 @@ def filter_carbohydrates(array):
     filter : ndarray, dtype=bool
         This array is `True` for all indices in `array`, where the atom
         belongs to a carbohydrate.
-    
+
     Notes
     -----
-    Carbohydrates are identified according to the PDB chemical component 
+    Carbohydrates are identified according to the PDB chemical component
     dictionary. A residue is considered a carbohydrate if it its
     ``_chem_comp.type`` property has one of the following values (case
     insensitive):
 
-    ``D-SACCHARIDE``, ``D-SACCHARIDE,ALPHA LINKING``, 
-    ``D-SACCHARIDE, BETA LINKING``, ``L-SACCHARIDE``, 
-    ``L-SACCHARIDE, ALPHA LINKING``, ``L-SACCHARIDE, BETA LINKING``, 
+    ``D-SACCHARIDE``, ``D-SACCHARIDE,ALPHA LINKING``,
+    ``D-SACCHARIDE, BETA LINKING``, ``L-SACCHARIDE``,
+    ``L-SACCHARIDE, ALPHA LINKING``, ``L-SACCHARIDE, BETA LINKING``,
     ``SACCHARIDE``
     """
     return np.isin(array.res_name, _carbohydrate_list)
@@ -299,7 +299,7 @@ def filter_linear_bond_continuity(array, min_len=1.2, max_len=1.8):
 
     The result will depend on the atoms' order.
     For instance, consider a molecule::
-    
+
            C3
            |
         C1-C2-C4
@@ -323,7 +323,7 @@ def filter_linear_bond_continuity(array, min_len=1.2, max_len=1.8):
         This array is `True` for all indices in `array`, where an atom
         has a bond length with the next atom within [`min_len`, `max_len`]
         boundaries.
-        
+
     Notes
     -----
     Note that this function purely uses distances between consecutive atoms.
@@ -438,7 +438,7 @@ def filter_first_altloc(atoms, altloc_ids):
     Filter all atoms, that have the first *altloc* ID appearing in a
     residue.
 
-    Structure files (PDB, PDBx, MMTF) allow for duplicate atom records,
+    Structure files (PDB, PDBx) allow for duplicate atom records,
     in case a residue is found in multiple alternate locations
     (*altloc*).
     This function is used to remove such duplicate atoms by choosing a
@@ -507,7 +507,7 @@ def filter_highest_occupancy_altloc(atoms, altloc_ids, occupancies):
     For each residue, filter all atoms, that have the *altloc* ID
     with the highest occupancy for this residue.
 
-    Structure files (PDB, PDBx, MMTF) allow for duplicate atom records,
+    Structure files (PDB, PDBx) allow for duplicate atom records,
     in case a residue is found in multiple alternate locations
     (*altloc*).
     This function is used to remove such duplicate atoms by choosing a

diff --git a/src/biotite/structure/io/__init__.py b/src/biotite/structure/io/__init__.py
@@ -15,10 +15,8 @@
 atoms may be lower in the atom array (stack) than in respective
 structure file.
 
-The recommended format for reading structure files is MMTF.
+The recommended format for reading structure files is *BinaryCIF*.
 It has by far the shortest parsing time and file size.
-Furthermore, chemical bond information can be read from MMTF files
-as :class:`BondList` instances.
 
 Besides the mentioned structure formats, Gromacs trajectory files can be
 loaded, if `mdtraj` is installed.

diff --git a/tests/application/data/2rtg.bcif b/tests/application/data/2rtg.bcif
diff --git a/tests/application/data/2rtg.mmtf b/tests/application/data/2rtg.mmtf
diff --git a/tests/application/test_autodock.py b/tests/application/test_autodock.py
@@ -7,7 +7,7 @@
 import pytest
 import biotite.structure as struc
 import biotite.structure.info as info
-import biotite.structure.io.mmtf as mmtf
+import biotite.structure.io.pdbx as pdbx
 from biotite.application.autodock import VinaApp
 from ..util import data_dir, is_not_installed
 
@@ -24,9 +24,11 @@ def test_docking(flexible):
     PDB structure.
     """
     # A structure of a straptavidin-biotin complex
-    mmtf_file = mmtf.MMTFFile.read(join(data_dir("application"), "2rtg.mmtf"))
-    structure = mmtf.get_structure(
-        mmtf_file, model=1, extra_fields=["charge"], include_bonds=True
+    pdbx_file = pdbx.BinaryCIFFile.read(
+        join(data_dir("application"), "2rtg.bcif")
+    )
+    structure = pdbx.get_structure(
+        pdbx_file, model=1, extra_fields=["charge"], include_bonds=True
     )
     structure = structure[structure.chain_id == "B"]
     receptor = structure[struc.filter_amino_acids(structure)]

diff --git a/tests/application/test_dssp.py b/tests/application/test_dssp.py
@@ -9,14 +9,14 @@
 import pytest
 import biotite.structure as struc
 import biotite.structure.io as strucio
-import biotite.structure.io.mmtf as mmtf
+import biotite.structure.io.pdbx as pdbx
 from biotite.application.dssp import DsspApp
 from ..util import data_dir, is_not_installed
 
 
 @pytest.mark.skipif(is_not_installed("mkdssp"), reason="DSSP is not installed")
 @pytest.mark.parametrize(
-    "path", glob.glob(join(data_dir("structure"), "*.mmtf"))
+    "path", glob.glob(join(data_dir("structure"), "*.bcif"))
 )
 def test_dssp(path):
     sec_struct_codes = {0 : "I",
@@ -28,16 +28,16 @@ def test_dssp(path):
                         6 : "T",
                         7 : "C"}
 
-    mmtf_file = mmtf.MMTFFile.read(path)
-    array = mmtf.get_structure(mmtf_file, model=1)
+    pdbx_file = pdbx.BinaryCIFFile.read(path)
+    array = pdbx.get_structure(pdbx_file, model=1)
     array = array[array.hetero == False]
     first_chain_id = array.chain_id[0]
     chain = array[array.chain_id == first_chain_id]
 
     n_residues = struc.get_residue_count(chain)
     # Secondary structure annotation in PDB use also DSSP
     # -> compare PDB and local DSSP
-    sse = mmtf_file["secStructList"]
+    sse = pdbx_file["secStructList"]
     sse = sse[:n_residues]
     if (sse == -1).any():
         # First chain is not a pure polypeptide chain
@@ -46,7 +46,7 @@ def test_dssp(path):
         return
     sse = np.array([sec_struct_codes[code] for code in sse],
                     dtype="U1")
-    
+
     chain = array[array.chain_id == first_chain_id]
     sse_from_app = DsspApp.annotate_sse(chain)
     # PDB uses different DSSP version -> slight differences possible
@@ -62,7 +62,7 @@ def test_multiple_chains():
     )
     atoms = atoms[struc.filter_canonical_amino_acids(atoms)]
     sse = DsspApp.annotate_sse(atoms)
-    assert np.all(np.isin(sse, ["C", "H", "B", "E", "G", "I", "T", "S"])) 
+    assert np.all(np.isin(sse, ["C", "H", "B", "E", "G", "I", "T", "S"]))
     assert len(sse) == struc.get_residue_count(atoms)
 
 

diff --git a/tests/structure/data/base_pairs/1bna.mmtf b/tests/structure/data/base_pairs/1bna.mmtf
diff --git a/tests/structure/test_basepairs.py b/tests/structure/test_basepairs.py
@@ -137,13 +137,13 @@ def test_base_pairs_incomplete_structure(nuc_sample_array):
     Remove atoms belonging to the pyrimidine / purine ring of each base
     and the ``O2`` atom contained in pyrimidine bases.
 
-    Test that no base pairs are detected as all bases have less than 3 
-    common atoms with their implemented reference base. 
+    Test that no base pairs are detected as all bases have less than 3
+    common atoms with their implemented reference base.
     """
-    
+
     nuc_sample_array = nuc_sample_array[
         ~ np.isin(
-            nuc_sample_array.atom_name, 
+            nuc_sample_array.atom_name,
             ['N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N7', 'C8', 'N9', 'O2']
         )
     ]
@@ -324,7 +324,7 @@ def test_base_stacking():
     """
     # Load the test structure (1BNA) - a DNA-double-helix
     helix = strucio.load_structure(
-        join(data_dir("structure"), "base_pairs", "1bna.mmtf")
+        join(data_dir("structure"), "base_pairs", "1bna.cif")
     )
 
     residue_starts = struc.get_residue_starts(helix)