Skip to content

Commit

Permalink
Remove usage of MMTF where possible
Browse files Browse the repository at this point in the history
  • Loading branch information
padix-key committed Feb 16, 2024
1 parent 48f2280 commit 7ede7b0
Show file tree
Hide file tree
Showing 25 changed files with 183 additions and 184 deletions.
6 changes: 2 additions & 4 deletions src/biotite/structure/bonds.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1578,10 +1578,8 @@ def connect_via_residue_names(atoms, atom_mask=None, bint inter_residue=True):
Notes
-----
If obtaining the bonds from an *MMTF* file is not possible, this is
the recommended way to obtain :class:`BondList` for a structure.
However, this method can only find bonds for residues in the RCSB
``components.cif`` dataset.
This method can only find bonds for residues in the RCSB
*Chemical Component Dictionary*.
Although this includes most molecules one encounters, this will fail
for exotic molecules, e.g. specialized inhibitors.
"""
Expand Down
48 changes: 24 additions & 24 deletions src/biotite/structure/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
__name__ = "biotite.structure"
__author__ = "Patrick Kunzmann, Tom David Müller"
__all__ = ["filter_solvent", "filter_monoatomic_ions", "filter_nucleotides",
"filter_canonical_nucleotides", "filter_amino_acids",
"filter_canonical_amino_acids", "filter_carbohydrates",
"filter_backbone", "filter_intersection", "filter_first_altloc",
"filter_canonical_nucleotides", "filter_amino_acids",
"filter_canonical_amino_acids", "filter_carbohydrates",
"filter_backbone", "filter_intersection", "filter_first_altloc",
"filter_highest_occupancy_altloc", "filter_peptide_backbone",
"filter_phosphate_backbone", "filter_linear_bond_continuity",
"filter_polymer"]
Expand Down Expand Up @@ -118,13 +118,13 @@ def filter_nucleotides(array):
Notes
-----
Nucleotides are identified according to the PDB chemical component
Nucleotides are identified according to the PDB chemical component
dictionary. A residue is considered a nucleotide if it its
``_chem_comp.type`` property has one of the following values (case
insensitive):
``DNA LINKING``, ``DNA OH 3 PRIME TERMINUS``,
``DNA OH 5 PRIME TERMINUS``, ``L-DNA LINKING``, ``L-RNA LINKING``,
``DNA LINKING``, ``DNA OH 3 PRIME TERMINUS``,
``DNA OH 5 PRIME TERMINUS``, ``L-DNA LINKING``, ``L-RNA LINKING``,
``RNA LINKING``, ``RNA OH 3 PRIME TERMINUS``,
``RNA OH 5 PRIME TERMINUS``
"""
Expand All @@ -133,7 +133,7 @@ def filter_nucleotides(array):

def filter_canonical_amino_acids(array):
"""
Filter all atoms of one array that belong to canonical amino acid
Filter all atoms of one array that belong to canonical amino acid
residues.
Parameters
Expand Down Expand Up @@ -164,20 +164,20 @@ def filter_amino_acids(array):
filter : ndarray, dtype=bool
This array is `True` for all indices in `array`, where the atom
belongs to an amino acid residue.
Notes
-----
Amino acids are identified according to the PDB chemical component
Amino acids are identified according to the PDB chemical component
dictionary. A residue is considered an amino acid if it its
``_chem_comp.type`` property has one of the following values (case
insensitive):
``D-BETA-PEPTIDE``, ``C-GAMMA LINKING``, ``D-GAMMA-PEPTIDE``,
``C-DELTA LINKING``, ``D-PEPTIDE LINKING``,
``D-PEPTIDE NH3 AMINO TERMINUS``,
``L-BETA-PEPTIDE, C-GAMMA LINKING``,
``L-GAMMA-PEPTIDE, C-DELTA LINKING``,
``L-PEPTIDE COOH CARBOXY TERMINUS``, ``L-PEPTIDE LINKING``,
``D-BETA-PEPTIDE``, ``C-GAMMA LINKING``, ``D-GAMMA-PEPTIDE``,
``C-DELTA LINKING``, ``D-PEPTIDE LINKING``,
``D-PEPTIDE NH3 AMINO TERMINUS``,
``L-BETA-PEPTIDE, C-GAMMA LINKING``,
``L-GAMMA-PEPTIDE, C-DELTA LINKING``,
``L-PEPTIDE COOH CARBOXY TERMINUS``, ``L-PEPTIDE LINKING``,
``L-PEPTIDE NH3 AMINO TERMINUS``, ``PEPTIDE LINKING``
"""
return np.isin(array.res_name, _amino_acid_list)
Expand All @@ -197,17 +197,17 @@ def filter_carbohydrates(array):
filter : ndarray, dtype=bool
This array is `True` for all indices in `array`, where the atom
belongs to a carbohydrate.
Notes
-----
Carbohydrates are identified according to the PDB chemical component
Carbohydrates are identified according to the PDB chemical component
dictionary. A residue is considered a carbohydrate if it its
``_chem_comp.type`` property has one of the following values (case
insensitive):
``D-SACCHARIDE``, ``D-SACCHARIDE,ALPHA LINKING``,
``D-SACCHARIDE, BETA LINKING``, ``L-SACCHARIDE``,
``L-SACCHARIDE, ALPHA LINKING``, ``L-SACCHARIDE, BETA LINKING``,
``D-SACCHARIDE``, ``D-SACCHARIDE,ALPHA LINKING``,
``D-SACCHARIDE, BETA LINKING``, ``L-SACCHARIDE``,
``L-SACCHARIDE, ALPHA LINKING``, ``L-SACCHARIDE, BETA LINKING``,
``SACCHARIDE``
"""
return np.isin(array.res_name, _carbohydrate_list)
Expand Down Expand Up @@ -299,7 +299,7 @@ def filter_linear_bond_continuity(array, min_len=1.2, max_len=1.8):
The result will depend on the atoms' order.
For instance, consider a molecule::
C3
|
C1-C2-C4
Expand All @@ -323,7 +323,7 @@ def filter_linear_bond_continuity(array, min_len=1.2, max_len=1.8):
This array is `True` for all indices in `array`, where an atom
has a bond length with the next atom within [`min_len`, `max_len`]
boundaries.
Notes
-----
Note that this function purely uses distances between consecutive atoms.
Expand Down Expand Up @@ -438,7 +438,7 @@ def filter_first_altloc(atoms, altloc_ids):
Filter all atoms, that have the first *altloc* ID appearing in a
residue.
Structure files (PDB, PDBx, MMTF) allow for duplicate atom records,
Structure files (PDB, PDBx) allow for duplicate atom records,
in case a residue is found in multiple alternate locations
(*altloc*).
This function is used to remove such duplicate atoms by choosing a
Expand Down Expand Up @@ -507,7 +507,7 @@ def filter_highest_occupancy_altloc(atoms, altloc_ids, occupancies):
For each residue, filter all atoms, that have the *altloc* ID
with the highest occupancy for this residue.
Structure files (PDB, PDBx, MMTF) allow for duplicate atom records,
Structure files (PDB, PDBx) allow for duplicate atom records,
in case a residue is found in multiple alternate locations
(*altloc*).
This function is used to remove such duplicate atoms by choosing a
Expand Down
4 changes: 1 addition & 3 deletions src/biotite/structure/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@
atoms may be lower in the atom array (stack) than in respective
structure file.
The recommended format for reading structure files is MMTF.
The recommended format for reading structure files is *BinaryCIF*.
It has by far the shortest parsing time and file size.
Furthermore, chemical bond information can be read from MMTF files
as :class:`BondList` instances.
Besides the mentioned structure formats, Gromacs trajectory files can be
loaded, if `mdtraj` is installed.
Expand Down
Binary file added tests/application/data/2rtg.bcif
Binary file not shown.
Binary file removed tests/application/data/2rtg.mmtf
Binary file not shown.
10 changes: 6 additions & 4 deletions tests/application/test_autodock.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pytest
import biotite.structure as struc
import biotite.structure.info as info
import biotite.structure.io.mmtf as mmtf
import biotite.structure.io.pdbx as pdbx
from biotite.application.autodock import VinaApp
from ..util import data_dir, is_not_installed

Expand All @@ -24,9 +24,11 @@ def test_docking(flexible):
PDB structure.
"""
# A structure of a straptavidin-biotin complex
mmtf_file = mmtf.MMTFFile.read(join(data_dir("application"), "2rtg.mmtf"))
structure = mmtf.get_structure(
mmtf_file, model=1, extra_fields=["charge"], include_bonds=True
pdbx_file = pdbx.BinaryCIFFile.read(
join(data_dir("application"), "2rtg.bcif")
)
structure = pdbx.get_structure(
pdbx_file, model=1, extra_fields=["charge"], include_bonds=True
)
structure = structure[structure.chain_id == "B"]
receptor = structure[struc.filter_amino_acids(structure)]
Expand Down
14 changes: 7 additions & 7 deletions tests/application/test_dssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
import pytest
import biotite.structure as struc
import biotite.structure.io as strucio
import biotite.structure.io.mmtf as mmtf
import biotite.structure.io.pdbx as pdbx
from biotite.application.dssp import DsspApp
from ..util import data_dir, is_not_installed


@pytest.mark.skipif(is_not_installed("mkdssp"), reason="DSSP is not installed")
@pytest.mark.parametrize(
"path", glob.glob(join(data_dir("structure"), "*.mmtf"))
"path", glob.glob(join(data_dir("structure"), "*.bcif"))
)
def test_dssp(path):
sec_struct_codes = {0 : "I",
Expand All @@ -28,16 +28,16 @@ def test_dssp(path):
6 : "T",
7 : "C"}

mmtf_file = mmtf.MMTFFile.read(path)
array = mmtf.get_structure(mmtf_file, model=1)
pdbx_file = pdbx.BinaryCIFFile.read(path)
array = pdbx.get_structure(pdbx_file, model=1)
array = array[array.hetero == False]
first_chain_id = array.chain_id[0]
chain = array[array.chain_id == first_chain_id]

n_residues = struc.get_residue_count(chain)
# Secondary structure annotation in PDB use also DSSP
# -> compare PDB and local DSSP
sse = mmtf_file["secStructList"]
sse = pdbx_file["secStructList"]
sse = sse[:n_residues]
if (sse == -1).any():
# First chain is not a pure polypeptide chain
Expand All @@ -46,7 +46,7 @@ def test_dssp(path):
return
sse = np.array([sec_struct_codes[code] for code in sse],
dtype="U1")

chain = array[array.chain_id == first_chain_id]
sse_from_app = DsspApp.annotate_sse(chain)
# PDB uses different DSSP version -> slight differences possible
Expand All @@ -62,7 +62,7 @@ def test_multiple_chains():
)
atoms = atoms[struc.filter_canonical_amino_acids(atoms)]
sse = DsspApp.annotate_sse(atoms)
assert np.all(np.isin(sse, ["C", "H", "B", "E", "G", "I", "T", "S"]))
assert np.all(np.isin(sse, ["C", "H", "B", "E", "G", "I", "T", "S"]))
assert len(sse) == struc.get_residue_count(atoms)


Expand Down
Binary file removed tests/structure/data/base_pairs/1bna.mmtf
Binary file not shown.
10 changes: 5 additions & 5 deletions tests/structure/test_basepairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,13 +137,13 @@ def test_base_pairs_incomplete_structure(nuc_sample_array):
Remove atoms belonging to the pyrimidine / purine ring of each base
and the ``O2`` atom contained in pyrimidine bases.
Test that no base pairs are detected as all bases have less than 3
common atoms with their implemented reference base.
Test that no base pairs are detected as all bases have less than 3
common atoms with their implemented reference base.
"""

nuc_sample_array = nuc_sample_array[
~ np.isin(
nuc_sample_array.atom_name,
nuc_sample_array.atom_name,
['N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N7', 'C8', 'N9', 'O2']
)
]
Expand Down Expand Up @@ -324,7 +324,7 @@ def test_base_stacking():
"""
# Load the test structure (1BNA) - a DNA-double-helix
helix = strucio.load_structure(
join(data_dir("structure"), "base_pairs", "1bna.mmtf")
join(data_dir("structure"), "base_pairs", "1bna.cif")
)

residue_starts = struc.get_residue_starts(helix)
Expand Down
Loading

0 comments on commit 7ede7b0

Please sign in to comment.