Skip to content

Commit

Permalink
add data utils
Browse files Browse the repository at this point in the history
  • Loading branch information
hypnopump committed May 17, 2021
1 parent 85ad19e commit c7a17fa
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 204 deletions.
46 changes: 46 additions & 0 deletions mp_nerf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,52 @@
import numpy as np
from einops import repeat, rearrange

# data utils
def get_prot(dataloader_=None, vocab_=None, min_len=80, max_len=150, verbose=True):
""" Gets a protein from sidechainnet and returns
the right attrs for training.
Inputs:
* dataloader_: sidechainnet iterator over dataset
* vocab_: sidechainnet VOCAB class
* min_len: int. minimum sequence length
* max_len: int. maximum sequence length
* verbose: bool. verbosity level
Outputs: (cleaned, without padding)
(seq_str, int_seq, coords, angles, padding_seq, mask, pid)
"""
for b,batch in enumerate(dataloader_['train']):
# try for breaking from 2 loops at once
try:
for i in range(batch.int_seqs.shape[0]):
# strip padding padding
padding_seq = (batch.int_seqs[i] == 20).sum().item()
padding_angles = (torch.abs(batch.angs[i]).sum(dim=-1) == 0).long().sum().item()

if padding_seq == padding_angles:
# check for appropiate length
real_len = batch.int_seqs[i].shape[0] - padding_seq
if max_len >= real_len >= min_len:
# strip padding tokens
seq = ''.join([vocab_.int2char(aa) for aa in batch.int_seqs[i].numpy()])
seq = seq[:-padding_seq or None]
int_seq = batch.int_seqs[i][:-padding_seq or None]
angles = batch.angs[i][:-padding_seq or None]
mask = batch.msks[i][:-padding_seq or None]
coords = batch.crds[i][:-padding_seq*14 or None]

print("stopping at sequence of length", real_len)
raise StopIteration
else:
# print("found a seq of length:", len(seq),
# "but oustide the threshold:", min_len, max_len)
pass

except StopIteration:
break

return seq, int_seq, coords, angles, padding_seq, mask, batch.pids[i]


######################
## structural utils ##
######################
Expand Down
58 changes: 3 additions & 55 deletions notebooks/extend_measures.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -87,47 +87,6 @@
"# created='Sep 20, 2020')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "finnish-insulin",
"metadata": {},
"outputs": [],
"source": [
"def get_prot(min_len=80, max_len=150, verbose=0):\n",
" \"\"\" Gets a protein from sidechainnet and returns the right attrs for training. \"\"\"\n",
" for batch in dataloaders['train']:\n",
" real_seqs = [''.join([VOCAB.int2char(aa) for aa in seq]) for seq in batch.int_seqs.numpy()]\n",
" # print(len(real_seqs[0]))\n",
" try:\n",
" for i in range(len(batch.int_seqs.numpy())):\n",
" # get variables\n",
" seq = real_seqs[i]\n",
" int_seq = batch.int_seqs[i]\n",
" angles = batch.angs[i]\n",
" # get padding\n",
" padding_angles = (torch.abs(angles).sum(dim=-1) == 0).long().sum()\n",
" padding_seq = (np.array([x for x in seq]) == \"_\").sum()\n",
" # only accept sequences with right dimensions and no missing coords\n",
" # if padding_seq == padding_angles:\n",
" # print(\"paddings_match\")\n",
" # print(\"len coords\", list(batch.crds[i].shape)[0]//3, \"vs int_seq\", len(int_seq))\n",
" if list(batch.crds[i].shape)[0]//14 == len(int_seq):\n",
" if max_len > len(seq) and len(seq) > min_len and padding_seq == padding_angles:\n",
" if verbose:\n",
" print(\"stopping at sequence of length\", len(seq))\n",
" # print(len(seq), angles.shape, padding_seq == padding_angles == list(batch.crds[i].shape)[0]//3)\n",
" # print(\"paddings: \", padding_seq, padding_angles)\n",
" raise StopIteration\n",
" else:\n",
" # print(\"found a seq of length:\", len(seq), \"but below the threshold:\", min_len)\n",
" pass\n",
" except StopIteration:\n",
" break\n",
" \n",
" return seq, batch.crds[i], angles, padding_seq"
]
},
{
"cell_type": "code",
"execution_count": 18,
Expand Down Expand Up @@ -298,13 +257,7 @@
" 29%|██▉ | 289/1000 [00:26<01:01, 11.55it/s]\u001b[A\n",
" 29%|██▉ | 291/1000 [00:26<01:02, 11.41it/s]\u001b[A\n",
" 29%|██▉ | 293/1000 [00:26<00:59, 11.78it/s]\u001b[A\n",
" 30%|██▉ | 295/1000 [00:26<00:58, 12.10it/s]\u001b[A\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 30%|██▉ | 295/1000 [00:26<00:58, 12.10it/s]\u001b[A\n",
" 30%|██▉ | 297/1000 [00:26<00:59, 11.85it/s]\u001b[A\n",
" 30%|██▉ | 299/1000 [00:27<00:59, 11.81it/s]\u001b[A\n",
" 30%|███ | 301/1000 [00:27<00:59, 11.84it/s]\u001b[A\n",
Expand Down Expand Up @@ -612,13 +565,7 @@
" 89%|████████▉ | 893/1000 [01:19<00:09, 11.32it/s]\u001b[A\n",
" 90%|████████▉ | 895/1000 [01:19<00:08, 11.76it/s]\u001b[A\n",
" 90%|████████▉ | 897/1000 [01:19<00:08, 11.84it/s]\u001b[A\n",
" 90%|████████▉ | 899/1000 [01:19<00:08, 11.95it/s]\u001b[A\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 90%|████████▉ | 899/1000 [01:19<00:08, 11.95it/s]\u001b[A\n",
" 90%|█████████ | 901/1000 [01:19<00:08, 11.96it/s]\u001b[A\n",
" 90%|█████████ | 903/1000 [01:19<00:08, 11.58it/s]\u001b[A\n",
" 90%|█████████ | 905/1000 [01:20<00:08, 11.54it/s]\u001b[A\n",
Expand Down Expand Up @@ -673,6 +620,7 @@
}
],
"source": [
"get_prot = mp_nerf.utils.get_prot\n",
"prots_list = [get_prot(min_len=100, max_len=950) for i in tqdm(range(1000))]"
]
},
Expand Down
52 changes: 4 additions & 48 deletions notebooks/integrated_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,50 +34,6 @@
sep = "\n\n=======\n\n"


def get_prot(dataloader_=None, vocab_=None, min_len=80, max_len=150, verbose=True):
""" Gets a protein from sidechainnet and returns
the right attrs for training.
Inputs:
* dataloader_: sidechainnet iterator over dataset
* vocab_: sidechainnet VOCAB class
* min_len: int. minimum sequence length
* max_len: int. maximum sequence length
* verbose: bool. verbosity level
Outputs: (cleaned, without padding)
(seq_str, int_seq, coords, angles, padding_seq, mask, pid)
"""
for b,batch in enumerate(dataloader_['train']):
# try for breaking from 2 loops at once
try:
for i in range(batch.int_seqs.shape[0]):
# strip padding padding
padding_seq = (batch.int_seqs[i] == 20).sum().item()
padding_angles = (torch.abs(batch.angs[i]).sum(dim=-1) == 0).long().sum().item()

if padding_seq == padding_angles:
# check for appropiate length
real_len = batch.int_seqs[i].shape[0] - padding_seq
if max_len >= real_len >= min_len:
# strip padding tokens
seq = ''.join([vocab_.int2char(aa) for aa in batch.int_seqs[i].numpy()])
seq = seq[:-padding_seq or None]
int_seq = batch.int_seqs[i][:-padding_seq or None]
angles = batch.angs[i][:-padding_seq or None]
mask = batch.msks[i][:-padding_seq or None]
coords = batch.crds[i][:-padding_seq*14 or None]

print("stopping at sequence of length", real_len)
raise StopIteration
else:
# print("found a seq of length:", len(seq),
# "but oustide the threshold:", min_len, max_len)
pass

except StopIteration:
break

return seq, int_seq, coords, angles, padding_seq, mask, batch.pids[i]

# begin tests
if __name__ == "__main__":

Expand All @@ -88,10 +44,10 @@ def get_prot(dataloader_=None, vocab_=None, min_len=80, max_len=150, verbose=Tru
# skip
dataloaders_ = sidechainnet.load(casp_version=7, with_pytorch="dataloaders", batch_size=2)
logger.info("Data has been loaded"+"\n"+sep)
stored = [ get_prot(dataloader_=dataloaders_,
vocab_=VOCAB,
min_len=desired_len+5,
max_len=desired_len+60) for desired_len in lengths ]
stored = [ mp_nerf.utils.get_prot(dataloader_=dataloaders_,
vocab_=VOCAB,
min_len=desired_len+5,
max_len=desired_len+60) for desired_len in lengths ]
joblib.dump(stored, BASE_FOLDER[:-1]+"_manual/analyzed_prots.joblib")
except:
stored = joblib.load(BASE_FOLDER[:-1]+"_manual/analyzed_prots.joblib")
Expand Down
52 changes: 1 addition & 51 deletions notebooks/test_implementation_loop.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -76,57 +76,6 @@
"# created='Sep 20, 2020')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def get_prot(dataloader_=None, vocab_=None, min_len=80, max_len=150, verbose=True):\n",
" \"\"\" Gets a protein from sidechainnet and returns\n",
" the right attrs for training. \n",
" Inputs: \n",
" * dataloader_: sidechainnet iterator over dataset\n",
" * vocab_: sidechainnet VOCAB class\n",
" * min_len: int. minimum sequence length\n",
" * max_len: int. maximum sequence length\n",
" * verbose: bool. verbosity level\n",
" Outputs: (cleaned, without padding)\n",
" (seq_str, int_seq, coords, angles, padding_seq, mask, pid)\n",
" \"\"\"\n",
" for batch in dataloader_['train']:\n",
" # try for breaking from 2 loops at once\n",
" try:\n",
" for i in range(batch.int_seqs.shape[0]):\n",
" # strip padding padding\n",
" padding_seq = (batch.int_seqs[i] == 20).sum().item()\n",
" padding_angles = (batch.angs[i].sum(dim=-1) == 0).sum().item()\n",
" if padding_seq == padding_angles:\n",
" # check for appropiate length\n",
" real_len = batch.int_seqs[i].shape[0] - padding_seq\n",
" if max_len >= real_len >= min_len:\n",
" # strip padding tokens\n",
" seq = ''.join([vocab_.int2char(aa) for aa in batch.int_seqs[i].numpy()])\n",
" seq = seq[:-padding_seq or None]\n",
" int_seq = batch.int_seqs[i][:-padding_seq or None]\n",
" angles = batch.angs[i][:-padding_seq or None]\n",
" mask = batch.msks[i][:-padding_seq or None]\n",
" coords = batch.crds[i][:-padding_seq*14 or None]\n",
"\n",
" print(\"stopping at sequence of length\", real_len)\n",
" raise StopIteration\n",
" else:\n",
" # print(\"found a seq of length:\", len(seq),\n",
" # \"but oustide the threshold:\", min_len, max_len)\n",
" pass\n",
" except StopIteration:\n",
" break\n",
" \n",
" return seq, int_seq, coords, angles, padding_seq, mask, batch.pids[i]"
]
},
{
"cell_type": "code",
"execution_count": 6,
Expand All @@ -141,6 +90,7 @@
}
],
"source": [
"get_prot = mp_nerf.utils.get_prot\n",
"seq, int_seq, true_coords, angles, padding_seq, mask, pid = get_prot(dataloader_=dataloaders, vocab_=VOCAB, \n",
" min_len=200, max_len=1000)"
]
Expand Down
51 changes: 1 addition & 50 deletions notebooks/test_implementation_speed.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -74,56 +74,6 @@
"# created='Sep 20, 2020')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def get_prot(dataloader_=None, vocab_=None, min_len=80, max_len=150, verbose=True):\n",
" \"\"\" Gets a protein from sidechainnet and returns\n",
" the right attrs for training. \n",
" Inputs: \n",
" * dataloader_: sidechainnet iterator over dataset\n",
" * vocab_: sidechainnet VOCAB class\n",
" * min_len: int. minimum sequence length\n",
" * max_len: int. maximum sequence length\n",
" * verbose: bool. verbosity level\n",
" Outputs: (cleaned, without padding)\n",
" (seq_str, int_seq, coords, angles, padding_seq, mask, pid)\n",
" \"\"\"\n",
" for batch in dataloader_['train']:\n",
" # try for breaking from 2 loops at once\n",
" try:\n",
" for i in range(batch.int_seqs.shape[0]):\n",
" # strip padding padding\n",
" padding_seq = (batch.int_seqs[i] == 20).sum().item()\n",
" padding_angles = (torch.abs(batch.angs[i]).sum(dim=-1) == 0).long().sum().item()\n",
"\n",
" if padding_seq == padding_angles:\n",
" # check for appropiate length\n",
" real_len = batch.int_seqs[i].shape[0] - padding_seq\n",
" if max_len >= real_len >= min_len:\n",
" # strip padding tokens\n",
" seq = ''.join([vocab_.int2char(aa) for aa in batch.int_seqs[i].numpy()])\n",
" seq = seq[:-padding_seq or None]\n",
" int_seq = batch.int_seqs[i][:-padding_seq or None]\n",
" angles = batch.angs[i][:-padding_seq or None]\n",
" mask = batch.msks[i][:-padding_seq or None]\n",
" coords = batch.crds[i][:-padding_seq*14 or None]\n",
"\n",
" print(\"stopping at sequence of length\", real_len)\n",
" raise StopIteration\n",
" else:\n",
" # print(\"found a seq of length:\", len(seq),\n",
" # \"but oustide the threshold:\", min_len, max_len)\n",
" pass\n",
" except StopIteration:\n",
" break\n",
" \n",
" return seq, int_seq, coords, angles, padding_seq, mask, batch.pids[i]"
]
},
{
"cell_type": "code",
"execution_count": 6,
Expand All @@ -138,6 +88,7 @@
}
],
"source": [
"get_prot = mp_nerf.utils.get_prot\n",
"seq, int_seq, true_coords, angles, padding_seq, mask, pid = get_prot(dataloader_=dataloaders, vocab_=VOCAB, \n",
" min_len=700, max_len=1000)"
]
Expand Down

0 comments on commit c7a17fa

Please sign in to comment.