-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhydrophobic_cluster.py
200 lines (185 loc) · 5.81 KB
/
hydrophobic_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import sys
import os
import argparse
import numpy as np
from salt_bridges import data_coord_extraction, dist_calc, c_cluster, join_res_data
np.set_printoptions(threshold=sys.maxsize)
# surface accessible side chain area
sasa = {
"ALA": 75,
"CYS": 115,
"ASP": 130,
"GLU": 161,
"PHE": 209,
"GLY": 0,
"HIS": 180,
"ILE": 172,
"LYS": 205,
"LEU": 172,
"MET": 184,
"ASN": 142,
"PRO": 134,
"GLN": 173,
"ARG": 236,
"SER": 95,
"THR": 130,
"VAL": 143,
"TRP": 254,
"TYR": 222,
}
def create_output_hy(
pair_num: list[list[str]],
pair_num_ori: list[list[str]],
create_file: str = None,
silent: bool = False,
) -> None:
"""prints output in the terminal and optionally creates a output file
:parameter
- pair_num:
clustered residues in one list
- pair_num_ori:
the original pairs of interacting residues
- create_file:
how the output file will be named
- silent:
whether to print output in terminal or not
:return
- None
"""
# list with cluster sizes
cluster_sizes = [len(i) for i in pair_num]
# contacts per cluster_sizes
cpc = []
# surface area per cluster
spc = []
for i in pair_num:
inter_pairs = []
for k in i:
inter_pairs += pair_num_ori[np.where(pair_num_ori == k)[0]].tolist()
cpc.append(np.unique(inter_pairs, axis=0).shape[0])
r = []
surface_area = 0
for f in np.unique(inter_pairs):
f_split = f.split("-")
surface_area += int(sasa[f_split[0]])
r.append(f_split[-1])
spc.append(surface_area)
if create_file is not None:
data_file = open("{}.csv".format(create_file), "w+")
data_file.write(
"InteractingResidues,ContactsPerCluster,SurfaceAreaPerCluster\n"
)
for i in range(len(pair_num)):
if not silent:
print(" - ".join(pair_num[i]), cpc[i], spc[i])
if create_file is not None:
data_file.write(
",".join([" - ".join(pair_num[i]), str(cpc[i]), str(spc[i])]) + "\n"
)
if create_file is not None:
data_file.close()
def hydr_cluster(
file_path: str,
sele_chain: str = None,
create_file: str = None,
silent: bool = False,
) -> None:
"""calculates hydrophobic cluster with the option to select a chain
:parameter
- file_path:
path to the pdb file
- sele_chain:
to select a specific chain use e.g. 'A' in which the salt bridges
should be calculated
- create_file:
how the output file will be named - gets split if '_' present
- silent:
whether to print output in terminal or not
:return
- None
"""
data, coords = data_coord_extraction(file_path)
aa = ["ILE", "LEU", "VAL"]
atom = ["N", "H", "CA", "HA", "C", "O"]
tests = []
for i in aa:
tests.append((data[:, 1] == i).tolist())
for i in atom:
tests.append((data[:, 0] != i).tolist())
heavy_atom = []
for i in data[:, 0]:
heavy_atom.append(not i.startswith("H"))
tests.append(heavy_atom)
tests = np.asarray(tests)
# which data entry contains the right amino acid and the right atom type
test_conf = np.sum(tests, axis=0) == 8
if sele_chain is None:
chain_test = np.ones(data[test_conf].shape[0]).astype(bool)
else:
# to get data entries from selected chain(s)
chain_test = data[test_conf][:, 2] == sele_chain
# ResidueIDs for interacting residues that are able to form bridges
sele_data = data[test_conf][chain_test]
# their coordinates
sele_coords = coords[test_conf][chain_test]
# distance matrix between all the potential residues
dists = dist_calc(sele_coords, sele_coords)
# all atoms in hydrophobic interaction distance
dists = dists < 6.56
# to only get interactions once
dists = np.triu(dists, 1)
pair_ind0, pair_ind1 = np.where(dists)
excl_same = np.any(
sele_data[:, [1, 2, 3]][pair_ind0] != sele_data[:, [1, 2, 3]][pair_ind1], axis=1
)
# pairs of hydrophobic interactions with Amino Acid, Chain and Number
valid_pairs = np.column_stack(
(
sele_data[:, [1, 2, 3]][pair_ind0][excl_same],
sele_data[:, [1, 2, 3]][pair_ind1][excl_same],
)
)
# to only have one entry per residues interaction and not of all their
# atoms
valid_pairs = np.unique(valid_pairs, axis=0)
# pairs of cluster forming residues as their residue number as strings
pair_num = join_res_data(valid_pairs[:, 3:], valid_pairs[:, :3])
pair_num_ori = pair_num
pair_num = c_cluster(pair_num)
create_output_hy(pair_num, pair_num_ori, create_file=create_file, silent=silent)
return pair_num_ori
def arg_dict() -> dict:
"""argparser for hydrophobic cluster search
:parameter
- None:
:return
- d
dictionary specifying all parameters for hydr_cluster
"""
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"-f", "--file_path", type=str, required=True, help="path to pdb file"
)
parser.add_argument(
"-c", "--create_file", type=str, required=False, default=None, help="file name"
)
parser.add_argument(
"-s",
"--sele_chain",
type=str,
required=False,
default=None,
help="ChainID if the hydrophobic cluster should "
"only calculated for one specific chain",
)
args = parser.parse_args()
d = {
"file_path": args.file_path,
"create_file": args.create_file,
"sele_chain": args.sele_chain,
}
return d
if __name__ == "__main__":
hydr_cluster(**arg_dict())