-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_conformers.py
304 lines (246 loc) · 8.25 KB
/
find_conformers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#!/usr/bin/python3
"""
This script is a wrapper to run UPGMA agglomerative clustering on a set of monomeric
chains, parsed in by the user. These chains should all have the same UniProt accession
and have at least partial sequence overlap. Although the latter condition is not
essential, a lack of sufficient overlap will not provide informative clustering results.
Functions defined here handle argument parsing, such as setting up the input to the
cluster_monomers.ClusterConformations() class instance and make decisions on which
methods to execute.
"""
# Third party imports
import argparse
from pathlib import Path, PosixPath
import sys
# Custom imports
import cluster_conformers.cluster_monomers as cluster_monomers
from cluster_conformers.utils import logging_utils
from cluster_conformers.distance_differences import make_dd_maps
def extract_image_format(image_args: str):
"""
Image formats are parsed in from the command line by the user, along with paths to
their save location. This function separates the path from specified image format(s)
and returns them as easily-handleable booleans.
"""
png_bool = False
svg_bool = False
for arg in image_args:
if arg == "png":
png_bool = True
if arg == "svg":
svg_bool = True
path_image = Path(image_args[0])
return path_image, png_bool, svg_bool
def extract_structure_format(args_mmcif):
"""
Takes arguments collected from the create_parser() function (below) and creates a
dictionary acceptable by the cluster_monomers.ClusterConformations() object.
Output example for 'structures' object:
"/path/to/updated/mmcif/1atp_updated.cif" : ['A', 'B'],
"/path/to/updated/mmcif/2adp_updated.cif" : ['C', 'D', 'E'],
...
"/path/to/updated/mmcif/9amp_updated.cif" : ['A', 'B', ... 'Z']
"""
# Add parsed list of mmCIFs to dictionary
structures = { # str : list
# "/path/1atp_updated.cif" : ['A', 'B'],
# "/path/2adp_updated.cif" : ['C', 'D', 'E'],
# ...
# "/path/9amp_updated.cif" : ['A', 'B', ... 'Z']
}
if args_mmcif:
try:
for i in args_mmcif:
structures[i[0]] = i[1:]
except Exception:
raise IndexError("Must parse in chain ID(s)")
else:
raise NameError("Must parse in path to one or more mmCIF file(s)")
return structures
def create_parser(input_args=None):
"""
Collects command-line arguments from the user and parses them into a dictionary,
ready for feeding into cluster_monomers.ClusterConformations(), and an 'arguments'
object, which is used to make decisions on which methods in
cluster_monomers.ClusterConformations() to run.
"""
parser = argparse.ArgumentParser()
parser.add_argument(
"-v", "--verbose", help="Increase verbosity", default=False, action="store_true"
)
parser.add_argument(
"-u", "--uniprot", help="UniProt accession", type=str, required=True
)
parser.add_argument(
"-m",
"--mmcif",
nargs="+",
action="append",
help="Enter list of paths to mmCIFs that overlap a given UniProt segment",
# type=pathlib.Path
required=True,
)
parser.add_argument(
"-c",
"--path_ca",
help="Path to save CA distance matrices",
type=PosixPath,
required=True,
)
parser.add_argument(
"-s",
"--path_clusters",
help="Path to save clustering results",
type=PosixPath,
)
parser.add_argument(
"-d",
"--path_dd",
help="Path to save distance difference matrices",
type=PosixPath,
default=None,
)
parser.add_argument(
"-g",
"--path_dendrogram",
nargs="+",
help="Path to save dendrogram of clustering results",
type=str,
)
parser.add_argument(
"-w",
"--path_swarm",
help="Path to save swarm plot of scores",
nargs="+",
type=str,
default=None,
)
parser.add_argument(
"-o",
"--path_histogram",
help="Path to save histograms of distance difference maps",
type=PosixPath,
)
parser.add_argument(
"-a",
"--path_alpha_fold",
help="Path to save AlphaFold Database structure",
type=PosixPath,
default=None,
)
parser.add_argument(
"-n", "--nproc", help="Max number of threads to utilise", type=int, default=1
)
parser.add_argument(
"-f",
"--force",
help="Force overwrite of existing matrix files",
default=False,
action="store_true",
)
parser.add_argument(
"-i",
"--updated_entries",
help="List of updated entries, matrices will be regenerated",
# action="append",
nargs="+",
type=str,
default=None,
)
parser.add_argument(
"-0",
"--first_residue_position",
help="First residue position in (UniProt) sequence",
type=int,
default=None,
)
parser.add_argument(
"-1",
"--last_residue_position",
help="Last residue position in (UniProt) sequence",
type=int,
default=None,
)
args = parser.parse_args(input_args)
# Add parsed list of mmCIFs to dictionary
structures = extract_structure_format(args.mmcif)
return args, structures
def main():
"""
Wrapper to run cluster_monomers.ClusterConformations() on a set of parsed mmCIFs.
"""
args, structures = create_parser(sys.argv[1:])
# Initialise logger
logging_utils.init_logger(verbose=args.verbose)
# End early if only one residue parsed
if bool(args.first_residue_position) != bool(args.last_residue_position):
raise NameError(
"Must parse in BOTH first (-0) and last (-1) residue positions in sequence"
)
# Create object for clustering
unp_cluster = cluster_monomers.ClusterConformations(
unp=args.uniprot,
mmcifs_and_chains=structures,
path_save_alphafold=args.path_alpha_fold,
nproc=args.nproc,
force=args.force,
)
# Remove any existing matrices for updated entries
if args.updated_entries:
unp_cluster.remove_entry_matxs(
pdb_ids=args.updated_entries,
path_ca=args.path_ca,
path_dd=args.path_dd,
)
# Generate CA distance matrices and save
unp_cluster.ca_distance(args.path_ca)
# Perform agglomerative clustering and save results
if args.path_clusters:
unp_cluster.cluster(
path_save_dd_matx=args.path_dd,
path_save_cluster_results=args.path_clusters,
)
elif bool(args.path_clusters):
raise NameError(
"Must parse both path to save distance difference matrices "
"and clustering results. Use -d </path/to/distance/difference/matrices/> "
"-s </path/to/save/clustering/results/>"
)
else:
pass
# Render and save distance difference maps
if args.path_histogram:
make_dd_maps(
path_matxs=args.path_dd,
path_save_maps=args.path_histogram,
force=True,
)
# Define residue range if parsed in
if args.first_residue_position and args.last_residue_position:
this_unp_range = (args.first_residue_position, args.last_residue_position)
else:
this_unp_range = None
# Parsing in options for saving dendrogram
if args.path_dendrogram:
path_save, png_bool, svg_bool = extract_image_format(args.path_dendrogram)
cluster_monomers.render_dendrogram(
unp=args.uniprot,
path_results=path_save,
path_save=path_save,
png=png_bool,
svg=svg_bool,
unp_range=this_unp_range,
)
if args.path_swarm:
path_save, png_bool, svg_bool = extract_image_format(args.path_swarm)
# Render and save swarm plot of scores
cluster_monomers.render_swarmplot(
unp=args.uniprot,
path_save=path_save,
path_results=args.path_clusters,
png=png_bool,
svg=svg_bool,
unp_range=this_unp_range,
)
if __name__ == "__main__":
main()