Skip to content

Commit 6dfbee8

Browse files
authored
Add files via upload
1 parent b6e6631 commit 6dfbee8

File tree

100 files changed

+1042
-2
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

100 files changed

+1042
-2
lines changed

README.md

+31-2
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,31 @@
1-
# Toxidrome
2-
Toxidrome models and data used to create them
1+
# Toxidrome - Graph Convolutional Neural Network Models
2+
3+
##### Supporting information for paper:
4+
Title: "Rapid Screening of Chemicals for their Potential to Cause Specific Toxidromes"
5+
Authors: Ruifeng Liu, Mohamed Diwan M AbdulHameed, Zhen Xu, Benjamin Clancy, Valmik Desai, Anders Wallqvist
6+
Journal: Frontiers in Drug Discovery, section In silico Methods and Artificial Intelligence for Drug Discovery
7+
### Intro
8+
This repository contains the data and models used to make Toxidrome's predictions and has sorted this data into 4 sections:
9+
- An excel workbook containing the compounds used in training our graph convolutional neural network (GCNN) models as well as running our similarity ensemble approach (SEA)
10+
- A folder containing the CMPNN models and script
11+
- A folder containing the DMPNN mdoels and script
12+
- A folder containing the SEA script
13+
14+
##### Compound Data Excel Workbook
15+
This workbook contains 8 pages, one for each toxidrome we make predictions for (with one exception: cholinergic and anticholinergic predictions are made using the same compounds).
16+
The pages used in training our GCNN models (cholinergic, convolsant, opioid, and sympathomimetic) contain the each compound's SMILES, model, value, and unit, while the pages used in running the SEA (Anticoagulant, Irritant-corrosive, knockdown, and Solvents-anesthetics-sedatives) only contain the SMILES for each compound.
17+
18+
##### CMPNN Models and Script
19+
[what is different about the CMPNN]
20+
21+
This folder contains 2 main parts: a folder of models called "toxidromes" and a script called run_cmpnn.py.
22+
23+
This script runs the CMPNN using these models.
24+
##### DMPNN Models and Script
25+
[what is different about the DMPNN]
26+
27+
This folder contains 2 main parts: a folder of models called "toxidromes" and a script called run_dmpnn.py.
28+
29+
This script runs the DMPNN using these models.
30+
##### SEA Script
31+
This folder contians a script to run the similarity ensemble approach and an excel sheet which has the list of compounds used in running the SEA script (these are the same compounds listed in the SEA pages the workbook).

Toxidrome Data_11-20-23.xlsx

1.84 MB
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .data import MoleculeDatapoint, MoleculeDataset
2+
from .scaffold import scaffold_to_smiles
3+
from .scaler import StandardScaler
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
+241
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
from argparse import Namespace
2+
import random
3+
from typing import Callable, List, Union
4+
5+
import numpy as np
6+
from torch.utils.data.dataset import Dataset
7+
from rdkit import Chem
8+
9+
from .scaler import StandardScaler
10+
from cmpnn_toxidrome.chemprop_cmpnn.features import get_features_generator
11+
12+
13+
class MoleculeDatapoint:
14+
"""A MoleculeDatapoint contains a single molecule and its associated features and targets."""
15+
16+
def __init__(self,
17+
line: List[str],
18+
args: Namespace = None,
19+
features: np.ndarray = None,
20+
use_compound_names: bool = False):
21+
"""
22+
Initializes a MoleculeDatapoint, which contains a single molecule.
23+
24+
:param line: A list of strings generated by separating a line in a data CSV file by comma.
25+
:param args: Arguments.
26+
:param features: A numpy array containing additional features (ex. Morgan fingerprint).
27+
:param use_compound_names: Whether the data CSV includes the compound name on each line.
28+
"""
29+
if args is not None:
30+
self.features_generator = args.features_generator
31+
self.args = args
32+
else:
33+
self.features_generator = self.args = None
34+
35+
if features is not None and self.features_generator is not None:
36+
raise ValueError('Currently cannot provide both loaded features and a features generator.')
37+
38+
self.features = features
39+
40+
if use_compound_names:
41+
self.compound_name = line[0] # str
42+
line = line[1:]
43+
else:
44+
self.compound_name = None
45+
46+
self.smiles = line[0] # str
47+
self.mol = Chem.MolFromSmiles(self.smiles)
48+
49+
# Generate additional features if given a generator
50+
if self.features_generator is not None:
51+
self.features = []
52+
53+
for fg in self.features_generator:
54+
features_generator = get_features_generator(fg)
55+
if self.mol is not None and self.mol.GetNumHeavyAtoms() > 0:
56+
self.features.extend(features_generator(self.mol))
57+
58+
self.features = np.array(self.features)
59+
60+
# Fix nans in features
61+
if self.features is not None:
62+
replace_token = 0
63+
self.features = np.where(np.isnan(self.features), replace_token, self.features)
64+
65+
# Create targets
66+
self.targets = [float(x) if x != '' else None for x in line[1:]]
67+
68+
def set_features(self, features: np.ndarray):
69+
"""
70+
Sets the features of the molecule.
71+
72+
:param features: A 1-D numpy array of features for the molecule.
73+
"""
74+
self.features = features
75+
76+
def num_tasks(self) -> int:
77+
"""
78+
Returns the number of prediction tasks.
79+
80+
:return: The number of tasks.
81+
"""
82+
return len(self.targets)
83+
84+
def set_targets(self, targets: List[float]):
85+
"""
86+
Sets the targets of a molecule.
87+
88+
:param targets: A list of floats containing the targets.
89+
"""
90+
self.targets = targets
91+
92+
93+
class MoleculeDataset(Dataset):
94+
"""A MoleculeDataset contains a list of molecules and their associated features and targets."""
95+
96+
def __init__(self, data: List[MoleculeDatapoint]):
97+
"""
98+
Initializes a MoleculeDataset, which contains a list of MoleculeDatapoints (i.e. a list of molecules).
99+
100+
:param data: A list of MoleculeDatapoints.
101+
"""
102+
self.data = data
103+
self.args = self.data[0].args if len(self.data) > 0 else None
104+
self.scaler = None
105+
106+
def compound_names(self) -> List[str]:
107+
"""
108+
Returns the compound names associated with the molecule (if they exist).
109+
110+
:return: A list of compound names or None if the dataset does not contain compound names.
111+
"""
112+
if len(self.data) == 0 or self.data[0].compound_name is None:
113+
return None
114+
115+
return [d.compound_name for d in self.data]
116+
117+
def smiles(self) -> List[str]:
118+
"""
119+
Returns the smiles strings associated with the molecules.
120+
121+
:return: A list of smiles strings.
122+
"""
123+
return [d.smiles for d in self.data]
124+
125+
def mols(self) -> List[Chem.Mol]:
126+
"""
127+
Returns the RDKit molecules associated with the molecules.
128+
129+
:return: A list of RDKit Mols.
130+
"""
131+
return [d.mol for d in self.data]
132+
133+
def features(self) -> List[np.ndarray]:
134+
"""
135+
Returns the features associated with each molecule (if they exist).
136+
137+
:return: A list of 1D numpy arrays containing the features for each molecule or None if there are no features.
138+
"""
139+
if len(self.data) == 0 or self.data[0].features is None:
140+
return None
141+
142+
return [d.features for d in self.data]
143+
144+
def targets(self) -> List[List[float]]:
145+
"""
146+
Returns the targets associated with each molecule.
147+
148+
:return: A list of lists of floats containing the targets.
149+
"""
150+
return [d.targets for d in self.data]
151+
152+
def num_tasks(self) -> int:
153+
"""
154+
Returns the number of prediction tasks.
155+
156+
:return: The number of tasks.
157+
"""
158+
return self.data[0].num_tasks() if len(self.data) > 0 else None
159+
160+
def features_size(self) -> int:
161+
"""
162+
Returns the size of the features array associated with each molecule.
163+
164+
:return: The size of the features.
165+
"""
166+
return len(self.data[0].features) if len(self.data) > 0 and self.data[0].features is not None else None
167+
168+
def shuffle(self, seed: int = None):
169+
"""
170+
Shuffles the dataset.
171+
172+
:param seed: Optional random seed.
173+
"""
174+
if seed is not None:
175+
random.seed(seed)
176+
random.shuffle(self.data)
177+
178+
def normalize_features(self, scaler: StandardScaler = None, replace_nan_token: int = 0) -> StandardScaler:
179+
"""
180+
Normalizes the features of the dataset using a StandardScaler (subtract mean, divide by standard deviation).
181+
182+
If a scaler is provided, uses that scaler to perform the normalization. Otherwise fits a scaler to the
183+
features in the dataset and then performs the normalization.
184+
185+
:param scaler: A fitted StandardScaler. Used if provided. Otherwise a StandardScaler is fit on
186+
this dataset and is then used.
187+
:param replace_nan_token: What to replace nans with.
188+
:return: A fitted StandardScaler. If a scaler is provided, this is the same scaler. Otherwise, this is
189+
a scaler fit on this dataset.
190+
"""
191+
if len(self.data) == 0 or self.data[0].features is None:
192+
return None
193+
194+
if scaler is not None:
195+
self.scaler = scaler
196+
197+
elif self.scaler is None:
198+
features = np.vstack([d.features for d in self.data])
199+
self.scaler = StandardScaler(replace_nan_token=replace_nan_token)
200+
self.scaler.fit(features)
201+
202+
for d in self.data:
203+
d.set_features(self.scaler.transform(d.features.reshape(1, -1))[0])
204+
205+
return self.scaler
206+
207+
def set_targets(self, targets: List[List[float]]):
208+
"""
209+
Sets the targets for each molecule in the dataset. Assumes the targets are aligned with the datapoints.
210+
211+
:param targets: A list of lists of floats containing targets for each molecule. This must be the
212+
same length as the underlying dataset.
213+
"""
214+
assert len(self.data) == len(targets)
215+
for i in range(len(self.data)):
216+
self.data[i].set_targets(targets[i])
217+
218+
def sort(self, key: Callable):
219+
"""
220+
Sorts the dataset using the provided key.
221+
222+
:param key: A function on a MoleculeDatapoint to determine the sorting order.
223+
"""
224+
self.data.sort(key=key)
225+
226+
def __len__(self) -> int:
227+
"""
228+
Returns the length of the dataset (i.e. the number of molecules).
229+
230+
:return: The length of the dataset.
231+
"""
232+
return len(self.data)
233+
234+
def __getitem__(self, item) -> Union[MoleculeDatapoint, List[MoleculeDatapoint]]:
235+
"""
236+
Gets one or more MoleculeDatapoints via an index or slice.
237+
238+
:param item: An index (int) or a slice object.
239+
:return: A MoleculeDatapoint if an int is provided or a list of MoleculeDatapoints if a slice is provided.
240+
"""
241+
return self.data[item]

0 commit comments

Comments
 (0)