Skip to content

Commit 4b94b3c

Browse files
committed
feat: add merged_lipid_classes
1 parent 9f17c78 commit 4b94b3c

File tree

4 files changed

+163
-97
lines changed

4 files changed

+163
-97
lines changed

lta/helpers/pipeline.py

+67-91
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,7 @@ def __post_init__(self) -> None:
100100
logger.exception(f"{self.file} is a directory. A full traceback follows...")
101101
raise
102102
except pd.errors.EmptyDataError:
103-
logger.exception(
104-
f"{self.file} contains no data. A full traceback follows..."
105-
)
103+
logger.exception(f"{self.file} contains no data. A full traceback follows...")
106104
raise
107105
logger.debug("Binarizing data...")
108106
self.binary = {
@@ -117,19 +115,13 @@ def __post_init__(self) -> None:
117115
}
118116
logger.debug("Filtering data...")
119117
self.filtered = {
120-
group: df.loc[self.binary[group].index, :]
121-
for group, df in data.groupby(axis="columns", level=self.mode)
118+
group: df.loc[self.binary[group].index, :] for group, df in data.groupby(axis="columns", level=self.mode)
122119
}
123120
Path(self.output, "enfc").mkdir(exist_ok=True, parents=True)
124121
Path(self.output, "jaccard").mkdir(exist_ok=True, parents=True)
125122

126-
conditions = [
127-
df.columns.get_level_values(self.level).unique()
128-
for df in self.filtered.values()
129-
]
130-
self.conditions = [
131-
val for mode in conditions for val in mode if val != self.control
132-
]
123+
conditions = [df.columns.get_level_values(self.level).unique() for df in self.filtered.values()]
124+
self.conditions = [val for mode in conditions for val in mode if val != self.control]
133125

134126
def _calculate_enfc(self) -> Dict[str, Dict[str, pd.DataFrame]]:
135127
"""Calculate error-normalised fold change.
@@ -178,9 +170,7 @@ def _get_a_lipids(self) -> Dict[str, pd.DataFrame]:
178170
logger.info("Calculating A-lipids...")
179171
results = {
180172
f"a_{mode}": (
181-
df.groupby(axis="columns", level=self.level)
182-
.all()
183-
.pipe(lambda x: x.loc[x.any(axis="columns"), :])
173+
df.groupby(axis="columns", level=self.level).all().pipe(lambda x: x.loc[x.any(axis="columns"), :])
184174
)
185175
for mode, df in self.binary.items()
186176
}
@@ -231,16 +221,10 @@ def _get_b_lipids(self, picky: bool = True) -> Dict[str, pd.DataFrame]:
231221
# This assumes that self.binary and a_lip will have the same keys
232222
# Which is definitely True
233223
if picky:
234-
data = {
235-
mode: df.drop(index=a_lip[f"a_{mode}"])
236-
for mode, df in self.binary.items()
237-
}
224+
data = {mode: df.drop(index=a_lip[f"a_{mode}"]) for mode, df in self.binary.items()}
238225
subtype = "p"
239226
else:
240-
data = {
241-
mode: df.loc[a_lip[f"a_{mode}"], :]
242-
for mode, df in self.binary.items()
243-
}
227+
data = {mode: df.loc[a_lip[f"a_{mode}"], :] for mode, df in self.binary.items()}
244228
subtype = "c"
245229

246230
logger.info(f"Calculating B{subtype}-lipids...")
@@ -296,47 +280,32 @@ def _get_n_lipids(self, n: int) -> Dict[str, pd.DataFrame]:
296280
compartments = df.columns.get_level_values(self.compartment)
297281
# Mask required to prevent dropping levels
298282
# The initial check must be done with all compartments (only n)...
299-
mask = (
300-
df.groupby(axis="columns", level=self.compartment)
301-
.any()
302-
.sum(axis="columns")
303-
== n
304-
)
283+
mask = df.groupby(axis="columns", level=self.compartment).any().sum(axis="columns") == n
305284

306285
data = [
307286
(group, df.loc[mask, compartments.isin(group)])
308287
for group in itertools.combinations(compartments.unique(), n)
309288
]
310-
logger.debug(
311-
f"N{n} compartment groups before filtering: {[group for group, _ in data]}"
312-
)
289+
logger.debug(f"N{n} compartment groups before filtering: {[group for group, _ in data]}")
313290
# ...which necessitates a second check to drop those that are not
314291
# Again, mask necessary for keeping info
315292
# Also, we only care for groups with lipids
316293
masks = [
317-
df.groupby(axis="columns", level=self.compartment)
318-
.any()
319-
.sum(axis="columns")
320-
== n
321-
for _, df in data
294+
df.groupby(axis="columns", level=self.compartment).any().sum(axis="columns") == n for _, df in data
322295
]
323296
data = [
324297
(group, df.loc[mask, :].groupby(axis="columns", level=self.level).all())
325298
for (group, df), mask in zip(data, masks)
326299
if mask.sum() != 0
327300
]
328-
logger.debug(
329-
f"N{n} compartment groups after filtering: {[group for group, _ in data]}"
330-
)
301+
logger.debug(f"N{n} compartment groups after filtering: {[group for group, _ in data]}")
331302
for compartments, df in data:
332303
n_type = "u" if n == 1 else f"n{n}"
333304
group = "_".join([x.upper() for x in compartments])
334305
results[f"{n_type}_{group}_{mode}"] = df
335306
return results
336307

337-
def _jaccard(
338-
self, data: Dict[str, pd.DataFrame], group: str
339-
) -> Dict[str, Dict[str, pd.DataFrame]]:
308+
def _jaccard(self, data: Dict[str, pd.DataFrame], group: str) -> Dict[str, Dict[str, pd.DataFrame]]:
340309
"""Calculate jaccard similarity and p-values.
341310
342311
This takes a dictionary of data.
@@ -367,84 +336,69 @@ def _jaccard(
367336
mode: lipids.loc[:, [group, self.control]]
368337
.pipe(lambda df: df.loc[df.sum(axis=1) != 0, :])
369338
.groupby(axis="index", level="Category")
370-
.apply(
371-
lambda x: jac.bootstrap(
372-
x.loc[:, group], x.loc[:, self.control], n=self.n
373-
)
374-
)
339+
.apply(lambda x: jac.bootstrap(x.loc[:, group], x.loc[:, self.control], n=self.n))
375340
for mode, lipids in data.items()
376341
}
377342
for group in self.conditions
378343
}
379344
return jaccard
380345

381-
def _generate_enfc_summary(self) -> None:
346+
def _generate_enfc_summary(self) -> pd.DataFrame:
382347
logger.debug("Generating ENFC summary files...")
383348
enfcs = self._calculate_enfc()
384349
frames = []
385350
levels = set()
386-
for group, data in enfcs.items():
351+
for phenotype, data in enfcs.items():
387352
df = pd.concat(data, axis="columns")
388-
df.to_csv(
389-
self.output
390-
/ "enfc"
391-
/ f"{group}_by_{self.control}_individual_lipids.csv"
392-
)
393-
df.columns = utils.add_level_to_index(index=df.columns, new_level=group, new_level_name="Group")
353+
df.to_csv(self.output / "enfc" / f"{phenotype}_by_{self.control}_individual_lipids.csv")
354+
df.columns = utils.add_level_to_index(index=df.columns, new_level=phenotype, new_level_name="Phenotype")
394355
frames.append(df)
395356
levels.update(df.index.names)
396-
utils.merge_dataframe_by_level(datas=frames, levels=levels).to_csv(
397-
self.output / "enfc" / f"individual_lipids.csv"
398-
)
357+
summary = utils.merge_dataframe_by_level(datas=frames, levels=levels)
358+
summary.to_csv(self.output / "enfc" / f"individual_lipids.csv")
359+
return summary
399360

400-
def _generate_enfc_class_summary(self) -> None:
361+
def _generate_enfc_class_summary(self) -> pd.DataFrame:
401362
logger.debug("Generating class ENFC summary files...")
402-
self.filtered = {
403-
mode: df.groupby(axis="index", level="Category").sum()
404-
for mode, df in self.filtered.items()
405-
}
363+
self.filtered = {mode: df.groupby(axis="index", level="Category").sum() for mode, df in self.filtered.items()}
406364
self.enfcs = self._calculate_enfc()
407365
frames = []
408366
levels = set()
409-
for group, data in self.enfcs.items():
367+
for phenotype, data in self.enfcs.items():
410368
df = pd.concat(data, axis="columns")
411-
df.to_csv(
412-
self.output / "enfc" / f"{group}_by_{self.control}_lipid_classes.csv"
413-
)
414-
df.columns = utils.add_level_to_index(index=df.columns, new_level=group, new_level_name="Group")
369+
df.to_csv(self.output / "enfc" / f"{phenotype}_by_{self.control}_lipid_classes.csv")
370+
df.columns = utils.add_level_to_index(index=df.columns, new_level=phenotype, new_level_name="Phenotype")
415371
frames.append(df)
416372
levels.update(df.index.names)
417-
utils.merge_dataframe_by_level(datas=frames, levels=levels).to_csv(
418-
self.output / "enfc" / f"lipid_classes.csv"
419-
)
373+
summary = utils.merge_dataframe_by_level(datas=frames, levels=levels)
374+
summary.to_csv(self.output / "enfc" / f"lipid_classes.csv")
375+
return summary
420376

421-
def _generate_jaccard_distance_summary(self) -> None:
377+
def _generate_jaccard_distance_summary(self) -> pd.DataFrame:
422378
logger.debug("Generating Jaccard distance summary files...")
423379
frames = []
424380
levels = set()
425-
for group in set(self.conditions):
381+
for phenotype in set(self.conditions):
426382
jaccard = pd.concat(
427383
{
428-
**self.a_jaccard[group],
429-
**self.bc_jaccard[group],
430-
**self.bp_jaccard[group],
431-
**self.n2_jaccard[group],
432-
**self.u_jaccard[group],
384+
**self.a_jaccard[phenotype],
385+
**self.bc_jaccard[phenotype],
386+
**self.bp_jaccard[phenotype],
387+
**self.n2_jaccard[phenotype],
388+
**self.u_jaccard[phenotype],
433389
},
434390
axis="columns",
435391
)
436392
jaccard.columns.names = ["type_compartment_mode", "Metrics"]
437-
jaccard.to_csv(
438-
self.output
439-
/ "jaccard"
440-
/ f"{group}_to_{self.control}_jaccard_similarity.csv"
393+
jaccard.to_csv(self.output / "jaccard" / f"{phenotype}_to_{self.control}_jaccard_similarity.csv")
394+
jaccard.columns = utils.add_level_to_index(
395+
index=jaccard.columns, new_level=phenotype, new_level_name="Phenotype"
441396
)
442-
jaccard.columns = utils.add_level_to_index(index=jaccard.columns, new_level=group, new_level_name="Group")
443397
frames.append(jaccard)
444398
levels.update(jaccard.index.names)
445-
utils.merge_dataframe_by_level(datas=frames, levels=levels).to_csv(
446-
self.output / "jaccard" / f"jaccard_similarity.csv"
447-
)
399+
summary = utils.merge_dataframe_by_level(datas=frames, levels=levels)
400+
summary.to_csv(self.output / "jaccard" / f"jaccard_similarity.csv")
401+
return summary
448402

449403
def run(self) -> None:
450404
"""Run the full LTA pipeline.
@@ -490,8 +444,30 @@ def run(self) -> None:
490444
).fillna(False)
491445
summary.columns.names = ["type_compartment_mode", "Phenotype"]
492446
summary.to_csv(self.output / "switch_individual_lipids.csv")
493-
summary.groupby(axis="index", level="Category").sum().to_csv(
494-
self.output / "switch_lipid_classes.csv"
495-
)
447+
lipid_classes = summary.groupby(axis="index", level="Category").sum()
448+
lipid_classes.to_csv(self.output / "switch_lipid_classes.csv")
449+
450+
jaccard_similarity = self._generate_jaccard_distance_summary()
496451

497-
self._generate_jaccard_distance_summary()
452+
lipid_classes.columns = utils.add_level_to_index(
453+
index=lipid_classes.columns, new_level="-", new_level_name="Metrics"
454+
)
455+
jaccard_similarity.columns = utils.reorder_index(
456+
index=jaccard_similarity.columns, orders=lipid_classes.columns.names
457+
)
458+
merged_lipid_classes = utils.merge_dataframe_by_level(
459+
datas=[
460+
lipid_classes,
461+
jaccard_similarity,
462+
],
463+
levels=lipid_classes.index.names,
464+
)
465+
merged_lipid_classes = utils.sort_columns(
466+
data=merged_lipid_classes, level="type_compartment_mode",
467+
pressing=lipid_classes.columns.get_level_values("type_compartment_mode").unique().to_list()
468+
)
469+
merged_lipid_classes = utils.sort_columns(data=merged_lipid_classes, level="Phenotype", pressing=[self.control])
470+
merged_lipid_classes = utils.sort_columns(
471+
data=merged_lipid_classes, level="Metrics", pressing=["-"]
472+
)
473+
merged_lipid_classes.to_csv(self.output / "merged_lipid_classes.csv")

lta/helpers/utils.py

+43-6
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import itertools
2+
import operator
13
import typing
24
import pandas as pd
35

@@ -11,14 +13,49 @@ def merge_dataframe_by_level(
1113

1214

1315
def add_level_to_index(
14-
*,
15-
index: typing.Union[pd.MultiIndex, pd.Index],
16-
new_level: typing.Any,
17-
new_level_name: str
16+
*, index: typing.Union[pd.MultiIndex, pd.Index], new_level: typing.Any, new_level_name: str
1817
) -> pd.MultiIndex:
1918
names = index.names + [new_level_name]
2019
if isinstance(index, pd.MultiIndex):
2120
index = index.to_flat_index()
22-
return pd.MultiIndex.from_tuples(
23-
list(val + (new_level,) for val in index.array), names=names
21+
return pd.MultiIndex.from_tuples(list(val + (new_level,) for val in index.array), names=names)
22+
23+
24+
def reorder_index(
25+
*,
26+
index: typing.Union[pd.MultiIndex, pd.Index],
27+
orders: typing.List[typing.Union[str, int]],
28+
) -> pd.MultiIndex:
29+
if isinstance(index, pd.MultiIndex):
30+
return index.reorder_levels(orders)
31+
return index
32+
33+
34+
def sort_columns(
35+
*,
36+
data: pd.DataFrame,
37+
level: typing.Union[str, int],
38+
pressing: typing.List[typing.Union[str, int]],
39+
) -> pd.DataFrame:
40+
idx = data.columns
41+
pressed = set(pressing)
42+
# get correct ordering of the index
43+
level_ordering = pressing + list(
44+
# filter out the pressing columns
45+
map(
46+
operator.itemgetter(0),
47+
itertools.groupby(sorted(v for v in idx.get_level_values(level).values if v not in pressed)),
48+
)
49+
)
50+
51+
if not isinstance(idx, pd.MultiIndex):
52+
return data.reindex(columns=level_ordering, level=level)
53+
54+
# recreate Multiindex with mapping and reindex all columns
55+
new_level_index = data.reindex(level=level, columns=level_ordering).columns
56+
new_column_order_map = dict(zip(idx[idx.isin(new_level_index)], new_level_index))
57+
new_multi_index = pd.MultiIndex.from_tuples(
58+
[new_column_order_map.get(x, x) for x in idx],
59+
names=idx.names,
2460
)
61+
return data.reindex(new_multi_index, axis=1)

tests/unit/helpers/__init__.py

Whitespace-only changes.

tests/unit/helpers/test_utils.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
from lta.helpers import utils
5+
6+
7+
def test_merge_dataframe_by_level() -> None:
8+
idx = pd.MultiIndex.from_product([["A", "B", "C"], ["1", "2", "3"]], names=["alphabet", "numeric"])
9+
col = ["c1", "c2", "c3", "c4"]
10+
11+
df1_data, df2_data = (np.array([[f"d{i+1}_{i1}{i2}_{c}" for c in col] for i1, i2 in idx]) for i in range(2))
12+
assert (df1_data != df2_data).all()
13+
frames = [
14+
pd.DataFrame(df1_data, idx, col),
15+
pd.DataFrame(df2_data, idx, col),
16+
]
17+
res = utils.merge_dataframe_by_level(datas=frames, levels=idx.levels)
18+
assert res.index.identical(idx)
19+
assert res.columns.identical(pd.Index(col * 2))
20+
assert (res.to_numpy() == np.append(df1_data, df2_data, axis=1)).all()
21+
22+
23+
def test_add_level_to_multi_index() -> None:
24+
idx = pd.MultiIndex.from_product([["A", "B", "C"], ["1", "2", "3"]], names=["alphabet", "numeric"])
25+
expected = pd.MultiIndex.from_product(
26+
[["A", "B", "C"], ["1", "2", "3"], ["ADDED"]],
27+
names=["alphabet", "numeric", "newly added"],
28+
)
29+
res = utils.add_level_to_index(index=idx, new_level="ADDED", new_level_name="newly added")
30+
assert res.identical(expected)
31+
32+
33+
def test_reorder_column_index() -> None:
34+
idx = pd.MultiIndex.from_product([["A", "B", "C"], ["1", "2", "3"]], names=["alphabet", "numeric"])
35+
new_idx = utils.reorder_index(index=idx, orders=["numeric", "alphabet"])
36+
col = ["c1", "c2", "c3", "c4"]
37+
df_data = np.array([[f"d{i1}{i2}_{c}" for c in col] for i1, i2 in idx])
38+
expected = pd.DataFrame(df_data, new_idx, col)
39+
res = pd.DataFrame(df_data, idx, col)
40+
res.index = new_idx
41+
assert expected.equals(res)
42+
43+
44+
def test_sort_columns() -> None:
45+
idx = ["c1", "c2", "c3", "c4"]
46+
expected_col = pd.MultiIndex.from_product([["A", "B", "C"], ["2", "3", "1"]], names=["alphabet", "numeric"])
47+
expected_data = np.array([[f"d{i1}{i2}_{c}" for c in idx] for i1, i2 in expected_col]).T
48+
expected = pd.DataFrame(expected_data, idx, expected_col)
49+
col = pd.MultiIndex.from_product([["A", "B", "C"], ["1", "2", "3"]], names=["alphabet", "numeric"])
50+
df_data = np.array([[f"d{i1}{i2}_{c}" for c in idx] for i1, i2 in col]).T
51+
df = pd.DataFrame(df_data, idx, col)
52+
res = utils.sort_columns(data=df, level="numeric", pressing=["2", "3"])
53+
assert expected.equals(res)

0 commit comments

Comments
 (0)