@@ -100,9 +100,7 @@ def __post_init__(self) -> None:
100
100
logger .exception (f"{ self .file } is a directory. A full traceback follows..." )
101
101
raise
102
102
except pd .errors .EmptyDataError :
103
- logger .exception (
104
- f"{ self .file } contains no data. A full traceback follows..."
105
- )
103
+ logger .exception (f"{ self .file } contains no data. A full traceback follows..." )
106
104
raise
107
105
logger .debug ("Binarizing data..." )
108
106
self .binary = {
@@ -117,19 +115,13 @@ def __post_init__(self) -> None:
117
115
}
118
116
logger .debug ("Filtering data..." )
119
117
self .filtered = {
120
- group : df .loc [self .binary [group ].index , :]
121
- for group , df in data .groupby (axis = "columns" , level = self .mode )
118
+ group : df .loc [self .binary [group ].index , :] for group , df in data .groupby (axis = "columns" , level = self .mode )
122
119
}
123
120
Path (self .output , "enfc" ).mkdir (exist_ok = True , parents = True )
124
121
Path (self .output , "jaccard" ).mkdir (exist_ok = True , parents = True )
125
122
126
- conditions = [
127
- df .columns .get_level_values (self .level ).unique ()
128
- for df in self .filtered .values ()
129
- ]
130
- self .conditions = [
131
- val for mode in conditions for val in mode if val != self .control
132
- ]
123
+ conditions = [df .columns .get_level_values (self .level ).unique () for df in self .filtered .values ()]
124
+ self .conditions = [val for mode in conditions for val in mode if val != self .control ]
133
125
134
126
def _calculate_enfc (self ) -> Dict [str , Dict [str , pd .DataFrame ]]:
135
127
"""Calculate error-normalised fold change.
@@ -178,9 +170,7 @@ def _get_a_lipids(self) -> Dict[str, pd.DataFrame]:
178
170
logger .info ("Calculating A-lipids..." )
179
171
results = {
180
172
f"a_{ mode } " : (
181
- df .groupby (axis = "columns" , level = self .level )
182
- .all ()
183
- .pipe (lambda x : x .loc [x .any (axis = "columns" ), :])
173
+ df .groupby (axis = "columns" , level = self .level ).all ().pipe (lambda x : x .loc [x .any (axis = "columns" ), :])
184
174
)
185
175
for mode , df in self .binary .items ()
186
176
}
@@ -231,16 +221,10 @@ def _get_b_lipids(self, picky: bool = True) -> Dict[str, pd.DataFrame]:
231
221
# This assumes that self.binary and a_lip will have the same keys
232
222
# Which is definitely True
233
223
if picky :
234
- data = {
235
- mode : df .drop (index = a_lip [f"a_{ mode } " ])
236
- for mode , df in self .binary .items ()
237
- }
224
+ data = {mode : df .drop (index = a_lip [f"a_{ mode } " ]) for mode , df in self .binary .items ()}
238
225
subtype = "p"
239
226
else :
240
- data = {
241
- mode : df .loc [a_lip [f"a_{ mode } " ], :]
242
- for mode , df in self .binary .items ()
243
- }
227
+ data = {mode : df .loc [a_lip [f"a_{ mode } " ], :] for mode , df in self .binary .items ()}
244
228
subtype = "c"
245
229
246
230
logger .info (f"Calculating B{ subtype } -lipids..." )
@@ -296,47 +280,32 @@ def _get_n_lipids(self, n: int) -> Dict[str, pd.DataFrame]:
296
280
compartments = df .columns .get_level_values (self .compartment )
297
281
# Mask required to prevent dropping levels
298
282
# The initial check must be done with all compartments (only n)...
299
- mask = (
300
- df .groupby (axis = "columns" , level = self .compartment )
301
- .any ()
302
- .sum (axis = "columns" )
303
- == n
304
- )
283
+ mask = df .groupby (axis = "columns" , level = self .compartment ).any ().sum (axis = "columns" ) == n
305
284
306
285
data = [
307
286
(group , df .loc [mask , compartments .isin (group )])
308
287
for group in itertools .combinations (compartments .unique (), n )
309
288
]
310
- logger .debug (
311
- f"N{ n } compartment groups before filtering: { [group for group , _ in data ]} "
312
- )
289
+ logger .debug (f"N{ n } compartment groups before filtering: { [group for group , _ in data ]} " )
313
290
# ...which necessitates a second check to drop those that are not
314
291
# Again, mask necessary for keeping info
315
292
# Also, we only care for groups with lipids
316
293
masks = [
317
- df .groupby (axis = "columns" , level = self .compartment )
318
- .any ()
319
- .sum (axis = "columns" )
320
- == n
321
- for _ , df in data
294
+ df .groupby (axis = "columns" , level = self .compartment ).any ().sum (axis = "columns" ) == n for _ , df in data
322
295
]
323
296
data = [
324
297
(group , df .loc [mask , :].groupby (axis = "columns" , level = self .level ).all ())
325
298
for (group , df ), mask in zip (data , masks )
326
299
if mask .sum () != 0
327
300
]
328
- logger .debug (
329
- f"N{ n } compartment groups after filtering: { [group for group , _ in data ]} "
330
- )
301
+ logger .debug (f"N{ n } compartment groups after filtering: { [group for group , _ in data ]} " )
331
302
for compartments , df in data :
332
303
n_type = "u" if n == 1 else f"n{ n } "
333
304
group = "_" .join ([x .upper () for x in compartments ])
334
305
results [f"{ n_type } _{ group } _{ mode } " ] = df
335
306
return results
336
307
337
- def _jaccard (
338
- self , data : Dict [str , pd .DataFrame ], group : str
339
- ) -> Dict [str , Dict [str , pd .DataFrame ]]:
308
+ def _jaccard (self , data : Dict [str , pd .DataFrame ], group : str ) -> Dict [str , Dict [str , pd .DataFrame ]]:
340
309
"""Calculate jaccard similarity and p-values.
341
310
342
311
This takes a dictionary of data.
@@ -367,84 +336,69 @@ def _jaccard(
367
336
mode : lipids .loc [:, [group , self .control ]]
368
337
.pipe (lambda df : df .loc [df .sum (axis = 1 ) != 0 , :])
369
338
.groupby (axis = "index" , level = "Category" )
370
- .apply (
371
- lambda x : jac .bootstrap (
372
- x .loc [:, group ], x .loc [:, self .control ], n = self .n
373
- )
374
- )
339
+ .apply (lambda x : jac .bootstrap (x .loc [:, group ], x .loc [:, self .control ], n = self .n ))
375
340
for mode , lipids in data .items ()
376
341
}
377
342
for group in self .conditions
378
343
}
379
344
return jaccard
380
345
381
- def _generate_enfc_summary (self ) -> None :
346
+ def _generate_enfc_summary (self ) -> pd . DataFrame :
382
347
logger .debug ("Generating ENFC summary files..." )
383
348
enfcs = self ._calculate_enfc ()
384
349
frames = []
385
350
levels = set ()
386
- for group , data in enfcs .items ():
351
+ for phenotype , data in enfcs .items ():
387
352
df = pd .concat (data , axis = "columns" )
388
- df .to_csv (
389
- self .output
390
- / "enfc"
391
- / f"{ group } _by_{ self .control } _individual_lipids.csv"
392
- )
393
- df .columns = utils .add_level_to_index (index = df .columns , new_level = group , new_level_name = "Group" )
353
+ df .to_csv (self .output / "enfc" / f"{ phenotype } _by_{ self .control } _individual_lipids.csv" )
354
+ df .columns = utils .add_level_to_index (index = df .columns , new_level = phenotype , new_level_name = "Phenotype" )
394
355
frames .append (df )
395
356
levels .update (df .index .names )
396
- utils .merge_dataframe_by_level (datas = frames , levels = levels ). to_csv (
397
- self .output / "enfc" / f"individual_lipids.csv"
398
- )
357
+ summary = utils .merge_dataframe_by_level (datas = frames , levels = levels )
358
+ summary . to_csv ( self .output / "enfc" / f"individual_lipids.csv" )
359
+ return summary
399
360
400
- def _generate_enfc_class_summary (self ) -> None :
361
+ def _generate_enfc_class_summary (self ) -> pd . DataFrame :
401
362
logger .debug ("Generating class ENFC summary files..." )
402
- self .filtered = {
403
- mode : df .groupby (axis = "index" , level = "Category" ).sum ()
404
- for mode , df in self .filtered .items ()
405
- }
363
+ self .filtered = {mode : df .groupby (axis = "index" , level = "Category" ).sum () for mode , df in self .filtered .items ()}
406
364
self .enfcs = self ._calculate_enfc ()
407
365
frames = []
408
366
levels = set ()
409
- for group , data in self .enfcs .items ():
367
+ for phenotype , data in self .enfcs .items ():
410
368
df = pd .concat (data , axis = "columns" )
411
- df .to_csv (
412
- self .output / "enfc" / f"{ group } _by_{ self .control } _lipid_classes.csv"
413
- )
414
- df .columns = utils .add_level_to_index (index = df .columns , new_level = group , new_level_name = "Group" )
369
+ df .to_csv (self .output / "enfc" / f"{ phenotype } _by_{ self .control } _lipid_classes.csv" )
370
+ df .columns = utils .add_level_to_index (index = df .columns , new_level = phenotype , new_level_name = "Phenotype" )
415
371
frames .append (df )
416
372
levels .update (df .index .names )
417
- utils .merge_dataframe_by_level (datas = frames , levels = levels ). to_csv (
418
- self .output / "enfc" / f"lipid_classes.csv"
419
- )
373
+ summary = utils .merge_dataframe_by_level (datas = frames , levels = levels )
374
+ summary . to_csv ( self .output / "enfc" / f"lipid_classes.csv" )
375
+ return summary
420
376
421
- def _generate_jaccard_distance_summary (self ) -> None :
377
+ def _generate_jaccard_distance_summary (self ) -> pd . DataFrame :
422
378
logger .debug ("Generating Jaccard distance summary files..." )
423
379
frames = []
424
380
levels = set ()
425
- for group in set (self .conditions ):
381
+ for phenotype in set (self .conditions ):
426
382
jaccard = pd .concat (
427
383
{
428
- ** self .a_jaccard [group ],
429
- ** self .bc_jaccard [group ],
430
- ** self .bp_jaccard [group ],
431
- ** self .n2_jaccard [group ],
432
- ** self .u_jaccard [group ],
384
+ ** self .a_jaccard [phenotype ],
385
+ ** self .bc_jaccard [phenotype ],
386
+ ** self .bp_jaccard [phenotype ],
387
+ ** self .n2_jaccard [phenotype ],
388
+ ** self .u_jaccard [phenotype ],
433
389
},
434
390
axis = "columns" ,
435
391
)
436
392
jaccard .columns .names = ["type_compartment_mode" , "Metrics" ]
437
- jaccard .to_csv (
438
- self .output
439
- / "jaccard"
440
- / f"{ group } _to_{ self .control } _jaccard_similarity.csv"
393
+ jaccard .to_csv (self .output / "jaccard" / f"{ phenotype } _to_{ self .control } _jaccard_similarity.csv" )
394
+ jaccard .columns = utils .add_level_to_index (
395
+ index = jaccard .columns , new_level = phenotype , new_level_name = "Phenotype"
441
396
)
442
- jaccard .columns = utils .add_level_to_index (index = jaccard .columns , new_level = group , new_level_name = "Group" )
443
397
frames .append (jaccard )
444
398
levels .update (jaccard .index .names )
445
- utils .merge_dataframe_by_level (datas = frames , levels = levels ). to_csv (
446
- self .output / "jaccard" / f"jaccard_similarity.csv"
447
- )
399
+ summary = utils .merge_dataframe_by_level (datas = frames , levels = levels )
400
+ summary . to_csv ( self .output / "jaccard" / f"jaccard_similarity.csv" )
401
+ return summary
448
402
449
403
def run (self ) -> None :
450
404
"""Run the full LTA pipeline.
@@ -490,8 +444,30 @@ def run(self) -> None:
490
444
).fillna (False )
491
445
summary .columns .names = ["type_compartment_mode" , "Phenotype" ]
492
446
summary .to_csv (self .output / "switch_individual_lipids.csv" )
493
- summary .groupby (axis = "index" , level = "Category" ).sum ().to_csv (
494
- self .output / "switch_lipid_classes.csv"
495
- )
447
+ lipid_classes = summary .groupby (axis = "index" , level = "Category" ).sum ()
448
+ lipid_classes .to_csv (self .output / "switch_lipid_classes.csv" )
449
+
450
+ jaccard_similarity = self ._generate_jaccard_distance_summary ()
496
451
497
- self ._generate_jaccard_distance_summary ()
452
+ lipid_classes .columns = utils .add_level_to_index (
453
+ index = lipid_classes .columns , new_level = "-" , new_level_name = "Metrics"
454
+ )
455
+ jaccard_similarity .columns = utils .reorder_index (
456
+ index = jaccard_similarity .columns , orders = lipid_classes .columns .names
457
+ )
458
+ merged_lipid_classes = utils .merge_dataframe_by_level (
459
+ datas = [
460
+ lipid_classes ,
461
+ jaccard_similarity ,
462
+ ],
463
+ levels = lipid_classes .index .names ,
464
+ )
465
+ merged_lipid_classes = utils .sort_columns (
466
+ data = merged_lipid_classes , level = "type_compartment_mode" ,
467
+ pressing = lipid_classes .columns .get_level_values ("type_compartment_mode" ).unique ().to_list ()
468
+ )
469
+ merged_lipid_classes = utils .sort_columns (data = merged_lipid_classes , level = "Phenotype" , pressing = [self .control ])
470
+ merged_lipid_classes = utils .sort_columns (
471
+ data = merged_lipid_classes , level = "Metrics" , pressing = ["-" ]
472
+ )
473
+ merged_lipid_classes .to_csv (self .output / "merged_lipid_classes.csv" )
0 commit comments