80
80
"tor" : "Siemens" ,
81
81
}
82
82
83
+ variable_to_label = {
84
+ 'MEAN(area)' : 'CSA [$mm^2$]' ,
85
+ 'edss_M0' : 'EDSS' ,
86
+ 'lesion_volume' : 'Lesion volume [$mm^3$]' ,
87
+ }
88
+
83
89
84
90
def get_parser ():
85
91
parser = argparse .ArgumentParser (
86
92
description = "Generate figure for T2w C2-C3 CSA. The figure is saved to the same folder as the input .csv file."
87
93
)
88
94
parser .add_argument (
89
- '-i -canproco' ,
95
+ '-csa -canproco' ,
90
96
required = True ,
91
97
metavar = '<file_path>' ,
92
- help = "input .csv file with canproco CSA values" )
98
+ help = "Path to the input .csv file with canproco CSA values. " )
93
99
parser .add_argument (
94
- '-i -spinegeneric' ,
100
+ '-csa -spinegeneric' ,
95
101
required = True ,
96
102
metavar = '<file_path>' ,
97
- help = "input .csv file with spine-generic CSA values" )
103
+ help = "Path to the input .csv file with spine-generic CSA values. " )
98
104
parser .add_argument (
99
- '-participants-file- canproco' ,
105
+ '-participants-canproco' ,
100
106
required = True ,
101
107
metavar = '<file_path>' ,
102
- help = "canproco participants.tsv file (includes pathology and phenotype columns)" )
108
+ help = "Path to the canproco participants.tsv file (includes pathology and phenotype columns). " )
103
109
parser .add_argument (
104
- '-participants-file- spinegeneric' ,
110
+ '-participants-spinegeneric' ,
105
111
required = True ,
106
112
metavar = '<file_path>' ,
107
- help = "spine-generic participants.tsv file (includes vendor column)" )
113
+ help = "Path to the spine-generic participants.tsv file (includes vendor column)." )
114
+ parser .add_argument (
115
+ '-lesion-folder' ,
116
+ required = False ,
117
+ metavar = '<file_path>' ,
118
+ help = "Path to the folder with .xls files with lesion volumes generated by sct_analyze_lesion." )
108
119
109
120
return parser
110
121
@@ -305,10 +316,7 @@ def compute_partial_correlation(canproco_pd, site):
305
316
:return:
306
317
"""
307
318
# Work only with MS patients
308
- if site == 'all' :
309
- ms_pd = canproco_pd [canproco_pd ['pathology' ] == 'MS' ]
310
- else :
311
- ms_pd = canproco_pd [(canproco_pd ['pathology' ] == 'MS' ) & (canproco_pd ['site' ] == site )]
319
+ ms_pd = canproco_pd [(canproco_pd ['pathology' ] == 'MS' ) & (canproco_pd ['site' ] == site )]
312
320
# Convert str to int (to be compatible with partial correlation)
313
321
ms_pd = ms_pd .replace ({'phenotype' : {'RRMS' : 0 , 'PPMS' : 1 , 'RIS' : 2 }})
314
322
stats = pg .partial_corr (data = ms_pd , x = 'MEAN(area)' , y = 'edss_M0' , covar = 'phenotype' , method = 'spearman' )
@@ -356,33 +364,32 @@ def compute_regression(x, y):
356
364
return x_vals , y_vals
357
365
358
366
359
- def create_csa_edss_correlation_figure_persite (canproco_pd , fname_fig ):
367
+ def create_correlation_figures_persite (canproco_pd , pair , fname_fig ):
360
368
"""
361
- Plot the relationship between EDSS score and CSA per-site and per-phenotype. Also, plot linear fit per-phenotype and
369
+ Plot the relationship between pairs of variables per-site and per-phenotype. Also, plot linear fit per-phenotype and
362
370
for the whole cohort.
363
- :param canproco_pd:
364
- :param fname_fig:
371
+ :param canproco_pd: pandas dataframe: canproco data
372
+ :param pair: tuple: pairs of variables for which correlation will be computed, for example ('MEAN(area)', 'edss_M0')
373
+ :param fname_fig: str: figure name
365
374
:return:
366
375
"""
376
+ # Drop rows with NaN values for the pair of variables
377
+ canproco_pd = canproco_pd .dropna (subset = list (pair ))
367
378
# Create main figure
368
- fig , axes = plt .subplots (2 , 3 , figsize = (20 , 14 ), sharey = True )
379
+ fig , axes = plt .subplots (2 , 3 , figsize = (20 , 14 ), sharex = True , sharey = True )
369
380
# Flatten 2D array into 1D to allow iteration by loop
370
381
ax = axes .ravel ()
371
382
# Loop across sites (all means all sites together)
372
383
for index , site in enumerate (site_to_vendor_title .keys ()):
373
384
# Compute partial correlation (with phenotype as a covariate)
374
385
r , p_val = compute_partial_correlation (canproco_pd , site )
375
- print (f'{ site } : Partial correlation EDSS vs CSA : r={ r } , p-value{ format_pvalue (p_val , alpha = 0.05 )} ' )
386
+ print (f'{ site } : Partial correlation { pair [ 0 ] } vs { pair [ 1 ] } : r={ r } , p-value{ format_pvalue (p_val , alpha = 0.05 )} ' )
376
387
# Compute linear regression for all MS patients together (i.e., across all phenotypes) --> ['pathology'] == 'MS'
377
- if site == 'all' :
378
- csa = canproco_pd [canproco_pd ['pathology' ] == 'MS' ]['MEAN(area)' ]
379
- edss = canproco_pd [canproco_pd ['pathology' ] == 'MS' ]['edss_M0' ]
380
- #phen = canproco_pd[canproco_pd['pathology'] == 'MS']['phenotype']
381
- else :
382
- csa = canproco_pd [(canproco_pd ['pathology' ] == 'MS' ) & (canproco_pd ['site' ] == site )]['MEAN(area)' ]
383
- edss = canproco_pd [(canproco_pd ['pathology' ] == 'MS' ) & (canproco_pd ['site' ] == site )]['edss_M0' ]
384
- #phen = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]['phenotype']
385
- x_vals , y_vals = compute_regression (csa , edss )
388
+ var1 = canproco_pd [(canproco_pd ['pathology' ] == 'MS' ) & (canproco_pd ['site' ] == site )][pair [0 ]]
389
+ var2 = canproco_pd [(canproco_pd ['pathology' ] == 'MS' ) & (canproco_pd ['site' ] == site )][pair [1 ]]
390
+ #phen = canproco_pd[(canproco_pd['pathology'] == 'MS') & (canproco_pd['site'] == site)]['phenotype']
391
+ x_vals , y_vals = compute_regression (var1 , var2 )
392
+ # TODO - Consider replacing by sns.regplot
386
393
ax [index ].plot (x_vals , y_vals , '--' , color = 'black' , alpha = .5 , linewidth = 3 )
387
394
388
395
# Insert text with corr coef and pval into every subplot/axis
@@ -392,25 +399,20 @@ def create_csa_edss_correlation_figure_persite(canproco_pd, fname_fig):
392
399
393
400
for color , phenotype in enumerate (['RRMS' , 'PPMS' , 'RIS' ]):
394
401
# Prepare variables for plotting
395
- if site == 'all' :
396
- csa = canproco_pd [canproco_pd ['phenotype' ] == phenotype ]['MEAN(area)' ]
397
- edss = canproco_pd [canproco_pd ['phenotype' ] == phenotype ]['edss_M0' ]
398
- r , p_val = compute_correlation (csa , edss )
399
- else :
400
- csa = canproco_pd [(canproco_pd ['phenotype' ] == phenotype ) & (canproco_pd ['site' ] == site )]['MEAN(area)' ]
401
- edss = canproco_pd [(canproco_pd ['phenotype' ] == phenotype ) & (canproco_pd ['site' ] == site )]['edss_M0' ]
402
- r , p_val = compute_correlation (csa , edss )
403
- print (f'{ site } , { phenotype } : Correlation EDSS vs CSA: r={ r } , p-value{ format_pvalue (p_val , alpha = 0.05 )} ' )
402
+ var1 = canproco_pd [(canproco_pd ['phenotype' ] == phenotype ) & (canproco_pd ['site' ] == site )][pair [0 ]]
403
+ var2 = canproco_pd [(canproco_pd ['phenotype' ] == phenotype ) & (canproco_pd ['site' ] == site )][pair [1 ]]
404
+ r , p_val = compute_correlation (var1 , var2 )
405
+ print (f'{ site } , { phenotype } : Correlation { pair [0 ]} vs { pair [1 ]} : r={ r } , p-value{ format_pvalue (p_val , alpha = 0.05 )} ' )
404
406
# Plot individual scatter plots
405
- ax [index ].scatter (csa , edss , color = color_pallete [color ], alpha = .8 , label = phenotype , s = 100 )
406
- x_vals , y_vals = compute_regression (csa , edss )
407
+ ax [index ].scatter (var1 , var2 , color = color_pallete [color ], alpha = .8 , label = phenotype , s = 100 )
408
+ x_vals , y_vals = compute_regression (var1 , var2 )
407
409
ax [index ].plot (x_vals , y_vals , '--' , color = color_pallete [color ], alpha = .8 , linewidth = 3 )
408
410
if site == 'all' :
409
411
ax [index ].set_title (site_to_vendor_title [site ], fontsize = FONTSIZE_CORR , fontweight = 'bold' )
410
412
else :
411
413
ax [index ].set_title (site_to_vendor_title [site ], fontsize = FONTSIZE_CORR )
412
414
if index > 2 :
413
- ax [index ].set_xlabel ('CSA [$mm^2$]' , fontsize = FONTSIZE_CORR )
415
+ ax [index ].set_xlabel (variable_to_label [ pair [ 0 ]] , fontsize = FONTSIZE_CORR )
414
416
415
417
# # Set fixed number of y-ticks
416
418
# xmin, xmax = ax[index].get_xlim()
@@ -419,7 +421,7 @@ def create_csa_edss_correlation_figure_persite(canproco_pd, fname_fig):
419
421
# ax[index].set_xticklabels(custom_ticks)
420
422
421
423
if index == 0 or index == 3 :
422
- ax [index ].set_ylabel ('EDSS' , fontsize = FONTSIZE_CORR )
424
+ ax [index ].set_ylabel (variable_to_label [ pair [ 1 ]] , fontsize = FONTSIZE_CORR )
423
425
# Increase size of xticks and yticks
424
426
plt .setp (ax [index ].xaxis .get_majorticklabels (), fontsize = FONTSIZE_CORR )
425
427
plt .setp (ax [index ].yaxis .get_majorticklabels (), fontsize = FONTSIZE_CORR )
@@ -552,20 +554,63 @@ def read_participants_file(file_path):
552
554
raise FileNotFoundError (f'{ file_path } not found' )
553
555
554
556
557
+ def read_lesion_files (lesion_folder ):
558
+ """
559
+ Read xls files containing lesion volume generated by sct_analyze_lesion and aggregate them into one pandas DF
560
+ """
561
+ list_of_files = os .listdir (lesion_folder )
562
+ # Ignore .DS_Store
563
+ if '.DS_Store' in list_of_files :
564
+ list_of_files .remove ('.DS_Store' )
565
+ list_of_files .sort ()
566
+
567
+ # Initialize pandas dataFrame where lesion volume across all subjects will be stored
568
+ lesion_df = pd .DataFrame (columns = ['subject_id' , 'lesion_volume' , 'number_of_lesions' ])
569
+
570
+ # Loop across subjects
571
+ for file in list_of_files :
572
+ # Get subject ID
573
+ subject_id = file .split ('_' )[0 ]
574
+ lesion_dict = {'subject_id' : [], 'lesion_volume' : [], 'number_of_lesions' : []}
575
+ # Construct path to xls file
576
+ file_path = os .path .join (lesion_folder , file )
577
+ # Read xls file as pandas dataFrame
578
+ # run 'pip install xlrd' if you get an error
579
+ df = pd .read_excel (file_path )
580
+ lesion_dict ['subject_id' ] = subject_id
581
+ # Sum lesion volume across all lesions
582
+ lesion_dict ['lesion_volume' ] = df ['volume [mm3]' ].sum ()
583
+ # Get number of lesions (number of rows in the dataFrame)
584
+ lesion_dict ['number_of_lesions' ] = df .shape [0 ]
585
+ # Insert lesion_dict into lesion_df as a new row
586
+ lesion_df .loc [subject_id ] = lesion_dict
587
+
588
+ return lesion_df
589
+
590
+
555
591
def main ():
556
592
parser = get_parser ()
557
593
args = parser .parse_args ()
558
594
595
+ # ------------------------------------------------------
596
+ # Read input files as pandas DataFrames
597
+ # ------------------------------------------------------
559
598
# Read .csv file for canproco subjects
560
- canproco_pd = read_csv_file (args .i_canproco , subjects_to_exclude_canproco )
599
+ canproco_pd = read_csv_file (args .csa_canproco , subjects_to_exclude_canproco )
561
600
# Read canproco participants.tsv file (includes pathology and phenotype columns)
562
- canproco_participants_pd = read_participants_file (args .participants_file_canproco )
601
+ canproco_participants_pd = read_participants_file (args .participants_canproco )
563
602
564
603
# Read .csv file for spine-generic subjects
565
- spinegeneric_pd = read_csv_file (args .i_spinegeneric , subjects_to_exclude_spinegeneric )
604
+ spinegeneric_pd = read_csv_file (args .csa_spinegeneric , subjects_to_exclude_spinegeneric )
566
605
# Read spine-generic participants.tsv file (includes manufacturer column)
567
- spinegeneric_participants_pd = read_participants_file (args .participants_file_spinegeneric )
606
+ spinegeneric_participants_pd = read_participants_file (args .participants_spinegeneric )
568
607
608
+ if args .lesion_folder :
609
+ lesion_df = read_lesion_files (args .lesion_folder )
610
+
611
+ # ------------------------------------------------------
612
+ # Merge and prepare DataFrames for further analysis
613
+ # ------------------------------------------------------
569
614
# Merge pathology and phenotype columns to the canproco dataframe with CSA values
570
615
canproco_pd = pd .merge (canproco_pd , canproco_participants_pd [['participant_id' , 'pathology' , 'phenotype' , 'edss_M0' ]],
571
616
how = 'left' , left_on = 'subject_id' , right_on = 'participant_id' )
@@ -576,10 +621,18 @@ def main():
576
621
# Replace n/a in phenotype by HC to allow sorting in violinplot
577
622
canproco_pd ['phenotype' ].fillna (canproco_pd ['pathology' ], inplace = True )
578
623
624
+ # Merge lesion_df to the canproco dataframe with CSA values
625
+ if args .lesion_folder :
626
+ canproco_pd = pd .merge (canproco_pd , lesion_df [['subject_id' , 'lesion_volume' , 'number_of_lesions' ]],
627
+ how = 'left' , left_on = 'subject_id' , right_on = 'subject_id' )
628
+
579
629
# Merge manufacturer column to the spine-generic dataframe with CSA values
580
630
spinegeneric_pd = pd .merge (spinegeneric_pd , spinegeneric_participants_pd [['participant_id' , 'manufacturer' ]],
581
631
how = 'left' , left_on = 'subject_id' , right_on = 'participant_id' )
582
632
633
+ # ------------------------------------------------------
634
+ # Compute descriptive statistics
635
+ # ------------------------------------------------------
583
636
# Compute median, mean, std, cov persite and phenotype
584
637
statistic = canproco_pd .groupby (['site' , 'phenotype' ]).agg ([np .median , np .mean , np .std , stats .variation ])
585
638
print (f'\n Descriptive statistics:\n { statistic } ' )
@@ -593,8 +646,11 @@ def main():
593
646
temp_pd ['site' ] = 'all'
594
647
canproco_pd = pd .concat ([canproco_pd , temp_pd ])
595
648
649
+ # ------------------------------------------------------
650
+ # Create plots and compute between sites statistics
651
+ # ------------------------------------------------------
596
652
# Create rain plot
597
- fname_fig = args .i_canproco .replace ('.csv' , '_rainplot.png' )
653
+ fname_fig = args .csa_canproco .replace ('.csv' , '_rainplot.png' )
598
654
create_rainplot (canproco_pd , spinegeneric_pd , fname_fig )
599
655
600
656
# Compute ANOVA among phenotypes
@@ -611,9 +667,16 @@ def main():
611
667
# Compare CSA values between canproco healthy controls and spine-generic per manufacturer
612
668
compare_healthy_controls (canproco_pd , spinegeneric_pd )
613
669
614
- # Compute and plot correlation between EDSS and CSA persite
615
- fname_fig = args .i_canproco .replace ('.csv' , '_correlation_persite.png' )
616
- create_csa_edss_correlation_figure_persite (canproco_pd , fname_fig )
670
+ # Define pairs of variables for which correlation will be computed
671
+ variable_pairs = [('MEAN(area)' , 'edss_M0' ),]
672
+ if args .lesion_folder :
673
+ variable_pairs .append (('MEAN(area)' , 'lesion_volume' ))
674
+ variable_pairs .append (('edss_M0' , 'lesion_volume' ))
675
+ # Compute and plot correlation between variables persite (including also the whole cohort).
676
+ # For example between EDSS and CSA, or between EDSS and lesion volume
677
+ for pair in variable_pairs :
678
+ fname_fig = args .csa_canproco .replace ('.csv' , '_correlation_' + pair [0 ] + '_vs_' + pair [1 ] + '_persite.png' )
679
+ create_correlation_figures_persite (canproco_pd , pair , fname_fig )
617
680
618
681
619
682
if __name__ == "__main__" :
0 commit comments